From aaf2d08622ad1022351d0f20fc0d53bda35caf2b Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Mon, 2 Mar 2026 11:55:47 +0100 Subject: [PATCH 01/14] feat: configure leading dimensions explicitly --- include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp index cbf18e0..7c9b328 100644 --- a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp +++ b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp @@ -118,10 +118,10 @@ class BlasCuda { } } - void AddLayoutConfig(std::size_t m, std::size_t n, std::size_t k) { - CheckAndAddLayout(k, m); - CheckAndAddLayout(k, n); - CheckAndAddLayout(m, n); + void AddLayoutConfig(std::size_t m, std::size_t n, std::size_t k, std::size_t lda, std::size_t ldb, std::size_t ldc) { + CheckAndAddLayout(k, m, lda); + CheckAndAddLayout(k, n, ldb); + CheckAndAddLayout(m, n, ldc); } template @@ -313,11 +313,10 @@ gemmrelu(char transa, char transb, const unsigned int m, private: alpaka::QueueCudaRtNonBlocking m_queue; - void CheckAndAddLayout(size_t rows, size_t cols) { + void CheckAndAddLayout(size_t rows, size_t cols, size_t ld) { auto key = std::make_pair(rows, cols); if (LayoutStore.find(key) == LayoutStore.end()) { cublasLtMatrixLayout_t temp = nullptr; - size_t ld = rows; CHECK_CUBLAS( cublasLtMatrixLayoutCreate(&temp, CUDA_R_32F, rows, cols, ld)); LayoutStore.emplace(key, temp); From 9aa4b884854ed82985667e8481710f3124d7dd48 Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Mon, 2 Mar 2026 14:31:55 +0100 Subject: [PATCH 02/14] fix: set stream while initializing handle --- include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp index 7c9b328..58c0f27 100644 --- a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp +++ b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp @@ -49,7 +49,6 @@ struct PairEq { class BlasCuda { cublasLtHandle_t ltHandle = nullptr; - cublasHandle_t handle = nullptr; cublasLtMatmulDesc_t operationDesc = nullptr; cublasLtMatmulPreference_t preference = nullptr; void *d_workspace = nullptr; @@ -72,7 +71,7 @@ class BlasCuda { BlasCuda(alpaka::QueueCudaRtNonBlocking &queue) : m_queue{queue} { stream = static_cast(m_queue.getNativeHandle()); CHECK_CUBLAS(cublasLtCreate(<Handle)); - CHECK_CUBLAS(cublasCreate(&handle)); + CHECK_CUBLAS(cublasSetStream(ltHandle, stream)); heuristic = {}; CHECK_CUBLAS(cublasLtMatmulDescCreate(&operationDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F)); From 2498197aec00d4c65f4c46eec23d75e253b468b6 Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Mon, 2 Mar 2026 14:52:00 +0100 Subject: [PATCH 03/14] feat: print requested workspace --- include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp index 58c0f27..1c9dd24 100644 --- a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp +++ b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp @@ -71,7 +71,6 @@ class BlasCuda { BlasCuda(alpaka::QueueCudaRtNonBlocking &queue) : m_queue{queue} { stream = static_cast(m_queue.getNativeHandle()); CHECK_CUBLAS(cublasLtCreate(<Handle)); - CHECK_CUBLAS(cublasSetStream(ltHandle, stream)); heuristic = {}; CHECK_CUBLAS(cublasLtMatmulDescCreate(&operationDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F)); @@ -170,7 +169,8 @@ gemm(char transa, char transb, const unsigned int m, 1, &localHeuristic, &returnedResults)); - + std::cout << "Requested workspace: " + << localHeuristic.workspaceSize << std::endl; if (returnedResults == 0) { cublasLtMatmulDescDestroy(localDesc); std::cerr << "No suitable cuBLASLt algorithm found!\n"; @@ -237,7 +237,8 @@ gemmrelu(char transa, char transb, const unsigned int m, 1, &localHeuristic, &error_flag)); - + std::cout << "Requested workspace: " + << localHeuristic.workspaceSize << std::endl; if (error_flag == 0) { cublasLtMatmulDescDestroy(localDesc); std::cerr << "No suitable cuBLASLt algorithm found!\n"; From 901c5ae33fd687b7c3ad0b01babf0b0008846ca7 Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Mon, 2 Mar 2026 17:31:13 +0100 Subject: [PATCH 04/14] fix: provide layout order during adding --- include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp index 1c9dd24..e921ebb 100644 --- a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp +++ b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp @@ -319,6 +319,8 @@ gemmrelu(char transa, char transb, const unsigned int m, cublasLtMatrixLayout_t temp = nullptr; CHECK_CUBLAS( cublasLtMatrixLayoutCreate(&temp, CUDA_R_32F, rows, cols, ld)); + cublasLtMatrixLayoutSetAttribute( + temp, CUBLASLT_MATRIX_LAYOUT_ORDER, CUBLASLT_ORDER_COL, sizeof(int)); LayoutStore.emplace(key, temp); } } From 8a27ffe044d87c4f87269bd3e5d8772769edae7d Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Tue, 3 Mar 2026 10:38:08 +0100 Subject: [PATCH 05/14] fix: layout order configuration --- include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp index e921ebb..1c9dd24 100644 --- a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp +++ b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp @@ -319,8 +319,6 @@ gemmrelu(char transa, char transb, const unsigned int m, cublasLtMatrixLayout_t temp = nullptr; CHECK_CUBLAS( cublasLtMatrixLayoutCreate(&temp, CUDA_R_32F, rows, cols, ld)); - cublasLtMatrixLayoutSetAttribute( - temp, CUBLASLT_MATRIX_LAYOUT_ORDER, CUBLASLT_ORDER_COL, sizeof(int)); LayoutStore.emplace(key, temp); } } From 54dce6320accc9aea63bb3401cb19da36dbef779 Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Tue, 3 Mar 2026 10:46:56 +0100 Subject: [PATCH 06/14] fix: remove look for requested workspace --- include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp index 1c9dd24..47d4b98 100644 --- a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp +++ b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp @@ -169,8 +169,6 @@ gemm(char transa, char transb, const unsigned int m, 1, &localHeuristic, &returnedResults)); - std::cout << "Requested workspace: " - << localHeuristic.workspaceSize << std::endl; if (returnedResults == 0) { cublasLtMatmulDescDestroy(localDesc); std::cerr << "No suitable cuBLASLt algorithm found!\n"; From c5128e65d644826766fdaba0417432765d4d357b Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Fri, 17 Apr 2026 11:31:30 +0200 Subject: [PATCH 07/14] feat: matmul method for without bias matrix multiplication condition --- .../backends/cuda/sofieBLAS_cublas.hpp | 60 +++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp index 47d4b98..831d0cb 100644 --- a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp +++ b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp @@ -308,6 +308,66 @@ gemmrelu(char transa, char transb, const unsigned int m, workspaceSize, stream)); } + // matmul without bias + template + inline void + matmul(char transa, char transb, const unsigned int m, + const unsigned int n, const unsigned int k, + const float alpha, + alpaka::BufCudaRt, TIdx> const &A, + alpaka::BufCudaRt, TIdx> const &B, + const float beta, + alpaka::BufCudaRt, TIdx> &C) + { + cublasLtMatmulDesc_t localDesc = nullptr; + CHECK_CUBLAS(cublasLtMatmulDescCreate(&localDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F)); + + cublasOperation_t transB_op = charToCuBlasTranspose(transb); + CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( + localDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transB_op, sizeof(transB_op))); + + cublasOperation_t transA_op = charToCuBlasTranspose(transa); + CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( + localDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transA_op, sizeof(transA_op))); + + + cublasLtMatmulHeuristicResult_t localHeuristic{}; + int returnedResults = 0; + CHECK_CUBLAS(cublasLtMatmulAlgoGetHeuristic( + ltHandle, + localDesc, + LayoutStore.at({k, m}), + LayoutStore.at({k, n}), + LayoutStore.at({m, n}), + LayoutStore.at({m, n}), + preference, + 1, + &localHeuristic, + &returnedResults)); + if (returnedResults == 0) { + cublasLtMatmulDescDestroy(localDesc); + std::cerr << "No suitable cuBLASLt algorithm found!\n"; + exit(EXIT_FAILURE); + } + + CHECK_CUBLAS(cublasLtMatmul( + ltHandle, + localDesc, + &alpha, + alpaka::getPtrNative(A), LayoutStore.at({k, m}), + alpaka::getPtrNative(B), LayoutStore.at({k, n}), + &beta, + alpaka::getPtrNative(C), LayoutStore.at({m, n}), + alpaka::getPtrNative(C), LayoutStore.at({m, n}), + &(localHeuristic.algo), + d_workspace, + workspaceSize, + stream)); + + cudaDeviceSynchronize(); + CHECK_CUBLAS(cublasLtMatmulDescDestroy(localDesc)); + } + private: alpaka::QueueCudaRtNonBlocking m_queue; From 9282ba9770d9064f6d77e8571081957b4051c754 Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Fri, 17 Apr 2026 11:42:30 +0200 Subject: [PATCH 08/14] feat: option to pass gpu pointers directly --- .../backends/cuda/sofieBLAS_cublas.hpp | 27 ++++++++++++++++--- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp index 831d0cb..39d3b5b 100644 --- a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp +++ b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp @@ -308,6 +308,7 @@ gemmrelu(char transa, char transb, const unsigned int m, workspaceSize, stream)); } + // matmul without bias template inline void @@ -319,6 +320,24 @@ gemmrelu(char transa, char transb, const unsigned int m, const float beta, alpaka::BufCudaRt, TIdx> &C) { + + matmul(transa, transb, m, n, k, alpha, + reinterpret_cast(alpaka::getPtrNative(A)), + reinterpret_cast(alpaka::getPtrNative(B)), + beta, + reinterpret_cast(alpaka::getPtrNative(C))); + } + + template + inline void + matmul(char transa, char transb, const unsigned int m, + const unsigned int n, const unsigned int k, + const float alpha, + void const &A, + void const &B, + const float beta, + void &C) + { cublasLtMatmulDesc_t localDesc = nullptr; CHECK_CUBLAS(cublasLtMatmulDescCreate(&localDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F)); @@ -354,11 +373,11 @@ gemmrelu(char transa, char transb, const unsigned int m, ltHandle, localDesc, &alpha, - alpaka::getPtrNative(A), LayoutStore.at({k, m}), - alpaka::getPtrNative(B), LayoutStore.at({k, n}), + A, LayoutStore.at({k, m}), + B, LayoutStore.at({k, n}), &beta, - alpaka::getPtrNative(C), LayoutStore.at({m, n}), - alpaka::getPtrNative(C), LayoutStore.at({m, n}), + C, LayoutStore.at({m, n}), + C, LayoutStore.at({m, n}), &(localHeuristic.algo), d_workspace, workspaceSize, From fc9073798360fabe61c6de31dc4e3d8c2da76049 Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Fri, 17 Apr 2026 12:08:02 +0200 Subject: [PATCH 09/14] fix: function signature for matmul method with void pointers --- .../sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp index 39d3b5b..c174ad3 100644 --- a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp +++ b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp @@ -322,21 +322,21 @@ gemmrelu(char transa, char transb, const unsigned int m, { matmul(transa, transb, m, n, k, alpha, - reinterpret_cast(alpaka::getPtrNative(A)), - reinterpret_cast(alpaka::getPtrNative(B)), + alpaka::getPtrNative(A), + alpaka::getPtrNative(B), beta, - reinterpret_cast(alpaka::getPtrNative(C))); + alpaka::getPtrNative(C)); } - + template inline void matmul(char transa, char transb, const unsigned int m, const unsigned int n, const unsigned int k, const float alpha, - void const &A, - void const &B, + void const * A, + void const * B, const float beta, - void &C) + void * C) { cublasLtMatmulDesc_t localDesc = nullptr; CHECK_CUBLAS(cublasLtMatmulDescCreate(&localDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F)); From b591c4a11085a480821077713205a8da06675fb3 Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Fri, 17 Apr 2026 12:16:20 +0200 Subject: [PATCH 10/14] fix: template types for arguments to function with pointer signatures --- include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp index c174ad3..b8abe23 100644 --- a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp +++ b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp @@ -333,10 +333,10 @@ gemmrelu(char transa, char transb, const unsigned int m, matmul(char transa, char transb, const unsigned int m, const unsigned int n, const unsigned int k, const float alpha, - void const * A, - void const * B, + T const * A, + T const * B, const float beta, - void * C) + T * C) { cublasLtMatmulDesc_t localDesc = nullptr; CHECK_CUBLAS(cublasLtMatmulDescCreate(&localDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F)); From 19b5b3f4951f8caf46b742d965612f13d58696af Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Fri, 17 Apr 2026 12:40:23 +0200 Subject: [PATCH 11/14] fix: use float signatures for pointer arguments for blascuda --- include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp index b8abe23..9b9c5e4 100644 --- a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp +++ b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp @@ -333,10 +333,10 @@ gemmrelu(char transa, char transb, const unsigned int m, matmul(char transa, char transb, const unsigned int m, const unsigned int n, const unsigned int k, const float alpha, - T const * A, - T const * B, + float * A, + float * B, const float beta, - T * C) + float * C) { cublasLtMatmulDesc_t localDesc = nullptr; CHECK_CUBLAS(cublasLtMatmulDescCreate(&localDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F)); From c9954648aec4460d4a193296954df8201dbab3e7 Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Fri, 17 Apr 2026 12:50:40 +0200 Subject: [PATCH 12/14] fix: use explicit data type for signatures with pointer arguments --- include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp index 9b9c5e4..57da5e7 100644 --- a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp +++ b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp @@ -328,7 +328,6 @@ gemmrelu(char transa, char transb, const unsigned int m, alpaka::getPtrNative(C)); } - template inline void matmul(char transa, char transb, const unsigned int m, const unsigned int n, const unsigned int k, From 75f75a4ef66a0654ff4e8380f0ecee19c208e62b Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Fri, 17 Apr 2026 12:58:37 +0200 Subject: [PATCH 13/14] fix: non transpose axis for matmul method --- include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp index 57da5e7..21269fc 100644 --- a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp +++ b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp @@ -354,7 +354,7 @@ gemmrelu(char transa, char transb, const unsigned int m, CHECK_CUBLAS(cublasLtMatmulAlgoGetHeuristic( ltHandle, localDesc, - LayoutStore.at({k, m}), + LayoutStore.at({m, k}), LayoutStore.at({k, n}), LayoutStore.at({m, n}), LayoutStore.at({m, n}), @@ -372,7 +372,7 @@ gemmrelu(char transa, char transb, const unsigned int m, ltHandle, localDesc, &alpha, - A, LayoutStore.at({k, m}), + A, LayoutStore.at({m, k}), B, LayoutStore.at({k, n}), &beta, C, LayoutStore.at({m, n}), From ed303fbc7edef361332afbc857f65f1e9e8c26b1 Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Fri, 17 Apr 2026 13:38:18 +0200 Subject: [PATCH 14/14] fix (experimental): layout shape order --- include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp index 21269fc..de36507 100644 --- a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp +++ b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp @@ -117,7 +117,7 @@ class BlasCuda { } void AddLayoutConfig(std::size_t m, std::size_t n, std::size_t k, std::size_t lda, std::size_t ldb, std::size_t ldc) { - CheckAndAddLayout(k, m, lda); + CheckAndAddLayout(m, k, lda); CheckAndAddLayout(k, n, ldb); CheckAndAddLayout(m, n, ldc); }