From e845e50ad3747f18ad680007b9cd64a05f3fce0c Mon Sep 17 00:00:00 2001 From: Yermalayeu Ihar Date: Mon, 4 May 2026 16:48:33 +0300 Subject: [PATCH 01/32] +add SVE optimizations of function AbsDifferenceSum. --- docs/2026.html | 9 ++++ prj/vs2022/Sve1.vcxproj | 1 + prj/vs2022/Sve1.vcxproj.filters | 6 +++ src/Simd/SimdLib.cpp | 5 ++ src/Simd/SimdSve1.h | 2 + src/Simd/SimdSve1AbsDifferenceSum.cpp | 68 +++++++++++++++++++++++++++ src/Test/TestDifferenceSum.cpp | 6 ++- 7 files changed, 96 insertions(+), 1 deletion(-) create mode 100644 src/Simd/SimdSve1AbsDifferenceSum.cpp diff --git a/docs/2026.html b/docs/2026.html index 32d449f5b7..7a10dd1896 100644 --- a/docs/2026.html +++ b/docs/2026.html @@ -35,6 +35,15 @@

Simd Library Release Notes (2026).

2013 +Home +
+

June X, 2026 (version 7.1.162)

+

Algorithms

+
New features
+ + Home

May 4, 2026 (version 7.1.161)

diff --git a/prj/vs2022/Sve1.vcxproj b/prj/vs2022/Sve1.vcxproj index 75c0879b23..93c5d19a12 100644 --- a/prj/vs2022/Sve1.vcxproj +++ b/prj/vs2022/Sve1.vcxproj @@ -22,6 +22,7 @@ + diff --git a/prj/vs2022/Sve1.vcxproj.filters b/prj/vs2022/Sve1.vcxproj.filters index 3ecc6a5937..3356e32b25 100644 --- a/prj/vs2022/Sve1.vcxproj.filters +++ b/prj/vs2022/Sve1.vcxproj.filters @@ -19,6 +19,9 @@ {9b881931-b5e2-4e02-85e0-c84c24f7eacb} + + {b123ae36-270f-4b5c-8b87-372f66f78ba6} + @@ -47,5 +50,8 @@ Sve1\Convert + + Sve1\Statistics + \ No newline at end of file diff --git a/src/Simd/SimdLib.cpp b/src/Simd/SimdLib.cpp index 770c4525bf..9922a57a8c 100644 --- a/src/Simd/SimdLib.cpp +++ b/src/Simd/SimdLib.cpp @@ -333,6 +333,11 @@ SIMD_API void SimdAbsDifferenceSum(const uint8_t *a, size_t aStride, const uint8 Sse41::AbsDifferenceSum(a, aStride, b, bStride, width, height, sum); else #endif +#ifdef SIMD_SVE_ENABLE + if (Sve::Enable) + Sve::AbsDifferenceSum(a, aStride, b, bStride, width, height, sum); + else +#endif #ifdef SIMD_NEON_ENABLE if (Neon::Enable && width >= Neon::A) Neon::AbsDifferenceSum(a, aStride, b, bStride, width, height, sum); diff --git a/src/Simd/SimdSve1.h b/src/Simd/SimdSve1.h index 165dc7b2ef..cd021477c5 100644 --- a/src/Simd/SimdSve1.h +++ b/src/Simd/SimdSve1.h @@ -35,6 +35,8 @@ namespace Simd { void AbsDifference(const uint8_t* a, size_t aStride, const uint8_t* b, size_t bStride, uint8_t* c, size_t cStride, size_t width, size_t height); + void AbsDifferenceSum(const uint8_t* a, size_t aStride, const uint8_t* b, size_t bStride, size_t width, size_t height, uint64_t* sum); + void AbsGradientSaturatedSum(const uint8_t* src, size_t srcStride, size_t width, size_t height, uint8_t* dst, size_t dstStride); void BgrToRgb(const uint8_t* bgr, size_t width, size_t height, size_t bgrStride, uint8_t* rgb, size_t rgbStride); diff --git a/src/Simd/SimdSve1AbsDifferenceSum.cpp b/src/Simd/SimdSve1AbsDifferenceSum.cpp new file mode 100644 index 0000000000..4d7c0daa6a --- /dev/null +++ b/src/Simd/SimdSve1AbsDifferenceSum.cpp @@ -0,0 +1,68 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2022 Yermalayeu Ihar, +* 2022-2022 Fabien Spindler, +* 2022-2022 Souriya Trinh. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdSve1.h" +#include "Simd/SimdMemory.h" + +namespace Simd +{ +#ifdef SIMD_SVE_ENABLE + namespace Sve + { + void AbsDifferenceSum(const uint8_t* a, size_t aStride, const uint8_t* b, size_t bStride, size_t width, size_t height, uint64_t* sum) + { + size_t A = svlen(svuint8_t()); + size_t widthA = AlignLo(width, A); + const svbool_t body = svwhilelt_b8(size_t(0), A); + const svbool_t tail = svwhilelt_b8(widthA, width); + //svuint8_t _a + //svuint64_t _sum = svdup_n_u64(0); + uint64_t _sum = 0; + for (size_t row = 0; row < height; ++row) + { + //svuint32_t rowSum = svdup_n_u32(0); + size_t col = 0; + for (; col < widthA; col += A) + { + svuint8_t _a = svld1_u8(body, a + col); + svuint8_t _b = svld1_u8(body, b + col); + svuint8_t abd = svabd_x(body, _a, _b); + _sum += svaddv_u8(body, abd); + } + if (widthA < width) + { + svuint8_t _a = svld1_u8(tail, a + col); + svuint8_t _b = svld1_u8(tail, b + col); + svuint8_t abd = svabd_x(tail, _a, _b); + _sum += svaddv_u8(tail, abd); + } + a += aStride; + b += bStride; + } + *sum = _sum; + } + } +#endif +} diff --git a/src/Test/TestDifferenceSum.cpp b/src/Test/TestDifferenceSum.cpp index 130955ac0e..10cd0257d7 100644 --- a/src/Test/TestDifferenceSum.cpp +++ b/src/Test/TestDifferenceSum.cpp @@ -215,7 +215,6 @@ namespace Test if (Simd::Neon::Enable && TestNeon(options)) result = result && DifferenceSumsAutoTest(FUNC_S(Simd::Neon::SquaredDifferenceSum), FUNC_S(SimdSquaredDifferenceSum), 1); #endif - return result; } @@ -276,6 +275,11 @@ namespace Test result = result && DifferenceSumsAutoTest(FUNC_S(Simd::Neon::AbsDifferenceSum), FUNC_S(SimdAbsDifferenceSum), 1); #endif +#ifdef SIMD_SVE_ENABLE + if (Simd::Sve::Enable && TestSve(options)) + result = result && DifferenceSumsAutoTest(FUNC_S(Simd::Sve::AbsDifferenceSum), FUNC_S(SimdAbsDifferenceSum), 1); +#endif + #ifdef SIMD_HVX_ENABLE if (Simd::Hvx::Enable && TestHvx(options) && W >= Simd::Hvx::A) result = result && DifferenceSumsAutoTest(FUNC_S(Simd::Hvx::AbsDifferenceSum), FUNC_S(SimdAbsDifferenceSum), 1); From 99ab1443d53848de43a04ccc09e9c5c66d246431 Mon Sep 17 00:00:00 2001 From: Yermalayeu Ihar Date: Mon, 4 May 2026 17:02:09 +0300 Subject: [PATCH 02/32] +add SVE optimizations of function AbsDifferenceSum (version 2). --- src/Simd/SimdSve1AbsDifferenceSum.cpp | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/Simd/SimdSve1AbsDifferenceSum.cpp b/src/Simd/SimdSve1AbsDifferenceSum.cpp index 4d7c0daa6a..39afea2897 100644 --- a/src/Simd/SimdSve1AbsDifferenceSum.cpp +++ b/src/Simd/SimdSve1AbsDifferenceSum.cpp @@ -37,31 +37,30 @@ namespace Simd size_t widthA = AlignLo(width, A); const svbool_t body = svwhilelt_b8(size_t(0), A); const svbool_t tail = svwhilelt_b8(widthA, width); - //svuint8_t _a - //svuint64_t _sum = svdup_n_u64(0); - uint64_t _sum = 0; + svuint8_t _1 = svdup_n_u8(1); + *sum = 0; for (size_t row = 0; row < height; ++row) { - //svuint32_t rowSum = svdup_n_u32(0); size_t col = 0; + svuint32_t _sum = svdup_n_u32(0); for (; col < widthA; col += A) { svuint8_t _a = svld1_u8(body, a + col); svuint8_t _b = svld1_u8(body, b + col); svuint8_t abd = svabd_x(body, _a, _b); - _sum += svaddv_u8(body, abd); + _sum = svdot_u32(_sum, abd, _1); } if (widthA < width) { svuint8_t _a = svld1_u8(tail, a + col); svuint8_t _b = svld1_u8(tail, b + col); svuint8_t abd = svabd_x(tail, _a, _b); - _sum += svaddv_u8(tail, abd); + _sum = svdot_u32(_sum, abd, _1); } + *sum += svaddv_u32(svptrue_b32(), _sum); a += aStride; b += bStride; } - *sum = _sum; } } #endif From 5cb414677394fd4f8bd74bc7e2005f4394d13897 Mon Sep 17 00:00:00 2001 From: Yermalayeu Ihar Date: Mon, 4 May 2026 17:39:43 +0300 Subject: [PATCH 03/32] +add SVE optimizations of function AbsDifferenceSumMasked. --- docs/2026.html | 1 + src/Simd/SimdLib.cpp | 5 ++++ src/Simd/SimdNeonAbsDifferenceSum.cpp | 8 +++++- src/Simd/SimdSve1.h | 3 ++ src/Simd/SimdSve1AbsDifferenceSum.cpp | 40 +++++++++++++++++++++++++++ src/Test/TestDifferenceSum.cpp | 5 ++++ 6 files changed, 61 insertions(+), 1 deletion(-) diff --git a/docs/2026.html b/docs/2026.html index 7a10dd1896..5f8f4d4d71 100644 --- a/docs/2026.html +++ b/docs/2026.html @@ -42,6 +42,7 @@

Algorithms

New features
  • SVE optimizations of function AbsDifferenceSum.
  • +
  • SVE optimizations of function AbsDifferenceSumMasked.
Home diff --git a/src/Simd/SimdLib.cpp b/src/Simd/SimdLib.cpp index 9922a57a8c..e304def840 100644 --- a/src/Simd/SimdLib.cpp +++ b/src/Simd/SimdLib.cpp @@ -370,6 +370,11 @@ SIMD_API void SimdAbsDifferenceSumMasked(const uint8_t *a, size_t aStride, const Sse41::AbsDifferenceSumMasked(a, aStride, b, bStride, mask, maskStride, index, width, height, sum); else #endif +#ifdef SIMD_SVE_ENABLE + if (Sve::Enable) + Sve::AbsDifferenceSumMasked(a, aStride, b, bStride, mask, maskStride, index, width, height, sum); + else +#endif #ifdef SIMD_NEON_ENABLE if (Neon::Enable && width >= Neon::A) Neon::AbsDifferenceSumMasked(a, aStride, b, bStride, mask, maskStride, index, width, height, sum); diff --git a/src/Simd/SimdNeonAbsDifferenceSum.cpp b/src/Simd/SimdNeonAbsDifferenceSum.cpp index c3b97857db..360a2aa4f4 100644 --- a/src/Simd/SimdNeonAbsDifferenceSum.cpp +++ b/src/Simd/SimdNeonAbsDifferenceSum.cpp @@ -75,6 +75,8 @@ namespace Simd AbsDifferenceSum(a, aStride, b, bStride, width, height, sum); } + //-------------------------------------------------------------------------------------------------- + template void AbsDifferenceSumMasked(const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, const uint8_t *mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sum) { @@ -131,6 +133,8 @@ namespace Simd AbsDifferenceSumMasked(a, aStride, b, bStride, mask, maskStride, index, width, height, sum); } + //-------------------------------------------------------------------------------------------------- + template void AbsDifferenceSums3(uint8x16_t current, const uint8_t * background, uint16x8_t sums[3]) { sums[0] = vaddq_u16(sums[0], vpaddlq_u8(vabdq_u8(current, Load(background - 1)))); @@ -228,6 +232,8 @@ namespace Simd AbsDifferenceSums3x3(current, currentStride, background, backgroundStride, width, height, sums); } + //-------------------------------------------------------------------------------------------------- + template void AbsDifferenceSums3Masked16(uint8x16_t current, const uint8_t * background, uint8x16_t mask, uint16x8_t sums[3]) { sums[0] = vaddq_u16(sums[0], vpaddlq_u8(vabdq_u8(current, vandq_u8(mask, Load(background - 1))))); @@ -318,5 +324,5 @@ namespace Simd AbsDifferenceSums3x3Masked(current, currentStride, background, backgroundStride, mask, maskStride, index, width, height, sums); } } -#endif// SIMD_NEON_ENABLE +#endif } diff --git a/src/Simd/SimdSve1.h b/src/Simd/SimdSve1.h index cd021477c5..51dff21046 100644 --- a/src/Simd/SimdSve1.h +++ b/src/Simd/SimdSve1.h @@ -37,6 +37,9 @@ namespace Simd void AbsDifferenceSum(const uint8_t* a, size_t aStride, const uint8_t* b, size_t bStride, size_t width, size_t height, uint64_t* sum); + void AbsDifferenceSumMasked(const uint8_t* a, size_t aStride, const uint8_t* b, size_t bStride, + const uint8_t* mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t* sum); + void AbsGradientSaturatedSum(const uint8_t* src, size_t srcStride, size_t width, size_t height, uint8_t* dst, size_t dstStride); void BgrToRgb(const uint8_t* bgr, size_t width, size_t height, size_t bgrStride, uint8_t* rgb, size_t rgbStride); diff --git a/src/Simd/SimdSve1AbsDifferenceSum.cpp b/src/Simd/SimdSve1AbsDifferenceSum.cpp index 39afea2897..e27146e917 100644 --- a/src/Simd/SimdSve1AbsDifferenceSum.cpp +++ b/src/Simd/SimdSve1AbsDifferenceSum.cpp @@ -62,6 +62,46 @@ namespace Simd b += bStride; } } + + //-------------------------------------------------------------------------------------------------- + + void AbsDifferenceSumMasked(const uint8_t* a, size_t aStride, const uint8_t* b, size_t bStride, + const uint8_t* mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t* sum) + { + size_t A = svlen(svuint8_t()); + size_t widthA = AlignLo(width, A); + const svbool_t body = svwhilelt_b8(size_t(0), A); + const svbool_t tail = svwhilelt_b8(widthA, width); + svuint8_t _i = svdup_n_u8(index), _1 = svdup_n_u8(1); + *sum = 0; + for (size_t row = 0; row < height; ++row) + { + size_t col = 0; + svuint32_t _sum = svdup_n_u32(0); + for (; col < widthA; col += A) + { + svuint8_t _a = svld1_u8(body, a + col); + svuint8_t _b = svld1_u8(body, b + col); + svuint8_t _m = svld1_u8(body, mask + col); + svbool_t _mask = svcmpeq_u8(body, _m, _i); + svuint8_t abd = svabd_x(_mask, _a, _b); + _sum = svdot_u32(_sum, abd, _1); + } + if (widthA < width) + { + svuint8_t _a = svld1_u8(tail, a + col); + svuint8_t _b = svld1_u8(tail, b + col); + svuint8_t _m = svld1_u8(tail, mask + col); + svbool_t _mask = svcmpeq_u8(tail, _m, _i); + svuint8_t abd = svabd_x(_mask, _a, _b); + _sum = svdot_u32(_sum, abd, _1); + } + *sum += svaddv_u32(svptrue_b32(), _sum); + a += aStride; + b += bStride; + mask += maskStride; + } + } } #endif } diff --git a/src/Test/TestDifferenceSum.cpp b/src/Test/TestDifferenceSum.cpp index 10cd0257d7..003fdcc6f0 100644 --- a/src/Test/TestDifferenceSum.cpp +++ b/src/Test/TestDifferenceSum.cpp @@ -315,6 +315,11 @@ namespace Test result = result && DifferenceSumsMaskedAutoTest(FUNC_M(Simd::Neon::AbsDifferenceSumMasked), FUNC_M(SimdAbsDifferenceSumMasked), 1); #endif +#ifdef SIMD_SVE_ENABLE + if (Simd::Sve::Enable && TestSve(options)) + result = result && DifferenceSumsMaskedAutoTest(FUNC_M(Simd::Sve::AbsDifferenceSumMasked), FUNC_M(SimdAbsDifferenceSumMasked), 1); +#endif + return result; } From d763831fcd811d730583cbaa04044e6b36a3b805 Mon Sep 17 00:00:00 2001 From: Yermalayeu Ihar Date: Mon, 4 May 2026 17:50:39 +0300 Subject: [PATCH 04/32] *fix bug in SVE optimizations of function AbsDifferenceSumMasked. --- src/Simd/SimdSve1AbsDifferenceSum.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Simd/SimdSve1AbsDifferenceSum.cpp b/src/Simd/SimdSve1AbsDifferenceSum.cpp index e27146e917..f1b6201c62 100644 --- a/src/Simd/SimdSve1AbsDifferenceSum.cpp +++ b/src/Simd/SimdSve1AbsDifferenceSum.cpp @@ -84,7 +84,7 @@ namespace Simd svuint8_t _b = svld1_u8(body, b + col); svuint8_t _m = svld1_u8(body, mask + col); svbool_t _mask = svcmpeq_u8(body, _m, _i); - svuint8_t abd = svabd_x(_mask, _a, _b); + svuint8_t abd = svabd_z(_mask, _a, _b); _sum = svdot_u32(_sum, abd, _1); } if (widthA < width) @@ -93,7 +93,7 @@ namespace Simd svuint8_t _b = svld1_u8(tail, b + col); svuint8_t _m = svld1_u8(tail, mask + col); svbool_t _mask = svcmpeq_u8(tail, _m, _i); - svuint8_t abd = svabd_x(_mask, _a, _b); + svuint8_t abd = svabd_z(_mask, _a, _b); _sum = svdot_u32(_sum, abd, _1); } *sum += svaddv_u32(svptrue_b32(), _sum); From edb89d9cd31b5740470cf251f6ed2c786e6073f4 Mon Sep 17 00:00:00 2001 From: Yermalayeu Ihar Date: Mon, 4 May 2026 19:56:44 +0300 Subject: [PATCH 05/32] *refactoring of SVE optimizations of functions AbsDifferenceSum and AbsDifferenceSumMasked. --- src/Simd/SimdSve1AbsDifferenceSum.cpp | 52 ++++++++++++--------------- 1 file changed, 23 insertions(+), 29 deletions(-) diff --git a/src/Simd/SimdSve1AbsDifferenceSum.cpp b/src/Simd/SimdSve1AbsDifferenceSum.cpp index f1b6201c62..b2c8fcc6f5 100644 --- a/src/Simd/SimdSve1AbsDifferenceSum.cpp +++ b/src/Simd/SimdSve1AbsDifferenceSum.cpp @@ -31,6 +31,14 @@ namespace Simd #ifdef SIMD_SVE_ENABLE namespace Sve { + SIMD_INLINE void AbsDifferenceSum(const uint8_t* a, const uint8_t* b, const svuint8_t& _1, const svbool_t & mask, svuint32_t & sum) + { + svuint8_t _a = svld1_u8(mask, a); + svuint8_t _b = svld1_u8(mask, b); + svuint8_t abd = svabd_x(mask, _a, _b); + sum = svdot_u32(sum, abd, _1); + } + void AbsDifferenceSum(const uint8_t* a, size_t aStride, const uint8_t* b, size_t bStride, size_t width, size_t height, uint64_t* sum) { size_t A = svlen(svuint8_t()); @@ -44,19 +52,9 @@ namespace Simd size_t col = 0; svuint32_t _sum = svdup_n_u32(0); for (; col < widthA; col += A) - { - svuint8_t _a = svld1_u8(body, a + col); - svuint8_t _b = svld1_u8(body, b + col); - svuint8_t abd = svabd_x(body, _a, _b); - _sum = svdot_u32(_sum, abd, _1); - } + AbsDifferenceSum(a + col, b + col, _1, body, _sum); if (widthA < width) - { - svuint8_t _a = svld1_u8(tail, a + col); - svuint8_t _b = svld1_u8(tail, b + col); - svuint8_t abd = svabd_x(tail, _a, _b); - _sum = svdot_u32(_sum, abd, _1); - } + AbsDifferenceSum(a + col, b + col, _1, tail, _sum); *sum += svaddv_u32(svptrue_b32(), _sum); a += aStride; b += bStride; @@ -65,6 +63,16 @@ namespace Simd //-------------------------------------------------------------------------------------------------- + SIMD_INLINE void AbsDifferenceSumMasked(const uint8_t* a, const uint8_t* b, const uint8_t* m, const svuint8_t& _1, const svuint8_t& index, const svbool_t& mask, svuint32_t& sum) + { + svuint8_t _a = svld1_u8(mask, a); + svuint8_t _b = svld1_u8(mask, b); + svuint8_t _m = svld1_u8(mask, m); + svbool_t _mask = svcmpeq_u8(mask, _m, index); + svuint8_t abd = svabd_z(_mask, _a, _b); + sum = svdot_u32(sum, abd, _1); + } + void AbsDifferenceSumMasked(const uint8_t* a, size_t aStride, const uint8_t* b, size_t bStride, const uint8_t* mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t* sum) { @@ -72,30 +80,16 @@ namespace Simd size_t widthA = AlignLo(width, A); const svbool_t body = svwhilelt_b8(size_t(0), A); const svbool_t tail = svwhilelt_b8(widthA, width); - svuint8_t _i = svdup_n_u8(index), _1 = svdup_n_u8(1); + svuint8_t _index = svdup_n_u8(index), _1 = svdup_n_u8(1); *sum = 0; for (size_t row = 0; row < height; ++row) { size_t col = 0; svuint32_t _sum = svdup_n_u32(0); for (; col < widthA; col += A) - { - svuint8_t _a = svld1_u8(body, a + col); - svuint8_t _b = svld1_u8(body, b + col); - svuint8_t _m = svld1_u8(body, mask + col); - svbool_t _mask = svcmpeq_u8(body, _m, _i); - svuint8_t abd = svabd_z(_mask, _a, _b); - _sum = svdot_u32(_sum, abd, _1); - } + AbsDifferenceSumMasked(a + col, b + col, mask + col, _1, _index, body, _sum); if (widthA < width) - { - svuint8_t _a = svld1_u8(tail, a + col); - svuint8_t _b = svld1_u8(tail, b + col); - svuint8_t _m = svld1_u8(tail, mask + col); - svbool_t _mask = svcmpeq_u8(tail, _m, _i); - svuint8_t abd = svabd_z(_mask, _a, _b); - _sum = svdot_u32(_sum, abd, _1); - } + AbsDifferenceSumMasked(a + col, b + col, mask + col, _1, _index, tail, _sum); *sum += svaddv_u32(svptrue_b32(), _sum); a += aStride; b += bStride; From 720f4ee4eb7bb865a68e047d8d9a4268f28e562e Mon Sep 17 00:00:00 2001 From: Yermalayeu Ihar Date: Mon, 4 May 2026 20:55:46 +0300 Subject: [PATCH 06/32] +add SVE optimizations of function AbsDifferenceSums3x3. --- docs/2026.html | 1 + src/Simd/SimdLib.cpp | 5 +++ src/Simd/SimdSve1.h | 3 ++ src/Simd/SimdSve1AbsDifferenceSum.cpp | 60 +++++++++++++++++++++++++++ src/Test/TestDifferenceSum.cpp | 5 +++ 5 files changed, 74 insertions(+) diff --git a/docs/2026.html b/docs/2026.html index 5f8f4d4d71..c09e0834de 100644 --- a/docs/2026.html +++ b/docs/2026.html @@ -43,6 +43,7 @@
New features
  • SVE optimizations of function AbsDifferenceSum.
  • SVE optimizations of function AbsDifferenceSumMasked.
  • +
  • SVE optimizations of function AbsDifferenceSums3x3.
Home diff --git a/src/Simd/SimdLib.cpp b/src/Simd/SimdLib.cpp index e304def840..7db27e4646 100644 --- a/src/Simd/SimdLib.cpp +++ b/src/Simd/SimdLib.cpp @@ -402,6 +402,11 @@ SIMD_API void SimdAbsDifferenceSums3x3(const uint8_t *current, size_t currentStr Sse41::AbsDifferenceSums3x3(current, currentStride, background, backgroundStride, width, height, sums); else #endif +#ifdef SIMD_SVE_ENABLE + if (Sve::Enable) + Sve::AbsDifferenceSums3x3(current, currentStride, background, backgroundStride, width, height, sums); + else +#endif #ifdef SIMD_NEON_ENABLE if (Neon::Enable && width >= Neon::A + 2) Neon::AbsDifferenceSums3x3(current, currentStride, background, backgroundStride, width, height, sums); diff --git a/src/Simd/SimdSve1.h b/src/Simd/SimdSve1.h index 51dff21046..ad8f3f3fc9 100644 --- a/src/Simd/SimdSve1.h +++ b/src/Simd/SimdSve1.h @@ -40,6 +40,9 @@ namespace Simd void AbsDifferenceSumMasked(const uint8_t* a, size_t aStride, const uint8_t* b, size_t bStride, const uint8_t* mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t* sum); + void AbsDifferenceSums3x3(const uint8_t* current, size_t currentStride, const uint8_t* background, size_t backgroundStride, + size_t width, size_t height, uint64_t* sums); + void AbsGradientSaturatedSum(const uint8_t* src, size_t srcStride, size_t width, size_t height, uint8_t* dst, size_t dstStride); void BgrToRgb(const uint8_t* bgr, size_t width, size_t height, size_t bgrStride, uint8_t* rgb, size_t rgbStride); diff --git a/src/Simd/SimdSve1AbsDifferenceSum.cpp b/src/Simd/SimdSve1AbsDifferenceSum.cpp index b2c8fcc6f5..1a24e8c683 100644 --- a/src/Simd/SimdSve1AbsDifferenceSum.cpp +++ b/src/Simd/SimdSve1AbsDifferenceSum.cpp @@ -96,6 +96,66 @@ namespace Simd mask += maskStride; } } + + //-------------------------------------------------------------------------------------------------- + + SIMD_INLINE void AbsDifferenceSums3(const svuint8_t& current, const uint8_t* background, const svuint8_t& _1, const svbool_t& mask, svuint32x3_t sums) + { + svset3(sums, 0, svdot_u32(svget3(sums, 0), svabd_x(mask, current, svld1_u8(mask, background - 1)), _1)); + svset3(sums, 1, svdot_u32(svget3(sums, 1), svabd_x(mask, current, svld1_u8(mask, background)), _1)); + svset3(sums, 2, svdot_u32(svget3(sums, 2), svabd_x(mask, current, svld1_u8(mask, background + 1)), _1)); + } + + SIMD_INLINE void AbsDifferenceSums3x3(const uint8_t* current, const uint8_t* background, size_t stride, const svuint8_t& _1, + const svbool_t& mask, svuint32x3_t &sums0, svuint32x3_t &sums3, svuint32x3_t& sums6) + { + svuint8_t _current = svld1_u8(mask, current); + AbsDifferenceSums3(_current, background - stride, _1, mask, sums0); + AbsDifferenceSums3(_current, background, _1, mask, sums3); + AbsDifferenceSums3(_current, background + stride, _1, mask, sums6); + } + + SIMD_INLINE void AddRowSums3(svuint32x3_t src, uint64_t* dst) + { + dst[0] += svaddv_u32(svptrue_b32(), svget3(src, 0)); + dst[1] += svaddv_u32(svptrue_b32(), svget3(src, 1)); + dst[2] += svaddv_u32(svptrue_b32(), svget3(src, 2)); + } + + void AbsDifferenceSums3x3(const uint8_t* current, size_t currentStride, const uint8_t* background, size_t backgroundStride, size_t width, size_t height, uint64_t* sums) + { + assert(height > 2 && width > 2); + + width -= 2; + height -= 2; + current += 1 + currentStride; + background += 1 + backgroundStride; + + size_t A = svlen(svuint8_t()); + size_t widthA = AlignLo(width, A); + const svbool_t body = svwhilelt_b8(size_t(0), A); + const svbool_t tail = svwhilelt_b8(widthA, width); + svuint8_t _1 = svdup_n_u8(1); + + for (size_t i = 0; i < 9; ++i) + sums[i] = 0; + for (size_t row = 0; row < height; ++row) + { + svuint32x3_t sums0 = svcreate3_u32(svdup_n_u32(0), svdup_n_u32(0), svdup_n_u32(0)); + svuint32x3_t sums3 = svcreate3_u32(svdup_n_u32(0), svdup_n_u32(0), svdup_n_u32(0)); + svuint32x3_t sums6 = svcreate3_u32(svdup_n_u32(0), svdup_n_u32(0), svdup_n_u32(0)); + size_t col = 0; + for (; col < widthA; col += A) + AbsDifferenceSums3x3(current + col, background + col, backgroundStride, _1, body, sums0, sums3, sums6); + if (widthA < width) + AbsDifferenceSums3x3(current + col, background + col, backgroundStride, _1, tail, sums0, sums3, sums6); + AddRowSums3(sums0, sums + 0); + AddRowSums3(sums3, sums + 3); + AddRowSums3(sums6, sums + 6); + current += currentStride; + background += backgroundStride; + } + } } #endif } diff --git a/src/Test/TestDifferenceSum.cpp b/src/Test/TestDifferenceSum.cpp index 003fdcc6f0..99ad5c3272 100644 --- a/src/Test/TestDifferenceSum.cpp +++ b/src/Test/TestDifferenceSum.cpp @@ -350,6 +350,11 @@ namespace Test result = result && DifferenceSumsAutoTest(FUNC_S(Simd::Neon::AbsDifferenceSums3x3), FUNC_S(SimdAbsDifferenceSums3x3), 9); #endif +#ifdef SIMD_SVE_ENABLE + if (Simd::Sve::Enable && TestSve(options)) + result = result && DifferenceSumsAutoTest(FUNC_S(Simd::Sve::AbsDifferenceSums3x3), FUNC_S(SimdAbsDifferenceSums3x3), 9); +#endif + return result; } From fbceae9c926879998bad5fc9f0fca7255cba741a Mon Sep 17 00:00:00 2001 From: Yermalayeu Ihar Date: Mon, 4 May 2026 21:01:50 +0300 Subject: [PATCH 07/32] *fix bug in Sve::AbsDifferenceSums3x3. --- src/Simd/SimdSve1AbsDifferenceSum.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Simd/SimdSve1AbsDifferenceSum.cpp b/src/Simd/SimdSve1AbsDifferenceSum.cpp index 1a24e8c683..5a7f4728d8 100644 --- a/src/Simd/SimdSve1AbsDifferenceSum.cpp +++ b/src/Simd/SimdSve1AbsDifferenceSum.cpp @@ -99,11 +99,11 @@ namespace Simd //-------------------------------------------------------------------------------------------------- - SIMD_INLINE void AbsDifferenceSums3(const svuint8_t& current, const uint8_t* background, const svuint8_t& _1, const svbool_t& mask, svuint32x3_t sums) + SIMD_INLINE void AbsDifferenceSums3(const svuint8_t& current, const uint8_t* background, const svuint8_t& _1, const svbool_t& mask, svuint32x3_t & sums) { - svset3(sums, 0, svdot_u32(svget3(sums, 0), svabd_x(mask, current, svld1_u8(mask, background - 1)), _1)); - svset3(sums, 1, svdot_u32(svget3(sums, 1), svabd_x(mask, current, svld1_u8(mask, background)), _1)); - svset3(sums, 2, svdot_u32(svget3(sums, 2), svabd_x(mask, current, svld1_u8(mask, background + 1)), _1)); + sums = svset3(sums, 0, svdot_u32(svget3(sums, 0), svabd_x(mask, current, svld1_u8(mask, background - 1)), _1)); + sums = svset3(sums, 1, svdot_u32(svget3(sums, 1), svabd_x(mask, current, svld1_u8(mask, background)), _1)); + sums = svset3(sums, 2, svdot_u32(svget3(sums, 2), svabd_x(mask, current, svld1_u8(mask, background + 1)), _1)); } SIMD_INLINE void AbsDifferenceSums3x3(const uint8_t* current, const uint8_t* background, size_t stride, const svuint8_t& _1, From 1293ef7dfbd5668fb5e80bf7d710cd85bdfb2920 Mon Sep 17 00:00:00 2001 From: Yermalayeu Ihar Date: Tue, 5 May 2026 09:08:18 +0300 Subject: [PATCH 08/32] *fix bug in Sve::AbsDifferenceSums3x3 (part 2). --- src/Simd/SimdSve1AbsDifferenceSum.cpp | 51 ++++++++++++++++----------- 1 file changed, 30 insertions(+), 21 deletions(-) diff --git a/src/Simd/SimdSve1AbsDifferenceSum.cpp b/src/Simd/SimdSve1AbsDifferenceSum.cpp index 5a7f4728d8..8545b979ff 100644 --- a/src/Simd/SimdSve1AbsDifferenceSum.cpp +++ b/src/Simd/SimdSve1AbsDifferenceSum.cpp @@ -99,27 +99,35 @@ namespace Simd //-------------------------------------------------------------------------------------------------- - SIMD_INLINE void AbsDifferenceSums3(const svuint8_t& current, const uint8_t* background, const svuint8_t& _1, const svbool_t& mask, svuint32x3_t & sums) + SIMD_INLINE void AbsDifferenceSums3(const svuint8_t& current, const uint8_t* background, const svuint8_t& _1, const svbool_t& mask, + svuint32_t &sum0, svuint32_t& sum1, svuint32_t& sum2) { - sums = svset3(sums, 0, svdot_u32(svget3(sums, 0), svabd_x(mask, current, svld1_u8(mask, background - 1)), _1)); - sums = svset3(sums, 1, svdot_u32(svget3(sums, 1), svabd_x(mask, current, svld1_u8(mask, background)), _1)); - sums = svset3(sums, 2, svdot_u32(svget3(sums, 2), svabd_x(mask, current, svld1_u8(mask, background + 1)), _1)); + sum0 = svdot_u32(sum0, svabd_x(mask, current, svld1_u8(mask, background - 1)), _1); + sum1 = svdot_u32(sum1, svabd_x(mask, current, svld1_u8(mask, background)), _1); + sum2 = svdot_u32(sum2, svabd_x(mask, current, svld1_u8(mask, background + 1)), _1); } - SIMD_INLINE void AbsDifferenceSums3x3(const uint8_t* current, const uint8_t* background, size_t stride, const svuint8_t& _1, - const svbool_t& mask, svuint32x3_t &sums0, svuint32x3_t &sums3, svuint32x3_t& sums6) + SIMD_INLINE void AbsDifferenceSums3x3(const uint8_t* current, const uint8_t* background, size_t stride, const svuint8_t& _1, const svbool_t& mask, + svuint32_t& sum0, svuint32_t& sum1, svuint32_t& sum2, svuint32_t& sum3, svuint32_t& sum4, svuint32_t& sum5, svuint32_t& sum6, svuint32_t& sum7, svuint32_t& sum8) { svuint8_t _current = svld1_u8(mask, current); - AbsDifferenceSums3(_current, background - stride, _1, mask, sums0); - AbsDifferenceSums3(_current, background, _1, mask, sums3); - AbsDifferenceSums3(_current, background + stride, _1, mask, sums6); + AbsDifferenceSums3(_current, background - stride, _1, mask, sum0, sum1, sum2); + AbsDifferenceSums3(_current, background, _1, mask, sum3, sum4, sum5); + AbsDifferenceSums3(_current, background + stride, _1, mask, sum6, sum7, sum8); } - SIMD_INLINE void AddRowSums3(svuint32x3_t src, uint64_t* dst) + SIMD_INLINE void ClearSums(svuint32_t& sum0, svuint32_t& sum1, svuint32_t& sum2) { - dst[0] += svaddv_u32(svptrue_b32(), svget3(src, 0)); - dst[1] += svaddv_u32(svptrue_b32(), svget3(src, 1)); - dst[2] += svaddv_u32(svptrue_b32(), svget3(src, 2)); + sum0 = svdup_n_u32(0); + sum1 = svdup_n_u32(0); + sum2 = svdup_n_u32(0); + } + + SIMD_INLINE void AddSums(const svuint32_t& sum0, const svuint32_t& sum1, const svuint32_t& sum2, uint64_t* sums) + { + sums[0] += svaddv_u32(svptrue_b32(), sum0); + sums[1] += svaddv_u32(svptrue_b32(), sum1); + sums[2] += svaddv_u32(svptrue_b32(), sum2); } void AbsDifferenceSums3x3(const uint8_t* current, size_t currentStride, const uint8_t* background, size_t backgroundStride, size_t width, size_t height, uint64_t* sums) @@ -139,19 +147,20 @@ namespace Simd for (size_t i = 0; i < 9; ++i) sums[i] = 0; + svuint32_t s0, s1, s2, s3, s4, s5, s6, s7, s8; for (size_t row = 0; row < height; ++row) { - svuint32x3_t sums0 = svcreate3_u32(svdup_n_u32(0), svdup_n_u32(0), svdup_n_u32(0)); - svuint32x3_t sums3 = svcreate3_u32(svdup_n_u32(0), svdup_n_u32(0), svdup_n_u32(0)); - svuint32x3_t sums6 = svcreate3_u32(svdup_n_u32(0), svdup_n_u32(0), svdup_n_u32(0)); + ClearSums(s0, s1, s3); + ClearSums(s3, s4, s5); + ClearSums(s6, s7, s8); size_t col = 0; for (; col < widthA; col += A) - AbsDifferenceSums3x3(current + col, background + col, backgroundStride, _1, body, sums0, sums3, sums6); + AbsDifferenceSums3x3(current + col, background + col, backgroundStride, _1, body, s0, s1, s2, s3, s4, s5, s6, s7, s8); if (widthA < width) - AbsDifferenceSums3x3(current + col, background + col, backgroundStride, _1, tail, sums0, sums3, sums6); - AddRowSums3(sums0, sums + 0); - AddRowSums3(sums3, sums + 3); - AddRowSums3(sums6, sums + 6); + AbsDifferenceSums3x3(current + col, background + col, backgroundStride, _1, tail, s0, s1, s2, s3, s4, s5, s6, s7, s8); + AddSums(s0, s1, s2, sums + 0); + AddSums(s3, s4, s5, sums + 3); + AddSums(s6, s7, s8, sums + 6); current += currentStride; background += backgroundStride; } From 50a2f75592d68768b23983b2db4cd0a6a67f112e Mon Sep 17 00:00:00 2001 From: Yermalayeu Ihar Date: Tue, 5 May 2026 09:13:41 +0300 Subject: [PATCH 09/32] *fix bug in Sve::AbsDifferenceSums3x3 (part 3). --- src/Simd/SimdSve1AbsDifferenceSum.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/Simd/SimdSve1AbsDifferenceSum.cpp b/src/Simd/SimdSve1AbsDifferenceSum.cpp index 8545b979ff..6d55d57823 100644 --- a/src/Simd/SimdSve1AbsDifferenceSum.cpp +++ b/src/Simd/SimdSve1AbsDifferenceSum.cpp @@ -108,12 +108,12 @@ namespace Simd } SIMD_INLINE void AbsDifferenceSums3x3(const uint8_t* current, const uint8_t* background, size_t stride, const svuint8_t& _1, const svbool_t& mask, - svuint32_t& sum0, svuint32_t& sum1, svuint32_t& sum2, svuint32_t& sum3, svuint32_t& sum4, svuint32_t& sum5, svuint32_t& sum6, svuint32_t& sum7, svuint32_t& sum8) + svuint32_t& s0, svuint32_t& s1, svuint32_t& s2, svuint32_t& s3, svuint32_t& s4, svuint32_t& s5, svuint32_t& s6, svuint32_t& s7, svuint32_t& s8) { svuint8_t _current = svld1_u8(mask, current); - AbsDifferenceSums3(_current, background - stride, _1, mask, sum0, sum1, sum2); - AbsDifferenceSums3(_current, background, _1, mask, sum3, sum4, sum5); - AbsDifferenceSums3(_current, background + stride, _1, mask, sum6, sum7, sum8); + AbsDifferenceSums3(_current, background - stride, _1, mask, s0, s1, s2); + AbsDifferenceSums3(_current, background, _1, mask, s3, s4, s5); + AbsDifferenceSums3(_current, background + stride, _1, mask, s6, s7, s8); } SIMD_INLINE void ClearSums(svuint32_t& sum0, svuint32_t& sum1, svuint32_t& sum2) @@ -150,7 +150,7 @@ namespace Simd svuint32_t s0, s1, s2, s3, s4, s5, s6, s7, s8; for (size_t row = 0; row < height; ++row) { - ClearSums(s0, s1, s3); + ClearSums(s0, s1, s2); ClearSums(s3, s4, s5); ClearSums(s6, s7, s8); size_t col = 0; From 9a91bcfefa3ac9f1a550fdb62965e36ae8147bfd Mon Sep 17 00:00:00 2001 From: Yermalayeu Ihar Date: Tue, 5 May 2026 10:02:08 +0300 Subject: [PATCH 10/32] +add SVE optimizations of function AbsDifferenceSums3x3Masked. --- docs/2026.html | 1 + src/Simd/SimdLib.cpp | 5 +++ src/Simd/SimdSve1.h | 3 ++ src/Simd/SimdSve1AbsDifferenceSum.cpp | 60 +++++++++++++++++++++++++++ src/Test/TestDifferenceSum.cpp | 5 +++ 5 files changed, 74 insertions(+) diff --git a/docs/2026.html b/docs/2026.html index c09e0834de..d35a59563b 100644 --- a/docs/2026.html +++ b/docs/2026.html @@ -44,6 +44,7 @@
New features
  • SVE optimizations of function AbsDifferenceSum.
  • SVE optimizations of function AbsDifferenceSumMasked.
  • SVE optimizations of function AbsDifferenceSums3x3.
  • +
  • SVE optimizations of function AbsDifferenceSums3x3Masked.
  • Home diff --git a/src/Simd/SimdLib.cpp b/src/Simd/SimdLib.cpp index 7db27e4646..6afbc92956 100644 --- a/src/Simd/SimdLib.cpp +++ b/src/Simd/SimdLib.cpp @@ -434,6 +434,11 @@ SIMD_API void SimdAbsDifferenceSums3x3Masked(const uint8_t *current, size_t curr Sse41::AbsDifferenceSums3x3Masked(current, currentStride, background, backgroundStride, mask, maskStride, index, width, height, sums); else #endif +#ifdef SIMD_SVE_ENABLE + if (Sve::Enable) + Sve::AbsDifferenceSums3x3Masked(current, currentStride, background, backgroundStride, mask, maskStride, index, width, height, sums); + else +#endif #ifdef SIMD_NEON_ENABLE if (Neon::Enable && width >= Neon::A + 2) Neon::AbsDifferenceSums3x3Masked(current, currentStride, background, backgroundStride, mask, maskStride, index, width, height, sums); diff --git a/src/Simd/SimdSve1.h b/src/Simd/SimdSve1.h index ad8f3f3fc9..293d6bf276 100644 --- a/src/Simd/SimdSve1.h +++ b/src/Simd/SimdSve1.h @@ -43,6 +43,9 @@ namespace Simd void AbsDifferenceSums3x3(const uint8_t* current, size_t currentStride, const uint8_t* background, size_t backgroundStride, size_t width, size_t height, uint64_t* sums); + void AbsDifferenceSums3x3Masked(const uint8_t* current, size_t currentStride, const uint8_t* background, size_t backgroundStride, + const uint8_t* mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t* sums); + void AbsGradientSaturatedSum(const uint8_t* src, size_t srcStride, size_t width, size_t height, uint8_t* dst, size_t dstStride); void BgrToRgb(const uint8_t* bgr, size_t width, size_t height, size_t bgrStride, uint8_t* rgb, size_t rgbStride); diff --git a/src/Simd/SimdSve1AbsDifferenceSum.cpp b/src/Simd/SimdSve1AbsDifferenceSum.cpp index 6d55d57823..6efa0d0a94 100644 --- a/src/Simd/SimdSve1AbsDifferenceSum.cpp +++ b/src/Simd/SimdSve1AbsDifferenceSum.cpp @@ -165,6 +165,66 @@ namespace Simd background += backgroundStride; } } + + //-------------------------------------------------------------------------------------------------- + + SIMD_INLINE void AbsDifferenceSums3Masked(const svuint8_t& current, const uint8_t* background, const svuint8_t& _1, const svbool_t& mask, + svuint32_t& sum0, svuint32_t& sum1, svuint32_t& sum2) + { + sum0 = svdot_u32(sum0, svabd_z(mask, current, svld1_u8(mask, background - 1)), _1); + sum1 = svdot_u32(sum1, svabd_z(mask, current, svld1_u8(mask, background)), _1); + sum2 = svdot_u32(sum2, svabd_z(mask, current, svld1_u8(mask, background + 1)), _1); + } + + SIMD_INLINE void AbsDifferenceSums3x3Masked(const uint8_t* c, const uint8_t* b, size_t stride, const uint8_t* m, const svuint8_t& _1, const svuint8_t& i, const svbool_t& mask, + svuint32_t& s0, svuint32_t& s1, svuint32_t& s2, svuint32_t& s3, svuint32_t& s4, svuint32_t& s5, svuint32_t& s6, svuint32_t& s7, svuint32_t& s8) + { + svuint8_t _c = svld1_u8(mask, c); + svuint8_t _m = svld1_u8(mask, m); + svbool_t _mask = svcmpeq_u8(mask, _m, i); + AbsDifferenceSums3Masked(_c, b - stride, _1, _mask, s0, s1, s2); + AbsDifferenceSums3Masked(_c, b, _1, _mask, s3, s4, s5); + AbsDifferenceSums3Masked(_c, b + stride, _1, _mask, s6, s7, s8); + } + + void AbsDifferenceSums3x3Masked(const uint8_t* current, size_t currentStride, const uint8_t* background, size_t backgroundStride, + const uint8_t* mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t* sums) + { + assert(height > 2 && width > 2); + + width -= 2; + height -= 2; + current += 1 + currentStride; + background += 1 + backgroundStride; + mask += 1 + maskStride; + + size_t A = svlen(svuint8_t()); + size_t widthA = AlignLo(width, A); + const svbool_t body = svwhilelt_b8(size_t(0), A); + const svbool_t tail = svwhilelt_b8(widthA, width); + svuint8_t _index = svdup_n_u8(index), _1 = svdup_n_u8(1); + + for (size_t i = 0; i < 9; ++i) + sums[i] = 0; + svuint32_t s0, s1, s2, s3, s4, s5, s6, s7, s8; + for (size_t row = 0; row < height; ++row) + { + ClearSums(s0, s1, s2); + ClearSums(s3, s4, s5); + ClearSums(s6, s7, s8); + size_t col = 0; + for (; col < widthA; col += A) + AbsDifferenceSums3x3Masked(current + col, background + col, backgroundStride, mask + col, _1, _index, body, s0, s1, s2, s3, s4, s5, s6, s7, s8); + if (widthA < width) + AbsDifferenceSums3x3Masked(current + col, background + col, backgroundStride, mask + col, _1, _index, tail, s0, s1, s2, s3, s4, s5, s6, s7, s8); + AddSums(s0, s1, s2, sums + 0); + AddSums(s3, s4, s5, sums + 3); + AddSums(s6, s7, s8, sums + 6); + current += currentStride; + background += backgroundStride; + mask += maskStride; + } + } } #endif } diff --git a/src/Test/TestDifferenceSum.cpp b/src/Test/TestDifferenceSum.cpp index 99ad5c3272..983efaa2bd 100644 --- a/src/Test/TestDifferenceSum.cpp +++ b/src/Test/TestDifferenceSum.cpp @@ -385,6 +385,11 @@ namespace Test result = result && DifferenceSumsMaskedAutoTest(FUNC_M(Simd::Neon::AbsDifferenceSums3x3Masked), FUNC_M(SimdAbsDifferenceSums3x3Masked), 9); #endif +#ifdef SIMD_SVE_ENABLE + if (Simd::Sve::Enable && TestSve(options)) + result = result && DifferenceSumsMaskedAutoTest(FUNC_M(Simd::Sve::AbsDifferenceSums3x3Masked), FUNC_M(SimdAbsDifferenceSums3x3Masked), 9); +#endif + return result; } From e11754104a3b0b6abc3fa7b9eb1aedfba8f7f5c8 Mon Sep 17 00:00:00 2001 From: Yermalayeu Ihar Date: Tue, 5 May 2026 10:27:47 +0300 Subject: [PATCH 11/32] *improve SVE optimizations of function AbsDifferenceSums3x3. --- src/Simd/SimdSve1AbsDifferenceSum.cpp | 60 +++++++++++++++++++++++---- 1 file changed, 51 insertions(+), 9 deletions(-) diff --git a/src/Simd/SimdSve1AbsDifferenceSum.cpp b/src/Simd/SimdSve1AbsDifferenceSum.cpp index 6efa0d0a94..d7c425b876 100644 --- a/src/Simd/SimdSve1AbsDifferenceSum.cpp +++ b/src/Simd/SimdSve1AbsDifferenceSum.cpp @@ -99,6 +99,20 @@ namespace Simd //-------------------------------------------------------------------------------------------------- + SIMD_INLINE void ClearSums(svuint32_t& sum0, svuint32_t& sum1, svuint32_t& sum2) + { + sum0 = svdup_n_u32(0); + sum1 = svdup_n_u32(0); + sum2 = svdup_n_u32(0); + } + + SIMD_INLINE void AddSums(const svuint32_t& sum0, const svuint32_t& sum1, const svuint32_t& sum2, uint64_t* sums) + { + sums[0] += svaddv_u32(svptrue_b32(), sum0); + sums[1] += svaddv_u32(svptrue_b32(), sum1); + sums[2] += svaddv_u32(svptrue_b32(), sum2); + } + SIMD_INLINE void AbsDifferenceSums3(const svuint8_t& current, const uint8_t* background, const svuint8_t& _1, const svbool_t& mask, svuint32_t &sum0, svuint32_t& sum1, svuint32_t& sum2) { @@ -116,18 +130,28 @@ namespace Simd AbsDifferenceSums3(_current, background + stride, _1, mask, s6, s7, s8); } - SIMD_INLINE void ClearSums(svuint32_t& sum0, svuint32_t& sum1, svuint32_t& sum2) + SIMD_INLINE void AbsDifferenceSums3x2(const svuint8_t& c0, const svuint8_t& c1, const uint8_t* b, const svuint8_t& _1, const svbool_t& mask, + svuint32_t& s0, svuint32_t& s1, svuint32_t& s2, svuint32_t& s3, svuint32_t& s4, svuint32_t& s5) { - sum0 = svdup_n_u32(0); - sum1 = svdup_n_u32(0); - sum2 = svdup_n_u32(0); + svuint8_t b0 = svld1_u8(mask, b - 1); + s0 = svdot_u32(s0, svabd_x(mask, c0, b0), _1); + s3 = svdot_u32(s3, svabd_x(mask, c1, b0), _1); + svuint8_t b1 = svld1_u8(mask, b); + s1 = svdot_u32(s1, svabd_x(mask, c0, b1), _1); + s4 = svdot_u32(s4, svabd_x(mask, c1, b1), _1); + svuint8_t b2 = svld1_u8(mask, b + 1); + s2 = svdot_u32(s2, svabd_x(mask, c0, b2), _1); + s5 = svdot_u32(s5, svabd_x(mask, c1, b2), _1); } - SIMD_INLINE void AddSums(const svuint32_t& sum0, const svuint32_t& sum1, const svuint32_t& sum2, uint64_t* sums) + SIMD_INLINE void AbsDifferenceSums3x3x2(const uint8_t* c, size_t cStride, const uint8_t* b, size_t bStride, const svuint8_t& _1, const svbool_t& mask, + svuint32_t& s0, svuint32_t& s1, svuint32_t& s2, svuint32_t& s3, svuint32_t& s4, svuint32_t& s5, svuint32_t& s6, svuint32_t& s7, svuint32_t& s8) { - sums[0] += svaddv_u32(svptrue_b32(), sum0); - sums[1] += svaddv_u32(svptrue_b32(), sum1); - sums[2] += svaddv_u32(svptrue_b32(), sum2); + svuint8_t c0 = svld1_u8(mask, c), c1 = svld1_u8(mask, c + cStride); + AbsDifferenceSums3(c0, b - bStride, _1, mask, s0, s1, s2); + AbsDifferenceSums3x2(c1, c0, b, _1, mask, s0, s1, s2, s3, s4, s5); + AbsDifferenceSums3x2(c1, c0, b + bStride, _1, mask, s3, s4, s5, s6, s7, s8); + AbsDifferenceSums3(c1, b + 2 * bStride, _1, mask, s6, s7, s8); } void AbsDifferenceSums3x3(const uint8_t* current, size_t currentStride, const uint8_t* background, size_t backgroundStride, size_t width, size_t height, uint64_t* sums) @@ -141,6 +165,7 @@ namespace Simd size_t A = svlen(svuint8_t()); size_t widthA = AlignLo(width, A); + size_t height2 = AlignLo(height, 2); const svbool_t body = svwhilelt_b8(size_t(0), A); const svbool_t tail = svwhilelt_b8(widthA, width); svuint8_t _1 = svdup_n_u8(1); @@ -148,7 +173,24 @@ namespace Simd for (size_t i = 0; i < 9; ++i) sums[i] = 0; svuint32_t s0, s1, s2, s3, s4, s5, s6, s7, s8; - for (size_t row = 0; row < height; ++row) + size_t row = 0; + for (; row < height2; row += 2) + { + ClearSums(s0, s1, s2); + ClearSums(s3, s4, s5); + ClearSums(s6, s7, s8); + size_t col = 0; + for (; col < widthA; col += A) + AbsDifferenceSums3x3x2(current + col, currentStride, background + col, backgroundStride, _1, body, s0, s1, s2, s3, s4, s5, s6, s7, s8); + if (widthA < width) + AbsDifferenceSums3x3x2(current + col, currentStride, background + col, backgroundStride, _1, tail, s0, s1, s2, s3, s4, s5, s6, s7, s8); + AddSums(s0, s1, s2, sums + 0); + AddSums(s3, s4, s5, sums + 3); + AddSums(s6, s7, s8, sums + 6); + current += 2 * currentStride; + background += 2 * backgroundStride; + } + for (; row < height; ++row) { ClearSums(s0, s1, s2); ClearSums(s3, s4, s5); From b7ac1869b9285d7b7370a9eca26b3b8dda636c34 Mon Sep 17 00:00:00 2001 From: Yermalayeu Ihar Date: Tue, 5 May 2026 12:27:19 +0300 Subject: [PATCH 12/32] +add SVE optimizations of function BackgroundGrowRangeSlow. --- docs/2026.html | 1 + prj/vs2022/Sve1.vcxproj | 1 + prj/vs2022/Sve1.vcxproj.filters | 6 +++ src/Simd/SimdLib.cpp | 5 +++ src/Simd/SimdSve1.h | 3 ++ src/Simd/SimdSve1Background.cpp | 66 +++++++++++++++++++++++++++++++++ src/Test/TestBackground.cpp | 5 +++ 7 files changed, 87 insertions(+) create mode 100644 src/Simd/SimdSve1Background.cpp diff --git a/docs/2026.html b/docs/2026.html index d35a59563b..5f125b2243 100644 --- a/docs/2026.html +++ b/docs/2026.html @@ -45,6 +45,7 @@
    New features
  • SVE optimizations of function AbsDifferenceSumMasked.
  • SVE optimizations of function AbsDifferenceSums3x3.
  • SVE optimizations of function AbsDifferenceSums3x3Masked.
  • +
  • SVE optimizations of function BackgroundGrowRangeSlow.
  • Home diff --git a/prj/vs2022/Sve1.vcxproj b/prj/vs2022/Sve1.vcxproj index 93c5d19a12..a960799e61 100644 --- a/prj/vs2022/Sve1.vcxproj +++ b/prj/vs2022/Sve1.vcxproj @@ -24,6 +24,7 @@ + diff --git a/prj/vs2022/Sve1.vcxproj.filters b/prj/vs2022/Sve1.vcxproj.filters index 3356e32b25..87fc1e9732 100644 --- a/prj/vs2022/Sve1.vcxproj.filters +++ b/prj/vs2022/Sve1.vcxproj.filters @@ -22,6 +22,9 @@ {b123ae36-270f-4b5c-8b87-372f66f78ba6} + + {c617af02-4208-4080-9349-d24bcf67c558} + @@ -53,5 +56,8 @@ Sve1\Statistics + + Sve1\Motion + \ No newline at end of file diff --git a/src/Simd/SimdLib.cpp b/src/Simd/SimdLib.cpp index 6afbc92956..d7e8c70752 100644 --- a/src/Simd/SimdLib.cpp +++ b/src/Simd/SimdLib.cpp @@ -722,6 +722,11 @@ SIMD_API void SimdBackgroundGrowRangeSlow(const uint8_t * value, size_t valueStr Sse41::BackgroundGrowRangeSlow(value, valueStride, width, height, lo, loStride, hi, hiStride); else #endif +#ifdef SIMD_SVE_ENABLE + if (Sve::Enable) + Sve::BackgroundGrowRangeSlow(value, valueStride, width, height, lo, loStride, hi, hiStride); + else +#endif #ifdef SIMD_NEON_ENABLE if (Neon::Enable && width >= Neon::A) Neon::BackgroundGrowRangeSlow(value, valueStride, width, height, lo, loStride, hi, hiStride); diff --git a/src/Simd/SimdSve1.h b/src/Simd/SimdSve1.h index 293d6bf276..a1133666de 100644 --- a/src/Simd/SimdSve1.h +++ b/src/Simd/SimdSve1.h @@ -48,6 +48,9 @@ namespace Simd void AbsGradientSaturatedSum(const uint8_t* src, size_t srcStride, size_t width, size_t height, uint8_t* dst, size_t dstStride); + void BackgroundGrowRangeSlow(const uint8_t* value, size_t valueStride, size_t width, size_t height, + uint8_t* lo, size_t loStride, uint8_t* hi, size_t hiStride); + void BgrToRgb(const uint8_t* bgr, size_t width, size_t height, size_t bgrStride, uint8_t* rgb, size_t rgbStride); void DeinterleaveUv(const uint8_t* uv, size_t uvStride, size_t width, size_t height, uint8_t* u, size_t uStride, uint8_t* v, size_t vStride); diff --git a/src/Simd/SimdSve1Background.cpp b/src/Simd/SimdSve1Background.cpp new file mode 100644 index 0000000000..fcb76d4d38 --- /dev/null +++ b/src/Simd/SimdSve1Background.cpp @@ -0,0 +1,66 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2017 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdMemory.h" +#include "Simd/SimdStore.h" + +namespace Simd +{ +#ifdef SIMD_SVE_ENABLE + namespace Sve + { + SIMD_INLINE void BackgroundGrowRangeSlow(const uint8_t * value, uint8_t * lo, uint8_t * hi, const svuint8_t& _1, const svbool_t & mask) + { + svuint8_t _value = svld1_u8(mask, value); + svuint8_t _lo = svld1_u8(mask, lo); + svuint8_t _hi = svld1_u8(mask, hi); + + svbool_t inc = svcmpgt_u8(mask, _value, _hi); + svbool_t dec = svcmpgt_u8(mask, _value, _lo); + + svst1_u8(mask, lo, svqsub_u8(_lo, svand_u8_z(dec, _1, _1))); + svst1_u8(mask, hi, svqadd_u8(_hi, svand_u8_z(inc, _1, _1))); + } + + void BackgroundGrowRangeSlow(const uint8_t * value, size_t valueStride, size_t width, size_t height, uint8_t * lo, size_t loStride, uint8_t * hi, size_t hiStride) + { + size_t A = svlen(svuint8_t()); + size_t widthA = AlignLo(width, A); + const svbool_t body = svwhilelt_b8(size_t(0), A); + const svbool_t tail = svwhilelt_b8(widthA, width); + svuint8_t _1 = svdup_n_u8(1); + for (size_t row = 0; row < height; ++row) + { + size_t col = 0; + for (; col < widthA; col += A) + BackgroundGrowRangeSlow(value + col, lo + col, hi + col, _1, body); + if (widthA < width) + BackgroundGrowRangeSlow(value + col, lo + col, hi + col, _1, tail); + value += valueStride; + lo += loStride; + hi += hiStride; + } + } + } +#endif +} diff --git a/src/Test/TestBackground.cpp b/src/Test/TestBackground.cpp index fe509fa3db..51d9351f56 100644 --- a/src/Test/TestBackground.cpp +++ b/src/Test/TestBackground.cpp @@ -464,6 +464,11 @@ namespace Test result = result && BackgroundChangeRangeAutoTest(FUNC1(Simd::Neon::BackgroundGrowRangeSlow), FUNC1(SimdBackgroundGrowRangeSlow)); #endif +#ifdef SIMD_SVE_ENABLE + if (Simd::Sve::Enable && TestSve(options)) + result = result && BackgroundChangeRangeAutoTest(FUNC1(Simd::Sve::BackgroundGrowRangeSlow), FUNC1(SimdBackgroundGrowRangeSlow)); +#endif + return result; } From 89bebe810e2f533e20faf0902a1ae317b70bd93d Mon Sep 17 00:00:00 2001 From: Yermalayeu Ihar Date: Tue, 5 May 2026 12:35:22 +0300 Subject: [PATCH 13/32] *fix bug in SVE optimizations of function BackgroundGrowRangeSlow. --- src/Simd/SimdSve1Background.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Simd/SimdSve1Background.cpp b/src/Simd/SimdSve1Background.cpp index fcb76d4d38..5ffafb7ff0 100644 --- a/src/Simd/SimdSve1Background.cpp +++ b/src/Simd/SimdSve1Background.cpp @@ -36,7 +36,7 @@ namespace Simd svuint8_t _hi = svld1_u8(mask, hi); svbool_t inc = svcmpgt_u8(mask, _value, _hi); - svbool_t dec = svcmpgt_u8(mask, _value, _lo); + svbool_t dec = svcmplt_u8(mask, _value, _lo); svst1_u8(mask, lo, svqsub_u8(_lo, svand_u8_z(dec, _1, _1))); svst1_u8(mask, hi, svqadd_u8(_hi, svand_u8_z(inc, _1, _1))); From 0452f8838ce6f7be222df237ea6cd2f8dbacec15 Mon Sep 17 00:00:00 2001 From: Yermalayeu Ihar Date: Tue, 5 May 2026 12:48:22 +0300 Subject: [PATCH 14/32] *fix bug: Error in function SimdAlignment for SVE (ARM). --- docs/2026.html | 4 ++++ src/Simd/SimdAlignment.h | 5 +++++ 2 files changed, 9 insertions(+) diff --git a/docs/2026.html b/docs/2026.html index 5f125b2243..14493f65b6 100644 --- a/docs/2026.html +++ b/docs/2026.html @@ -47,6 +47,10 @@
    New features
  • SVE optimizations of function AbsDifferenceSums3x3Masked.
  • SVE optimizations of function BackgroundGrowRangeSlow.
  • +
    Bug fixing
    +
      +
    • Error in function SimdAlignment for SVE (ARM).
    • +
    Home
    diff --git a/src/Simd/SimdAlignment.h b/src/Simd/SimdAlignment.h index 837884b1f5..738c230552 100644 --- a/src/Simd/SimdAlignment.h +++ b/src/Simd/SimdAlignment.h @@ -51,6 +51,11 @@ namespace Simd return sizeof(__m128i); else #endif +#ifdef SIMD_SVE_ENABLE + if (Sve::Enable) + return Sve::SveSize; + else +#endif #ifdef SIMD_NEON_ENABLE if (Neon::Enable) return sizeof(uint8x16_t); From 56d807e60314e807ce6f3f4a92a7d97a855d1da2 Mon Sep 17 00:00:00 2001 From: Yermalayeu Ihar Date: Tue, 5 May 2026 13:10:58 +0300 Subject: [PATCH 15/32] *update help. --- README.md | 3 +- docs/2026.html | 13 ++++ docs/help/group__c__types.html | 2 +- docs/help/group__descrint.html | 22 +++--- docs/help/group__drawing.html | 10 +-- docs/help/group__gaussian__filter.html | 4 +- docs/help/group__image__io.html | 10 +-- docs/help/group__info.html | 42 ++++++++-- docs/help/group__memory.html | 78 +++++++++++++------ docs/help/group__object__detection.html | 6 +- .../group__recursive__bilateral__filter.html | 4 +- docs/help/group__resizing.html | 4 +- docs/help/group__shifting.html | 6 +- docs/help/group__synet__add.html | 4 +- .../help/group__synet__convolution__bf16.html | 10 +-- .../help/group__synet__convolution__fp32.html | 12 +-- .../help/group__synet__convolution__int8.html | 12 +-- .../group__synet__deconvolution__bf16.html | 10 +-- .../group__synet__deconvolution__fp32.html | 12 +-- docs/help/group__synet__gather__elements.html | 8 +- docs/help/group__synet__grid__sample.html | 6 +- docs/help/group__synet__inner__product.html | 10 +-- .../group__synet__inner__product__bf16.html | 10 +-- ...oup__synet__merged__convolution__bf16.html | 12 +-- ...oup__synet__merged__convolution__fp32.html | 12 +-- ...oup__synet__merged__convolution__int8.html | 12 +-- docs/help/group__synet__permute.html | 6 +- docs/help/group__synet__quantized__add.html | 4 +- .../group__synet__quantized__convolution.html | 12 +-- ...oup__synet__quantized__inner__product.html | 12 +-- ...synet__quantized__merged__convolution.html | 12 +-- docs/help/group__synet__scale.html | 12 +-- docs/help/group__warp__affine.html | 4 +- docs/help/index.html | 3 +- docs/help/struct_simd_1_1_view.html | 2 +- docs/index.html | 2 +- prj/txt/DoxygenOverview.txt | 3 +- src/Simd/SimdLib.h | 3 +- 38 files changed, 242 insertions(+), 167 deletions(-) diff --git a/README.md b/README.md index 8ed8dc5505..d3892b2c73 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ object detection and classification, neural network. The algorithms are optimized with using of different SIMD CPU extensions. In particular the library supports following CPU extensions: -SSE, AVX, AVX-512 and AMX for x86/x64, NEON for ARM, HVX for Hexagon. +SSE, AVX, AVX-512 and AMX for x86/x64, NEON, SVE for ARM, HVX for Hexagon. The Simd Library has C API and also contains useful C++ classes and functions to facilitate access to C API. The library supports dynamic and static linking, 32-bit and 64-bit Windows and Linux, @@ -98,6 +98,7 @@ There are addition build parameters: * `SIMD_AVX512` - Enable of AVX-512 (AVX-512F, AVX-512CD, AVX-512VL, AVX-512DQ, AVX-512BW) CPU extensions. It is switched on by default. * `SIMD_AVX512VNNI` - Enable of AVX-512-VNNI CPU extensions. It is switched on by default. * `SIMD_AMXBF16` - Enable of AMX-BF16, AMX-INT8 and AVX-512-BF16 CPU extensions. It is switched off by default. +* `SIMD_SVE` - Enable of SVE CPU extension. It is switched off by default. * `SIMD_TEST` - Build test framework. It is switched on by default. * `SIMD_INFO` - Print build information. It is switched on by default. * `SIMD_PERF` - Enable of internal performance statistic. It is switched off by default. diff --git a/docs/2026.html b/docs/2026.html index 14493f65b6..76f760826e 100644 --- a/docs/2026.html +++ b/docs/2026.html @@ -52,6 +52,19 @@
    Bug fixing
  • Error in function SimdAlignment for SVE (ARM).
  • +

    Documentation

    +
    Improving
    +
      +
    • Description of function SimdVersion.
    • +
    • Description of function SimdCpuDesc.
    • +
    • Description of function SimdCpuInfo.
    • +
    • Description of function SimdAllocate.
    • +
    • Description of function SimdFree.
    • +
    • Description of function SimdAlign.
    • +
    • Description of function SimdAlignment.
    • +
    • Description of function SimdRelease.
    • +
    + Home

    May 4, 2026 (version 7.1.161)

    diff --git a/docs/help/group__c__types.html b/docs/help/group__c__types.html index c41e21addc..036bf3d69d 100644 --- a/docs/help/group__c__types.html +++ b/docs/help/group__c__types.html @@ -325,7 +325,7 @@

    -

    Describes type of description which can return function SimdCpuDesc.

    +

    Describes type of description which can return function SimdCpuDesc.

    diff --git a/docs/help/group__descrint.html b/docs/help/group__descrint.html index 815597d487..b1c61b1f65 100644 --- a/docs/help/group__descrint.html +++ b/docs/help/group__descrint.html @@ -121,7 +121,7 @@

    Returns
    a pointer to Integer Descriptor Engine context. On error it returns NULL. It must be released with using of function SimdRelease. This pointer is used in functions SimdDescrIntEncodedSize, SimdDescrIntDecodedSize, SimdDescrIntEncode32f, SimdDescrIntEncode16f, SimdDescrIntDecode32f, SimdDescrIntDecode16f, SimdDescrIntCosineDistance, SimdDescrIntCosineDistancesMxNa, SimdDescrIntCosineDistancesMxNp, SimdDescrIntVectorNorm.
    +
    Returns
    a pointer to Integer Descriptor Engine context. On error it returns NULL. It must be released with using of function SimdRelease. This pointer is used in functions SimdDescrIntEncodedSize, SimdDescrIntDecodedSize, SimdDescrIntEncode32f, SimdDescrIntEncode16f, SimdDescrIntDecode32f, SimdDescrIntDecode16f, SimdDescrIntCosineDistance, SimdDescrIntCosineDistancesMxNa, SimdDescrIntCosineDistancesMxNp, SimdDescrIntVectorNorm.
    @@ -144,7 +144,7 @@

    Parameters

    Enumerator
    SimdCpuDescModel 

    A CPU model name.

    - +
    [in]context- a pointer to Integer Descriptor Engine context. It must be created by function SimdDescrIntInit and released by function SimdRelease.
    [in]context- a pointer to Integer Descriptor Engine context. It must be created by function SimdDescrIntInit and released by function SimdRelease.
    @@ -171,7 +171,7 @@

    Parameters
    - +
    [in]context- a pointer to Integer Descriptor Engine context. It must be created by function SimdDescrIntInit and released by function SimdRelease.
    [in]context- a pointer to Integer Descriptor Engine context. It must be created by function SimdDescrIntInit and released by function SimdRelease.
    @@ -214,7 +214,7 @@

    Parameters
    - +
    [in]context- a pointer to Integer Descriptor Engine context. It must be created by function SimdDescrIntInit and released by function SimdRelease.
    [in]context- a pointer to Integer Descriptor Engine context. It must be created by function SimdDescrIntInit and released by function SimdRelease.
    [in]src- a pointer to original 32-bit float descriptor. Its length can be determined by function SimdDescrIntDecodedSize.
    [out]dst- a pointer to encoded integer descriptor. Its size in bytes can be determined by function SimdDescrIntEncodedSize.
    @@ -258,7 +258,7 @@

    Parameters
    - +
    [in]context- a pointer to Integer Descriptor Engine context. It must be created by function SimdDescrIntInit and released by function SimdRelease.
    [in]context- a pointer to Integer Descriptor Engine context. It must be created by function SimdDescrIntInit and released by function SimdRelease.
    [in]src- a pointer to original 16-bit float descriptor. Its length can be determined by function SimdDescrIntDecodedSize.
    [out]dst- a pointer to encoded integer descriptor. Its size in bytes can be determined by function SimdDescrIntEncodedSize.
    @@ -302,7 +302,7 @@

    Parameters
    - +
    [in]context- a pointer to Integer Descriptor Engine context. It must be created by function SimdDescrIntInit and released by function SimdRelease.
    [in]context- a pointer to Integer Descriptor Engine context. It must be created by function SimdDescrIntInit and released by function SimdRelease.
    [in]src- a pointer to encoded integer descriptor. Its size in bytes can be determined by function SimdDescrIntEncodedSize.
    [out]dst- a pointer to output 32-bit float descriptor. Its length can be determined by function SimdDescrIntDecodedSize.
    @@ -346,7 +346,7 @@

    Parameters
    - +
    [in]context- a pointer to Integer Descriptor Engine context. It must be created by function SimdDescrIntInit and released by function SimdRelease.
    [in]context- a pointer to Integer Descriptor Engine context. It must be created by function SimdDescrIntInit and released by function SimdRelease.
    [in]src- a pointer to encoded integer descriptor. Its size in bytes can be determined by function SimdDescrIntEncodedSize.
    [out]dst- a pointer to output 16-bit float descriptor. Its length can be determined by function SimdDescrIntDecodedSize.
    @@ -397,7 +397,7 @@

    Note
    Integer descriptor can be received with using of functions SimdDescrIntEncode32f or SimdDescrIntEncode16f. Its size in bytes is determined by function SimdDescrIntEncodedSize.
    Parameters
    - + @@ -461,7 +461,7 @@

    Note
    Integer descriptor can be received with using of functions SimdDescrIntEncode32f or SimdDescrIntEncode16f. Its size in bytes is determined by function SimdDescrIntEncodedSize.
    Parameters

    [in]context- a pointer to Integer Descriptor Engine context. It must be created by function SimdDescrIntInit and released by function SimdRelease.
    [in]context- a pointer to Integer Descriptor Engine context. It must be created by function SimdDescrIntInit and released by function SimdRelease.
    [in]a- a pointer to the first integer descriptor.
    [in]b- a pointer to the second integer descriptor.
    [out]distance- a pointer to 32-bit float with cosine distance.
    - + @@ -527,7 +527,7 @@

    Note
    Integer descriptor can be received with using of functions SimdDescrIntEncode32f or SimdDescrIntEncode16f. Its size in bytes is determined by function SimdDescrIntEncodedSize.
    Parameters

    [in]context- a pointer to Integer Descriptor Engine context. It must be created by function SimdDescrIntInit and released by function SimdRelease.
    [in]context- a pointer to Integer Descriptor Engine context. It must be created by function SimdDescrIntInit and released by function SimdRelease.
    [in]M- a number of A arrays.
    [in]N- a number of B arrays.
    [in]A- a pointer to the first array with pointers to integer descriptors.
    - + @@ -575,7 +575,7 @@

    Note
    Integer descriptor can be received with using of functions SimdDescrIntEncode32f or SimdDescrIntEncode16f. Its size in bytes is determined by function SimdDescrIntEncodedSize.
    Parameters

    [in]context- a pointer to Integer Descriptor Engine context. It must be created by function SimdDescrIntInit and released by function SimdRelease.
    [in]context- a pointer to Integer Descriptor Engine context. It must be created by function SimdDescrIntInit and released by function SimdRelease.
    [in]M- a number of A arrays.
    [in]N- a number of B arrays.
    [in]A- a pointer to the first array with integer descriptors.
    - +
    [in]context- a pointer to Integer Descriptor Engine context. It must be created by function SimdDescrIntInit and released by function SimdRelease.
    [in]context- a pointer to Integer Descriptor Engine context. It must be created by function SimdDescrIntInit and released by function SimdRelease.
    [in]a- a pointer to integer descriptor.
    [out]norm- a pointer to result 32-bit float norm.
    diff --git a/docs/help/group__drawing.html b/docs/help/group__drawing.html index 2e24e725fd..d881a6ea7d 100644 --- a/docs/help/group__drawing.html +++ b/docs/help/group__drawing.html @@ -977,7 +977,7 @@

    Creates font context.

    -
    Returns
    a pointer to font context. On error it returns NULL. This pointer is used in functions SimdFontResize, SimdFontHeight. It must be released with using of function SimdRelease.
    +
    Returns
    a pointer to font context. On error it returns NULL. This pointer is used in functions SimdFontResize, SimdFontHeight. It must be released with using of function SimdRelease.
    @@ -1010,7 +1010,7 @@

    Parameters
    - +
    [in]context- a font context. It must be created by function SimdFontInit and released by function SimdRelease.
    [in]context- a font context. It must be created by function SimdFontInit and released by function SimdRelease.
    [in]height- a new height of font.
    @@ -1038,7 +1038,7 @@

    Parameters
    - +
    [in]context- a font context. It must be created by function SimdFontInit and released by function SimdRelease.
    [in]context- a font context. It must be created by function SimdFontInit and released by function SimdRelease.

    @@ -1087,7 +1087,7 @@

    Parameters
    - + @@ -1174,7 +1174,7 @@

    Parameters

    [in]context- a font context. It must be created by function SimdFontInit and released by function SimdRelease.
    [in]context- a font context. It must be created by function SimdFontInit and released by function SimdRelease.
    [in]text- a pointer to text.
    [out]width- a measured width of region need to draw this text.
    [out]height- a measured height of region need to draw this text.
    - + diff --git a/docs/help/group__gaussian__filter.html b/docs/help/group__gaussian__filter.html index 5daa7769ed..b477c7b11c 100644 --- a/docs/help/group__gaussian__filter.html +++ b/docs/help/group__gaussian__filter.html @@ -208,7 +208,7 @@

    Returns
    a pointer to filter context. On error it returns NULL. This pointer is used in functions SimdGaussianBlurRun. It must be released with using of function SimdRelease.
    +
    Returns
    a pointer to filter context. On error it returns NULL. This pointer is used in functions SimdGaussianBlurRun. It must be released with using of function SimdRelease.
    @@ -270,7 +270,7 @@

    Parameters

    [in]context- a font context. It must be created by function SimdFontInit and released by function SimdRelease.
    [in]context- a font context. It must be created by function SimdFontInit and released by function SimdRelease.
    [out]canvas- a pointer to pixels data of canvas image.
    [in]stride- a row size of canvas image.
    [in]width- a width of canvas image.
    - + diff --git a/docs/help/group__image__io.html b/docs/help/group__image__io.html index dd6f8c003d..1d79c01973 100644 --- a/docs/help/group__image__io.html +++ b/docs/help/group__image__io.html @@ -148,7 +148,7 @@

    Returns
    a pointer to memory buffer with output image file. It has to be deleted after use by function SimdFree. On error it returns NULL.
    +
    Returns
    a pointer to memory buffer with output image file. It has to be deleted after use by function SimdFree. On error it returns NULL.
    @@ -316,7 +316,7 @@

    Returns
    a pointer to memory buffer with output image file. It has to be deleted after use by function SimdFree. On error it returns NULL.
    +
    Returns
    a pointer to memory buffer with output image file. It has to be deleted after use by function SimdFree. On error it returns NULL.
    @@ -417,7 +417,7 @@

    Returns
    a pointer to memory buffer with output image file. It has to be deleted after use by function SimdFree. On error it returns NULL.
    +
    Returns
    a pointer to memory buffer with output image file. It has to be deleted after use by function SimdFree. On error it returns NULL.
    @@ -484,7 +484,7 @@

    Returns
    a pointer to pixels data of output image. It has to be deleted after use by function SimdFree. On error it returns NULL.
    +
    Returns
    a pointer to pixels data of output image. It has to be deleted after use by function SimdFree. On error it returns NULL.
    @@ -544,7 +544,7 @@

    Returns
    a pointer to pixels data of output image. It has to be deleted after use by function SimdFree. On error it returns NULL.
    +
    Returns
    a pointer to pixels data of output image. It has to be deleted after use by function SimdFree. On error it returns NULL.
    diff --git a/docs/help/group__info.html b/docs/help/group__info.html index 64ee6cb630..3e213b7e9f 100644 --- a/docs/help/group__info.html +++ b/docs/help/group__info.html @@ -54,7 +54,7 @@

    Simd Library Documentation.

    - + @@ -86,7 +86,18 @@

    Gets version of Simd Library.

    -
    Returns
    string with version of Simd Library (major version number, minor version number, release number, number of SVN's commits).
    +

    Returns a pointer to a null-terminated, statically allocated string that encodes the library version. The format of the string is:

    major.minor.release[.branch-sha]
    +

    where major, minor and release are numeric components taken from the library's version file, and the optional branch and sha suffix identify the Git branch name and short commit hash at build time (e.g. "7.1.161.main-a1b2c3d"). When version information is not available at build time the function returns "unknown".

    +

    The returned pointer is valid for the lifetime of the process and must not be freed.

    +

    Using example:

    #include "Simd/SimdLib.h"
    +#include <iostream>
    +
    +int main()
    +{
    +    std::cout << "Simd Library version: " << SimdVersion() << std::endl;
    +    return 0;
    +}
    +
    Returns
    a pointer to a static null-terminated string with the version of Simd Library.
    @@ -106,23 +117,27 @@

    -

    Gets description of CPU and Simd Library.

    -
    Note
    See enumeration SimdCpuDescType.
    +

    Gets a text description of the CPU.

    +

    Returns a pointer to a null-terminated string whose content depends on the requested SimdCpuDescType:

      +
    • SimdCpuDescModel — the CPU brand/model name string (e.g. "Intel(R) Core(TM) i7-8700 CPU @ 3.20GHz"). On x86 it is read from the CPUID brand-string leaves; on Linux/ARM it is obtained via lscpu. An empty string is returned on platforms where the model name is not available (Apple, Android).
    • +
    +

    The returned pointer is valid for the lifetime of the process and must not be freed. For an unknown or unsupported type value the function returns NULL.

    +
    Note
    See enumeration SimdCpuDescType for the full list of supported types.

    Using example:

    #include "Simd/SimdLib.h"
     #include <iostream>
     
     int main()
     {
    -    std::cout << "CPU: " << SimdCpuDesc(SimdCpuDescModel) << std::endl;
    +    std::cout << "CPU model: " << SimdCpuDesc(SimdCpuDescModel) << std::endl;
         return 0;
     }
     
    Parameters

    [in]filter- a filter context. It must be created by function SimdGaussianBlurInit and released by function SimdRelease.
    [in]filter- a filter context. It must be created by function SimdGaussianBlurInit and released by function SimdRelease.
    [in]src- a pointer to pixels data of the original input image.
    [in]srcStride- a row size (in bytes) of the input image.
    [out]dst- a pointer to pixels data of the filtered output image.
     Gets version of Simd Library. More...
     
    SIMD_API const char * SimdCpuDesc (SimdCpuDescType type)
     Gets description of CPU and Simd Library. More...
     Gets a text description of the CPU. More...
     
    SIMD_API uint64_t SimdCpuInfo (SimdCpuInfoType type)
     Gets information about CPU and Simd Library. More...
    - +
    [in]type- a type of required description.
    [in]type- a type of required description. See SimdCpuDescType.
    -
    Returns
    a value which contains description of CPU and Simd Library.
    +
    Returns
    a pointer to a static null-terminated string with the requested CPU description, or NULL if type is not supported.
    @@ -143,6 +158,13 @@

    Gets information about CPU and Simd Library.

    +

    Depending on the requested SimdCpuInfoType, the function returns one of the following kinds of values:

      +
    • CPU topology: number of sockets, physical cores, or logical threads.
    • +
    • Cache / RAM sizes in bytes (L1 data cache, L2 cache, L3 cache, physical RAM).
    • +
    • SIMD extension availability: 1 if the extension is supported and enabled by the library, 0 otherwise. The extensions covered are SSE4.1 (and below), AVX2 (and FMA/AVX), AVX-512BW (and AVX-512F), AVX-512VNNI, AMX-BF16 (and AMX-INT8/AVX-512VBMI/AVX-512FP16), NEON, SVE, and HVX.
    • +
    • SVE vector width in bytes (SimdCpuInfoSveSize).
    • +
    • Current CPU core frequency in Hz (SimdCpuInfoCurrentFrequency); returns 0 if unavailable on the platform.
    • +
    Note
    See enumeration SimdCpuInfoType.

    Using example:

    #include "Simd/SimdLib.h"
     #include <iostream>
    @@ -162,6 +184,10 @@ 

    Parameters
    @@ -170,7 +196,7 @@

    Returns
    a value which contains information about CPU and Simd Library.

    +
    Returns
    a value whose meaning depends on type: a count (topology), size in bytes (cache/RAM), 1 or 0 (SIMD availability), size in bytes (SVE vector width), or frequency in Hz (current CPU frequency).
    diff --git a/docs/help/group__memory.html b/docs/help/group__memory.html index 3eac0f5f61..9e5c3568e6 100644 --- a/docs/help/group__memory.html +++ b/docs/help/group__memory.html @@ -51,19 +51,19 @@

    Simd Library Documentation.

    Functions

    SIMD_API void * SimdAllocate (size_t size, size_t align) - Allocates aligned memory block. More...
    + Allocates an aligned memory block. More...
      SIMD_API void SimdFree (void *ptr) - Frees aligned memory block. More...
    + Frees an aligned memory block previously allocated by SimdAllocate. More...
      SIMD_API size_t SimdAlign (size_t size, size_t align) - Gets aligned size. More...
    + Rounds a size value up to the nearest multiple of a given alignment. More...
      SIMD_API size_t SimdAlignment (void) - Gets alignment required for the most productive work of Simd Library. More...
    + Returns the optimal memory alignment for the current platform. More...
      SIMD_API void SimdRelease (void *context) - Releases context created with using of Simd Library API. More...
    + Destroys an opaque context object created by the Simd Library API. More...
      SIMD_INLINE void LitterCpuCache (size_t k=2)  It creates a large buffer and fills it. More...
    @@ -98,16 +98,31 @@

    -

    Allocates aligned memory block.

    -
    Note
    The memory allocated by this function is must be deleted by function SimdFree.
    -
    Parameters
    +

    Allocates an aligned memory block.

    +

    Allocates a contiguous memory block of at least size bytes whose start address is a multiple of align. The alignment value must be a power of two and, on POSIX platforms (GCC), is rounded up to at least sizeof(void*) internally. The actual allocation is performed via the platform-appropriate aligned allocator: _aligned_malloc (MSVC), __mingw_aligned_malloc (MinGW), posix_memalign (GCC), or plain malloc on platforms that do not support aligned allocation.

    +

    The block must be released with SimdFree — passing it to the standard free or delete is undefined behaviour.

    +

    Using example:

    #include "Simd/SimdLib.h"
    +
    +int main()
    +{
    +    const size_t size  = 1024;
    +    const size_t align = SimdAlignment();
    +    uint8_t * data = (uint8_t *)SimdAllocate(size, align);
    +    if (data)
    +    {
    +        // use data ...
    +        SimdFree(data);
    +    }
    +    return 0;
    +}
    +
    Parameters
    - - + +
    [in]size- a size of memory block.
    [in]align- a required alignment of memory block.
    [in]size- the number of bytes to allocate. Must be greater than zero.
    [in]align- the required alignment of the allocated block in bytes. Must be a power of two. Use SimdAlignment to obtain the optimal alignment for the current platform.
    -
    Returns
    a pointer to allocated memory.
    +
    Returns
    a pointer to the newly allocated aligned memory block, or NULL if the allocation fails.
    @@ -127,11 +142,13 @@

    -

    Frees aligned memory block.

    -
    Note
    This function frees a memory allocated by function SimdAllocate.
    +

    Frees an aligned memory block previously allocated by SimdAllocate.

    +

    Releases the memory block pointed to by ptr, which must have been returned by a prior call to SimdAllocate. Passing a pointer obtained from any other allocator (e.g. malloc, new, or _aligned_malloc) is undefined behaviour.

    +

    Passing NULL is safe and has no effect, consistent with the behaviour of the standard free function.

    +

    The underlying release call matches the allocator used by SimdAllocate for the current platform: _aligned_free (MSVC), __mingw_aligned_free (MinGW), or free (GCC and others).

    Parameters
    - +
    [in]ptr- a pointer to the memory to be deleted.
    [in]ptr- a pointer to the memory block to free. Must have been returned by SimdAllocate, or NULL (in which case the call has no effect).
    @@ -164,15 +181,17 @@

    -

    Gets aligned size.

    +

    Rounds a size value up to the nearest multiple of a given alignment.

    +

    Returns the smallest value that is both a multiple of align and greater than or equal to size. If size is already a multiple of align, it is returned unchanged.

    +

    The function uses the bitwise formula (size + align - 1) & ~(align - 1), which requires align to be a positive power of two.

    Parameters
    - - + +
    [in]size- an original size.
    [in]align- a required alignment.
    [in]size- the original size in bytes (or elements) to be aligned.
    [in]align- the required alignment in bytes. Must be a positive power of two. Use SimdAlignment to obtain the optimal alignment for the current platform.
    -
    Returns
    an aligned size.
    +
    Returns
    the smallest multiple of align that is greater than or equal to size.
    @@ -192,8 +211,18 @@

    -

    Gets alignment required for the most productive work of Simd Library.

    -
    Returns
    a required alignment.
    +

    Returns the optimal memory alignment for the current platform.

    +

    Returns the byte-width of the widest SIMD register available at runtime, which is the recommended alignment value to pass to SimdAllocate and SimdAlign in order to achieve best performance.

    +

    The value is determined once at library initialization time by probing the active SIMD extensions and is constant for the lifetime of the process:

      +
    • 128 bytes — HVX (Qualcomm Hexagon)
    • +
    • 64 bytes — AVX-512 (x86, when either AVX-512BW or AVX-512VNNI is available)
    • +
    • 32 bytes — AVX2 (x86)
    • +
    • 16 bytes — SSE4.1 (x86) or NEON (ARM)
    • +
    • sizeof(void*) — scalar fallback (no SIMD extensions detected)
    • +
    • SVE vector size for current CPU in bytes — when SVE is available.
    • +
    +

    The returned value is always a power of two and equals the value of the SIMD_ALIGN compile-time constant used internally by the library.

    +
    Returns
    the optimal alignment in bytes for the current platform.
    @@ -213,11 +242,14 @@

    -

    Releases context created with using of Simd Library API.

    -
    Note
    This function releases a context created by functions SimdDetectionLoadA and SimdDetectionInit.
    +

    Destroys an opaque context object created by the Simd Library API.

    +

    Releases any context object returned by a Simd Library context-creation function, i.e. any function whose name ends in Init (such as SimdGaussianBlurInit, SimdResizerInit, SimdWarpAffineInit, SimdDescrIntInit, SimdFontInit, SimdSynetConvolution32fInit, and others), as well as SimdDetectionLoadA.

    +

    Internally the function performs a polymorphic delete through the virtual destructor of the internal Deletable base class, ensuring that the correct destructor is always invoked regardless of the actual context type.

    +

    Passing NULL is safe and has no effect, consistent with the behaviour of a C++ delete expression on a null pointer.

    +
    Note
    Passing a pointer that was not returned by a Simd Library context-creation function (for example a pointer from SimdAllocate, malloc, or new) is undefined behaviour.
    Parameters
    - +
    [in]context- a context to be released.
    [in]context- a pointer to the context to be released, or NULL.
    diff --git a/docs/help/group__object__detection.html b/docs/help/group__object__detection.html index a14762f74a..a113feab58 100644 --- a/docs/help/group__object__detection.html +++ b/docs/help/group__object__detection.html @@ -112,7 +112,7 @@

    Returns
    a pointer to loaded cascade. On error it returns NULL. This pointer is used in functions SimdDetectionInfo and SimdDetectionInit, and must be released with using of function SimdRelease.

    +
    Returns
    a pointer to loaded cascade. On error it returns NULL. This pointer is used in functions SimdDetectionInfo and SimdDetectionInit, and must be released with using of function SimdRelease.
    @@ -141,7 +141,7 @@

    Returns
    a pointer to loaded cascade. On error it returns NULL. This pointer is used in functions SimdDetectionInfo and SimdDetectionInit, and must be released with using of function SimdRelease.
    +
    Returns
    a pointer to loaded cascade. On error it returns NULL. This pointer is used in functions SimdDetectionInfo and SimdDetectionInit, and must be released with using of function SimdRelease.
    @@ -295,7 +295,7 @@

    Returns
    a pointer to hidden cascade. On error it returns NULL. This pointer is used in functions SimdDetectionPrepare, SimdDetectionHaarDetect32fp, SimdDetectionHaarDetect32fi, SimdDetectionLbpDetect32fp, SimdDetectionLbpDetect32fi, SimdDetectionLbpDetect16ip and SimdDetectionLbpDetect16ii. It must be released with using of function SimdRelease.
    +
    Returns
    a pointer to hidden cascade. On error it returns NULL. This pointer is used in functions SimdDetectionPrepare, SimdDetectionHaarDetect32fp, SimdDetectionHaarDetect32fi, SimdDetectionLbpDetect32fp, SimdDetectionLbpDetect32fi, SimdDetectionLbpDetect16ip and SimdDetectionLbpDetect16ii. It must be released with using of function SimdRelease.
    diff --git a/docs/help/group__recursive__bilateral__filter.html b/docs/help/group__recursive__bilateral__filter.html index 36c4d4fbdf..b4953b49f6 100644 --- a/docs/help/group__recursive__bilateral__filter.html +++ b/docs/help/group__recursive__bilateral__filter.html @@ -186,7 +186,7 @@

    Returns
    a pointer to filter context. On error it returns NULL. This pointer is used in functions SimdRecursiveBilateralFilterRun. It must be released with using of function SimdRelease.
    +
    Returns
    a pointer to filter context. On error it returns NULL. This pointer is used in functions SimdRecursiveBilateralFilterRun. It must be released with using of function SimdRelease.
    @@ -237,7 +237,7 @@

    Parameters
    - + diff --git a/docs/help/group__resizing.html b/docs/help/group__resizing.html index f6a8988737..c9b808fa89 100644 --- a/docs/help/group__resizing.html +++ b/docs/help/group__resizing.html @@ -732,7 +732,7 @@

    Returns
    a pointer to resize context. On error it returns NULL. This pointer is used in functions SimdResizerRun. It must be released with using of function SimdRelease.
    +
    Returns
    a pointer to resize context. On error it returns NULL. This pointer is used in functions SimdResizerRun. It must be released with using of function SimdRelease.
    @@ -783,7 +783,7 @@

    Parameters

    [in]filter- a filter context. It must be created by function SimdRecursiveBilateralFilterInit and released by function SimdRelease.
    [in]filter- a filter context. It must be created by function SimdRecursiveBilateralFilterInit and released by function SimdRelease.
    [in]src- a pointer to pixels data of the original input image.
    [in]srcStride- a row size (in bytes) of the input image.
    [out]dst- a pointer to pixels data of the filtered output image.
    - + diff --git a/docs/help/group__shifting.html b/docs/help/group__shifting.html index cbfc7cc7f6..62be61fe85 100644 --- a/docs/help/group__shifting.html +++ b/docs/help/group__shifting.html @@ -313,7 +313,7 @@

    Returns
    a pointer to shift detector context. On error it returns NULL. This pointer is used in functions SimdShiftDetectorSetBackground, SimdShiftDetectorEstimate. It must be released with using of function SimdRelease.
    +
    Returns
    a pointer to shift detector context. On error it returns NULL. This pointer is used in functions SimdShiftDetectorSetBackground, SimdShiftDetectorEstimate. It must be released with using of function SimdRelease.
    @@ -358,7 +358,7 @@

    Note
    This function used in class Simd::ShiftDetector.
    Parameters

    [in]resizer- a resize context. It must be created by function SimdResizerInit and released by function SimdRelease.
    [in]resizer- a resize context. It must be created by function SimdResizerInit and released by function SimdRelease.
    [in]src- a pointer to pixels data of the original input image.
    [in]srcStride- a row size (in bytes) of the input image.
    [out]dst- a pointer to pixels data of the resized output image.
    - + @@ -451,7 +451,7 @@

    Note
    This function used in class Simd::ShiftDetector.
    Parameters

    [in]context- a shift detector context. It must be created by function SimdShiftDetectorInitBuffers and released by function SimdRelease.
    [in]context- a shift detector context. It must be created by function SimdShiftDetectorInitBuffers and released by function SimdRelease.
    [in]bkg- a pointer to pixels data of background image.
    [in]bkgStride- a row size of the background image.
    [in]makeCopy- if true, copy of the background will be created.
    - + diff --git a/docs/help/group__synet__add.html b/docs/help/group__synet__add.html index 08ba7d3ad0..439325d0ea 100644 --- a/docs/help/group__synet__add.html +++ b/docs/help/group__synet__add.html @@ -143,7 +143,7 @@

    Returns
    a pointer to add context. On error it returns NULL. It must be released with using of function SimdRelease. This pointer is used in function SimdSynetAdd16bForward.
    +
    Returns
    a pointer to add context. On error it returns NULL. It must be released with using of function SimdRelease. This pointer is used in function SimdSynetAdd16bForward.
    @@ -188,7 +188,7 @@

    Parameters

    [in]context- a shift detector context. It must be created by function SimdShiftDetectorInitBuffers and released by function SimdRelease.
    [in]context- a shift detector context. It must be created by function SimdShiftDetectorInitBuffers and released by function SimdRelease.
    [in]curr- a pointer to pixels data of current image.
    [in]currStride- a row size of the current image.
    [in]currWidth- a width of current image.
    - + diff --git a/docs/help/group__synet__convolution__bf16.html b/docs/help/group__synet__convolution__bf16.html index f1997e4b9b..c204d40b41 100644 --- a/docs/help/group__synet__convolution__bf16.html +++ b/docs/help/group__synet__convolution__bf16.html @@ -110,7 +110,7 @@

    Returns
    a pointer to BF16 convolution context. On error it returns NULL. It must be released with using of function SimdRelease. This pointer is used in functions SimdSynetConvolution16bExternalBufferSize, SimdSynetConvolution16bInternalBufferSize, SimdSynetConvolution16bInfo, SimdSynetConvolution16bSetParams and SimdSynetConvolution16bForward.
    +
    Returns
    a pointer to BF16 convolution context. On error it returns NULL. It must be released with using of function SimdRelease. This pointer is used in functions SimdSynetConvolution16bExternalBufferSize, SimdSynetConvolution16bInternalBufferSize, SimdSynetConvolution16bInfo, SimdSynetConvolution16bSetParams and SimdSynetConvolution16bForward.
    @@ -133,7 +133,7 @@

    Parameters

    [in]context- a pointer to add context. It must be created by function SimdSynetAdd16bInit and released by function SimdRelease.
    [in]context- a pointer to add context. It must be created by function SimdSynetAdd16bInit and released by function SimdRelease.
    [in]a- a pointer to input A tensor.
    [in]b- a pointer to input B tensor.
    [out]dst- a pointer to output tensor.
    - +
    [in]context- a pointer to BF16 convolution context. It must be created by function SimdSynetConvolution16bInit and released by function SimdRelease.
    [in]context- a pointer to BF16 convolution context. It must be created by function SimdSynetConvolution16bInit and released by function SimdRelease.
    @@ -160,7 +160,7 @@

    Parameters
    - +
    [in]context- a pointer to BF16 convolution context. It must be created by function SimdSynetConvolution16bInit and released by function SimdRelease.
    [in]context- a pointer to BF16 convolution context. It must be created by function SimdSynetConvolution16bInit and released by function SimdRelease.
    @@ -187,7 +187,7 @@

    Parameters
    - +
    [in]context- a pointer to BF16 convolution context. It must be created by function SimdSynetConvolution16bInit and released by function SimdRelease.
    [in]context- a pointer to BF16 convolution context. It must be created by function SimdSynetConvolution16bInit and released by function SimdRelease.
    @@ -236,7 +236,7 @@

    Parameters
    - + diff --git a/docs/help/group__synet__convolution__fp32.html b/docs/help/group__synet__convolution__fp32.html index b5dd9836a0..3c76b35396 100644 --- a/docs/help/group__synet__convolution__fp32.html +++ b/docs/help/group__synet__convolution__fp32.html @@ -106,7 +106,7 @@

    Returns
    a pointer to FP32 convolution context. On error it returns NULL. It must be released with using of function SimdRelease. This pointer is used in functions SimdSynetConvolution32fExternalBufferSize, SimdSynetConvolution32fInternalBufferSize, SimdSynetConvolution32fInfo, SimdSynetConvolution32fSetParams and SimdSynetConvolution32fForward.
    +
    Returns
    a pointer to FP32 convolution context. On error it returns NULL. It must be released with using of function SimdRelease. This pointer is used in functions SimdSynetConvolution32fExternalBufferSize, SimdSynetConvolution32fInternalBufferSize, SimdSynetConvolution32fInfo, SimdSynetConvolution32fSetParams and SimdSynetConvolution32fForward.
    @@ -129,7 +129,7 @@

    Parameters

    [in]context- a pointer to BF16 convolution context. It must be created by function SimdSynetConvolution16bInit and released by function SimdRelease.
    [in]context- a pointer to BF16 convolution context. It must be created by function SimdSynetConvolution16bInit and released by function SimdRelease.
    [in]src- a pointer to input tensor.
    [out]buf- a pointer to external temporary buffer. The size of the external temporary buffer is determined by function SimdSynetConvolution16bExternalBufferSize. Can be NULL (it causes usage of internal buffer).
    [out]dst- a pointer to output tensor.
    - +
    [in]context- a pointer to FP32 convolution context. It must be created by function SimdSynetConvolution32fInit and released by function SimdRelease.
    [in]context- a pointer to FP32 convolution context. It must be created by function SimdSynetConvolution32fInit and released by function SimdRelease.
    @@ -156,7 +156,7 @@

    Parameters
    - +
    [in]context- a pointer to FP32 convolution context. It must be created by function SimdSynetConvolution32fInit and released by function SimdRelease.
    [in]context- a pointer to FP32 convolution context. It must be created by function SimdSynetConvolution32fInit and released by function SimdRelease.
    @@ -183,7 +183,7 @@

    Parameters
    - +
    [in]context- a pointer to FP32 convolution context. It must be created by function SimdSynetConvolution32fInit and released by function SimdRelease.
    [in]context- a pointer to FP32 convolution context. It must be created by function SimdSynetConvolution32fInit and released by function SimdRelease.
    @@ -238,7 +238,7 @@

    Parameters
    - + @@ -290,7 +290,7 @@

    Parameters

    [in,out]context- a pointer to FP32 convolution context. It must be created by function SimdSynetConvolution32fInit and released by function SimdRelease.
    [in,out]context- a pointer to FP32 convolution context. It must be created by function SimdSynetConvolution32fInit and released by function SimdRelease.
    [in]weight- a pointer to convolution weights.
    [out]internal- a flag signalizing that weight is stored in the internal buffer. Can be NULL.
    [in]bias- a pointer to bias. Can be NULL.
    - + diff --git a/docs/help/group__synet__convolution__int8.html b/docs/help/group__synet__convolution__int8.html index 0913d872d4..9bd02e654a 100644 --- a/docs/help/group__synet__convolution__int8.html +++ b/docs/help/group__synet__convolution__int8.html @@ -113,7 +113,7 @@

    Returns
    a pointer to INT8 convolution context. On error it returns NULL. It must be released with using of function SimdRelease. This pointer is used in functions SimdSynetConvolution8iExternalBufferSize, SimdSynetConvolution8iInternalBufferSize, SimdSynetConvolution8iInfo, SimdSynetConvolution8iSetParams and SimdSynetConvolution8iForward.
    +
    Returns
    a pointer to INT8 convolution context. On error it returns NULL. It must be released with using of function SimdRelease. This pointer is used in functions SimdSynetConvolution8iExternalBufferSize, SimdSynetConvolution8iInternalBufferSize, SimdSynetConvolution8iInfo, SimdSynetConvolution8iSetParams and SimdSynetConvolution8iForward.
    @@ -136,7 +136,7 @@

    Parameters

    [in]context- a pointer to FP32 convolution context. It must be created by function SimdSynetConvolution32fInit and released by function SimdRelease.
    [in]context- a pointer to FP32 convolution context. It must be created by function SimdSynetConvolution32fInit and released by function SimdRelease.
    [in]src- a pointer to input tensor.
    [out]buf- a pointer to external temporary buffer. The size of the external temporary buffer is determined by function SimdSynetConvolution32fExternalBufferSize. Can be NULL (it causes usage of internal buffer).
    [out]dst- a pointer to output tensor.
    - +
    [in]context- a pointer to INT8 convolution context. It must be created by function SimdSynetConvolution8iInit and released by function SimdRelease.
    [in]context- a pointer to INT8 convolution context. It must be created by function SimdSynetConvolution8iInit and released by function SimdRelease.
    @@ -163,7 +163,7 @@

    Parameters
    - +
    [in]context- a pointer to INT8 convolution context. It must be created by function SimdSynetConvolution8iInit and released by function SimdRelease.
    [in]context- a pointer to INT8 convolution context. It must be created by function SimdSynetConvolution8iInit and released by function SimdRelease.
    @@ -190,7 +190,7 @@

    Parameters
    - +
    [in]context- a pointer to INT8 convolution context. It must be created by function SimdSynetConvolution8iInit and released by function SimdRelease.
    [in]context- a pointer to INT8 convolution context. It must be created by function SimdSynetConvolution8iInit and released by function SimdRelease.
    @@ -245,7 +245,7 @@

    Parameters
    - + @@ -297,7 +297,7 @@

    Parameters

    [in,out]context- a pointer to INT8 convolution context. It must be created by function SimdSynetConvolution8iInit and released by function SimdRelease.
    [in,out]context- a pointer to INT8 convolution context. It must be created by function SimdSynetConvolution8iInit and released by function SimdRelease.
    [in]weight- a pointer to original (32-bit float point) convolution weights.
    [in]bias- a pointer to original (32-bit float point) bias. Can be NULL.
    [in]params- a pointer to original (32-bit float point) parameters of activation functions (see SimdConvolutionActivationType). Can be NULL.
    - + diff --git a/docs/help/group__synet__deconvolution__bf16.html b/docs/help/group__synet__deconvolution__bf16.html index 6f8a6bf5f7..ccfa6c96c5 100644 --- a/docs/help/group__synet__deconvolution__bf16.html +++ b/docs/help/group__synet__deconvolution__bf16.html @@ -110,7 +110,7 @@

    Returns
    a pointer to BF16 convolution context. On error it returns NULL. It must be released with using of function SimdRelease. This pointer is used in functions SimdSynetDeconvolution16bExternalBufferSize, SimdSynetDeconvolution16bInternalBufferSize, SimdSynetDeconvolution16bInfo, SimdSynetDeconvolution16bSetParams and SimdSynetDeconvolution16bForward.
    +
    Returns
    a pointer to BF16 convolution context. On error it returns NULL. It must be released with using of function SimdRelease. This pointer is used in functions SimdSynetDeconvolution16bExternalBufferSize, SimdSynetDeconvolution16bInternalBufferSize, SimdSynetDeconvolution16bInfo, SimdSynetDeconvolution16bSetParams and SimdSynetDeconvolution16bForward.
    @@ -133,7 +133,7 @@

    Parameters

    [in]context- a pointer to INT8 convolution context. It must be created by function SimdSynetConvolution8iInit and released by function SimdRelease.
    [in]context- a pointer to INT8 convolution context. It must be created by function SimdSynetConvolution8iInit and released by function SimdRelease.
    [in]src- a pointer to input tensor.
    [out]buf- a pointer to external temporary buffer. The size of the external temporary buffer is determined by function SimdSynetConvolution8iExternalBufferSize. Can be NULL (it causes usage of internal buffer).
    [out]dst- a pointer to output tensor.
    - +
    [in]context- a pointer to BF16 deconvolution context. It must be created by function SimdSynetDeconvolution16bInit and released by function SimdRelease.
    [in]context- a pointer to BF16 deconvolution context. It must be created by function SimdSynetDeconvolution16bInit and released by function SimdRelease.
    @@ -160,7 +160,7 @@

    Parameters
    - +
    [in]context- a pointer to BF16 deconvolution context. It must be created by function SimdSynetDeconvolution16bInit and released by function SimdRelease.
    [in]context- a pointer to BF16 deconvolution context. It must be created by function SimdSynetDeconvolution16bInit and released by function SimdRelease.
    @@ -187,7 +187,7 @@

    Parameters
    - +
    [in]context- a pointer to BF16 deconvolution context. It must be created by function SimdSynetDeconvolution16bInit and released by function SimdRelease.
    [in]context- a pointer to BF16 deconvolution context. It must be created by function SimdSynetDeconvolution16bInit and released by function SimdRelease.
    @@ -236,7 +236,7 @@

    Parameters
    - + diff --git a/docs/help/group__synet__deconvolution__fp32.html b/docs/help/group__synet__deconvolution__fp32.html index 0c865b2a24..f8c8c25b9e 100644 --- a/docs/help/group__synet__deconvolution__fp32.html +++ b/docs/help/group__synet__deconvolution__fp32.html @@ -113,7 +113,7 @@

    Returns
    a pointer to FP32 deconvolution context. On error it returns NULL. It must be released with using of function SimdRelease. This pointer is used in functions SimdSynetDeconvolution32fExternalBufferSize, SimdSynetDeconvolution32fInternalBufferSize, SimdSynetDeconvolution32fInfo, SimdSynetDeconvolution32fSetParams and SimdSynetDeconvolution32fForward.
    +
    Returns
    a pointer to FP32 deconvolution context. On error it returns NULL. It must be released with using of function SimdRelease. This pointer is used in functions SimdSynetDeconvolution32fExternalBufferSize, SimdSynetDeconvolution32fInternalBufferSize, SimdSynetDeconvolution32fInfo, SimdSynetDeconvolution32fSetParams and SimdSynetDeconvolution32fForward.
    @@ -136,7 +136,7 @@

    Parameters

    [in]context- a pointer to BF16 deconvolution context. It must be created by function SimdSynetDeconvolution16bInit and released by function SimdRelease.
    [in]context- a pointer to BF16 deconvolution context. It must be created by function SimdSynetDeconvolution16bInit and released by function SimdRelease.
    [in]src- a pointer to input tensor.
    [out]buf- a pointer to external temporary buffer. The size of the external temporary buffer is determined by function SimdSynetDeconvolution16bExternalBufferSize. Can be NULL (it causes usage of internal buffer).
    [out]dst- a pointer to output tensor.
    - +
    [in]context- a pointer to FP32 deconvolution context. It must be created by function SimdSynetDeconvolution32fInit and released by function SimdRelease.
    [in]context- a pointer to FP32 deconvolution context. It must be created by function SimdSynetDeconvolution32fInit and released by function SimdRelease.
    @@ -163,7 +163,7 @@

    Parameters
    - +
    [in]context- a pointer to FP32 deconvolution context. It must be created by function SimdSynetDeconvolution32fInit and released by function SimdRelease.
    [in]context- a pointer to FP32 deconvolution context. It must be created by function SimdSynetDeconvolution32fInit and released by function SimdRelease.
    @@ -190,7 +190,7 @@

    Parameters
    - +
    [in]context- a pointer to FP32 deconvolution context. It must be created by function SimdSynetDeconvolution32fInit and released by function SimdRelease.
    [in]context- a pointer to FP32 deconvolution context. It must be created by function SimdSynetDeconvolution32fInit and released by function SimdRelease.
    @@ -245,7 +245,7 @@

    Parameters
    - + @@ -297,7 +297,7 @@

    Parameters

    [in,out]context- a pointer to FP32 deconvolution context. It must be created by function SimdSynetDeconvolution32fInit and released by function SimdRelease.
    [in,out]context- a pointer to FP32 deconvolution context. It must be created by function SimdSynetDeconvolution32fInit and released by function SimdRelease.
    [in]weight- a pointer to deconvolution weights.
    [out]internal- a flag signalizing that weight is stored in the internal buffer. Can be NULL.
    [in]bias- a pointer to bias. Can be NULL.
    - + diff --git a/docs/help/group__synet__gather__elements.html b/docs/help/group__synet__gather__elements.html index 3cc92ba300..0590991246 100644 --- a/docs/help/group__synet__gather__elements.html +++ b/docs/help/group__synet__gather__elements.html @@ -153,7 +153,7 @@

    Returns
    a pointer to gather elements context. On error it returns NULL. It must be released with using of function SimdRelease. This pointer is used in functions :: SimdSynetGatherElementsSetIndex, SimdSynetGatherElementsInternalBufferSize, and SimdSynetGatherElementsForward.
    +
    Returns
    a pointer to gather elements context. On error it returns NULL. It must be released with using of function SimdRelease. This pointer is used in functions :: SimdSynetGatherElementsSetIndex, SimdSynetGatherElementsInternalBufferSize, and SimdSynetGatherElementsForward.
    @@ -186,7 +186,7 @@

    Parameters

    [in]context- a pointer to FP32 deconvolution context. It must be created by function SimdSynetDeconvolution32fInit and released by function SimdRelease.
    [in]context- a pointer to FP32 deconvolution context. It must be created by function SimdSynetDeconvolution32fInit and released by function SimdRelease.
    [in]src- a pointer to input tensor.
    [out]buf- a pointer to external temporary buffer. The size of the external temporary buffer is determined by function SimdSynetDeconvolution32fExternalBufferSize. Can be NULL (it causes usage of internal buffer).
    [out]dst- a pointer to output tensor.
    - +
    [in]context- a pointer to gather elements context. It must be created by function SimdSynetGatherElementsInit and released by function SimdRelease.
    [in]context- a pointer to gather elements context. It must be created by function SimdSynetGatherElementsInit and released by function SimdRelease.
    [in]idx- a pointer to tensor with indexes. It can be INT32 or INT64. Its size = outer[0] * .. * outer[outerSize - 1] * idxCount * inner.
    @@ -213,7 +213,7 @@

    Parameters
    - +
    [in]context- a pointer to gather elements context. It must be created by function SimdSynetGatherElementsInit and released by function SimdRelease.
    [in]context- a pointer to gather elements context. It must be created by function SimdSynetGatherElementsInit and released by function SimdRelease.
    @@ -262,7 +262,7 @@

    Parameters
    - + diff --git a/docs/help/group__synet__grid__sample.html b/docs/help/group__synet__grid__sample.html index 3979a126b8..48334c5ad3 100644 --- a/docs/help/group__synet__grid__sample.html +++ b/docs/help/group__synet__grid__sample.html @@ -222,7 +222,7 @@

    Returns
    a pointer to grid sample 2D context. On error it returns NULL. It must be released with using of function SimdRelease. This pointer is used in functions SimdSynetGridSample2dInternalBufferSize, and SimdSynetGridSample2dForward.
    +
    Returns
    a pointer to grid sample 2D context. On error it returns NULL. It must be released with using of function SimdRelease. This pointer is used in functions SimdSynetGridSample2dInternalBufferSize, and SimdSynetGridSample2dForward.
    @@ -245,7 +245,7 @@

    Parameters

    [in]context- a pointer to gather elements algorithm. It must be created by function SimdSynetGatherElementsInit and released by function SimdRelease.
    [in]context- a pointer to gather elements algorithm. It must be created by function SimdSynetGatherElementsInit and released by function SimdRelease.
    [in]src- a pointer to input tensor. Its size = outer[0] * .. * outer[outerSize - 1] * srcCount * inner.
    [in]idx- a pointer to index tensor. Its size = outer[0] * .. * outer[outerSize - 1] * idxCount * inner.
    [out]dst- a pointer to output tensor. Its size = outer[0] * .. * outer[outerSize - 1] * idxCount * inner.
    - +
    [in]context- a pointer to grid sample 2D context. It must be created by function SimdSynetGridSample2dInit and released by function SimdRelease.
    [in]context- a pointer to grid sample 2D context. It must be created by function SimdSynetGridSample2dInit and released by function SimdRelease.
    @@ -294,7 +294,7 @@

    Parameters
    - + diff --git a/docs/help/group__synet__inner__product.html b/docs/help/group__synet__inner__product.html index 7d5b9f31d0..279ea25165 100644 --- a/docs/help/group__synet__inner__product.html +++ b/docs/help/group__synet__inner__product.html @@ -144,7 +144,7 @@

    Returns
    a pointer to FP32 inner product context. On error it returns NULL. It must be released with using of function SimdRelease. This pointer is used in functions SimdSynetInnerProduct32fInternalBufferSize, SimdSynetInnerProduct32fExternalBufferSize, SimdSynetInnerProduct32fSetParams and SimdSynetInnerProduct32fForward.
    +
    Returns
    a pointer to FP32 inner product context. On error it returns NULL. It must be released with using of function SimdRelease. This pointer is used in functions SimdSynetInnerProduct32fInternalBufferSize, SimdSynetInnerProduct32fExternalBufferSize, SimdSynetInnerProduct32fSetParams and SimdSynetInnerProduct32fForward.
    @@ -167,7 +167,7 @@

    Parameters

    [in]context- a pointer to grid sample 2D context. It must be created by function SimdSynetGridSample2dInit and released by function SimdRelease.
    [in]context- a pointer to grid sample 2D context. It must be created by function SimdSynetGridSample2dInit and released by function SimdRelease.
    [in]src- a pointer to input tensor. It has size = batch * channels * srcH * srcW.
    [in]grd- a pointer to grid tensor. It has size = batch * dstH * dstW * 2.
    [out]dst- a pointer to output tensor. It has size = batch * channels * dstH * dstW.
    - +
    [in]context- a pointer to FP32 inner product context. It must be created by function SimdSynetInnerProduct32fInit and released by function SimdRelease.
    [in]context- a pointer to FP32 inner product context. It must be created by function SimdSynetInnerProduct32fInit and released by function SimdRelease.
    @@ -194,7 +194,7 @@

    Parameters
    - +
    [in]context- a pointer to FP32 inner product context. It must be created by function SimdSynetInnerProduct32fInit and released by function SimdRelease.
    [in]context- a pointer to FP32 inner product context. It must be created by function SimdSynetInnerProduct32fInit and released by function SimdRelease.
    @@ -249,7 +249,7 @@

    Parameters
    - + @@ -307,7 +307,7 @@

    Parameters

    [in,out]context- a pointer to FP32 inner product context. It must be created by function SimdSynetInnerProduct32fInit and released by function SimdRelease.
    [in,out]context- a pointer to FP32 inner product context. It must be created by function SimdSynetInnerProduct32fInit and released by function SimdRelease.
    [in]weight- a pointer to inner product weights.
    [out]internal- a flag signalizing that weight is stored in the internal buffer. Can be NULL.
    [in]bias- a pointer to bias. Can be NULL.
    - + diff --git a/docs/help/group__synet__inner__product__bf16.html b/docs/help/group__synet__inner__product__bf16.html index 6123b7750c..058dc69610 100644 --- a/docs/help/group__synet__inner__product__bf16.html +++ b/docs/help/group__synet__inner__product__bf16.html @@ -166,7 +166,7 @@

    Returns
    a pointer to BF16 inner product context. On error it returns NULL. It must be released with using of function SimdRelease. This pointer is used in functions SimdSynetInnerProduct16bInternalBufferSize, SimdSynetInnerProduct16bExternalBufferSize, SimdSynetInnerProduct16bInfo, SimdSynetInnerProduct16bSetParams and SimdSynetInnerProduct16bForward.
    +
    Returns
    a pointer to BF16 inner product context. On error it returns NULL. It must be released with using of function SimdRelease. This pointer is used in functions SimdSynetInnerProduct16bInternalBufferSize, SimdSynetInnerProduct16bExternalBufferSize, SimdSynetInnerProduct16bInfo, SimdSynetInnerProduct16bSetParams and SimdSynetInnerProduct16bForward.
    @@ -189,7 +189,7 @@

    Parameters

    [in]context- a pointer to FP32 inner product context. It must be created by function SimdSynetInnerProduct32fInit and released by function SimdRelease.
    [in]context- a pointer to FP32 inner product context. It must be created by function SimdSynetInnerProduct32fInit and released by function SimdRelease.
    [in]A- a pointer to A matrix.
    [in]B- a pointer to B matrix. Can be NULL if B is constant matrix. In that case you have to set B (weight) in function SimdSynetInnerProduct16bSetParams.
    [out]buf- a pointer to external buffer. The size of the external temporary buffer is determined by function SimdSynetInnerProduct16bExternalBufferSize. Can be NULL (it causes usage of internal buffer).
    - +
    [in]context- a pointer to BF16 inner product context. It must be created by function SimdSynetInnerProduct16bInit and released by function SimdRelease.
    [in]context- a pointer to BF16 inner product context. It must be created by function SimdSynetInnerProduct16bInit and released by function SimdRelease.
    @@ -216,7 +216,7 @@

    Parameters
    - +
    [in]context- a pointer to BF16 inner product context. It must be created by function SimdSynetInnerProduct16bInit and released by function SimdRelease.
    [in]context- a pointer to BF16 inner product context. It must be created by function SimdSynetInnerProduct16bInit and released by function SimdRelease.
    @@ -243,7 +243,7 @@

    Parameters
    - +
    [in]context- a pointer to BF16 inner product context. It must be created by function SimdSynetInnerProduct16bInit and released by function SimdRelease.
    [in]context- a pointer to BF16 inner product context. It must be created by function SimdSynetInnerProduct16bInit and released by function SimdRelease.
    @@ -298,7 +298,7 @@

    Parameters
    - + diff --git a/docs/help/group__synet__merged__convolution__bf16.html b/docs/help/group__synet__merged__convolution__bf16.html index 8b75f74f21..91acb6f329 100644 --- a/docs/help/group__synet__merged__convolution__bf16.html +++ b/docs/help/group__synet__merged__convolution__bf16.html @@ -120,7 +120,7 @@

    Returns
    a pointer to BF16 merged convolution context. On error it returns NULL. It must be released with using of function SimdRelease. This pointer is used in functions SimdSynetMergedConvolution16bExternalBufferSize, SimdSynetMergedConvolution16bInternalBufferSize, SimdSynetMergedConvolution16bInfo, SimdSynetMergedConvolution16bSetParams and SimdSynetMergedConvolution16bForward.
    +
    Returns
    a pointer to BF16 merged convolution context. On error it returns NULL. It must be released with using of function SimdRelease. This pointer is used in functions SimdSynetMergedConvolution16bExternalBufferSize, SimdSynetMergedConvolution16bInternalBufferSize, SimdSynetMergedConvolution16bInfo, SimdSynetMergedConvolution16bSetParams and SimdSynetMergedConvolution16bForward.
    @@ -143,7 +143,7 @@

    Parameters

    [in]context- a pointer to BF16 inner product context. It must be created by function SimdSynetInnerProduct16bInit and released by function SimdRelease.
    [in]context- a pointer to BF16 inner product context. It must be created by function SimdSynetInnerProduct16bInit and released by function SimdRelease.
    [in]A- a pointer to A matrix.
    [in]B- a pointer to B matrix. Can be NULL if B is constant matrix. In that case you have to set B (weight) in function SimdSynetInnerProduct16bSetParams.
    [out]buf- a pointer to external buffer. The size of the external temporary buffer is determined by function SimdSynetInnerProduct16bExternalBufferSize. Can be NULL (it causes usage of internal buffer).
    - +
    [in]context- a pointer to BF16 merged convolution context. It must be created by function SimdSynetMergedConvolution16bInit and released by function SimdRelease.
    [in]context- a pointer to BF16 merged convolution context. It must be created by function SimdSynetMergedConvolution16bInit and released by function SimdRelease.
    @@ -170,7 +170,7 @@

    Parameters
    - +
    [in]context- a pointer to BF16 merged convolution context. It must be created by function SimdSynetMergedConvolution16bInit and released by function SimdRelease.
    [in]context- a pointer to BF16 merged convolution context. It must be created by function SimdSynetMergedConvolution16bInit and released by function SimdRelease.
    @@ -197,7 +197,7 @@

    Parameters
    - +
    [in]context- a pointer to BF16 merged convolution context. It must be created by function SimdSynetMergedConvolution16bInit and released by function SimdRelease.
    [in]context- a pointer to BF16 merged convolution context. It must be created by function SimdSynetMergedConvolution16bInit and released by function SimdRelease.
    @@ -246,7 +246,7 @@

    Parameters
    - + @@ -297,7 +297,7 @@

    Parameters

    [in,out]context- a pointer to BF16 merged convolution context. It must be created by function SimdSynetMergedConvolution16bInit and released by function SimdRelease.
    [in,out]context- a pointer to BF16 merged convolution context. It must be created by function SimdSynetMergedConvolution16bInit and released by function SimdRelease.
    [in]weight- a pointer to the array with pointers to convolution weights. The array size is determined by number of merged convolutions.
    [in]bias- a pointer to the array with pointers to bias. The array size is determined by number of merged convolutions. Can be NULL.
    [in]params- a pointer to the array with pointers to parameters of the activation functions (see SimdConvolutionActivationType). The array size is determined by number of merged convolutions. Can be NULL.
    - + diff --git a/docs/help/group__synet__merged__convolution__fp32.html b/docs/help/group__synet__merged__convolution__fp32.html index 569861f746..2aeefb11f9 100644 --- a/docs/help/group__synet__merged__convolution__fp32.html +++ b/docs/help/group__synet__merged__convolution__fp32.html @@ -120,7 +120,7 @@

    Returns
    a pointer to FP32 merged convolution context. On error it returns NULL. It must be released with using of function SimdRelease. This pointer is used in functions SimdSynetMergedConvolution32fExternalBufferSize, SimdSynetMergedConvolution32fInternalBufferSize, SimdSynetMergedConvolution32fInfo, SimdSynetMergedConvolution32fSetParams and SimdSynetMergedConvolution32fForward.
    +
    Returns
    a pointer to FP32 merged convolution context. On error it returns NULL. It must be released with using of function SimdRelease. This pointer is used in functions SimdSynetMergedConvolution32fExternalBufferSize, SimdSynetMergedConvolution32fInternalBufferSize, SimdSynetMergedConvolution32fInfo, SimdSynetMergedConvolution32fSetParams and SimdSynetMergedConvolution32fForward.
    @@ -143,7 +143,7 @@

    Parameters

    [in]context- a pointer to BF16 merged convolution context. It must be created by function SimdSynetMergedConvolution16bInit and released by function SimdRelease.
    [in]context- a pointer to BF16 merged convolution context. It must be created by function SimdSynetMergedConvolution16bInit and released by function SimdRelease.
    [in]src- a pointer to input image.
    [out]buf- a pointer to external temporary buffer. The size in bytes of the external temporary buffer is determined by function SimdSynetMergedConvolution16bExternalBufferSize. Can be NULL (it causes usage of internal buffer).
    [out]dst- a pointer to output image.
    - +
    [in]context- a pointer to FP32 merged convolution context. It must be created by function SimdSynetMergedConvolution32fInit and released by function SimdRelease.
    [in]context- a pointer to FP32 merged convolution context. It must be created by function SimdSynetMergedConvolution32fInit and released by function SimdRelease.
    @@ -170,7 +170,7 @@

    Parameters
    - +
    [in]context- a pointer to FP32 merged convolution context. It must be created by function SimdSynetMergedConvolution32fInit and released by function SimdRelease.
    [in]context- a pointer to FP32 merged convolution context. It must be created by function SimdSynetMergedConvolution32fInit and released by function SimdRelease.
    @@ -197,7 +197,7 @@

    Parameters
    - +
    [in]context- a pointer to FP32 merged convolution context. It must be created by function SimdSynetMergedConvolution32fInit and released by function SimdRelease.
    [in]context- a pointer to FP32 merged convolution context. It must be created by function SimdSynetMergedConvolution32fInit and released by function SimdRelease.
    @@ -252,7 +252,7 @@

    Parameters
    - + @@ -304,7 +304,7 @@

    Parameters

    [in,out]context- a pointer to FP32 merged convolution context. It must be created by function SimdSynetMergedConvolution32fInit and released by function SimdRelease.
    [in,out]context- a pointer to FP32 merged convolution context. It must be created by function SimdSynetMergedConvolution32fInit and released by function SimdRelease.
    [in]weight- a pointer to the array with pointers to convolution weights. The array size is determined by number of merged convolutions.
    [out]internal- a pointer to the array of flags signalizing that weights are stored in the internal buffer. The array size is determined by number of merged convolutions. Can be NULL.
    [in]bias- a pointer to the array with pointers to bias. The array size is determined by number of merged convolutions. Can be NULL.
    - + diff --git a/docs/help/group__synet__merged__convolution__int8.html b/docs/help/group__synet__merged__convolution__int8.html index bc02accf06..a1b0e9aefd 100644 --- a/docs/help/group__synet__merged__convolution__int8.html +++ b/docs/help/group__synet__merged__convolution__int8.html @@ -120,7 +120,7 @@

    Returns
    a pointer to INT8 merged convolution context. On error it returns NULL. It must be released with using of function SimdRelease. This pointer is used in functions SimdSynetMergedConvolution8iExternalBufferSize, SimdSynetMergedConvolution8iInternalBufferSize, SimdSynetMergedConvolution8iInfo, SimdSynetMergedConvolution8iSetParams and SimdSynetMergedConvolution8iForward.
    +
    Returns
    a pointer to INT8 merged convolution context. On error it returns NULL. It must be released with using of function SimdRelease. This pointer is used in functions SimdSynetMergedConvolution8iExternalBufferSize, SimdSynetMergedConvolution8iInternalBufferSize, SimdSynetMergedConvolution8iInfo, SimdSynetMergedConvolution8iSetParams and SimdSynetMergedConvolution8iForward.
    @@ -143,7 +143,7 @@

    Parameters

    [in]context- a pointer to FP32 merged convolution context. It must be created by function SimdSynetMergedConvolution32fInit and released by function SimdRelease.
    [in]context- a pointer to FP32 merged convolution context. It must be created by function SimdSynetMergedConvolution32fInit and released by function SimdRelease.
    [in]src- a pointer to input image.
    [out]buf- a pointer to external temporary buffer. The size of the external temporary buffer is determined by function SimdSynetMergedConvolution32fExternalBufferSize. Can be NULL (it causes usage of internal buffer).
    [out]dst- a pointer to output image.
    - +
    [in]context- a pointer to INT8 merged convolution context. It must be created by function SimdSynetMergedConvolution8iInit and released by function SimdRelease.
    [in]context- a pointer to INT8 merged convolution context. It must be created by function SimdSynetMergedConvolution8iInit and released by function SimdRelease.
    @@ -170,7 +170,7 @@

    Parameters
    - +
    [in]context- a pointer to INT8 merged convolution context. It must be created by function SimdSynetMergedConvolution8iInit and released by function SimdRelease.
    [in]context- a pointer to INT8 merged convolution context. It must be created by function SimdSynetMergedConvolution8iInit and released by function SimdRelease.
    @@ -197,7 +197,7 @@

    Parameters
    - +
    [in]context- a pointer to INT8 merged convolution context. It must be created by function SimdSynetMergedConvolution8iInit and released by function SimdRelease.
    [in]context- a pointer to INT8 merged convolution context. It must be created by function SimdSynetMergedConvolution8iInit and released by function SimdRelease.
    @@ -258,7 +258,7 @@

    Parameters
    - + @@ -311,7 +311,7 @@

    Parameters

    [in,out]context- a pointer to INT8 merged convolution context. It must be created by function SimdSynetMergedConvolution8iInit and released by function SimdRelease.
    [in,out]context- a pointer to INT8 merged convolution context. It must be created by function SimdSynetMergedConvolution8iInit and released by function SimdRelease.
    [in]weight- a pointer to the array with pointers to convolution weights. The array size is determined by number of merged convolutions.
    [out]internal- a pointer to the array of flags signalizing that weights are stored in the internal buffer. The array size is determined by number of merged convolutions. Can be NULL.
    [in]bias- a pointer to the array with pointers to bias. The array size is determined by number of merged convolutions. Can be NULL.
    - + diff --git a/docs/help/group__synet__permute.html b/docs/help/group__synet__permute.html index e60ed152c8..3c931ecfa9 100644 --- a/docs/help/group__synet__permute.html +++ b/docs/help/group__synet__permute.html @@ -111,7 +111,7 @@

    Returns
    a pointer to permute context. On error it returns NULL. It must be released with using of function SimdRelease. This pointer is used in functions SimdSynetPermuteInternalBufferSize, and SimdSynetPermuteForward.
    +
    Returns
    a pointer to permute context. On error it returns NULL. It must be released with using of function SimdRelease. This pointer is used in functions SimdSynetPermuteInternalBufferSize, and SimdSynetPermuteForward.
    @@ -134,7 +134,7 @@

    Parameters

    [in]context- a pointer to INT8 merged convolution context. It must be created by function SimdSynetMergedConvolution8iInit and released by function SimdRelease.
    [in]context- a pointer to INT8 merged convolution context. It must be created by function SimdSynetMergedConvolution8iInit and released by function SimdRelease.
    [in]src- a pointer to input image.
    [out]buf- a pointer to external temporary buffer. The size in bytes of the external temporary buffer is determined by function SimdSynetMergedConvolution8iExternalBufferSize. Can be NULL (it causes usage of internal buffer).
    [out]dst- a pointer to output image.
    - +
    [in]context- a pointer to permute context. It must be created by function SimdSynetPermuteInit and released by function SimdRelease.
    [in]context- a pointer to permute context. It must be created by function SimdSynetPermuteInit and released by function SimdRelease.
    @@ -177,7 +177,7 @@

    Parameters
    - +
    [in]context- a pointer to permute context. It must be created by function SimdSynetPermuteInit and released by function SimdRelease.
    [in]context- a pointer to permute context. It must be created by function SimdSynetPermuteInit and released by function SimdRelease.
    [in]src- a pointer to input image.
    [out]dst- a pointer to output image.
    diff --git a/docs/help/group__synet__quantized__add.html b/docs/help/group__synet__quantized__add.html index d11a5370e2..bf9e58e033 100644 --- a/docs/help/group__synet__quantized__add.html +++ b/docs/help/group__synet__quantized__add.html @@ -185,7 +185,7 @@

    Returns
    a pointer to quantized addition context. On error it returns NULL. It must be released with using of function SimdRelease. This pointer is used in function SimdSynetQuantizedAddForward.
    +
    Returns
    a pointer to quantized addition context. On error it returns NULL. It must be released with using of function SimdRelease. This pointer is used in function SimdSynetQuantizedAddForward.
    @@ -230,7 +230,7 @@

    Parameters
    - + diff --git a/docs/help/group__synet__quantized__convolution.html b/docs/help/group__synet__quantized__convolution.html index 36237e90b6..a269e8fdaf 100644 --- a/docs/help/group__synet__quantized__convolution.html +++ b/docs/help/group__synet__quantized__convolution.html @@ -106,7 +106,7 @@

    Returns
    a pointer to Quantized convolution context. On error it returns NULL. It must be released with using of function SimdRelease. This pointer is used in functions SimdSynetQuantizedConvolutionExternalBufferSize, SimdSynetQuantizedConvolutionInternalBufferSize, SimdSynetQuantizedConvolutionInfo, SimdSynetQuantizedConvolutionSetParams and SimdSynetQuantizedConvolutionForward.
    +
    Returns
    a pointer to Quantized convolution context. On error it returns NULL. It must be released with using of function SimdRelease. This pointer is used in functions SimdSynetQuantizedConvolutionExternalBufferSize, SimdSynetQuantizedConvolutionInternalBufferSize, SimdSynetQuantizedConvolutionInfo, SimdSynetQuantizedConvolutionSetParams and SimdSynetQuantizedConvolutionForward.
    @@ -129,7 +129,7 @@

    Parameters

    [in]context- a pointer to quantized addition context. It must be created by function SimdSynetQuantizedAddInit and released by function SimdRelease.
    [in]context- a pointer to quantized addition context. It must be created by function SimdSynetQuantizedAddInit and released by function SimdRelease.
    [in]a- a pointer to input A tensor.
    [in]b- a pointer to input B tensor.
    [out]dst- a pointer to output tensor.
    - +
    [in]context- a pointer to Quantized convolution context. It must be created by function SimdSynetQuantizedConvolutionInit and released by function SimdRelease.
    [in]context- a pointer to Quantized convolution context. It must be created by function SimdSynetQuantizedConvolutionInit and released by function SimdRelease.
    @@ -156,7 +156,7 @@

    Parameters
    - +
    [in]context- a pointer to Quantized convolution context. It must be created by function SimdSynetQuantizedConvolutionInit and released by function SimdRelease.
    [in]context- a pointer to Quantized convolution context. It must be created by function SimdSynetQuantizedConvolutionInit and released by function SimdRelease.
    @@ -183,7 +183,7 @@

    Parameters
    - +
    [in]context- a pointer to Quantized convolution context. It must be created by function SimdSynetQuantizedConvolutionInit and released by function SimdRelease.
    [in]context- a pointer to Quantized convolution context. It must be created by function SimdSynetQuantizedConvolutionInit and released by function SimdRelease.
    @@ -250,7 +250,7 @@

    Parameters
    - + @@ -304,7 +304,7 @@

    Parameters

    [in,out]context- a pointer to Quantized convolution context. It must be created by function SimdSynetQuantizedConvolutionInit and released by function SimdRelease.
    [in,out]context- a pointer to Quantized convolution context. It must be created by function SimdSynetQuantizedConvolutionInit and released by function SimdRelease.
    [in]ioScale- a pointer to 32-bit float point input/output tensors scales.
    [in]ioZero- a pointer to 8-bit unsigned integer input/output tensors zeros.
    [in]weight- a pointer to 8-bit integer convolution weight.
    - + diff --git a/docs/help/group__synet__quantized__inner__product.html b/docs/help/group__synet__quantized__inner__product.html index 0288fd7331..df809ebae3 100644 --- a/docs/help/group__synet__quantized__inner__product.html +++ b/docs/help/group__synet__quantized__inner__product.html @@ -162,7 +162,7 @@

    Returns
    a pointer to quantized inner product context. On error it returns NULL. It must be released with using of function SimdRelease. This pointer is used in functions SimdSynetQuantizedInnerProductInternalBufferSize, SimdSynetQuantizedInnerProductExternalBufferSize, SimdSynetQuantizedInnerProductInfo, SimdSynetQuantizedInnerProductSetParams and SimdSynetQuantizedInnerProductForward.
    +
    Returns
    a pointer to quantized inner product context. On error it returns NULL. It must be released with using of function SimdRelease. This pointer is used in functions SimdSynetQuantizedInnerProductInternalBufferSize, SimdSynetQuantizedInnerProductExternalBufferSize, SimdSynetQuantizedInnerProductInfo, SimdSynetQuantizedInnerProductSetParams and SimdSynetQuantizedInnerProductForward.
    @@ -185,7 +185,7 @@

    Parameters

    [in]context- a pointer to Quantized convolution context. It must be created by function SimdSynetQuantizedConvolutionInit and released by function SimdRelease.
    [in]context- a pointer to Quantized convolution context. It must be created by function SimdSynetQuantizedConvolutionInit and released by function SimdRelease.
    [in]src- a pointer to input tensor.
    [out]buf- a pointer to external temporary buffer. The size of the external temporary buffer is determined by function SimdSynetQuantizedConvolutionExternalBufferSize. Can be NULL (it causes usage of internal buffer).
    [out]dst- a pointer to output tensor.
    - +
    [in]context- a pointer to quantized inner product context. It must be created by function SimdSynetQuantizedInnerProductInit and released by function SimdRelease.
    [in]context- a pointer to quantized inner product context. It must be created by function SimdSynetQuantizedInnerProductInit and released by function SimdRelease.
    @@ -212,7 +212,7 @@

    Parameters
    - +
    [in]context- a pointer to quantized inner product context. It must be created by function SimdSynetQuantizedInnerProductInit and released by function SimdRelease.
    [in]context- a pointer to quantized inner product context. It must be created by function SimdSynetQuantizedInnerProductInit and released by function SimdRelease.
    @@ -239,7 +239,7 @@

    Parameters
    - +
    [in]context- a pointer to quantized inner product context. It must be created by function SimdSynetQuantizedInnerProductInit and released by function SimdRelease.
    [in]context- a pointer to quantized inner product context. It must be created by function SimdSynetQuantizedInnerProductInit and released by function SimdRelease.
    @@ -312,7 +312,7 @@

    Parameters
    - + @@ -373,7 +373,7 @@

    Parameters

    [in,out]context- a pointer to quantized inner product context. It must be created by function SimdSynetQuantizedInnerProductInit and released by function SimdRelease.
    [in,out]context- a pointer to quantized inner product context. It must be created by function SimdSynetQuantizedInnerProductInit and released by function SimdRelease.
    [in]aScale- a pointer to 32-bit float point input A tensor scale.
    [in]aZero- a pointer to 8-bit unsigned integer input A tensor zero.
    [in]b- a pointer to 8-bit integer input B tensor. Can be NULL.
    - + diff --git a/docs/help/group__synet__quantized__merged__convolution.html b/docs/help/group__synet__quantized__merged__convolution.html index 269bf31a86..2730802ca8 100644 --- a/docs/help/group__synet__quantized__merged__convolution.html +++ b/docs/help/group__synet__quantized__merged__convolution.html @@ -120,7 +120,7 @@

    Returns
    a pointer to Quantized merged convolution context. On error it returns NULL. It must be released with using of function SimdRelease. This pointer is used in functions SimdSynetQuantizedMergedConvolutionExternalBufferSize, SimdSynetQuantizedMergedConvolutionInternalBufferSize, SimdSynetQuantizedMergedConvolutionInfo, SimdSynetQuantizedMergedConvolutionSetParams and SimdSynetQuantizedMergedConvolutionForward.
    +
    Returns
    a pointer to Quantized merged convolution context. On error it returns NULL. It must be released with using of function SimdRelease. This pointer is used in functions SimdSynetQuantizedMergedConvolutionExternalBufferSize, SimdSynetQuantizedMergedConvolutionInternalBufferSize, SimdSynetQuantizedMergedConvolutionInfo, SimdSynetQuantizedMergedConvolutionSetParams and SimdSynetQuantizedMergedConvolutionForward.
    @@ -143,7 +143,7 @@

    Parameters

    [in]context- a pointer to quantized inner product context. It must be created by function SimdSynetQuantizedInnerProductInit and released by function SimdRelease.
    [in]context- a pointer to quantized inner product context. It must be created by function SimdSynetQuantizedInnerProductInit and released by function SimdRelease.
    [in]A- a pointer to A matrix.
    [in]B- a pointer to B matrix. Can be NULL if B is constant matrix. In that case you have to set B in function SimdSynetQuantizedInnerProductSetParams.
    [out]buf- a pointer to external buffer. The size of the external temporary buffer is determined by function SimdSynetQuantizedInnerProductExternalBufferSize. Can be NULL (it causes usage of internal buffer).
    - +
    [in]context- a pointer to Quantized merged convolution context. It must be created by function SimdSynetQuantizedMergedConvolutionInit and released by function SimdRelease.
    [in]context- a pointer to Quantized merged convolution context. It must be created by function SimdSynetQuantizedMergedConvolutionInit and released by function SimdRelease.
    @@ -170,7 +170,7 @@

    Parameters
    - +
    [in]context- a pointer to Quantized merged convolution context. It must be created by function SimdSynetQuantizedMergedConvolutionInit and released by function SimdRelease.
    [in]context- a pointer to Quantized merged convolution context. It must be created by function SimdSynetQuantizedMergedConvolutionInit and released by function SimdRelease.
    @@ -197,7 +197,7 @@

    Parameters
    - +
    [in]context- a pointer to Quantized merged convolution context. It must be created by function SimdSynetQuantizedMergedConvolutionInit and released by function SimdRelease.
    [in]context- a pointer to Quantized merged convolution context. It must be created by function SimdSynetQuantizedMergedConvolutionInit and released by function SimdRelease.
    @@ -258,7 +258,7 @@

    Parameters
    - + @@ -311,7 +311,7 @@

    Parameters

    [in,out]context- a pointer to Quantized merged convolution context. It must be created by function SimdSynetQuantizedMergedConvolutionInit and released by function SimdRelease.
    [in,out]context- a pointer to Quantized merged convolution context. It must be created by function SimdSynetQuantizedMergedConvolutionInit and released by function SimdRelease.
    [in]ioScale- a pointer to 32-bit float point input/output tensors scales.
    [in]ioZero- a pointer to 8-bit unsigned integer input/output tensors zeros.
    [in]weight- a pointer to 8-bit integer convolution weights.
    - + diff --git a/docs/help/group__synet__scale.html b/docs/help/group__synet__scale.html index 4ebd36a39f..d8632881f6 100644 --- a/docs/help/group__synet__scale.html +++ b/docs/help/group__synet__scale.html @@ -144,7 +144,7 @@

    Returns
    a pointer to scale context. On error it returns NULL. It must be released with using of function SimdRelease. This pointer is used in function SimdSynetScale16bForward.
    +
    Returns
    a pointer to scale context. On error it returns NULL. It must be released with using of function SimdRelease. This pointer is used in function SimdSynetScale16bForward.
    @@ -195,7 +195,7 @@

    Parameters

    [in]context- a pointer to Quantized merged convolution context. It must be created by function SimdSynetQuantizedMergedConvolutionInit and released by function SimdRelease.
    [in]context- a pointer to Quantized merged convolution context. It must be created by function SimdSynetQuantizedMergedConvolutionInit and released by function SimdRelease.
    [in]src- a pointer to input tensor.
    [out]buf- a pointer to external temporary buffer. The size of the external temporary buffer is determined by function SimdSynetQuantizedMergedConvolutionExternalBufferSize. Can be NULL (it causes usage of internal buffer).
    [out]dst- a pointer to output tensor.
    - + @@ -366,7 +366,7 @@

    Returns
    a pointer to INT8 scale context. On error it returns NULL. It must be released with using of function SimdRelease. This pointer is used in functions SimdSynetScale8iInternalBufferSize, SimdSynetScale8iSetParams and SimdSynetScale8iForward.
    +
    Returns
    a pointer to INT8 scale context. On error it returns NULL. It must be released with using of function SimdRelease. This pointer is used in functions SimdSynetScale8iInternalBufferSize, SimdSynetScale8iSetParams and SimdSynetScale8iForward.
    @@ -389,7 +389,7 @@

    Parameters

    [in]context- a pointer to scale context. It must be created by function SimdSynetScale16bInit and released by function SimdRelease.
    [in]context- a pointer to scale context. It must be created by function SimdSynetScale16bInit and released by function SimdRelease.
    [in]src- a pointer to input tensor.
    [in]norm- a pointer to FP32 array with scale coefficients. Can be NULL.
    [in]bias- a pointer to FP32 array with shift coefficients. Can be NULL.
    - +
    [in]context- a pointer to INT8 scale context. It must be created by function SimdSynetScale8iInit and released by function SimdRelease.
    [in]context- a pointer to INT8 scale context. It must be created by function SimdSynetScale8iInit and released by function SimdRelease.
    @@ -438,7 +438,7 @@

    Parameters
    - + @@ -483,7 +483,7 @@

    Parameters

    [in,out]context- a pointer to INT8 convolution context. It must be created by function SimdSynetScale8iInit and released by function SimdRelease.
    [in,out]context- a pointer to INT8 convolution context. It must be created by function SimdSynetScale8iInit and released by function SimdRelease.
    [in]scale- a pointer to original (32-bit float point) scale.
    [in]bias- a pointer to original (32-bit float point) bias. Can be NULL.
    [in]stats- a pointer to pointers with statistics of input(min - stats[0], max - stats[1]) and output(min - stats[2], max - stats[3]) tensors. Can be NULL for subsequent calls of this function.
    - +
    [in]context- a pointer to INT8 scale context. It must be created by function SimdSynetScale8iInit and released by function SimdRelease.
    [in]context- a pointer to INT8 scale context. It must be created by function SimdSynetScale8iInit and released by function SimdRelease.
    [in]src- a pointer to input tensor.
    [out]dst- a pointer to output tensor.
    diff --git a/docs/help/group__warp__affine.html b/docs/help/group__warp__affine.html index bc08e16fc7..cd3a67230b 100644 --- a/docs/help/group__warp__affine.html +++ b/docs/help/group__warp__affine.html @@ -228,7 +228,7 @@

    Returns
    a pointer to warp affine context. On error it returns NULL. This pointer is used in functions SimdWarpAffineRun. It must be released with using of function SimdRelease.
    +
    Returns
    a pointer to warp affine context. On error it returns NULL. This pointer is used in functions SimdWarpAffineRun. It must be released with using of function SimdRelease.
    @@ -268,7 +268,7 @@

    Note
    This function has a C++ wrapper Simd::WarpAffine(const View& src, const float * mat, View& dst, SimdWarpAffineFlags flags = SimdWarpAffineInterpBilinear | SimdWarpAffineBorderConstant, const uint8_t* border = NULL).
    Parameters
    - +
    [in]context- a warp affine context. It must be created by function SimdWarpAffineInit and released by function SimdRelease.
    [in]context- a warp affine context. It must be created by function SimdWarpAffineInit and released by function SimdRelease.
    [in]src- a pointer to pixels data of the original input image.
    [out]dst- a pointer to pixels data of the filtered output image.
    diff --git a/docs/help/index.html b/docs/help/index.html index bf3ea684f4..5d9f2d4a19 100644 --- a/docs/help/index.html +++ b/docs/help/index.html @@ -45,7 +45,7 @@

    Simd Library Documentation.

    Introduction

    The Simd Library is a free open source image processing library and machine learning, designed for C and C++ programmers. It provides many useful high performance algorithms for image processing and machine learning such as: pixel format conversion, image scaling and filtration, extraction of statistic information from images, motion detection, object detection and classification, neural network.

    -

    The algorithms are optimized with using of different SIMD CPU extensions. In particular the library supports following CPU extensions: SSE, AVX, AVX-512 and AMX for x86/x64, NEON for ARM.

    +

    The algorithms are optimized with using of different SIMD CPU extensions. In particular the library supports following CPU extensions: SSE, AVX, AVX-512 and AMX for x86/x64, NEON, SVE for ARM, HVX for Hexagon.

    The Simd Library has C API and also contains useful C++ classes and functions to facilitate access to C API. The library supports dynamic and static linking, 32-bit and 64-bit Windows and Linux, MSVS, G++ and Clang compilers, MSVS project and CMake build systems.

    Library folder's structure

    @@ -94,6 +94,7 @@

  • SIMD_AVX512 - Enable of AVX-512 (AVX-512F, AVX-512CD, AVX-512VL, AVX-512DQ, AVX-512BW) CPU extensions. It is switched on by default.
  • SIMD_AVX512VNNI - Enable of AVX-512-VNNI CPU extensions. It is switched on by default.
  • SIMD_AMXBF16 - Enable of AMX-BF16, AMX-INT8 and AVX-512-BF16 CPU extensions. It is switched off by default.
  • +
  • SIMD_SVE - Enable of SVE CPU extension. It is switched off by default.
  • SIMD_TEST - Build test framework. It is switched on by default.
  • SIMD_INFO - Print build information. It is switched on by default.
  • SIMD_PERF - Enable of internal performance statistic. It is switched off by default.
  • diff --git a/docs/help/struct_simd_1_1_view.html b/docs/help/struct_simd_1_1_view.html index 2ed1b35bde..2762b3820a 100644 --- a/docs/help/struct_simd_1_1_view.html +++ b/docs/help/struct_simd_1_1_view.html @@ -1827,7 +1827,7 @@

    Returns
    - a released pointer to pixel data. It must be deleted by function SimdFree.

    +
    Returns
    - a released pointer to pixel data. It must be deleted by function SimdFree.
    diff --git a/docs/index.html b/docs/index.html index c28c5f7ec9..0104f85d91 100644 --- a/docs/index.html +++ b/docs/index.html @@ -26,7 +26,7 @@

    Description

    pixel format conversion, image scaling and filtration, extraction of statistic information from images, motion detection, object detection and classification, neural network.

    The algorithms are optimized with using of different SIMD CPU extensions. -In particular the library supports following CPU extensions: SSE, AVX, AVX-512 and AMX for x86/x64, NEON for ARM, HVX for Hexagon.

    +In particular the library supports following CPU extensions: SSE, AVX, AVX-512 and AMX for x86/x64, NEON, SVE for ARM, HVX for Hexagon.

    The Simd Library has C API and also contains useful C++ and Python wrapper classes and functions to facilitate access to C API. The library supports dynamic and static linking, 32-bit and 64-bit Windows and Linux, MSVS, G++ and Clang compilers, MSVS project and CMake build systems.

    diff --git a/prj/txt/DoxygenOverview.txt b/prj/txt/DoxygenOverview.txt index 9a2c44c3ec..d7330a507f 100644 --- a/prj/txt/DoxygenOverview.txt +++ b/prj/txt/DoxygenOverview.txt @@ -8,7 +8,7 @@ extraction of statistic information from images, motion detection, object detection and classification, neural network. The algorithms are optimized with using of different SIMD CPU extensions. - In particular the library supports following CPU extensions: SSE, AVX, AVX-512 and AMX for x86/x64, NEON for ARM. + In particular the library supports following CPU extensions: SSE, AVX, AVX-512 and AMX for x86/x64, NEON, SVE for ARM, HVX for Hexagon. The %Simd Library has C API and also contains useful C++ classes and functions to facilitate access to C API. The library supports dynamic and static linking, 32-bit and 64-bit Windows and Linux, MSVS, G++ and Clang compilers, MSVS project and CMake build systems. @@ -82,6 +82,7 @@ - `SIMD_AVX512` - Enable of AVX-512 (AVX-512F, AVX-512CD, AVX-512VL, AVX-512DQ, AVX-512BW) CPU extensions. It is switched on by default. - `SIMD_AVX512VNNI` - Enable of AVX-512-VNNI CPU extensions. It is switched on by default. - `SIMD_AMXBF16` - Enable of AMX-BF16, AMX-INT8 and AVX-512-BF16 CPU extensions. It is switched off by default. + - `SIMD_SVE` - Enable of SVE CPU extension. It is switched off by default. - `SIMD_TEST` - Build test framework. It is switched on by default. - `SIMD_INFO` - Print build information. It is switched on by default. - `SIMD_PERF` - Enable of internal performance statistic. It is switched off by default. diff --git a/src/Simd/SimdLib.h b/src/Simd/SimdLib.h index b889e52df0..1357a6eb55 100644 --- a/src/Simd/SimdLib.h +++ b/src/Simd/SimdLib.h @@ -1002,11 +1002,12 @@ extern "C" The value is determined once at library initialization time by probing the active SIMD extensions and is constant for the lifetime of the process: + - \b 128 bytes — HVX (Qualcomm Hexagon) - \b 64 bytes — AVX-512 (x86, when either AVX-512BW or AVX-512VNNI is available) - \b 32 bytes — AVX2 (x86) - \b 16 bytes — SSE4.1 (x86) or NEON (ARM) - - sizeof(HVX_Vector) — HVX (Qualcomm Hexagon) - sizeof(void*) — scalar fallback (no SIMD extensions detected) + - \b SVE vector size for current CPU in bytes — when SVE is available. The returned value is always a power of two and equals the value of the \c SIMD_ALIGN compile-time constant used internally by the library. From 55792300be79e0ea5ded36fd616f34de1d6343f9 Mon Sep 17 00:00:00 2001 From: Yermalayeu Ihar Date: Tue, 5 May 2026 17:38:33 +0300 Subject: [PATCH 16/32] +add SVE optimizations of function BackgroundGrowRangeFast. --- docs/2026.html | 1 + src/Simd/SimdLib.cpp | 5 +++++ src/Simd/SimdLib.h | 12 ++++++++++++ src/Simd/SimdSve1.h | 3 +++ src/Simd/SimdSve1Background.cpp | 31 +++++++++++++++++++++++++++++++ src/Test/TestBackground.cpp | 6 ++++++ 6 files changed, 58 insertions(+) diff --git a/docs/2026.html b/docs/2026.html index 76f760826e..6e4767bbd5 100644 --- a/docs/2026.html +++ b/docs/2026.html @@ -46,6 +46,7 @@
    New features
  • SVE optimizations of function AbsDifferenceSums3x3.
  • SVE optimizations of function AbsDifferenceSums3x3Masked.
  • SVE optimizations of function BackgroundGrowRangeSlow.
  • +
  • SVE optimizations of function BackgroundGrowRangeFast.
  • Bug fixing
    Bug fixing
      diff --git a/src/Simd/SimdSynetConvolution16b.h b/src/Simd/SimdSynetConvolution16b.h index 1a99d24773..1fe2b46321 100644 --- a/src/Simd/SimdSynetConvolution16b.h +++ b/src/Simd/SimdSynetConvolution16b.h @@ -377,6 +377,51 @@ namespace Simd //------------------------------------------------------------------------------------------------- + class SynetConvolution16bNhwcSpecV3 : public SynetConvolution16b + { + public: + SynetConvolution16bNhwcSpecV3(const ConvParam& p); + virtual String Ext() const { return "Base"; } + virtual String Desc() const; + virtual size_t ExternalBufferSize() const; + virtual void SetParams(const float* weight, const float* bias, const float* params); + virtual void Forward(const uint8_t* src, uint8_t* buf, uint8_t* dst); + + static bool Preferable(const ConvParam& p); + + struct AlgParam + { + size_t F, microD, microS, microC; + size_t batch, srcC, srcH, srcW, dstC, K; + size_t padV, padH, padE, gapV, gapH, kA; + size_t macroD, macroH, macroC; + size_t bufS, bufD, elem; + }; + + typedef void(*PreprocessPtr)(const uint8_t* src, const ConvParam& p, const AlgParam& a, size_t dyBeg, size_t dyEnd, int end, uint16_t* dst); + + typedef void(*BodyConvPtr)(const uint16_t* src, const ConvParam& p, const AlgParam& a, const int* srcOffs, + size_t dstC, size_t dstS, size_t nK, int zero, const uint16_t* weight, float* sum); + + typedef void(*LastConvPtr)(const uint16_t* src, const ConvParam& p, const AlgParam& a, const int* srcOffs, size_t dstC, size_t dstS, size_t nK, int zero, + const uint16_t* weight, float* sum, const float* bias, const float* params, const int* dstMask, const int* dstOffs, uint8_t* dst); + + protected: + void SetAlgParam(size_t F, size_t microD, size_t microS, size_t microC, size_t L1, size_t L2, size_t L3); + virtual void SetWeight(const float* weight); + + void ForwardSingle(const uint8_t* src, uint16_t* buf, float* sum, uint8_t* dst); + void ForwardBatch(const uint8_t* src, uint16_t* buf, float* sum, uint8_t* dst); + + AlgParam _alg; + Array32i _srcOffs, _dstMask, _nK, _maBufOffs, _maSumOffs, _miDstOffs; + PreprocessPtr _preprocess; + BodyConvPtr _bodyConv; + LastConvPtr _lastConv; + }; + + //------------------------------------------------------------------------------------------------- + class SynetConvolution16bNhwcDepthwise : public SynetConvolution16b { public: diff --git a/src/Test/TestSynetConvolution16b.cpp b/src/Test/TestSynetConvolution16b.cpp index 5fcafb58e1..0cd899620b 100644 --- a/src/Test/TestSynetConvolution16b.cpp +++ b/src/Test/TestSynetConvolution16b.cpp @@ -364,21 +364,6 @@ namespace Test result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 999, 6, 6, 999, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2); result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 125, 116, 116, 125, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2); #endif -#if 0 - result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 256, 16, 16, 256, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2); - //result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 112, 24, 24, 112, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2); - result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 56, 96, 96, 56, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2); - result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 56, 48, 48, 56, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2); - //result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 56, 48, 48, 112, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2); - //result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 64, 48, 48, 56, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2); - result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 112, 24, 24, 112, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2); - //result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 112, 24, 24, 224, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2); - //result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 128, 24, 24, 112, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2); - result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 224, 12, 12, 224, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2); - result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 224, 12, 12, 448, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2); - result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 448, 6, 6, 448, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2); - //result = result && SynetConvolution16bForwardAutoTest(eps, Param(10, 448, 6, 6, 448, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2); -#endif #if 0 result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 32, 321, 321, 16, _2, _1, _1, _0, _0, 1, aId, tT, b16, f32), c, f1, f2); result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 16, 320, 320, 32, _2, _1, _1, _0, _1, 1, aId, tT, b16, f32), c, f1, f2); @@ -564,6 +549,21 @@ namespace Test result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 32, 96, 96, 96, _3, _1, _2, _1, _1, 1, aId, tT, b16, b16), c, f1, f2); #endif #if 1 + //result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 256, 16, 16, 256, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2); + //result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 112, 24, 24, 112, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2); + result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 56, 96, 96, 56, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2); + result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 56, 48, 48, 56, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2); + //result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 56, 48, 48, 112, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2); + //result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 64, 48, 48, 56, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2); + result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 112, 24, 24, 112, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2); + //result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 112, 24, 24, 224, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2); + //result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 128, 24, 24, 112, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2); + result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 224, 12, 12, 224, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2); + result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 224, 12, 12, 448, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2); + result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 448, 6, 6, 448, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2); + //result = result && SynetConvolution16bForwardAutoTest(eps, Param(10, 448, 6, 6, 448, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2); +#endif +#if 0 result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 224, 12, 12, 448, _1, _1, _2, _0, _0, 1, aId, tT, b16, b16), c, f1, f2); result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 56, 96, 96, 56, _3, _1, _2, _1, _1, 1, aId, tT, b16, b16), c, f1, f2); result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 56, 48, 48, 112, _1, _1, _2, _0, _0, 1, aId, tT, b16, b16), c, f1, f2); From 8ea37f38a01888d0933c8847c4916ce3e1c053f8 Mon Sep 17 00:00:00 2001 From: Yermalayeu Ihar Date: Wed, 20 May 2026 17:22:13 +0300 Subject: [PATCH 20/32] +add Base implementation of class SynetConvolution16bNhwcSpecV3. --- docs/2026.html | 1 + prj/vs2022/Base.vcxproj | 1 + prj/vs2022/Base.vcxproj.filters | 3 + .../SimdBaseSynetConvolution16bNhwcSpecV3.cpp | 314 ++++++++++++++++++ src/Simd/SimdSynetConvolution16b.h | 2 +- 5 files changed, 320 insertions(+), 1 deletion(-) create mode 100644 src/Simd/SimdBaseSynetConvolution16bNhwcSpecV3.cpp diff --git a/docs/2026.html b/docs/2026.html index ce089c4482..fb1b130286 100644 --- a/docs/2026.html +++ b/docs/2026.html @@ -49,6 +49,7 @@
      New features
    • SVE optimizations of function BackgroundGrowRangeFast.
    • Method View::Copy.
    • Method Frame::Copy.
    • +
    • Base implementation of class SynetConvolution16bNhwcSpecV3.
    Bug fixing
      diff --git a/prj/vs2022/Base.vcxproj b/prj/vs2022/Base.vcxproj index 5cc450ff14..4ef731618e 100644 --- a/prj/vs2022/Base.vcxproj +++ b/prj/vs2022/Base.vcxproj @@ -226,6 +226,7 @@ + diff --git a/prj/vs2022/Base.vcxproj.filters b/prj/vs2022/Base.vcxproj.filters index 4da71c92cf..1bf70407f7 100644 --- a/prj/vs2022/Base.vcxproj.filters +++ b/prj/vs2022/Base.vcxproj.filters @@ -472,6 +472,9 @@ Base\Synet\Scale + + Base\Synet\Convolution + diff --git a/src/Simd/SimdBaseSynetConvolution16bNhwcSpecV3.cpp b/src/Simd/SimdBaseSynetConvolution16bNhwcSpecV3.cpp new file mode 100644 index 0000000000..79c1cf8c2d --- /dev/null +++ b/src/Simd/SimdBaseSynetConvolution16bNhwcSpecV3.cpp @@ -0,0 +1,314 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2026 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdSynetConvolution16b.h" +#include "Simd/SimdSynetConvolution32f.h" +#include "Simd/SimdSynetConvolution32fCommon.h" +#include "Simd/SimdSynet.h" +#include "Simd/SimdBase.h" +#include "Simd/SimdBFloat16.h" +#include "Simd/SimdAlignment.h" +#include "Simd/SimdCpu.h" + +namespace Simd +{ +#if defined(SIMD_SYNET_ENABLE) + namespace Base + { + SynetConvolution16bNhwcSpecV3::SynetConvolution16bNhwcSpecV3(const ConvParam& p) + : SynetConvolution16b(p) + { + _preprocess = 0; + _bodyConv = 0; + _lastConv = 0; + } + + String SynetConvolution16bNhwcSpecV3::Desc() const + { + std::stringstream desc; + desc << Ext() << "::NhwcSpecV3"; + if (_alg.batch > 1) + desc << "-" << _alg.batch; + return desc.str(); + } + + void SynetConvolution16bNhwcSpecV3::SetAlgParam() + { + const ConvParam& p = _param; + AlgParam& a = _alg; + + int L1 = int(Base::AlgCacheL1() * (p.IsKernel(5) ? 1.05 : 1.00)), L2 = int(Base::AlgCacheL2() * 0.5), L3 = int(Base::AlgCacheL3()); + + a.F = 16; + a.microD = 32; + a.microS = 32; + a.microC = 32; + a.srcC = AlignHi(p.srcC, a.microC); + a.padV = Simd::Max(p.padY, p.padH); + a.padH = Simd::Max(p.padX, p.padW); + a.srcH = p.srcH + a.padV; + a.srcW = p.srcW + a.padH; + a.gapV = a.srcH - p.dstH; + a.gapH = a.srcW - p.dstW; + a.dstC = AlignHi(p.dstC, a.microD); + a.kA = p.kernelX * p.kernelY; + a.K = a.srcC * a.kA; + a.padE = a.srcW * a.padV + a.padH * Simd::Max(1, a.padV); + + a.macroC = Simd::RestrictRange(AlignLo(L1 / a.microD / a.kA / 2, a.microC), a.microC, a.srcC); + a.batch = 1; + size_t bufSize = a.srcC * a.srcH * a.srcW * 2; + if (bufSize * 2 <= L2 && p.batch > 1) + { + for (size_t batch = 1; batch <= p.batch; ++batch) + if (p.batch % batch == 0 && batch * bufSize <= L2) + a.batch = batch; + } + a.macroH = Simd::RestrictRange(L2 / a.macroC / a.srcW / 2, size_t(1), p.dstH * a.batch); + a.macroD = Simd::RestrictRange(AlignLoAny(L3 / a.macroC / a.kA / 2, a.microD), a.microD, AlignHiAny(p.dstC, a.microD)); + a.macroD = Simd::Min(a.macroD, a.microD * 4); + + a.bufD = AlignHi(a.batch * a.srcH * a.srcW, a.microS) * a.macroD; + + a.elem = _elemD; + a.bufS = (a.batch * a.srcH * a.srcW + a.padE + a.microS) * a.srcC; + + _stepS = p.srcH * p.srcW * p.srcC * a.batch * _elemS; + _stepD = p.dstH * p.dstW * p.dstC * a.batch * _elemD; + + int dX = (int)a.microC, dY = (int)a.srcW * dX, dC = int(a.batch * a.srcH * a.srcW + a.padE) * dX; + _srcOffs.Resize(DivHi(a.K, a.microC)); + for (size_t c = 0, offsS = 0, i = 0; c < a.srcC; c += dX, offsS += dC) + for (size_t y = 0, offsY = offsS; y < p.kernelY; y += 1, offsY += dY) + for (size_t offsX = offsY, endX = offsY + p.kernelX * dX; offsX < endX; offsX += dX, i++) + _srcOffs[i] = (int)offsX; + + _dstMask.Resize(AlignHi((a.srcH * a.batch - a.gapV) * a.srcW - a.padH, a.microS)); + size_t i = 0; + for (size_t b = 0; b < a.batch; b++) + { + for (size_t y = 0; y < p.dstH; y++) + { + for (size_t x = 0; x < p.dstW; x++, i++) + _dstMask[i] = -1; + for (size_t x = 0; x < a.gapH; x++, i++) + _dstMask[i] = 0; + } + for (size_t y = 0, gapI = a.gapV * a.srcW; y < gapI && i < _dstMask.size; y++, i++) + _dstMask[i] = 0; + } + for (; i < _dstMask.size; i++) + _dstMask[i] = 0; + + _nK.Resize(DivHi(a.srcC, a.macroC)); + for (size_t o = 0, c = 0; o < _nK.size; o++, c += a.macroC) + { + size_t macroC = Simd::Min(a.srcC, c + a.macroC) - c; + _nK[o] = int(DivHi(macroC, a.microC) * a.kA); + } + if (_nK.size > 1 && _nK[_nK.size - 1] < _nK[_nK.size - 2]) + Simd::Swap(_nK[_nK.size - 1], _nK[_nK.size - 2]); + + size_t n = DivHi(a.batch * p.dstH, a.macroH); + _maBufOffs.Resize(n); + _maSumOffs.Resize(n + 1); + _miDstOffs.Resize(DivHi(_dstMask.size, a.microS)); + for (size_t i = 0; i <= n; ++i) + { + if (i == n) + _maSumOffs[i] = int((a.srcH * a.batch - a.gapV) * a.srcW - a.padH); + else + { + size_t dy = i * a.macroH; + size_t sumOffs = Simd::Max(dy * a.srcW - a.gapH, 0); + _maSumOffs[i] = int(AlignLo(sumOffs, a.microS)); + _maBufOffs[i] = _maSumOffs[i]; + } + } + _miDstOffs[0] = 0; + for (size_t i = 1; i < _miDstOffs.size; ++i) + { + _miDstOffs[i] = _miDstOffs[i - 1]; + for (size_t j = (i - 1) * a.microS, m = i * a.microS; j < m; ++j) + if (_dstMask[j]) + _miDstOffs[i]++; + } + } + + size_t SynetConvolution16bNhwcSpecV3::ExternalBufferSize() const + { + const AlgParam& a = _alg; + size_t size = 0; + size += a.bufS * sizeof(uint16_t); + size += a.bufD * sizeof(float); + return size; + } + + void SynetConvolution16bNhwcSpecV3::SetParams(const float* weight, const float* bias, const float* params) + { + SetWeight(weight); + SynetConvolution16b::SetBias(bias, _alg.microD); + SynetConvolution16b::SetParams(params, _alg.microD); + } + + void SynetConvolution16bNhwcSpecV3::SetWeight(const float* weight) + { + const ConvParam& p = _param; + const AlgParam& a = _alg; + _weight.Resize(a.K * a.dstC, true); + uint16_t* dst = _weight.data; + const size_t microC = a.microC, F = a.F; + for (size_t mad = 0; mad < p.dstC; mad += F) + { + for (size_t mac = 0; mac < p.srcC; mac += microC) + { + for (size_t k = 0; k < a.kA; k++) + { + for (size_t c = 0; c < microC; c += 2) + { + const float* src = weight + (k * p.srcC + mac + c) * p.dstC + mad; + for (size_t d = 0; d < F; ++d) + { + for (size_t i = 0; i < 2; ++i) + { + if (mad + d < p.dstC && mac + c + i < p.srcC) + *(dst++) = Float32ToBFloat16(src[i * p.dstC]); + else + *(dst++) = 0; + } + src++; + } + } + } + } + } + } + + void SynetConvolution16bNhwcSpecV3::Forward(const uint8_t* src, uint8_t* buf8, uint8_t* dst) + { + const ConvParam& p = _param; + const AlgParam& a = _alg; + buf8 = Buffer(buf8); + uint16_t* bufS = a.bufS ? Allocate(buf8, a.bufS) : NULL; + float* bufD = a.bufD ? Allocate(buf8, a.bufD) : NULL; + for (size_t b = 0; b < p.batch; b += a.batch) + { + uint16_t* buf = bufS ? bufS : (uint16_t*)src; + float* sum = bufD ? bufD : (float*)dst; + if(a.batch == 1) + ForwardSingle(src, buf, sum, dst); + else + ForwardBatch(src, buf, sum, dst); + src += _stepS; + dst += _stepD; + } + } + + void SynetConvolution16bNhwcSpecV3::ForwardSingle(const uint8_t* src, uint16_t* buf, float* sum, uint8_t* dst) + { + const ConvParam& p = _param; + const AlgParam& a = _alg; + const float* bias = _bias.data, * params = _params.data; + size_t dS = a.microC, dB = a.macroD, dD = p.dstC * _elemD; + size_t bufOffs = ((a.padV - p.padY) * a.srcW + (a.padH - p.padX)) * dS; + for (size_t mad = 0; mad < p.dstC; mad += a.macroD) + { + size_t macroD = Simd::Min(p.dstC, mad + a.macroD) - mad; + const uint16_t* weight = _weight.data + mad * a.K; + const int* srcOffs = _srcOffs.data; + for (size_t nk = 0; nk < _nK.size; ++nk) + { + int zero = nk == 0 ? 1 : 0; + size_t nK = _nK[nk]; + for (size_t dyBeg = 0, dyN = 0; dyBeg < p.dstH; dyN++) + { + size_t dyEnd = Simd::Min(dyBeg + a.macroH, p.dstH); + size_t dstS = _maSumOffs[dyN + 1] - _maSumOffs[dyN]; + size_t miIdx = _maSumOffs[dyN] / a.microS; + if (mad == 0 && zero) + _preprocess(src, p, a, dyBeg, dyEnd, dyEnd == p.dstH ? 1 : 0, buf); + if (nk == _nK.size - 1) + _lastConv(buf + bufOffs + _maBufOffs[dyN] * dS, p, a, srcOffs, macroD, dstS, nK, zero, weight, + sum + _maSumOffs[dyN] * dB, bias, params, _dstMask.data + _maSumOffs[dyN], _miDstOffs.data + miIdx, dst + _miDstOffs[miIdx] * dD); + else + _bodyConv(buf + bufOffs + _maBufOffs[dyN] * dS, p, a, srcOffs, macroD, dstS, nK, zero, weight, sum + _maSumOffs[dyN] * dB); + dyBeg = dyEnd; + } + srcOffs += nK; + weight += nK * a.microC * a.F; + } + bias += macroD; + if (p.activation == ::SimdConvolutionActivationPrelu) + params += macroD; + dst += macroD * _elemD; + } + } + + void SynetConvolution16bNhwcSpecV3::ForwardBatch(const uint8_t* src, uint16_t* buf, float* sum, uint8_t* dst) + { + const ConvParam& p = _param; + const AlgParam& a = _alg; + const float* bias = _bias.data, * params = _params.data; + const int* mask = _dstMask.data; + size_t dstH = p.dstH * a.batch, dstS = _maSumOffs[1] - _maSumOffs[0]; + size_t bufOffs = ((a.padV - p.padY) * a.srcW + (a.padH - p.padX)) * a.microC; + for (size_t mad = 0; mad < p.dstC; mad += a.macroD) + { + size_t macroD = Simd::Min(p.dstC, mad + a.macroD) - mad; + const uint16_t* weight = _weight.data + mad * a.K; + const int* srcOffs = _srcOffs.data; + for (size_t nk = 0; nk < _nK.size; ++nk) + { + int zero = nk == 0 ? 1 : 0; + size_t nK = _nK[nk]; + if (mad == 0 && zero) + { + size_t dS = p.srcH * p.srcW * p.srcC * _elemS; + size_t dB = a.srcH * a.srcW * a.microC; + for (size_t b = 0; b < a.batch; ++b) + _preprocess(src + b * dS, p, a, 0, p.dstH, b == a.batch - 1 ? 1 : 0, buf + b * dB); + } + if (nk == _nK.size - 1) + _lastConv(buf + bufOffs, p, a, srcOffs, macroD, dstS, nK, zero, weight, sum, bias, params, mask, _miDstOffs.data, dst); + else + _bodyConv(buf + bufOffs, p, a, srcOffs, macroD, dstS, nK, zero, weight, sum); + srcOffs += nK; + weight += nK * a.microC * a.F; + } + bias += macroD; + if (p.activation == ::SimdConvolutionActivationPrelu) + params += macroD; + dst += macroD * _elemD; + } + } + + bool SynetConvolution16bNhwcSpecV3::Preferable(const ConvParam& p) + { + const size_t M = p.dstH * p.dstW; + static int choise = 0; + return 1 && p.trans != 0 && p.group == 1 && p.IsDilation(1) && p.IsStride(1) && !p.IsKernel(1) && p.dstC >= 4 + && p.srcC >= 9 && p.srcC <= 128 && M >= 16;// && (choise++) & 0; + } + } +#endif +} diff --git a/src/Simd/SimdSynetConvolution16b.h b/src/Simd/SimdSynetConvolution16b.h index 1fe2b46321..e481b76dd0 100644 --- a/src/Simd/SimdSynetConvolution16b.h +++ b/src/Simd/SimdSynetConvolution16b.h @@ -407,7 +407,7 @@ namespace Simd const uint16_t* weight, float* sum, const float* bias, const float* params, const int* dstMask, const int* dstOffs, uint8_t* dst); protected: - void SetAlgParam(size_t F, size_t microD, size_t microS, size_t microC, size_t L1, size_t L2, size_t L3); + void SetAlgParam(); virtual void SetWeight(const float* weight); void ForwardSingle(const uint8_t* src, uint16_t* buf, float* sum, uint8_t* dst); From b0b21219bafb2d2c683e1d5763cedf66584e75c4 Mon Sep 17 00:00:00 2001 From: Yermalayeu Ihar Date: Thu, 21 May 2026 12:02:20 +0300 Subject: [PATCH 21/32] +add AMX-BF16 optimizations of class SynetConvolution16bNhwcSpecV3. --- docs/2026.html | 2 +- prj/vs2022/AmxBf16.vcxproj | 1 + prj/vs2022/AmxBf16.vcxproj.filters | 3 + src/Simd/SimdAmxBf16SynetConvolution16b.cpp | 2 + ...mdAmxBf16SynetConvolution16bNhwcSpecV3.cpp | 695 ++++++++++++++++++ .../SimdBaseSynetConvolution16bNhwcSpecV3.cpp | 4 +- src/Simd/SimdSynetConvolution16b.h | 8 + 7 files changed, 712 insertions(+), 3 deletions(-) create mode 100644 src/Simd/SimdAmxBf16SynetConvolution16bNhwcSpecV3.cpp diff --git a/docs/2026.html b/docs/2026.html index fb1b130286..cf0499a982 100644 --- a/docs/2026.html +++ b/docs/2026.html @@ -49,7 +49,7 @@
      New features
    • SVE optimizations of function BackgroundGrowRangeFast.
    • Method View::Copy.
    • Method Frame::Copy.
    • -
    • Base implementation of class SynetConvolution16bNhwcSpecV3.
    • +
    • Base implementation, AMX-BF16 optimizations of class SynetConvolution16bNhwcSpecV3.
    Bug fixing
      diff --git a/prj/vs2022/AmxBf16.vcxproj b/prj/vs2022/AmxBf16.vcxproj index 11bea96861..fadbbffa28 100644 --- a/prj/vs2022/AmxBf16.vcxproj +++ b/prj/vs2022/AmxBf16.vcxproj @@ -88,6 +88,7 @@ + diff --git a/prj/vs2022/AmxBf16.vcxproj.filters b/prj/vs2022/AmxBf16.vcxproj.filters index cd85d24155..31ef806543 100644 --- a/prj/vs2022/AmxBf16.vcxproj.filters +++ b/prj/vs2022/AmxBf16.vcxproj.filters @@ -353,5 +353,8 @@ AmxBf16\Synet\Convolution + + AmxBf16\Synet\Convolution + \ No newline at end of file diff --git a/src/Simd/SimdAmxBf16SynetConvolution16b.cpp b/src/Simd/SimdAmxBf16SynetConvolution16b.cpp index 85589ae25c..a080a9c765 100644 --- a/src/Simd/SimdAmxBf16SynetConvolution16b.cpp +++ b/src/Simd/SimdAmxBf16SynetConvolution16b.cpp @@ -33,6 +33,8 @@ namespace Simd ConvParam param(batch, conv, compatibility); if (!param.Valid(SimdTensorData32f, SimdTensorData16b)) return NULL; + if (SynetConvolution16bNhwcSpecV3::Preferable(param)) + return new AmxBf16::SynetConvolution16bNhwcSpecV3(param); if (SynetConvolution16bNhwcSpecV2::Preferable(param)) return new AmxBf16::SynetConvolution16bNhwcSpecV2(param); //if (SynetConvolution16bNhwcSpecV1::Preferable(param)) diff --git a/src/Simd/SimdAmxBf16SynetConvolution16bNhwcSpecV3.cpp b/src/Simd/SimdAmxBf16SynetConvolution16bNhwcSpecV3.cpp new file mode 100644 index 0000000000..98137ca067 --- /dev/null +++ b/src/Simd/SimdAmxBf16SynetConvolution16bNhwcSpecV3.cpp @@ -0,0 +1,695 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2026 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdSynetConvolution16b.h" +#include "Simd/SimdSynetConvolution16bCommon.h" +#include "Simd/SimdBFloat16.h" +#include "Simd/SimdSynet.h" +#include "Simd/SimdAmxBf16.h" +#include "Simd/SimdSet.h" +#include "Simd/SimdCopy.h" +#include "Simd/SimdCpu.h" +#include "Simd/SimdTile.h" + +namespace Simd +{ +#if (defined(SIMD_AMXBF16_ENABLE) || (defined(SIMD_AVX512BW_ENABLE) && defined(SIMD_AMX_EMULATE))) + namespace AmxBf16 + { + typedef Base::SynetConvolution16bNhwcSpecV3::AlgParam AlgParam; + typedef Base::SynetConvolution16bNhwcSpecV3::LastConvPtr LastConvPtr; + + //------------------------------------------------------------------------------------------------- + + static void Convert16bNhwcSpecV3(const uint8_t* src8, const ConvParam& p, const AlgParam& a, size_t dyBeg, size_t dyEnd, int end, uint16_t* dst) + { + assert(a.microC == DF); + const float* src = (float*)src8; + size_t srcCDF = Simd::AlignLo(p.srcC, DF); + __mmask32 tailC = TailMask32(p.srcC - srcCDF); + size_t syPad = p.kernelY - 1 - p.padY, syBeg, syEnd = (dyEnd == p.dstH ? p.srcH : dyEnd + syPad); + size_t cD = a.batch * a.srcH * a.srcW + a.padE, sD = a.microC; + if (dyBeg == 0) + { + for (size_t s = 0, n = a.padV * a.srcW; s < n; ++s) + for (size_t c = 0; c < a.srcC; c += a.microC) + Avx512bw::SetZero(dst + c * cD + s * sD); + dst += a.padV * a.srcW * sD; + syBeg = 0; + } + else + { + syBeg = dyBeg + syPad; + src += syBeg * p.srcW * p.srcC; + dst += (dyBeg + p.kernelY - 1 + a.padV - p.padY) * a.srcW * sD; + } + for (size_t sy = syBeg; sy < syEnd; ++sy) + { + if (a.padH) + { + for (size_t s = 0; s < a.padH; ++s) + for (size_t c = 0; c < a.srcC; c += a.microC) + Avx512bw::SetZero(dst + c * cD + s * sD); + dst += a.padH * sD; + } + for (size_t sx = 0; sx < p.srcW; ++sx) + { + size_t sc = 0; + for (; sc < srcCDF; sc += DF) + AmxBf16::Float32ToBFloat16(src + sc, dst + sc * cD); + if (tailC) + AmxBf16::Float32ToBFloat16(src + sc, dst + sc * cD, tailC); + src += p.srcC; + dst += sD; + } + } + if (end) + { + for (size_t s = 0, n = a.padE; s < n; ++s) + for (size_t c = 0; c < a.srcC; c += a.microC) + Avx512bw::SetZero(dst + c * cD + s * sD); + } + else if (dyEnd != p.dstH) + { + for (size_t s = 0, n = a.padH; s < n; ++s) + for (size_t c = 0; c < a.srcC; c += a.microC) + Avx512bw::SetZero(dst + c * cD + s * sD); + } + } + + static void Reorder16bNhwcSpecV3(const uint8_t* src8, const ConvParam& p, const AlgParam& a, size_t dyBeg, size_t dyEnd, int end, uint16_t* dst) + { + assert(a.microC == DF); + const uint16_t* src = (uint16_t*)src8; + size_t srcCDF = Simd::AlignLo(p.srcC, DF); + __mmask32 tailC = TailMask32(p.srcC - srcCDF); + size_t syPad = p.kernelY - 1 - p.padY, syBeg, syEnd = (dyEnd == p.dstH ? p.srcH : dyEnd + syPad); + size_t cD = a.batch * a.srcH * a.srcW + a.padE, sD = a.microC; + if (dyBeg == 0) + { + for (size_t s = 0, n = a.padV * a.srcW; s < n; ++s) + for (size_t c = 0; c < a.srcC; c += a.microC) + Avx512bw::SetZero(dst + c * cD + s * sD); + dst += a.padV * a.srcW * sD; + syBeg = 0; + } + else + { + syBeg = dyBeg + syPad; + src += syBeg * p.srcW * p.srcC; + dst += (dyBeg + p.kernelY - 1 + a.padV - p.padY) * a.srcW * sD; + } + for (size_t sy = syBeg; sy < syEnd; ++sy) + { + if (a.padH) + { + for (size_t s = 0; s < a.padH; ++s) + for (size_t c = 0; c < a.srcC; c += a.microC) + Avx512bw::SetZero(dst + c * cD + s * sD); + dst += a.padH * sD; + } + for (size_t sx = 0; sx < p.srcW; ++sx) + { + size_t sc = 0; + for (; sc < srcCDF; sc += DF) + Avx512bw::Copy(src + sc, dst + sc * cD); + if (tailC) + Avx512bw::Copy(src + sc, dst + sc * cD, tailC); + src += p.srcC; + dst += sD; + } + } + if (end) + { + for (size_t s = 0, n = a.padE; s < n; ++s) + for (size_t c = 0; c < a.srcC; c += a.microC) + Avx512bw::SetZero(dst + c * cD + s * sD); + } + else if (dyEnd != p.dstH) + { + for (size_t s = 0, n = a.padH; s < n; ++s) + for (size_t c = 0; c < a.srcC; c += a.microC) + Avx512bw::SetZero(dst + c * cD + s * sD); + } + } + + //------------------------------------------------------------------------------------------------- + + SIMD_INLINE void Convolution16bNhwcSpecV3Body32x32(const uint16_t* src0, const ConvParam& p, const AlgParam& a, const int* offs, size_t nK, int zero, const uint16_t* weight0, float* buf0) + { + int dB = (int)a.macroD, dS = (int)a.microC, strideS = dS * 2, dW = 512, strideW = 64, strideB = dB * 4; + const uint16_t* weight1 = weight0 + a.K * F; + const uint16_t* src1 = src0 + 16 * dS; + float* buf1 = buf0 + 16 * dB; + + if (zero) + { + _tile_zero(0); + _tile_zero(1); + _tile_zero(2); + _tile_zero(3); + } + else + { + _tile_stream_loadd(0, buf0 + 0, strideB); + _tile_stream_loadd(1, buf0 + F, strideB); + _tile_stream_loadd(2, buf1 + 0, strideB); + _tile_stream_loadd(3, buf1 + F, strideB); + } + + int n1 = (int)nK - 1, o = offs[0]; + _tile_stream_loadd(4, src0 + o, strideS); + _tile_loadd(6, weight0, strideW); + for (int i = 0; i < n1; ++i, weight1 += dW) + { + _tile_stream_loadd(5, src1 + o, strideS); + _tile_loadd(7, weight1, strideW); + _tile_dpbf16ps(0, 4, 6); + _tile_dpbf16ps(1, 4, 7); + o = offs[i + 1]; + _tile_stream_loadd(4, src0 + o, strideS); + _tile_dpbf16ps(2, 5, 6); + weight0 += dW; + _tile_loadd(6, weight0, strideW); + _tile_dpbf16ps(3, 5, 7); + } + _tile_loadd(7, weight1, strideW); + _tile_stream_loadd(5, src1 + offs[n1], strideS); + + _tile_dpbf16ps(0, 4, 6); + _tile_stored(0, buf0 + 0, strideB); + TileMoveToMemory(buf0 + 0, dB); + + _tile_dpbf16ps(1, 4, 7); + _tile_stored(1, buf0 + F, strideB); + TileMoveToMemory(buf0 + F, dB); + + _tile_dpbf16ps(2, 5, 6); + _tile_stored(2, buf1 + 0, strideB); + TileMoveToMemory(buf1 + 0, dB); + + _tile_dpbf16ps(3, 5, 7); + _tile_stored(3, buf1 + F, strideB); + TileMoveToMemory(buf1 + F, dB); + } + + SIMD_INLINE void Convolution16bNhwcSpecV3Body32x16(const uint16_t* src0, const ConvParam& p, const AlgParam& a, const int* offs, size_t nK, int zero, const uint16_t* weight0, float* buf0) + { + int dB = (int)a.macroD, dS = (int)a.microC, strideS = dS * 2, dW = 512, strideW = 64, strideB = dB * 4; + const uint16_t* src1 = src0 + 16 * dS; + float* buf1 = buf0 + 16 * dB; + + if (zero) + { + _tile_zero(0); + _tile_zero(2); + } + else + { + _tile_stream_loadd(0, buf0 + 0, strideB); + _tile_stream_loadd(2, buf1 + 0, strideB); + } + + int n1 = (int)nK - 1, o = offs[0]; + _tile_loadd(4, src0 + o, strideS); + for (int i = 0; i < n1; ++i) + { + _tile_stream_loadd(6, weight0, strideW); + _tile_loadd(5, src1 + o, strideS); + _tile_dpbf16ps(0, 4, 6); + o = offs[i + 1]; + _tile_loadd(4, src0 + o, strideS); + _tile_dpbf16ps(2, 5, 6); + weight0 += dW; + } + _tile_stream_loadd(6, weight0, strideW); + _tile_loadd(5, src1 + offs[n1], strideS); + + _tile_dpbf16ps(0, 4, 6); + _tile_stored(0, buf0 + 0, strideB); + TileMoveToMemory(buf0 + 0, dB); + + _tile_dpbf16ps(2, 5, 6); + _tile_stored(2, buf1 + 0, strideB); + TileMoveToMemory(buf1 + 0, dB); + } + + SIMD_INLINE void Convolution16bNhwcSpecV3Body16x32(const uint16_t* src0, const ConvParam& p, const AlgParam& a, const int* offs, size_t nK, int zero, const uint16_t* weight0, float* buf0) + { + int dB = (int)a.macroD, dS = (int)a.microC, strideS = dS * 2, dW = 512, strideW = 64, strideB = dB * 4; + const uint16_t* weight1 = weight0 + a.K * F; + + if (zero) + { + _tile_zero(0); + _tile_zero(1); + } + else + { + _tile_stream_loadd(0, buf0 + 0, strideB); + _tile_stream_loadd(1, buf0 + F, strideB); + } + + int n1 = (int)nK - 1; + _tile_loadd(6, weight0, strideW); + for (int i = 0; i < n1; ++i, weight1 += dW) + { + _tile_stream_loadd(4, src0 + offs[i], strideS); + _tile_loadd(7, weight1, strideW); + _tile_dpbf16ps(0, 4, 6); + weight0 += dW; + _tile_loadd(6, weight0, strideW); + _tile_dpbf16ps(1, 4, 7); + } + _tile_stream_loadd(4, src0 + offs[n1], strideS); + _tile_loadd(7, weight1, strideW); + + _tile_dpbf16ps(0, 4, 6); + _tile_stored(0, buf0 + 0, strideB); + TileMoveToMemory(buf0 + 0, dB); + + _tile_dpbf16ps(1, 4, 7); + _tile_stored(1, buf0 + F, strideB); + TileMoveToMemory(buf0 + F, dB); + } + + SIMD_INLINE void Convolution16bNhwcSpecV3Body16x16(const uint16_t* src0, const ConvParam& p, const AlgParam& a, const int* offs, size_t nK, int zero, const uint16_t* weight0, float* buf0) + { + int dB = (int)a.macroD, dS = (int)a.microC, strideS = dS * 2, dW = 512, strideW = 64, strideB = dB * 4; + + if (zero) + { + _tile_zero(0); + } + else + { + _tile_stream_loadd(0, buf0 + 0, strideB); + } + + int n = (int)nK; + for (int i = 0; i < n; ++i) + { + _tile_stream_loadd(4, src0 + offs[i], strideS); + _tile_loadd(6, weight0, strideW); + _tile_dpbf16ps(0, 4, 6); + weight0 += dW; + } + + _tile_stored(0, buf0 + 0, strideB); + TileMoveToMemory(buf0 + 0, dB); + } + + typedef void (*Convolution16bNhwcSpecV3BodyPtr)(const uint16_t* src0, const ConvParam& p, const AlgParam& a, const int* offset, size_t nK, int zero, const uint16_t* weight0, float* buf0); + + static void Convolution16bNhwcSpecV3Body(const uint16_t* src, const ConvParam& p, const AlgParam& a, const int* offs, size_t dstC, size_t dstS, size_t nK, int zero, const uint16_t* weight, float* buf) + { + size_t n1 = AlignHi(dstS, 16), n = 32; + size_t nn = AlignLo(n1, n), m = n1 - nn, dW = a.K * DF; + size_t dB = a.macroD, dS = a.microC; + + SetTileConfFull(); + for (size_t dc = 0; dc < dstC; dc += DF) + { + size_t dC = Simd::Min(DF, dstC - dc); + size_t i = 0; + if (dC > F) + { + for (; i < nn; i += n) + Convolution16bNhwcSpecV3Body32x32(src + i * dS, p, a, offs, nK, zero, weight, buf + i * dB); + if (m) + Convolution16bNhwcSpecV3Body16x32(src + i * dS, p, a, offs, nK, zero, weight, buf + i * dB); + } + else + { + for (; i < nn; i += n) + Convolution16bNhwcSpecV3Body32x16(src + i * dS, p, a, offs, nK, zero, weight, buf + i * dB); + if (m) + Convolution16bNhwcSpecV3Body16x16(src + i * dS, p, a, offs, nK, zero, weight, buf + i * dB); + } + weight += dW; + buf += DF; + } + } + + //------------------------------------------------------------------------------------------------- + + template static SIMD_INLINE void ApplyMx1( + uint8_t *& ptr, int dP, float* buf, const __m512* bias, const __m512* params, const int* mask, __mmask32 tail = __mmask32(-1)) + { + uint32_t msk = mask[0]; + tail = tail & msk; + if (M == 1) + { + __m512 f0 = Activate(_mm512_add_ps(_mm512_loadu_ps(buf), bias[0]), params, 0); + _mm_prefetch((const char*)buf + 0, _MM_HINT_NTA); + if (term == Term16bLast16b) + { + _mm256_mask_storeu_epi16((uint16_t*)ptr, (__mmask16)tail, (__m256i)_mm512_cvtneps_pbh(f0)); + if (flush == 1) + _mm_prefetch((const char*)ptr, _MM_HINT_NTA); + else if (flush == 2) + _m_prefetchw((char*)ptr); + } + else + { + _mm512_mask_storeu_ps((float*)ptr, (__mmask16)tail, f0); + if (flush == 1) + _mm_prefetch((const char*)ptr, _MM_HINT_NTA); + else if (flush == 2) + _m_prefetchw((char*)ptr); + } + } + else if (M == 2) + { + __m512 f0 = Activate(_mm512_add_ps(_mm512_loadu_ps(buf + 0), bias[0]), params, 0); + _mm_prefetch((const char*)buf + 0, _MM_HINT_NTA); + __m512 f1 = Activate(_mm512_add_ps(_mm512_loadu_ps(buf + F), bias[1]), params, 1); + _mm_prefetch((const char*)buf + A, _MM_HINT_NTA); + if (term == Term16bLast16b) + { + _mm512_mask_storeu_epi16((uint16_t*)ptr, tail, (__m512i)_mm512_cvtne2ps_pbh(f1, f0)); + if (flush == 1) + _mm_prefetch((const char*)ptr, _MM_HINT_NTA); + else if (flush == 2) + _m_prefetchw((char*)ptr); + } + else + { + _mm512_mask_storeu_ps((float*)ptr, (__mmask16)msk, f0); + if (flush == 1) + _mm_prefetch((const char*)ptr, _MM_HINT_NTA); + else if (flush == 2) + _m_prefetchw((char*)ptr + 0); + _mm512_mask_storeu_ps((float*)(ptr + A), (__mmask16)tail, f1); + if (flush == 1) + _mm_prefetch((const char*)(ptr + A), _MM_HINT_NTA); + else if (flush == 2) + _m_prefetchw((char*)ptr + A); + } + } + ptr += dP & msk; + } + + template static SIMD_INLINE void ApplyMxN( + uint8_t*& ptr, int dP, float* buf, int dB, const __m512* bias, const __m512* params, const int* mask, __mmask32 tail = __mmask32(-1)) + { + if (N > 0) ApplyMx1(ptr, dP, buf + 0 * dB, bias, params, mask + 0, tail); + if (N > 1) ApplyMx1(ptr, dP, buf + 1 * dB, bias, params, mask + 1, tail); + if (N > 2) ApplyMx1(ptr, dP, buf + 2 * dB, bias, params, mask + 2, tail); + if (N > 3) ApplyMx1(ptr, dP, buf + 3 * dB, bias, params, mask + 3, tail); + if (N > 4) ApplyMx1(ptr, dP, buf + 4 * dB, bias, params, mask + 4, tail); + if (N > 5) ApplyMx1(ptr, dP, buf + 5 * dB, bias, params, mask + 5, tail); + if (N > 6) ApplyMx1(ptr, dP, buf + 6 * dB, bias, params, mask + 6, tail); + if (N > 7) ApplyMx1(ptr, dP, buf + 7 * dB, bias, params, mask + 7, tail); + } + + //------------------------------------------------------------------------------------------------- + + template void Convolution16bNhwcSpecV3_1x32x32( + const uint16_t* src0, const ConvParam& p, const AlgParam& a, const int* offs, size_t nK, int zero, const uint16_t* weight0, + const __m512* bias, const __m512* params, float* buf2, const int* mask, uint8_t * &dst, __mmask32 tail) + { + int dD = int(p.dstC * a.elem), dB = (int)a.macroD, dS = (int)a.microC, strideS = dS * 2, dW = 512, strideW = 64, strideB = dB * 4; + const uint16_t* weight1 = weight0 + a.K * F; + const uint16_t* src1 = src0 + 16 * dS; + float* buf0 = buf2 - 32 * dB; + float* buf3 = buf2 + 16 * dB; + + if (zero) + { + if (M > 0) _tile_zero(0); + if (M > 1) _tile_zero(1); + if (M > 0) _tile_zero(2); + if (M > 1) _tile_zero(3); + } + else + { + if (M > 0) _tile_stream_loadd(0, buf2 + 0, strideB); + if (M > 1) _tile_stream_loadd(1, buf2 + F, strideB); + if (M > 0) _tile_stream_loadd(2, buf3 + 0, strideB); + if (M > 1) _tile_stream_loadd(3, buf3 + F, strideB); + } + + int n1 = (int)nK - 1, i = 0, o = offs[0], na = apply ? (8 / apply - 1) : 0, ds = 0; + _tile_stream_loadd(4, src0 + o, strideS); + if (M > 0) _tile_loadd(6, weight0, strideW); + for (; i < na; ++i, weight1 += dW) + { + if (M > 1) _tile_loadd(7, weight1, strideW); + if (M > 0) _tile_dpbf16ps(0, 4, 6); + ApplyMxN(dst, dD, buf0 + ds * dB, dB, bias, params, mask + ds, tail), ds += apply; + _tile_stream_loadd(5, src1 + o, strideS); + if (M > 1) _tile_dpbf16ps(1, 4, 7); + ApplyMxN(dst, dD, buf0 + ds * dB, dB, bias, params, mask + ds, tail), ds += apply; + o = offs[i + 1]; + _tile_stream_loadd(4, src0 + o, strideS); + if (M > 0) _tile_dpbf16ps(2, 5, 6); + ApplyMxN(dst, dD, buf0 + ds * dB, dB, bias, params, mask + ds, tail), ds += apply; + weight0 += dW; + if (M > 0) _tile_loadd(6, weight0, strideW); + if (M > 1) _tile_dpbf16ps(3, 5, 7); + ApplyMxN(dst, dD, buf0 + ds * dB, dB, bias, params, mask + ds, tail), ds += apply; + } + for (; i < n1; ++i, weight1 += dW) + { + if (M > 1) _tile_loadd(7, weight1, strideW); + if (M > 0) _tile_dpbf16ps(0, 4, 6); + _tile_stream_loadd(5, src1 + o, strideS); + if (M > 1) _tile_dpbf16ps(1, 4, 7); + o = offs[i + 1]; + _tile_stream_loadd(4, src0 + o, strideS); + if (M > 0) _tile_dpbf16ps(2, 5, 6); + weight0 += dW; + if (M > 0) _tile_loadd(6, weight0, strideW); + if (M > 1) _tile_dpbf16ps(3, 5, 7); + } + if (M > 1) _tile_loadd(7, weight1, strideW); + _tile_stream_loadd(5, src1 + offs[n1], strideS); + + if (M > 0) _tile_dpbf16ps(0, 4, 6); + ApplyMxN(dst, dD, buf0 + ds * dB, dB, bias, params, mask + ds, tail), ds += apply; + if (M > 0) _tile_stored(0, buf2 + 0, strideB); + + if (M > 1) _tile_dpbf16ps(1, 4, 7); + ApplyMxN(dst, dD, buf0 + ds * dB, dB, bias, params, mask + ds, tail), ds += apply; + if (M > 1) _tile_stored(1, buf2 + F, strideB); + + if (M > 0) _tile_dpbf16ps(2, 5, 6); + ApplyMxN(dst, dD, buf0 + ds * dB, dB, bias, params, mask + ds, tail), ds += apply; + if (M > 0) _tile_stored(2, buf3 + 0, strideB); + + if (M > 1) _tile_dpbf16ps(3, 5, 7); + ApplyMxN(dst, dD, buf0 + ds * dB, dB, bias, params, mask + ds, tail), ds += apply; + if (M > 1) _tile_stored(3, buf3 + F, strideB); + } + + template void Convolution16bNhwcSpecV3_1x16x32( + const uint16_t* src0, const ConvParam& p, const AlgParam& a, const int* offs, size_t nK, int zero, const uint16_t* weight0, + const __m512* bias, const __m512* params, float* buf2, const int* mask, uint8_t*& dst, __mmask32 tail) + { + int dD = int(p.dstC * a.elem), dB = (int)a.macroD, dS = (int)a.microC, strideS = dS * 2, dW = 512, strideW = 64, strideB = dB * 4; + const uint16_t* weight1 = weight0 + a.K * F; + const uint16_t* src1 = src0 + 16 * dS; + float* buf0 = buf2 - 32 * dB; + + if (zero) + { + if (M > 0) _tile_zero(0); + if (M > 1) _tile_zero(1); + } + else + { + if (M > 0) _tile_stream_loadd(0, buf2 + 0, strideB); + if (M > 1) _tile_stream_loadd(1, buf2 + F, strideB); + } + + int n1 = (int)nK - 1, i = 0, o = offs[0], na = apply ? (8 / apply - 1) : 0, ds = 0; + _tile_stream_loadd(4, src0 + o, strideS); + if (M > 0) _tile_loadd(6, weight0, strideW); + for (; i < na; ++i, weight1 += dW) + { + if (M > 1) _tile_loadd(7, weight1, strideW); + if (M > 0) _tile_dpbf16ps(0, 4, 6); + ApplyMxN(dst, dD, buf0 + ds * dB, dB, bias, params, mask + ds, tail), ds += 2 * apply; + if (M > 1) _tile_dpbf16ps(1, 4, 7); + ApplyMxN(dst, dD, buf0 + ds * dB, dB, bias, params, mask + ds, tail), ds += 2 * apply; + o = offs[i + 1]; + _tile_stream_loadd(4, src0 + o, strideS); + weight0 += dW; + if (M > 0) _tile_loadd(6, weight0, strideW); + } + for (; i < n1; ++i, weight1 += dW) + { + if (M > 1) _tile_loadd(7, weight1, strideW); + if (M > 0) _tile_dpbf16ps(0, 4, 6); + if (M > 1) _tile_dpbf16ps(1, 4, 7); + o = offs[i + 1]; + _tile_stream_loadd(4, src0 + o, strideS); + weight0 += dW; + if (M > 0) _tile_loadd(6, weight0, strideW); + } + if (M > 1) _tile_loadd(7, weight1, strideW); + + if (M > 0) _tile_dpbf16ps(0, 4, 6); + ApplyMxN(dst, dD, buf0 + ds * dB, dB, bias, params, mask + ds, tail), ds += 2 * apply; + if (M > 0) _tile_stored(0, buf2 + 0, strideB); + + if (M > 1) _tile_dpbf16ps(1, 4, 7); + ApplyMxN(dst, dD, buf0 + ds * dB, dB, bias, params, mask + ds, tail), ds += 2 * apply; + if (M > 1) _tile_stored(1, buf2 + F, strideB); + } + + template void Convolution16bNhwcSpecV3_Nx32x32M( + const uint16_t* src0, const ConvParam& p, const AlgParam& a, size_t dstS, const int* offs, size_t nK, int zero, const uint16_t* weight0, + const float* bias, const float* params, __m512* _params, float* buf, const int* mask, uint8_t* dst, __mmask32 tail) + { + int dB = (int)a.macroD, dD = int(p.dstC * a.elem), dS = (int)a.microC; + + __m512 _bias[2]; + if (M > 0) _bias[0] = _mm512_loadu_ps(bias + 0 * F); + if (M > 1) _bias[1] = _mm512_loadu_ps(bias + 1 * F); + if (type == SimdConvolutionActivationPrelu) + { + if (M > 0) _params[0] = _mm512_loadu_ps(params + 0 * F); + if (M > 1) _params[1] = _mm512_loadu_ps(params + 1 * F); + } + + size_t pds = 0; + Convolution16bNhwcSpecV3_1x32x32(src0, p, a, offs, nK, zero, weight0, _bias, _params, buf, mask, dst, tail); + for (size_t cds = 32; cds < dstS; pds += 32) + { + if (cds + 16 >= dstS) + { + Convolution16bNhwcSpecV3_1x16x32(src0 + cds * dS, p, a, offs, nK, zero, weight0, _bias, _params, buf + cds * dB, mask + pds, dst, tail); + cds += 16; + } + else + { + Convolution16bNhwcSpecV3_1x32x32(src0 + cds * dS, p, a, offs, nK, zero, weight0, _bias, _params, buf + cds * dB, mask + pds, dst, tail); + cds += 32; + } + } + size_t dstS8 = dstS & (~7); + for (; pds < dstS8; pds += 8) + { + ApplyMxN(dst, dD, buf + pds * dB, dB, _bias, _params, mask + pds, tail); + } + for (; pds < dstS; ++pds) + { + ApplyMxN(dst, dD, buf + pds * dB, dB, _bias, _params, mask + pds, tail); + } + } + + //------------------------------------------------------------------------------------------------- + + typedef void (*Convolution16bNhwcSpecV3LastPtr)(const uint16_t* src0, const ConvParam& p, const AlgParam& a, size_t dstS, const int* offs, size_t nK, int zero, + const uint16_t* weight0, const float* bias, const float* params, __m512* _params, float* buf, const int* mask, uint8_t* dst, __mmask32 tail); + + template void Convolution16bNhwcSpecV3Last( + const uint16_t* src, const ConvParam& p, const AlgParam& a, const int* offs, size_t dstC, size_t dstS, size_t nK, int zero, + const uint16_t* weight, float* buf, const float* bias, const float* params, const int* mask, const int* dstOffs, uint8_t* dst) + { + size_t n = 256, n1 = dstS, nn = AlignLoAny(n1, n), dW = a.K * a.microD; + size_t dB = a.macroD, dD = p.dstC * a.elem, dS = a.microC; + + size_t dstC32 = AlignLo(dstC, 32), dstCt = dstC - dstC32; + __mmask32 tailD = term == Term16bLast16b ? TailMask32(dstCt) : (__mmask32)TailMask16(dstCt - AlignLo(dstCt - 1, 16)); + Convolution16bNhwcSpecV3LastPtr mainConv = Convolution16bNhwcSpecV3_Nx32x32M; + Convolution16bNhwcSpecV3LastPtr tailConv = dstCt > 16 ? Convolution16bNhwcSpecV3_Nx32x32M : + Convolution16bNhwcSpecV3_Nx32x32M; + + __m512 _params[2]; + _params[0] = _mm512_set1_ps(params[0]); + if (type == SimdConvolutionActivationRestrictRange || + type == SimdConvolutionActivationHswish || + type == SimdConvolutionActivationHardSigmoid) + _params[1] = _mm512_set1_ps(params[1]); + + SetTileConfFull(); + for (size_t i = 0; i < n1;) + { + size_t dn = (n1 - i >= n + 32 ? n : n1 - i); + const uint16_t* s = src + i * dS; + const uint16_t* w = weight; + float* b = buf + i * dB; + uint8_t* d = dst + (dstOffs[i/32] - dstOffs[0]) * dD; + size_t dc = 0; + for (; dc < dstC32; dc += DF, w += dW) + mainConv(s, p, a, dn, offs, nK, zero, w, bias + dc, params + dc, _params, b + dc, mask + i, d + dc * a.elem, __mmask32(-1)); + if (dc < dstC) + tailConv(s, p, a, dn, offs, nK, zero, w, bias + dc, params + dc, _params, b + dc, mask + i, d + dc * a.elem, tailD); + i += dn; + } + } + + //------------------------------------------------------------------------------------------------- + + template SIMD_INLINE void SetLastConvV3(const ConvParam& p, size_t nK, LastConvPtr& lastConv) + { + if (nK >= 8) + lastConv = Convolution16bNhwcSpecV3Last; + else if (nK >= 4) + lastConv = Convolution16bNhwcSpecV3Last; + else if (nK >= 2) + lastConv = Convolution16bNhwcSpecV3Last; + else + lastConv = NULL; + } + + template SIMD_INLINE void SetLastConvV3(const ConvParam& p, size_t nK, LastConvPtr& lastConv) + { + if (p.dstT == SimdTensorData16b) + SetLastConvV3(p, nK, lastConv); + else + SetLastConvV3(p, nK, lastConv); + } + + SynetConvolution16bNhwcSpecV3::SynetConvolution16bNhwcSpecV3(const ConvParam & p) + : Base::SynetConvolution16bNhwcSpecV3(p) + { + SetAlgParam(); + if (_src16b) + _preprocess = Reorder16bNhwcSpecV3; + else + _preprocess = Convert16bNhwcSpecV3; + _bodyConv = Convolution16bNhwcSpecV3Body; + size_t nK = _nK[_nK.size - 1]; + switch (p.activation) + { + case SimdConvolutionActivationIdentity: SetLastConvV3(p, nK, _lastConv); break; + case SimdConvolutionActivationRelu: SetLastConvV3(p, nK, _lastConv); break; + case SimdConvolutionActivationLeakyRelu: SetLastConvV3(p, nK, _lastConv); break; + case SimdConvolutionActivationRestrictRange: SetLastConvV3(p, nK, _lastConv); break; + case SimdConvolutionActivationPrelu: SetLastConvV3(p, nK, _lastConv); break; + case SimdConvolutionActivationElu: SetLastConvV3(p, nK, _lastConv); break; + case SimdConvolutionActivationHswish: SetLastConvV3(p, nK, _lastConv); break; + case SimdConvolutionActivationMish: SetLastConvV3(p, nK, _lastConv); break; + case SimdConvolutionActivationHardSigmoid: SetLastConvV3(p, nK, _lastConv); break; + case SimdConvolutionActivationSwish: SetLastConvV3(p, nK, _lastConv); break; + case SimdConvolutionActivationGelu: SetLastConvV3(p, nK, _lastConv); break; + default: assert(0); + } + } + } +#endif +} diff --git a/src/Simd/SimdBaseSynetConvolution16bNhwcSpecV3.cpp b/src/Simd/SimdBaseSynetConvolution16bNhwcSpecV3.cpp index 79c1cf8c2d..c3ff94b096 100644 --- a/src/Simd/SimdBaseSynetConvolution16bNhwcSpecV3.cpp +++ b/src/Simd/SimdBaseSynetConvolution16bNhwcSpecV3.cpp @@ -306,8 +306,8 @@ namespace Simd { const size_t M = p.dstH * p.dstW; static int choise = 0; - return 1 && p.trans != 0 && p.group == 1 && p.IsDilation(1) && p.IsStride(1) && !p.IsKernel(1) && p.dstC >= 4 - && p.srcC >= 9 && p.srcC <= 128 && M >= 16;// && (choise++) & 0; + return 1 && p.trans != 0 && p.group == 1 && p.IsDilation(1) && p.IsStride(1) && p.kernelX == 3 && p.dstC >= 4 + && p.srcC >= 9 && /*p.srcC <= 128 &&*/ M >= 16 && 1;// && (choise++) & 0; } } #endif diff --git a/src/Simd/SimdSynetConvolution16b.h b/src/Simd/SimdSynetConvolution16b.h index e481b76dd0..2ac8b90d46 100644 --- a/src/Simd/SimdSynetConvolution16b.h +++ b/src/Simd/SimdSynetConvolution16b.h @@ -692,6 +692,14 @@ namespace Simd virtual String Ext() const { return "AmxBf16"; } }; + class SynetConvolution16bNhwcSpecV3 : public Base::SynetConvolution16bNhwcSpecV3 + { + public: + SynetConvolution16bNhwcSpecV3(const ConvParam& p); + + virtual String Ext() const { return "AmxBf16"; } + }; + class SynetConvolution16bNchwGemm : public Avx512bw::SynetConvolution16bNchwGemm { public: From b922645873ec5b2bfd32afa8c4f6160a21d71f37 Mon Sep 17 00:00:00 2001 From: Yermalayeu Ihar Date: Thu, 21 May 2026 13:36:55 +0300 Subject: [PATCH 22/32] *extend using of SynetConvolution16bNhwcSpecV3. --- src/Simd/SimdBaseSynetConvolution16bNhwcSpecV3.cpp | 2 +- src/Test/TestSynetConvolution16b.cpp | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/Simd/SimdBaseSynetConvolution16bNhwcSpecV3.cpp b/src/Simd/SimdBaseSynetConvolution16bNhwcSpecV3.cpp index c3ff94b096..e912315c11 100644 --- a/src/Simd/SimdBaseSynetConvolution16bNhwcSpecV3.cpp +++ b/src/Simd/SimdBaseSynetConvolution16bNhwcSpecV3.cpp @@ -306,7 +306,7 @@ namespace Simd { const size_t M = p.dstH * p.dstW; static int choise = 0; - return 1 && p.trans != 0 && p.group == 1 && p.IsDilation(1) && p.IsStride(1) && p.kernelX == 3 && p.dstC >= 4 + return 1 && p.trans != 0 && p.group == 1 && p.IsDilation(1) && p.IsStride(1) && !p.IsKernel(1) && p.dstC >= 4 && p.srcC >= 9 && /*p.srcC <= 128 &&*/ M >= 16 && 1;// && (choise++) & 0; } } diff --git a/src/Test/TestSynetConvolution16b.cpp b/src/Test/TestSynetConvolution16b.cpp index 0cd899620b..b8de4da8b2 100644 --- a/src/Test/TestSynetConvolution16b.cpp +++ b/src/Test/TestSynetConvolution16b.cpp @@ -563,6 +563,12 @@ namespace Test result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 448, 6, 6, 448, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2); //result = result && SynetConvolution16bForwardAutoTest(eps, Param(10, 448, 6, 6, 448, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2); #endif +#if 1 + result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 64, 57, 57, 64, Size(1, 7), _1, _1, Size(0, 3), Size(0, 3), 1, aPr, tT, b16, b16), c, f1, f2); + result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 64, 57, 57, 64, Size(7, 1), _1, _1, Size(3, 0), Size(3, 0), 1, aPr, tT, b16, b16), c, f1, f2); + result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 128, 13, 13, 160, Size(1, 7), _1, _1, Size(0, 3), Size(0, 3), 1, aPr, tT, b16, b16), c, f1, f2); + result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 160, 13, 13, 192, Size(7, 1), _1, _1, Size(3, 0), Size(3, 0), 1, aPr, tT, b16, b16), c, f1, f2); +#endif #if 0 result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 224, 12, 12, 448, _1, _1, _2, _0, _0, 1, aId, tT, b16, b16), c, f1, f2); result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 56, 96, 96, 56, _3, _1, _2, _1, _1, 1, aId, tT, b16, b16), c, f1, f2); From b261443fdf1dd696037ed03041dc711a2b75b395 Mon Sep 17 00:00:00 2001 From: Yermalayeu Ihar Date: Thu, 21 May 2026 19:49:43 +0300 Subject: [PATCH 23/32] +add kernel Convolution16bNhwcSpecV3Body32x32_Yx3. --- ...mdAmxBf16SynetConvolution16bNhwcSpecV3.cpp | 123 +++++++++++++++++- .../SimdBaseSynetConvolution16bNhwcSpecV3.cpp | 2 +- src/Test/TestSynetConvolution16b.cpp | 13 +- 3 files changed, 125 insertions(+), 13 deletions(-) diff --git a/src/Simd/SimdAmxBf16SynetConvolution16bNhwcSpecV3.cpp b/src/Simd/SimdAmxBf16SynetConvolution16bNhwcSpecV3.cpp index 98137ca067..7154a8859f 100644 --- a/src/Simd/SimdAmxBf16SynetConvolution16bNhwcSpecV3.cpp +++ b/src/Simd/SimdAmxBf16SynetConvolution16bNhwcSpecV3.cpp @@ -155,7 +155,7 @@ namespace Simd //------------------------------------------------------------------------------------------------- - SIMD_INLINE void Convolution16bNhwcSpecV3Body32x32(const uint16_t* src0, const ConvParam& p, const AlgParam& a, const int* offs, size_t nK, int zero, const uint16_t* weight0, float* buf0) + SIMD_INLINE void Convolution16bNhwcSpecV3Body32x32_Any(const uint16_t* src0, const ConvParam& p, const AlgParam& a, const int* offs, size_t nK, int zero, const uint16_t* weight0, float* buf0) { int dB = (int)a.macroD, dS = (int)a.microC, strideS = dS * 2, dW = 512, strideW = 64, strideB = dB * 4; const uint16_t* weight1 = weight0 + a.K * F; @@ -213,6 +213,115 @@ namespace Simd TileMoveToMemory(buf1 + F, dB); } + SIMD_INLINE void Convolution16bNhwcSpecV3Body32x32_Yx3(const uint16_t* src0, const ConvParam& p, const AlgParam& a, const int* offs, size_t nK, int zero, const uint16_t* weight0, float* buf0) + { + int dB = (int)a.macroD, dS = (int)a.microC, strideS = dS * 4, dW = 512, strideW = 64, strideB = dB * 8; + const uint16_t* weight1 = weight0 + a.K * F; + const uint16_t* src1 = src0 + dS; + float* buf1 = buf0 + dB; + + if (zero) + { + _tile_zero(0); + _tile_zero(1); + _tile_zero(2); + _tile_zero(3); + } + else + { + _tile_stream_loadd(0, buf0 + 0, strideB); + _tile_stream_loadd(1, buf0 + F, strideB); + _tile_stream_loadd(2, buf1 + 0, strideB); + _tile_stream_loadd(3, buf1 + F, strideB); + } + + int n3 = (int)nK - 3, i = 0, o = offs[i]; + _tile_stream_loadd(4, src0 + o, strideS); + _tile_loadd(6, weight0, strideW); + + for (; i < n3; i += 3) + { + _tile_stream_loadd(5, src1 + o, strideS); + _tile_loadd(7, weight1, strideW); + weight1 += dW; + _tile_dpbf16ps(0, 4, 6); + _tile_dpbf16ps(1, 4, 7); + o = offs[i + 1]; + _tile_stream_loadd(4, src1 + o, strideS); + _tile_dpbf16ps(2, 5, 6); + weight0 += dW; + _tile_loadd(6, weight0, strideW); + _tile_dpbf16ps(3, 5, 7); + + //_tile_stream_loadd(4, src1 + o, strideS); + _tile_loadd(7, weight1, strideW); + weight1 += dW; + _tile_dpbf16ps(0, 5, 6); + _tile_dpbf16ps(1, 5, 7); + o = offs[i + 2]; + _tile_stream_loadd(5, src1 + o, strideS); + _tile_dpbf16ps(2, 4, 6); + weight0 += dW; + _tile_loadd(6, weight0, strideW); + _tile_dpbf16ps(3, 4, 7); + + //_tile_stream_loadd(5, src1 + o, strideS); + _tile_loadd(7, weight1, strideW); + weight1 += dW; + _tile_dpbf16ps(0, 4, 6); + _tile_dpbf16ps(1, 4, 7); + o = offs[i + 3]; + _tile_stream_loadd(4, src0 + o, strideS); + _tile_dpbf16ps(2, 5, 6); + weight0 += dW; + _tile_loadd(6, weight0, strideW); + _tile_dpbf16ps(3, 5, 7); + } + + _tile_stream_loadd(5, src1 + o, strideS); + _tile_loadd(7, weight1, strideW); + weight1 += dW; + _tile_dpbf16ps(0, 4, 6); + _tile_dpbf16ps(1, 4, 7); + o = offs[i + 1]; + _tile_stream_loadd(4, src1 + o, strideS); + _tile_dpbf16ps(2, 5, 6); + weight0 += dW; + _tile_loadd(6, weight0, strideW); + _tile_dpbf16ps(3, 5, 7); + + //_tile_stream_loadd(5, src1 + o, strideS); + _tile_loadd(7, weight1, strideW); + weight1 += dW; + _tile_dpbf16ps(0, 5, 6); + _tile_dpbf16ps(1, 5, 7); + o = offs[i + 2]; + _tile_stream_loadd(5, src1 + o, strideS); + _tile_dpbf16ps(2, 4, 6); + weight0 += dW; + _tile_loadd(6, weight0, strideW); + _tile_dpbf16ps(3, 4, 7); + + _tile_loadd(7, weight1, strideW); + //_tile_stream_loadd(5, src1 + o, strideS); + + _tile_dpbf16ps(0, 4, 6); + _tile_stored(0, buf0 + 0, strideB); + TileMoveToMemory(buf0 + 0, dB); + + _tile_dpbf16ps(1, 4, 7); + _tile_stored(1, buf0 + F, strideB); + TileMoveToMemory(buf0 + F, dB); + + _tile_dpbf16ps(2, 5, 6); + _tile_stored(2, buf1 + 0, strideB); + TileMoveToMemory(buf1 + 0, dB); + + _tile_dpbf16ps(3, 5, 7); + _tile_stored(3, buf1 + F, strideB); + TileMoveToMemory(buf1 + F, dB); + } + SIMD_INLINE void Convolution16bNhwcSpecV3Body32x16(const uint16_t* src0, const ConvParam& p, const AlgParam& a, const int* offs, size_t nK, int zero, const uint16_t* weight0, float* buf0) { int dB = (int)a.macroD, dS = (int)a.microC, strideS = dS * 2, dW = 512, strideW = 64, strideB = dB * 4; @@ -334,8 +443,16 @@ namespace Simd size_t i = 0; if (dC > F) { - for (; i < nn; i += n) - Convolution16bNhwcSpecV3Body32x32(src + i * dS, p, a, offs, nK, zero, weight, buf + i * dB); + if (p.kernelX == 3 && 1) + { + for (; i < nn; i += n) + Convolution16bNhwcSpecV3Body32x32_Yx3(src + i * dS, p, a, offs, nK, zero, weight, buf + i * dB); + } + else + { + for (; i < nn; i += n) + Convolution16bNhwcSpecV3Body32x32_Any(src + i * dS, p, a, offs, nK, zero, weight, buf + i * dB); + } if (m) Convolution16bNhwcSpecV3Body16x32(src + i * dS, p, a, offs, nK, zero, weight, buf + i * dB); } diff --git a/src/Simd/SimdBaseSynetConvolution16bNhwcSpecV3.cpp b/src/Simd/SimdBaseSynetConvolution16bNhwcSpecV3.cpp index e912315c11..1884736f45 100644 --- a/src/Simd/SimdBaseSynetConvolution16bNhwcSpecV3.cpp +++ b/src/Simd/SimdBaseSynetConvolution16bNhwcSpecV3.cpp @@ -304,7 +304,7 @@ namespace Simd bool SynetConvolution16bNhwcSpecV3::Preferable(const ConvParam& p) { - const size_t M = p.dstH * p.dstW; + const size_t M = p.batch * p.dstH * p.dstW; static int choise = 0; return 1 && p.trans != 0 && p.group == 1 && p.IsDilation(1) && p.IsStride(1) && !p.IsKernel(1) && p.dstC >= 4 && p.srcC >= 9 && /*p.srcC <= 128 &&*/ M >= 16 && 1;// && (choise++) & 0; diff --git a/src/Test/TestSynetConvolution16b.cpp b/src/Test/TestSynetConvolution16b.cpp index b8de4da8b2..af58e54b30 100644 --- a/src/Test/TestSynetConvolution16b.cpp +++ b/src/Test/TestSynetConvolution16b.cpp @@ -548,22 +548,17 @@ namespace Test result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 32, 28, 24, 32, _3, _1, _2, _1, _1, 1, aId, tT, b16, b16), c, f1, f2); result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 32, 96, 96, 96, _3, _1, _2, _1, _1, 1, aId, tT, b16, b16), c, f1, f2); #endif -#if 1 - //result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 256, 16, 16, 256, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2); - //result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 112, 24, 24, 112, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2); +#if 0 result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 56, 96, 96, 56, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2); result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 56, 48, 48, 56, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2); - //result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 56, 48, 48, 112, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2); - //result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 64, 48, 48, 56, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2); +#endif +#if 1 result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 112, 24, 24, 112, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2); - //result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 112, 24, 24, 224, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2); - //result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 128, 24, 24, 112, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2); result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 224, 12, 12, 224, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2); result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 224, 12, 12, 448, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2); result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 448, 6, 6, 448, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2); - //result = result && SynetConvolution16bForwardAutoTest(eps, Param(10, 448, 6, 6, 448, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2); #endif -#if 1 +#if 0 result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 64, 57, 57, 64, Size(1, 7), _1, _1, Size(0, 3), Size(0, 3), 1, aPr, tT, b16, b16), c, f1, f2); result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 64, 57, 57, 64, Size(7, 1), _1, _1, Size(3, 0), Size(3, 0), 1, aPr, tT, b16, b16), c, f1, f2); result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 128, 13, 13, 160, Size(1, 7), _1, _1, Size(0, 3), Size(0, 3), 1, aPr, tT, b16, b16), c, f1, f2); From 7598c32cfabd4eaf9a908c89aa58b4e2d269dc2a Mon Sep 17 00:00:00 2001 From: Yermalayeu Ihar Date: Thu, 21 May 2026 20:24:28 +0300 Subject: [PATCH 24/32] *disable using of Convolution16bNhwcSpecV3Body32x32_Yx3. --- src/Simd/SimdAmxBf16SynetConvolution16bNhwcSpecV3.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Simd/SimdAmxBf16SynetConvolution16bNhwcSpecV3.cpp b/src/Simd/SimdAmxBf16SynetConvolution16bNhwcSpecV3.cpp index 7154a8859f..56f0ccd608 100644 --- a/src/Simd/SimdAmxBf16SynetConvolution16bNhwcSpecV3.cpp +++ b/src/Simd/SimdAmxBf16SynetConvolution16bNhwcSpecV3.cpp @@ -443,7 +443,7 @@ namespace Simd size_t i = 0; if (dC > F) { - if (p.kernelX == 3 && 1) + if (p.kernelX == 3 && 0) { for (; i < nn; i += n) Convolution16bNhwcSpecV3Body32x32_Yx3(src + i * dS, p, a, offs, nK, zero, weight, buf + i * dB); From 3aa46dd3855dc1f509060a870aea1ce3d821ee13 Mon Sep 17 00:00:00 2001 From: Yermalayeu Ihar Date: Fri, 22 May 2026 17:28:47 +0300 Subject: [PATCH 25/32] *update help. --- docs/2026.html | 6 +++ docs/help/functions_b.html | 2 +- docs/help/functions_c.html | 2 +- docs/help/functions_f.html | 7 ++- docs/help/functions_func_c.html | 2 +- docs/help/functions_i.html | 2 +- docs/help/functions_l.html | 2 +- docs/help/functions_r.html | 2 +- docs/help/functions_t.html | 3 +- docs/help/functions_u.html | 2 +- docs/help/group__cpu__flags.html | 69 +++++++++++++++++++----- docs/help/group__matrix.html | 4 +- docs/help/group__thread.html | 7 +-- docs/help/struct_simd_1_1_detection.html | 6 +-- docs/help/struct_simd_1_1_frame.html | 49 +++++++++++++++++ docs/help/struct_simd_1_1_view.html | 46 ++++++++++++++++ 16 files changed, 178 insertions(+), 33 deletions(-) diff --git a/docs/2026.html b/docs/2026.html index cf0499a982..cb35619654 100644 --- a/docs/2026.html +++ b/docs/2026.html @@ -67,6 +67,12 @@
      Improving
    • Description of function SimdAlign.
    • Description of function SimdAlignment.
    • Description of function SimdRelease.
    • +
    • Description of function SimdGetThreadNumber.
    • +
    • Description of function SimdSetThreadNumber.
    • +
    • Description of function SimdEmpty.
    • +
    • Description of function SimdGetFastMode.
    • +
    • Description of function SimdSetFastMode.
    • +
    • Description of function SimdSetAmxFull.
    Home diff --git a/docs/help/functions_b.html b/docs/help/functions_b.html index 1be8318359..3c8eadd3f7 100644 --- a/docs/help/functions_b.html +++ b/docs/help/functions_b.html @@ -78,8 +78,8 @@

    - b -

    Note
    This function supports multithreading (See functions SimdGetThreadNumber and SimdSetThreadNumber).
    +
    Note
    This function supports multithreading (See functions SimdGetThreadNumber and SimdSetThreadNumber).
    Parameters
    @@ -244,7 +244,7 @@

    C(M, N) = alpha*A(M, K)*Trans(B(N, K)) + beta*C(M, N); -
    Note
    This function supports multithreading (See functions SimdGetThreadNumber and SimdSetThreadNumber).
    +
    Note
    This function supports multithreading (See functions SimdGetThreadNumber and SimdSetThreadNumber).
    Parameters

    [in]M- a height of A and height of C matrices.
    diff --git a/docs/help/group__thread.html b/docs/help/group__thread.html index 51e7a79cfa..b9d0ddb3e8 100644 --- a/docs/help/group__thread.html +++ b/docs/help/group__thread.html @@ -51,7 +51,7 @@

    Simd Library Documentation.

    - + @@ -76,8 +76,9 @@

    -

    Gets number of threads used by Simd Library to parallelize some algorithms.

    -
    Returns
    current thread number.
    +

    Gets current global thread number configured for Simd Library parallel algorithms.

    +

    Returns the value set by SimdSetThreadNumber. By default this value is 1. When set, it is restricted to the range [1, std::thread::hardware_concurrency()].

    +
    Returns
    current configured thread number.
    diff --git a/docs/help/struct_simd_1_1_detection.html b/docs/help/struct_simd_1_1_detection.html index e0f42170b5..c24c6897af 100644 --- a/docs/help/struct_simd_1_1_detection.html +++ b/docs/help/struct_simd_1_1_detection.html @@ -139,9 +139,9 @@

    Simd Library Documentation.

    Detection()
    Definition: SimdDetection.hpp:211
    bool Init(const Size &imageSize, double scaleFactor=1.1, const Size &sizeMin=Size(0, 0), const Size &sizeMax=Size(INT_MAX, INT_MAX), const View &roi=View(), ptrdiff_t threadNumber=-1)
    Definition: SimdDetection.hpp:290
    The View structure provides storage and manipulation of images.
    Definition: SimdView.hpp:70
    -
    Point< ptrdiff_t > Size() const
    Definition: SimdView.hpp:1076
    -
    bool Load(const std::string &path, Format format=None)
    Definition: SimdView.hpp:1283
    -
    bool Save(const std::string &path, SimdImageFileType type=SimdImageFileUndefined, int quality=100) const
    Definition: SimdView.hpp:1307
    +
    Point< ptrdiff_t > Size() const
    Definition: SimdView.hpp:1105
    +
    bool Load(const std::string &path, Format format=None)
    Definition: SimdView.hpp:1312
    +
    bool Save(const std::string &path, SimdImageFileType type=SimdImageFileUndefined, int quality=100) const
    Definition: SimdView.hpp:1336

    Using example (face detection in the video captured by OpenCV):

    #include <iostream>
    #include <string>
    diff --git a/docs/help/struct_simd_1_1_frame.html b/docs/help/struct_simd_1_1_frame.html index 0c56a0aa63..cd195bfb3a 100644 --- a/docs/help/struct_simd_1_1_frame.html +++ b/docs/help/struct_simd_1_1_frame.html @@ -114,6 +114,10 @@

    Simd Library Documentation.

    + + + + @@ -754,6 +758,51 @@

    Returns
    a pointer to the new Frame structure (not owner). The user must free this pointer after usage.
    + + + +

    ◆ Copy() [1/2]

    + +
    +
    +

    [in]M- a height of A and height of C matrices.

    Functions

    SIMD_API size_t SimdGetThreadNumber (void)
     Gets number of threads used by Simd Library to parallelize some algorithms. More...
     Gets current global thread number configured for Simd Library parallel algorithms. More...
     
    SIMD_API void SimdSetThreadNumber (size_t threadNumber)
     Sets number of threads used by Simd Library to parallelize some algorithms. More...
     
    FrameClone (Frame &buffer) const
     
    Frame Copy () const
     
    Frame Copy (const Rectangle< ptrdiff_t > &rect) const
     
    Frameoperator= (const Frame &frame)
     
    Frameoperator= (Frame &&frame)
    + + + + + + +
    Frame Copy () const
    +
    +

    Gets a copy of current frame by value.

    +
    Returns
    a new Frame structure containing a copy of the frame.
    + +
    + + +

    ◆ Copy() [2/2]

    + +
    +
    + + + + + + + + +
    Frame Copy (const Rectangle< ptrdiff_t > & rect) const
    +
    +

    Gets a copy of region of current frame bounded by the rectangle with specified coordinates, by value.

    +
    Parameters
    + + +
    [in]rect- a rectangle which bounds the region.
    +
    +
    +
    Returns
    a new Frame structure containing a copy of the region.
    +
    diff --git a/docs/help/struct_simd_1_1_view.html b/docs/help/struct_simd_1_1_view.html index 2762b3820a..b5fba9c9be 100644 --- a/docs/help/struct_simd_1_1_view.html +++ b/docs/help/struct_simd_1_1_view.html @@ -158,6 +158,10 @@

    Simd Library Documentation.

      ViewClone (View &buffer) const   +View Copy () const +  +View Copy (const Rectangle< ptrdiff_t > &rect) const +  Viewoperator= (const View &view)   Viewoperator= (View &&view) @@ -822,6 +826,48 @@

    Returns
    a pointer to the new View structure (not owner). The user must free this pointer after usage.

    + + + +

    ◆ Copy() [1/2]

    + +
    +
    + + + + +
    SIMD_INLINE View< A > Copy
    +
    +

    Gets a copy of current image view by value.

    +
    Returns
    a new View structure containing a copy of the image.
    + +
    +
    + +

    ◆ Copy() [2/2]

    + +
    +
    + + + + + + + + +
    SIMD_INLINE View< A > Copy (const Rectangle< ptrdiff_t > & rect) const
    +
    +

    Gets a copy of region of current image view bounded by the rectangle with specified coordinates, by value.

    +
    Parameters
    + + +
    [in]rect- a rectangle which bounds the region.
    +
    +
    +
    Returns
    a new View structure containing a copy of the region.
    +
    From 03546b65f125d36fa8783a855edd26f08e160ffd Mon Sep 17 00:00:00 2001 From: Yermalayeu Ihar Date: Mon, 25 May 2026 16:43:14 +0300 Subject: [PATCH 26/32] +add NEON optimizations of function Crc32c. --- docs/2026.html | 1 + docs/index.html | 1 + prj/cmake/arm.cmake | 6 +++- prj/vs2022/Neon.vcxproj | 1 + prj/vs2022/Neon.vcxproj.filters | 3 ++ src/Simd/SimdLib.cpp | 5 +++ src/Simd/SimdNeon.h | 4 +++ src/Simd/SimdNeonCrc32.cpp | 62 +++++++++++++++++++++++++++++++++ src/Test/TestCrc32.cpp | 5 +++ 9 files changed, 87 insertions(+), 1 deletion(-) create mode 100644 src/Simd/SimdNeonCrc32.cpp diff --git a/docs/2026.html b/docs/2026.html index cb35619654..b46393b7cb 100644 --- a/docs/2026.html +++ b/docs/2026.html @@ -50,6 +50,7 @@
    New features
  • Method View::Copy.
  • Method Frame::Copy.
  • Base implementation, AMX-BF16 optimizations of class SynetConvolution16bNhwcSpecV3.
  • +
  • NEON optimizations of function Crc32c.
  • Bug fixing
      diff --git a/docs/index.html b/docs/index.html index 0104f85d91..483b97dd6d 100644 --- a/docs/index.html +++ b/docs/index.html @@ -83,6 +83,7 @@

      Contributors

      2026-2026 Yu Changming,

      2026-2026 Evgeniy Efimov.

      2026-2026 Brian Cain.

      +

      2026-2026 metsw24-max.

      diff --git a/prj/cmake/arm.cmake b/prj/cmake/arm.cmake index a95bfa7531..5b93fd2b88 100644 --- a/prj/cmake/arm.cmake +++ b/prj/cmake/arm.cmake @@ -2,7 +2,7 @@ if((CMAKE_CXX_COMPILER_ID MATCHES "GNU") AND (NOT(CMAKE_CXX_COMPILER_VERSION VER set(COMMON_CXX_FLAGS "${COMMON_CXX_FLAGS} -Wno-psabi") endif() -if(CMAKE_SYSTEM_PROCESSOR MATCHES "arm" AND NOT CMAKE_SYSTEM_PROCESSOR MATCHES "arm64") +if(CMAKE_SYSTEM_PROCESSOR MATCHES "arm" AND NOT CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") if( NOT ((CMAKE_CXX_COMPILER_ID STREQUAL "Clang") OR (CMAKE_CXX_COMPILER MATCHES "clang") OR (CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang"))) set(CXX_NEON_FLAG "-mfpu=neon -mfpu=neon-fp16") endif() @@ -13,6 +13,10 @@ else() set(CXX_NEON_FLAG "") endif() +if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") + set(CXX_NEON_FLAG "${CXX_NEON_FLAG} -march=armv8-a+crc") +endif() + if((CMAKE_CXX_COMPILER_ID STREQUAL "Clang") OR (CMAKE_CXX_COMPILER MATCHES "clang") OR (CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang")) add_definitions(-DSIMD_NEON_FP16_DISABLE) endif() diff --git a/prj/vs2022/Neon.vcxproj b/prj/vs2022/Neon.vcxproj index eec5e3984a..d355f0900c 100644 --- a/prj/vs2022/Neon.vcxproj +++ b/prj/vs2022/Neon.vcxproj @@ -46,6 +46,7 @@ + diff --git a/prj/vs2022/Neon.vcxproj.filters b/prj/vs2022/Neon.vcxproj.filters index 5f5e92078f..597fbb5761 100644 --- a/prj/vs2022/Neon.vcxproj.filters +++ b/prj/vs2022/Neon.vcxproj.filters @@ -361,6 +361,9 @@ Neon\Filter + + Neon\System + diff --git a/src/Simd/SimdLib.cpp b/src/Simd/SimdLib.cpp index 38e12f405d..282024f180 100644 --- a/src/Simd/SimdLib.cpp +++ b/src/Simd/SimdLib.cpp @@ -273,6 +273,11 @@ SIMD_API uint32_t SimdCrc32c(const void * src, size_t size) if(Sse41::Enable) return Sse41::Crc32c(src, size); else +#endif +#if defined(SIMD_NEON_ENABLE) && defined(SIMD_ARM64_ENABLE) + if (Neon::Enable) + return Neon::Crc32c(src, size); + else #endif return Base::Crc32c(src, size); } diff --git a/src/Simd/SimdNeon.h b/src/Simd/SimdNeon.h index 2359e7c66e..30bda51a1b 100644 --- a/src/Simd/SimdNeon.h +++ b/src/Simd/SimdNeon.h @@ -33,6 +33,10 @@ namespace Simd #ifdef SIMD_NEON_ENABLE namespace Neon { +#if defined(SIMD_ARM64_ENABLE) + uint32_t Crc32c(const void* src, size_t size); +#endif + void AbsDifference(const uint8_t* a, size_t aStride, const uint8_t* b, size_t bStride, uint8_t* c, size_t cStride, size_t width, size_t height); diff --git a/src/Simd/SimdNeonCrc32.cpp b/src/Simd/SimdNeonCrc32.cpp new file mode 100644 index 0000000000..ab5956c547 --- /dev/null +++ b/src/Simd/SimdNeonCrc32.cpp @@ -0,0 +1,62 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2017 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdStore.h" +#include "Simd/SimdMemory.h" + +#if defined(SIMD_NEON_ENABLE) && defined(SIMD_ARM64_ENABLE) +#include +#endif + +namespace Simd +{ +#if defined(SIMD_NEON_ENABLE) && defined(SIMD_ARM64_ENABLE) + namespace Neon + { + SIMD_INLINE void Crc32c(uint32_t& crc, const uint64_t* p, const uint64_t* end) + { + while (p < end) + crc = __crc32cd(crc, *p++); + } + + SIMD_INLINE void Crc32c(uint32_t& crc, const uint8_t* p, const uint8_t* end) + { + while (p < end) + crc = __crc32cb(crc, *p++); + } + + uint32_t Crc32c(const void* src, size_t size) + { + uint8_t* nose = (uint8_t*)src; + uint64_t* body = (uint64_t*)AlignHi(nose, sizeof(uint64_t)); + uint64_t* tail = (uint64_t*)AlignLo(nose + size, sizeof(uint64_t)); + + uint32_t crc = 0xFFFFFFFF; + Crc32c(crc, nose, (uint8_t*)body); + Crc32c(crc, body, tail); + Crc32c(crc, (uint8_t*)tail, nose + size); + return ~crc; + } + } +#endif +} diff --git a/src/Test/TestCrc32.cpp b/src/Test/TestCrc32.cpp index 6c029fde74..38260a1f4f 100644 --- a/src/Test/TestCrc32.cpp +++ b/src/Test/TestCrc32.cpp @@ -109,6 +109,11 @@ namespace Test result = result && Crc32AutoTest(FUNC(Simd::Sse41::Crc32c), FUNC(SimdCrc32c)); #endif +#if defined(SIMD_NEON_ENABLE) && defined(SIMD_ARM64_ENABLE) + if (Simd::Neon::Enable && TestNeon(options)) + result = result && Crc32AutoTest(FUNC(Simd::Neon::Crc32c), FUNC(SimdCrc32c)); +#endif + return result; } } From 4027cee11dae4b20dddd5d4a3ce7359cc7264feb Mon Sep 17 00:00:00 2001 From: Yermalayeu Ihar Date: Mon, 25 May 2026 16:57:05 +0300 Subject: [PATCH 27/32] +add NEON optimizations of function Crc32. --- docs/2026.html | 1 + src/Simd/SimdLib.cpp | 8 ++++++-- src/Simd/SimdNeon.h | 2 ++ src/Simd/SimdNeonCrc32.cpp | 27 +++++++++++++++++++++++++++ src/Test/TestCrc32.cpp | 5 +++++ 5 files changed, 41 insertions(+), 2 deletions(-) diff --git a/docs/2026.html b/docs/2026.html index b46393b7cb..1d572bf4cd 100644 --- a/docs/2026.html +++ b/docs/2026.html @@ -51,6 +51,7 @@
      New features
    • Method Frame::Copy.
    • Base implementation, AMX-BF16 optimizations of class SynetConvolution16bNhwcSpecV3.
    • NEON optimizations of function Crc32c.
    • +
    • NEON optimizations of function Crc32.
    Bug fixing
      diff --git a/src/Simd/SimdLib.cpp b/src/Simd/SimdLib.cpp index 282024f180..06792097e7 100644 --- a/src/Simd/SimdLib.cpp +++ b/src/Simd/SimdLib.cpp @@ -260,10 +260,14 @@ SIMD_API void SimdSetAmxFull() #endif } - SIMD_API uint32_t SimdCrc32(const void* src, size_t size) { - return Base::Crc32(src, size); +#if defined(SIMD_NEON_ENABLE) && defined(SIMD_ARM64_ENABLE) + if (Neon::Enable) + return Neon::Crc32(src, size); + else +#endif + return Base::Crc32(src, size); } SIMD_API uint32_t SimdCrc32c(const void * src, size_t size) diff --git a/src/Simd/SimdNeon.h b/src/Simd/SimdNeon.h index 30bda51a1b..16e68b1625 100644 --- a/src/Simd/SimdNeon.h +++ b/src/Simd/SimdNeon.h @@ -34,6 +34,8 @@ namespace Simd namespace Neon { #if defined(SIMD_ARM64_ENABLE) + uint32_t Crc32(const void* src, size_t size); + uint32_t Crc32c(const void* src, size_t size); #endif diff --git a/src/Simd/SimdNeonCrc32.cpp b/src/Simd/SimdNeonCrc32.cpp index ab5956c547..b7026b476e 100644 --- a/src/Simd/SimdNeonCrc32.cpp +++ b/src/Simd/SimdNeonCrc32.cpp @@ -33,6 +33,33 @@ namespace Simd #if defined(SIMD_NEON_ENABLE) && defined(SIMD_ARM64_ENABLE) namespace Neon { + SIMD_INLINE void Crc32(uint32_t& crc, const uint64_t* p, const uint64_t* end) + { + while (p < end) + crc = __crc32d(crc, *p++); + } + + SIMD_INLINE void Crc32(uint32_t& crc, const uint8_t* p, const uint8_t* end) + { + while (p < end) + crc = __crc32b(crc, *p++); + } + + uint32_t Crc32(const void* src, size_t size) + { + uint8_t* nose = (uint8_t*)src; + uint64_t* body = (uint64_t*)AlignHi(nose, sizeof(uint64_t)); + uint64_t* tail = (uint64_t*)AlignLo(nose + size, sizeof(uint64_t)); + + uint32_t crc = 0xFFFFFFFF; + Crc32(crc, nose, (uint8_t*)body); + Crc32(crc, body, tail); + Crc32(crc, (uint8_t*)tail, nose + size); + return ~crc; + } + + //-------------------------------------------------------------------------------------------------- + SIMD_INLINE void Crc32c(uint32_t& crc, const uint64_t* p, const uint64_t* end) { while (p < end) diff --git a/src/Test/TestCrc32.cpp b/src/Test/TestCrc32.cpp index 38260a1f4f..9d9ccb99d2 100644 --- a/src/Test/TestCrc32.cpp +++ b/src/Test/TestCrc32.cpp @@ -94,6 +94,11 @@ namespace Test if (TestBase(options)) result = result && Crc32AutoTest(FUNC(Simd::Base::Crc32), FUNC(SimdCrc32)); +#if defined(SIMD_NEON_ENABLE) && defined(SIMD_ARM64_ENABLE) + if (Simd::Neon::Enable && TestNeon(options)) + result = result && Crc32AutoTest(FUNC(Simd::Neon::Crc32), FUNC(SimdCrc32)); +#endif + return result; } From 0d40c7b1e8451012002e209aa5b0557a6adff573 Mon Sep 17 00:00:00 2001 From: Yermalayeu Ihar Date: Tue, 26 May 2026 10:21:15 +0300 Subject: [PATCH 28/32] +add SVE optimizations of function GetStatistics. --- docs/2026.html | 22 ++++++++++ prj/vs2022/Sve1.vcxproj | 1 + prj/vs2022/Sve1.vcxproj.filters | 3 ++ src/Simd/SimdLib.cpp | 6 +++ src/Simd/SimdSve1.h | 2 + src/Simd/SimdSve1Statistic.cpp | 74 +++++++++++++++++++++++++++++++++ src/Test/TestStatistic.cpp | 5 +++ 7 files changed, 113 insertions(+) create mode 100644 src/Simd/SimdSve1Statistic.cpp diff --git a/docs/2026.html b/docs/2026.html index 1d572bf4cd..633d86ff9c 100644 --- a/docs/2026.html +++ b/docs/2026.html @@ -52,10 +52,16 @@
      New features
    • Base implementation, AMX-BF16 optimizations of class SynetConvolution16bNhwcSpecV3.
    • NEON optimizations of function Crc32c.
    • NEON optimizations of function Crc32.
    • +
    • Support of 8-bit BMP in function ImageLoadBmp.
    • +
    • SVE optimizations of function GetStatistics.
    Bug fixing
    • Error in function SimdAlignment for SVE (ARM).
    • +
    • Integer overflow in Base implementation of function JpegProcessFrameHeader.
    • +
    • Checking of correctness in Base::JpegHuffman::Build.
    • +
    • Checking of correctness in Base::JpegToRgba.
    • +
    • Error in ImagePngLoader::ReadTransparency.

    Documentation

    @@ -75,6 +81,22 @@
    Improving
  • Description of function SimdGetFastMode.
  • Description of function SimdSetFastMode.
  • Description of function SimdSetAmxFull.
  • +
  • Description of function SimdCrc32.
  • +
  • Description of function SimdCrc32c.
  • +
  • Description of function SimdAbsDifference.
  • +
  • Description of function SimdAbsDifferenceSum.
  • +
  • Description of function SimdAbsDifferenceSumMasked.
  • +
  • Description of function SimdAbsDifferenceSums3x3.
  • +
  • Description of function SimdAbsDifferenceSums3x3Masked.
  • +
  • Description of function SimdAbsGradientSaturatedSum.
  • +
  • Description of function SimdAddFeatureDifference.
  • +
  • Description of function SimdAlphaBlending.
  • +
  • Description of function SimdAlphaBlending2x.
  • +
  • Description of function SimdAlphaBlendingBgraToYuv420p.
  • +
  • Description of function SimdAlphaBlendingUniform.
  • +
  • Description of function SimdAlphaFilling.
  • +
  • Description of function SimdAlphaPremultiply.
  • +
  • Description of function SimdAlphaUnpremultiply.
  • Home diff --git a/prj/vs2022/Sve1.vcxproj b/prj/vs2022/Sve1.vcxproj index a960799e61..8dd4b60bf8 100644 --- a/prj/vs2022/Sve1.vcxproj +++ b/prj/vs2022/Sve1.vcxproj @@ -30,6 +30,7 @@ + diff --git a/prj/vs2022/Sve1.vcxproj.filters b/prj/vs2022/Sve1.vcxproj.filters index 87fc1e9732..6ccbb1ff48 100644 --- a/prj/vs2022/Sve1.vcxproj.filters +++ b/prj/vs2022/Sve1.vcxproj.filters @@ -59,5 +59,8 @@ Sve1\Motion + + Sve1\Statistics + \ No newline at end of file diff --git a/src/Simd/SimdLib.cpp b/src/Simd/SimdLib.cpp index 06792097e7..115af6c345 100644 --- a/src/Simd/SimdLib.cpp +++ b/src/Simd/SimdLib.cpp @@ -252,6 +252,7 @@ SIMD_API void SimdEmpty() Sse41::Empty(); #endif } + SIMD_API void SimdSetAmxFull() { #ifdef SIMD_AMXBF16_ENABLE @@ -4954,6 +4955,11 @@ SIMD_API void SimdGetStatistic(const uint8_t * src, size_t stride, size_t width, Sse41::GetStatistic(src, stride, width, height, min, max, average); else #endif +#ifdef SIMD_SVE_ENABLE + if (Sve::Enable) + Sve::GetStatistic(src, stride, width, height, min, max, average); + else +#endif #ifdef SIMD_NEON_ENABLE if (Neon::Enable && width >= Neon::A) Neon::GetStatistic(src, stride, width, height, min, max, average); diff --git a/src/Simd/SimdSve1.h b/src/Simd/SimdSve1.h index 853f9b4897..1e7d5274fd 100644 --- a/src/Simd/SimdSve1.h +++ b/src/Simd/SimdSve1.h @@ -75,6 +75,8 @@ namespace Simd void OperationBinary8u(const uint8_t* a, size_t aStride, const uint8_t* b, size_t bStride, size_t width, size_t height, size_t channelCount, uint8_t* dst, size_t dstStride, SimdOperationBinary8uType type); void OperationBinary16i(const uint8_t* a, size_t aStride, const uint8_t* b, size_t bStride, size_t width, size_t height, uint8_t* dst, size_t dstStride, SimdOperationBinary16iType type); + + void GetStatistic(const uint8_t* src, size_t stride, size_t width, size_t height, uint8_t* min, uint8_t* max, uint8_t* average); } #endif } diff --git a/src/Simd/SimdSve1Statistic.cpp b/src/Simd/SimdSve1Statistic.cpp new file mode 100644 index 0000000000..f9d1e58c02 --- /dev/null +++ b/src/Simd/SimdSve1Statistic.cpp @@ -0,0 +1,74 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2023 Yermalayeu Ihar, +* 2018-2018 Radchenko Andrey. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdMemory.h" +#include "Simd/SimdStore.h" +#include "Simd/SimdExtract.h" +#include "Simd/SimdBase.h" + +namespace Simd +{ +#ifdef SIMD_SVE_ENABLE + namespace Sve + { + SIMD_INLINE void UpdateStatistic(const uint8_t* src, svbool_t mask, svuint8_t _1, svuint8_t& min, svuint8_t& max, svuint32_t& sum) + { + svuint8_t val = svld1_u8(mask, src); + min = svmin_u8_m(mask, min, val); + max = svmax_u8_m(mask, max, val); + sum = svdot_u32(sum, val, _1); + } + + void GetStatistic(const uint8_t * src, size_t stride, size_t width, size_t height, uint8_t * min, uint8_t * max, uint8_t * average) + { + assert(width*height); + + size_t A = svlen(svuint8_t()); + size_t widthA = AlignLo(width, A); + const svbool_t body = svptrue_b32(); + const svbool_t tail = svwhilelt_b8(widthA, width); + + svuint8_t _1 = svdup_n_u8(1); + svuint8_t _min = svdup_n_u8(255); + svuint8_t _max = svdup_n_u8(0); + uint64_t sum = 0; + for (size_t row = 0; row < height; ++row) + { + size_t col = 0; + svuint32_t _sum = svdup_n_u32(0); + for (; col < widthA; col += A) + UpdateStatistic(src + col, body, _1, _min, _max, _sum); + if (widthA < width) + UpdateStatistic(src + col, tail, _1, _min, _max, _sum); + sum += svaddv_u32(svptrue_b32(), _sum); + src += stride; + } + + *min = svminv_u8(svptrue_b32(), _min); + *max = svmaxv_u8(svptrue_b32(), _max); + *average = (uint8_t)((sum + width*height / 2) / (width*height)); + } + } +#endif +} diff --git a/src/Test/TestStatistic.cpp b/src/Test/TestStatistic.cpp index db00029a42..6426752a11 100644 --- a/src/Test/TestStatistic.cpp +++ b/src/Test/TestStatistic.cpp @@ -113,6 +113,11 @@ namespace Test result = result && GetStatisticAutoTest(FUNC1(Simd::Neon::GetStatistic), FUNC1(SimdGetStatistic)); #endif +#ifdef SIMD_SVE_ENABLE + if (Simd::Sve::Enable && TestSve(options)) + result = result && GetStatisticAutoTest(FUNC1(Simd::Sve::GetStatistic), FUNC1(SimdGetStatistic)); +#endif + #ifdef SIMD_HVX_ENABLE if (Simd::Hvx::Enable && TestHvx(options) && W >= Simd::Hvx::A) { From 7aca8cee313160b681d9c07c464e34e9b3fdba8c Mon Sep 17 00:00:00 2001 From: Yermalayeu Ihar Date: Tue, 26 May 2026 10:32:27 +0300 Subject: [PATCH 29/32] *fix error int GetStatisticAutoTest. --- docs/2026.html | 6 ++++++ src/Test/TestCompare.h | 9 ++++++++- src/Test/TestStatistic.cpp | 6 +++--- 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/docs/2026.html b/docs/2026.html index 633d86ff9c..38614265fe 100644 --- a/docs/2026.html +++ b/docs/2026.html @@ -64,6 +64,12 @@
    Bug fixing
  • Error in ImagePngLoader::ReadTransparency.
  • +

    Test framework

    +
    Bug fixing
    +
      +
    • Error in test GetStatisticAutoTest.
    • +
    +

    Documentation

    Improving