From e845e50ad3747f18ad680007b9cd64a05f3fce0c Mon Sep 17 00:00:00 2001
From: Yermalayeu Ihar <ermig@tut.by>
Date: Mon, 4 May 2026 16:48:33 +0300
Subject: [PATCH 01/32] +add SVE optimizations of function AbsDifferenceSum.

---
 docs/2026.html                        |  9 ++++
 prj/vs2022/Sve1.vcxproj               |  1 +
 prj/vs2022/Sve1.vcxproj.filters       |  6 +++
 src/Simd/SimdLib.cpp                  |  5 ++
 src/Simd/SimdSve1.h                   |  2 +
 src/Simd/SimdSve1AbsDifferenceSum.cpp | 68 +++++++++++++++++++++++++++
 src/Test/TestDifferenceSum.cpp        |  6 ++-
 7 files changed, 96 insertions(+), 1 deletion(-)
 create mode 100644 src/Simd/SimdSve1AbsDifferenceSum.cpp
diff --git a/docs/2026.html b/docs/2026.html
index 32d449f5b7..7a10dd1896 100644
--- a/docs/2026.html
+++ b/docs/2026.html
@@ -35,6 +35,15 @@ <h1>Simd Library Release Notes (2026).</h1>
  <a href="2013.html">2013</a>
 </center>
 
+<a href="#HOME">Home</a>
+<hr/>
+<h3 id="R162">June X, 2026 (version 7.1.162)</h3> 
+<h4>Algorithms</h4>
+<h5>New features</h5>
+<ul>
+ <li>SVE optimizations of function AbsDifferenceSum.</li>
+</ul>
+
 <a href="#HOME">Home</a>
 <hr/>
 <h3 id="R161">May 4, 2026 (version 7.1.161)</h3> 
diff --git a/prj/vs2022/Sve1.vcxproj b/prj/vs2022/Sve1.vcxproj
index 75c0879b23..93c5d19a12 100644
--- a/prj/vs2022/Sve1.vcxproj
+++ b/prj/vs2022/Sve1.vcxproj
@@ -22,6 +22,7 @@
   </ItemDefinitionGroup>
   <ItemGroup>
     <ClCompile Include="..\..\src\Simd\SimdSve1AbsDifference.cpp" />
+    <ClCompile Include="..\..\src\Simd\SimdSve1AbsDifferenceSum.cpp" />
     <ClCompile Include="..\..\src\Simd\SimdSve1AbsGradientSaturatedSum.cpp" />
     <ClCompile Include="..\..\src\Simd\SimdSve1BgrToRgb.cpp" />
     <ClCompile Include="..\..\src\Simd\SimdSve1Cpu.cpp" />
diff --git a/prj/vs2022/Sve1.vcxproj.filters b/prj/vs2022/Sve1.vcxproj.filters
index 3ecc6a5937..3356e32b25 100644
--- a/prj/vs2022/Sve1.vcxproj.filters
+++ b/prj/vs2022/Sve1.vcxproj.filters
@@ -19,6 +19,9 @@
     <Filter Include="Sve1\Convert">
       <UniqueIdentifier>{9b881931-b5e2-4e02-85e0-c84c24f7eacb}</UniqueIdentifier>
     </Filter>
+    <Filter Include="Sve1\Statistics">
+      <UniqueIdentifier>{b123ae36-270f-4b5c-8b87-372f66f78ba6}</UniqueIdentifier>
+    </Filter>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\..\src\Simd\SimdSve1.h">
@@ -47,5 +50,8 @@
     <ClCompile Include="..\..\src\Simd\SimdSve1BgrToRgb.cpp">
       <Filter>Sve1\Convert</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\src\Simd\SimdSve1AbsDifferenceSum.cpp">
+      <Filter>Sve1\Statistics</Filter>
+    </ClCompile>
   </ItemGroup>
 </Project>
\ No newline at end of file
diff --git a/src/Simd/SimdLib.cpp b/src/Simd/SimdLib.cpp
index 770c4525bf..9922a57a8c 100644
--- a/src/Simd/SimdLib.cpp
+++ b/src/Simd/SimdLib.cpp
@@ -333,6 +333,11 @@ SIMD_API void SimdAbsDifferenceSum(const uint8_t *a, size_t aStride, const uint8
         Sse41::AbsDifferenceSum(a, aStride, b, bStride, width, height, sum);
     else
 #endif
+#ifdef SIMD_SVE_ENABLE
+    if (Sve::Enable)
+        Sve::AbsDifferenceSum(a, aStride, b, bStride, width, height, sum);
+    else
+#endif
 #ifdef SIMD_NEON_ENABLE
     if (Neon::Enable && width >= Neon::A)
         Neon::AbsDifferenceSum(a, aStride, b, bStride, width, height, sum);
diff --git a/src/Simd/SimdSve1.h b/src/Simd/SimdSve1.h
index 165dc7b2ef..cd021477c5 100644
--- a/src/Simd/SimdSve1.h
+++ b/src/Simd/SimdSve1.h
@@ -35,6 +35,8 @@ namespace Simd
     {
         void AbsDifference(const uint8_t* a, size_t aStride, const uint8_t* b, size_t bStride, uint8_t* c, size_t cStride, size_t width, size_t height);
 
+        void AbsDifferenceSum(const uint8_t* a, size_t aStride, const uint8_t* b, size_t bStride, size_t width, size_t height, uint64_t* sum);
+
         void AbsGradientSaturatedSum(const uint8_t* src, size_t srcStride, size_t width, size_t height, uint8_t* dst, size_t dstStride);
 
         void BgrToRgb(const uint8_t* bgr, size_t width, size_t height, size_t bgrStride, uint8_t* rgb, size_t rgbStride);
diff --git a/src/Simd/SimdSve1AbsDifferenceSum.cpp b/src/Simd/SimdSve1AbsDifferenceSum.cpp
new file mode 100644
index 0000000000..4d7c0daa6a
--- /dev/null
+++ b/src/Simd/SimdSve1AbsDifferenceSum.cpp
@@ -0,0 +1,68 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2022 Yermalayeu Ihar,
+*               2022-2022 Fabien Spindler,
+*               2022-2022 Souriya Trinh.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdSve1.h"
+#include "Simd/SimdMemory.h"
+
+namespace Simd
+{
+#ifdef SIMD_SVE_ENABLE
+    namespace Sve
+    {
+        void AbsDifferenceSum(const uint8_t* a, size_t aStride, const uint8_t* b, size_t bStride, size_t width, size_t height, uint64_t* sum)
+        {
+            size_t A = svlen(svuint8_t());
+            size_t widthA = AlignLo(width, A);
+            const svbool_t body = svwhilelt_b8(size_t(0), A);
+            const svbool_t tail = svwhilelt_b8(widthA, width);
+            //svuint8_t _a
+            //svuint64_t _sum = svdup_n_u64(0);
+            uint64_t _sum = 0;
+            for (size_t row = 0; row < height; ++row)
+            {
+                //svuint32_t rowSum = svdup_n_u32(0);
+                size_t col = 0;
+                for (; col < widthA; col += A)
+                {
+                    svuint8_t _a = svld1_u8(body, a + col);
+                    svuint8_t _b = svld1_u8(body, b + col);
+                    svuint8_t abd = svabd_x(body, _a, _b);
+                    _sum += svaddv_u8(body, abd);
+                }
+                if (widthA < width)
+                {
+                    svuint8_t _a = svld1_u8(tail, a + col);
+                    svuint8_t _b = svld1_u8(tail, b + col);
+                    svuint8_t abd = svabd_x(tail, _a, _b);
+                    _sum += svaddv_u8(tail, abd);
+                }
+                a += aStride;
+                b += bStride;
+            }
+            *sum = _sum;
+        }
+    }
+#endif
+}
diff --git a/src/Test/TestDifferenceSum.cpp b/src/Test/TestDifferenceSum.cpp
index 130955ac0e..10cd0257d7 100644
--- a/src/Test/TestDifferenceSum.cpp
+++ b/src/Test/TestDifferenceSum.cpp
@@ -215,7 +215,6 @@ namespace Test
         if (Simd::Neon::Enable && TestNeon(options))
             result = result && DifferenceSumsAutoTest(FUNC_S(Simd::Neon::SquaredDifferenceSum), FUNC_S(SimdSquaredDifferenceSum), 1);
 #endif
-
         return result;
     }
 
@@ -276,6 +275,11 @@ namespace Test
             result = result && DifferenceSumsAutoTest(FUNC_S(Simd::Neon::AbsDifferenceSum), FUNC_S(SimdAbsDifferenceSum), 1);
 #endif
 
+#ifdef SIMD_SVE_ENABLE
+        if (Simd::Sve::Enable && TestSve(options))
+            result = result && DifferenceSumsAutoTest(FUNC_S(Simd::Sve::AbsDifferenceSum), FUNC_S(SimdAbsDifferenceSum), 1);
+#endif
+
 #ifdef SIMD_HVX_ENABLE
         if (Simd::Hvx::Enable && TestHvx(options) && W >= Simd::Hvx::A)
             result = result && DifferenceSumsAutoTest(FUNC_S(Simd::Hvx::AbsDifferenceSum), FUNC_S(SimdAbsDifferenceSum), 1);

From 99ab1443d53848de43a04ccc09e9c5c66d246431 Mon Sep 17 00:00:00 2001
From: Yermalayeu Ihar <ermig@tut.by>
Date: Mon, 4 May 2026 17:02:09 +0300
Subject: [PATCH 02/32] +add SVE optimizations of function AbsDifferenceSum
 (version 2).

---
 src/Simd/SimdSve1AbsDifferenceSum.cpp | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/src/Simd/SimdSve1AbsDifferenceSum.cpp b/src/Simd/SimdSve1AbsDifferenceSum.cpp
index 4d7c0daa6a..39afea2897 100644
--- a/src/Simd/SimdSve1AbsDifferenceSum.cpp
+++ b/src/Simd/SimdSve1AbsDifferenceSum.cpp
@@ -37,31 +37,30 @@ namespace Simd
             size_t widthA = AlignLo(width, A);
             const svbool_t body = svwhilelt_b8(size_t(0), A);
             const svbool_t tail = svwhilelt_b8(widthA, width);
-            //svuint8_t _a
-            //svuint64_t _sum = svdup_n_u64(0);
-            uint64_t _sum = 0;
+            svuint8_t _1 = svdup_n_u8(1);
+            *sum = 0;
             for (size_t row = 0; row < height; ++row)
             {
-                //svuint32_t rowSum = svdup_n_u32(0);
                 size_t col = 0;
+                svuint32_t _sum = svdup_n_u32(0);
                 for (; col < widthA; col += A)
                 {
                     svuint8_t _a = svld1_u8(body, a + col);
                     svuint8_t _b = svld1_u8(body, b + col);
                     svuint8_t abd = svabd_x(body, _a, _b);
-                    _sum += svaddv_u8(body, abd);
+                    _sum = svdot_u32(_sum, abd, _1);
                 }
                 if (widthA < width)
                 {
                     svuint8_t _a = svld1_u8(tail, a + col);
                     svuint8_t _b = svld1_u8(tail, b + col);
                     svuint8_t abd = svabd_x(tail, _a, _b);
-                    _sum += svaddv_u8(tail, abd);
+                    _sum = svdot_u32(_sum, abd, _1);
                 }
+                *sum += svaddv_u32(svptrue_b32(), _sum);
                 a += aStride;
                 b += bStride;
             }
-            *sum = _sum;
         }
     }
 #endif

From 5cb414677394fd4f8bd74bc7e2005f4394d13897 Mon Sep 17 00:00:00 2001
From: Yermalayeu Ihar <ermig@tut.by>
Date: Mon, 4 May 2026 17:39:43 +0300
Subject: [PATCH 03/32] +add SVE optimizations of function
 AbsDifferenceSumMasked.

---
 docs/2026.html                        |  1 +
 src/Simd/SimdLib.cpp                  |  5 ++++
 src/Simd/SimdNeonAbsDifferenceSum.cpp |  8 +++++-
 src/Simd/SimdSve1.h                   |  3 ++
 src/Simd/SimdSve1AbsDifferenceSum.cpp | 40 +++++++++++++++++++++++++++
 src/Test/TestDifferenceSum.cpp        |  5 ++++
 6 files changed, 61 insertions(+), 1 deletion(-)

diff --git a/docs/2026.html b/docs/2026.html
index 7a10dd1896..5f8f4d4d71 100644
--- a/docs/2026.html
+++ b/docs/2026.html
@@ -42,6 +42,7 @@ <h4>Algorithms</h4>
 <h5>New features</h5>
 <ul>
  <li>SVE optimizations of function AbsDifferenceSum.</li>
+ <li>SVE optimizations of function AbsDifferenceSumMasked.</li>
 </ul>
 
 <a href="#HOME">Home</a>
diff --git a/src/Simd/SimdLib.cpp b/src/Simd/SimdLib.cpp
index 9922a57a8c..e304def840 100644
--- a/src/Simd/SimdLib.cpp
+++ b/src/Simd/SimdLib.cpp
@@ -370,6 +370,11 @@ SIMD_API void SimdAbsDifferenceSumMasked(const uint8_t *a, size_t aStride, const
         Sse41::AbsDifferenceSumMasked(a, aStride, b, bStride, mask, maskStride, index, width, height, sum);
     else
 #endif
+#ifdef SIMD_SVE_ENABLE
+    if (Sve::Enable)
+        Sve::AbsDifferenceSumMasked(a, aStride, b, bStride, mask, maskStride, index, width, height, sum);
+    else
+#endif
 #ifdef SIMD_NEON_ENABLE
     if (Neon::Enable && width >= Neon::A)
         Neon::AbsDifferenceSumMasked(a, aStride, b, bStride, mask, maskStride, index, width, height, sum);
diff --git a/src/Simd/SimdNeonAbsDifferenceSum.cpp b/src/Simd/SimdNeonAbsDifferenceSum.cpp
index c3b97857db..360a2aa4f4 100644
--- a/src/Simd/SimdNeonAbsDifferenceSum.cpp
+++ b/src/Simd/SimdNeonAbsDifferenceSum.cpp
@@ -75,6 +75,8 @@ namespace Simd
                 AbsDifferenceSum<false>(a, aStride, b, bStride, width, height, sum);
         }
 
+        //--------------------------------------------------------------------------------------------------
+
         template <bool align> void AbsDifferenceSumMasked(const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride,
             const uint8_t *mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sum)
         {
@@ -131,6 +133,8 @@ namespace Simd
                 AbsDifferenceSumMasked<false>(a, aStride, b, bStride, mask, maskStride, index, width, height, sum);
         }
 
+        //--------------------------------------------------------------------------------------------------
+
         template <bool align> void AbsDifferenceSums3(uint8x16_t current, const uint8_t * background, uint16x8_t sums[3])
         {
             sums[0] = vaddq_u16(sums[0], vpaddlq_u8(vabdq_u8(current, Load<align>(background - 1))));
@@ -228,6 +232,8 @@ namespace Simd
                 AbsDifferenceSums3x3<false>(current, currentStride, background, backgroundStride, width, height, sums);
         }
 
+        //--------------------------------------------------------------------------------------------------
+
         template <bool align> void AbsDifferenceSums3Masked16(uint8x16_t current, const uint8_t * background, uint8x16_t mask, uint16x8_t sums[3])
         {
             sums[0] = vaddq_u16(sums[0], vpaddlq_u8(vabdq_u8(current, vandq_u8(mask, Load<align>(background - 1)))));
@@ -318,5 +324,5 @@ namespace Simd
                 AbsDifferenceSums3x3Masked<false>(current, currentStride, background, backgroundStride, mask, maskStride, index, width, height, sums);
         }
     }
-#endif// SIMD_NEON_ENABLE
+#endif
 }
diff --git a/src/Simd/SimdSve1.h b/src/Simd/SimdSve1.h
index cd021477c5..51dff21046 100644
--- a/src/Simd/SimdSve1.h
+++ b/src/Simd/SimdSve1.h
@@ -37,6 +37,9 @@ namespace Simd
 
         void AbsDifferenceSum(const uint8_t* a, size_t aStride, const uint8_t* b, size_t bStride, size_t width, size_t height, uint64_t* sum);
 
+        void AbsDifferenceSumMasked(const uint8_t* a, size_t aStride, const uint8_t* b, size_t bStride,
+            const uint8_t* mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t* sum);
+
         void AbsGradientSaturatedSum(const uint8_t* src, size_t srcStride, size_t width, size_t height, uint8_t* dst, size_t dstStride);
 
         void BgrToRgb(const uint8_t* bgr, size_t width, size_t height, size_t bgrStride, uint8_t* rgb, size_t rgbStride);
diff --git a/src/Simd/SimdSve1AbsDifferenceSum.cpp b/src/Simd/SimdSve1AbsDifferenceSum.cpp
index 39afea2897..e27146e917 100644
--- a/src/Simd/SimdSve1AbsDifferenceSum.cpp
+++ b/src/Simd/SimdSve1AbsDifferenceSum.cpp
@@ -62,6 +62,46 @@ namespace Simd
                 b += bStride;
             }
         }
+
+        //--------------------------------------------------------------------------------------------------
+
+        void AbsDifferenceSumMasked(const uint8_t* a, size_t aStride, const uint8_t* b, size_t bStride,
+            const uint8_t* mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t* sum)
+        {
+            size_t A = svlen(svuint8_t());
+            size_t widthA = AlignLo(width, A);
+            const svbool_t body = svwhilelt_b8(size_t(0), A);
+            const svbool_t tail = svwhilelt_b8(widthA, width);
+            svuint8_t _i = svdup_n_u8(index), _1 = svdup_n_u8(1);
+            *sum = 0;
+            for (size_t row = 0; row < height; ++row)
+            {
+                size_t col = 0;
+                svuint32_t _sum = svdup_n_u32(0);
+                for (; col < widthA; col += A)
+                {
+                    svuint8_t _a = svld1_u8(body, a + col);
+                    svuint8_t _b = svld1_u8(body, b + col);
+                    svuint8_t _m = svld1_u8(body, mask + col);
+                    svbool_t _mask = svcmpeq_u8(body, _m, _i);
+                    svuint8_t abd = svabd_x(_mask, _a, _b);
+                    _sum = svdot_u32(_sum, abd, _1);
+                }
+                if (widthA < width)
+                {
+                    svuint8_t _a = svld1_u8(tail, a + col);
+                    svuint8_t _b = svld1_u8(tail, b + col);
+                    svuint8_t _m = svld1_u8(tail, mask + col);
+                    svbool_t _mask = svcmpeq_u8(tail, _m, _i);
+                    svuint8_t abd = svabd_x(_mask, _a, _b);
+                    _sum = svdot_u32(_sum, abd, _1);
+                }
+                *sum += svaddv_u32(svptrue_b32(), _sum);
+                a += aStride;
+                b += bStride;
+                mask += maskStride;
+            }
+        }
     }
 #endif
 }
diff --git a/src/Test/TestDifferenceSum.cpp b/src/Test/TestDifferenceSum.cpp
index 10cd0257d7..003fdcc6f0 100644
--- a/src/Test/TestDifferenceSum.cpp
+++ b/src/Test/TestDifferenceSum.cpp
@@ -315,6 +315,11 @@ namespace Test
             result = result && DifferenceSumsMaskedAutoTest(FUNC_M(Simd::Neon::AbsDifferenceSumMasked), FUNC_M(SimdAbsDifferenceSumMasked), 1);
 #endif 
 
+#ifdef SIMD_SVE_ENABLE
+        if (Simd::Sve::Enable && TestSve(options))
+            result = result && DifferenceSumsMaskedAutoTest(FUNC_M(Simd::Sve::AbsDifferenceSumMasked), FUNC_M(SimdAbsDifferenceSumMasked), 1);
+#endif 
+
         return result;
     }
 

From d763831fcd811d730583cbaa04044e6b36a3b805 Mon Sep 17 00:00:00 2001
From: Yermalayeu Ihar <ermig@tut.by>
Date: Mon, 4 May 2026 17:50:39 +0300
Subject: [PATCH 04/32] *fix bug in SVE optimizations of function
 AbsDifferenceSumMasked.

---
 src/Simd/SimdSve1AbsDifferenceSum.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Simd/SimdSve1AbsDifferenceSum.cpp b/src/Simd/SimdSve1AbsDifferenceSum.cpp
index e27146e917..f1b6201c62 100644
--- a/src/Simd/SimdSve1AbsDifferenceSum.cpp
+++ b/src/Simd/SimdSve1AbsDifferenceSum.cpp
@@ -84,7 +84,7 @@ namespace Simd
                     svuint8_t _b = svld1_u8(body, b + col);
                     svuint8_t _m = svld1_u8(body, mask + col);
                     svbool_t _mask = svcmpeq_u8(body, _m, _i);
-                    svuint8_t abd = svabd_x(_mask, _a, _b);
+                    svuint8_t abd = svabd_z(_mask, _a, _b);
                     _sum = svdot_u32(_sum, abd, _1);
                 }
                 if (widthA < width)
@@ -93,7 +93,7 @@ namespace Simd
                     svuint8_t _b = svld1_u8(tail, b + col);
                     svuint8_t _m = svld1_u8(tail, mask + col);
                     svbool_t _mask = svcmpeq_u8(tail, _m, _i);
-                    svuint8_t abd = svabd_x(_mask, _a, _b);
+                    svuint8_t abd = svabd_z(_mask, _a, _b);
                     _sum = svdot_u32(_sum, abd, _1);
                 }
                 *sum += svaddv_u32(svptrue_b32(), _sum);

From edb89d9cd31b5740470cf251f6ed2c786e6073f4 Mon Sep 17 00:00:00 2001
From: Yermalayeu Ihar <ermig@tut.by>
Date: Mon, 4 May 2026 19:56:44 +0300
Subject: [PATCH 05/32] *refactoring of SVE optimizations of functions
 AbsDifferenceSum and AbsDifferenceSumMasked.

---
 src/Simd/SimdSve1AbsDifferenceSum.cpp | 52 ++++++++++++---------------
 1 file changed, 23 insertions(+), 29 deletions(-)

diff --git a/src/Simd/SimdSve1AbsDifferenceSum.cpp b/src/Simd/SimdSve1AbsDifferenceSum.cpp
index f1b6201c62..b2c8fcc6f5 100644
--- a/src/Simd/SimdSve1AbsDifferenceSum.cpp
+++ b/src/Simd/SimdSve1AbsDifferenceSum.cpp
@@ -31,6 +31,14 @@ namespace Simd
 #ifdef SIMD_SVE_ENABLE
     namespace Sve
     {
+        SIMD_INLINE void AbsDifferenceSum(const uint8_t* a, const uint8_t* b, const svuint8_t& _1, const svbool_t & mask, svuint32_t & sum)
+        {
+            svuint8_t _a = svld1_u8(mask, a);
+            svuint8_t _b = svld1_u8(mask, b);
+            svuint8_t abd = svabd_x(mask, _a, _b);
+            sum = svdot_u32(sum, abd, _1);
+        }
+
         void AbsDifferenceSum(const uint8_t* a, size_t aStride, const uint8_t* b, size_t bStride, size_t width, size_t height, uint64_t* sum)
         {
             size_t A = svlen(svuint8_t());
@@ -44,19 +52,9 @@ namespace Simd
                 size_t col = 0;
                 svuint32_t _sum = svdup_n_u32(0);
                 for (; col < widthA; col += A)
-                {
-                    svuint8_t _a = svld1_u8(body, a + col);
-                    svuint8_t _b = svld1_u8(body, b + col);
-                    svuint8_t abd = svabd_x(body, _a, _b);
-                    _sum = svdot_u32(_sum, abd, _1);
-                }
+                    AbsDifferenceSum(a + col, b + col, _1, body, _sum);
                 if (widthA < width)
-                {
-                    svuint8_t _a = svld1_u8(tail, a + col);
-                    svuint8_t _b = svld1_u8(tail, b + col);
-                    svuint8_t abd = svabd_x(tail, _a, _b);
-                    _sum = svdot_u32(_sum, abd, _1);
-                }
+                    AbsDifferenceSum(a + col, b + col, _1, tail, _sum);
                 *sum += svaddv_u32(svptrue_b32(), _sum);
                 a += aStride;
                 b += bStride;
@@ -65,6 +63,16 @@ namespace Simd
 
         //--------------------------------------------------------------------------------------------------
 
+        SIMD_INLINE void AbsDifferenceSumMasked(const uint8_t* a, const uint8_t* b, const uint8_t* m, const svuint8_t& _1, const svuint8_t& index, const svbool_t& mask, svuint32_t& sum)
+        {
+            svuint8_t _a = svld1_u8(mask, a);
+            svuint8_t _b = svld1_u8(mask, b);
+            svuint8_t _m = svld1_u8(mask, m);
+            svbool_t _mask = svcmpeq_u8(mask, _m, index);
+            svuint8_t abd = svabd_z(_mask, _a, _b);
+            sum = svdot_u32(sum, abd, _1);
+        }
+
         void AbsDifferenceSumMasked(const uint8_t* a, size_t aStride, const uint8_t* b, size_t bStride,
             const uint8_t* mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t* sum)
         {
@@ -72,30 +80,16 @@ namespace Simd
             size_t widthA = AlignLo(width, A);
             const svbool_t body = svwhilelt_b8(size_t(0), A);
             const svbool_t tail = svwhilelt_b8(widthA, width);
-            svuint8_t _i = svdup_n_u8(index), _1 = svdup_n_u8(1);
+            svuint8_t _index = svdup_n_u8(index), _1 = svdup_n_u8(1);
             *sum = 0;
             for (size_t row = 0; row < height; ++row)
             {
                 size_t col = 0;
                 svuint32_t _sum = svdup_n_u32(0);
                 for (; col < widthA; col += A)
-                {
-                    svuint8_t _a = svld1_u8(body, a + col);
-                    svuint8_t _b = svld1_u8(body, b + col);
-                    svuint8_t _m = svld1_u8(body, mask + col);
-                    svbool_t _mask = svcmpeq_u8(body, _m, _i);
-                    svuint8_t abd = svabd_z(_mask, _a, _b);
-                    _sum = svdot_u32(_sum, abd, _1);
-                }
+                    AbsDifferenceSumMasked(a + col, b + col, mask + col, _1, _index, body, _sum);
                 if (widthA < width)
-                {
-                    svuint8_t _a = svld1_u8(tail, a + col);
-                    svuint8_t _b = svld1_u8(tail, b + col);
-                    svuint8_t _m = svld1_u8(tail, mask + col);
-                    svbool_t _mask = svcmpeq_u8(tail, _m, _i);
-                    svuint8_t abd = svabd_z(_mask, _a, _b);
-                    _sum = svdot_u32(_sum, abd, _1);
-                }
+                    AbsDifferenceSumMasked(a + col, b + col, mask + col, _1, _index, tail, _sum);
                 *sum += svaddv_u32(svptrue_b32(), _sum);
                 a += aStride;
                 b += bStride;

From 720f4ee4eb7bb865a68e047d8d9a4268f28e562e Mon Sep 17 00:00:00 2001
From: Yermalayeu Ihar <ermig@tut.by>
Date: Mon, 4 May 2026 20:55:46 +0300
Subject: [PATCH 06/32] +add SVE optimizations of function
 AbsDifferenceSums3x3.

---
 docs/2026.html                        |  1 +
 src/Simd/SimdLib.cpp                  |  5 +++
 src/Simd/SimdSve1.h                   |  3 ++
 src/Simd/SimdSve1AbsDifferenceSum.cpp | 60 +++++++++++++++++++++++++++
 src/Test/TestDifferenceSum.cpp        |  5 +++
 5 files changed, 74 insertions(+)

diff --git a/docs/2026.html b/docs/2026.html
index 5f8f4d4d71..c09e0834de 100644
--- a/docs/2026.html
+++ b/docs/2026.html
@@ -43,6 +43,7 @@ <h5>New features</h5>
 <ul>
  <li>SVE optimizations of function AbsDifferenceSum.</li>
  <li>SVE optimizations of function AbsDifferenceSumMasked.</li>
+ <li>SVE optimizations of function AbsDifferenceSums3x3.</li>
 </ul>
 
 <a href="#HOME">Home</a>
diff --git a/src/Simd/SimdLib.cpp b/src/Simd/SimdLib.cpp
index e304def840..7db27e4646 100644
--- a/src/Simd/SimdLib.cpp
+++ b/src/Simd/SimdLib.cpp
@@ -402,6 +402,11 @@ SIMD_API void SimdAbsDifferenceSums3x3(const uint8_t *current, size_t currentStr
         Sse41::AbsDifferenceSums3x3(current, currentStride, background, backgroundStride, width, height, sums);
     else
 #endif
+#ifdef SIMD_SVE_ENABLE
+    if (Sve::Enable)
+        Sve::AbsDifferenceSums3x3(current, currentStride, background, backgroundStride, width, height, sums);
+    else
+#endif
 #ifdef SIMD_NEON_ENABLE
     if (Neon::Enable && width >= Neon::A + 2)
         Neon::AbsDifferenceSums3x3(current, currentStride, background, backgroundStride, width, height, sums);
diff --git a/src/Simd/SimdSve1.h b/src/Simd/SimdSve1.h
index 51dff21046..ad8f3f3fc9 100644
--- a/src/Simd/SimdSve1.h
+++ b/src/Simd/SimdSve1.h
@@ -40,6 +40,9 @@ namespace Simd
         void AbsDifferenceSumMasked(const uint8_t* a, size_t aStride, const uint8_t* b, size_t bStride,
             const uint8_t* mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t* sum);
 
+        void AbsDifferenceSums3x3(const uint8_t* current, size_t currentStride, const uint8_t* background, size_t backgroundStride,
+            size_t width, size_t height, uint64_t* sums);
+
         void AbsGradientSaturatedSum(const uint8_t* src, size_t srcStride, size_t width, size_t height, uint8_t* dst, size_t dstStride);
 
         void BgrToRgb(const uint8_t* bgr, size_t width, size_t height, size_t bgrStride, uint8_t* rgb, size_t rgbStride);
diff --git a/src/Simd/SimdSve1AbsDifferenceSum.cpp b/src/Simd/SimdSve1AbsDifferenceSum.cpp
index b2c8fcc6f5..1a24e8c683 100644
--- a/src/Simd/SimdSve1AbsDifferenceSum.cpp
+++ b/src/Simd/SimdSve1AbsDifferenceSum.cpp
@@ -96,6 +96,66 @@ namespace Simd
                 mask += maskStride;
             }
         }
+
+        //--------------------------------------------------------------------------------------------------
+
+        SIMD_INLINE void AbsDifferenceSums3(const svuint8_t& current, const uint8_t* background, const svuint8_t& _1, const svbool_t& mask, svuint32x3_t sums)
+        {
+            svset3(sums, 0, svdot_u32(svget3(sums, 0), svabd_x(mask, current, svld1_u8(mask, background - 1)), _1));
+            svset3(sums, 1, svdot_u32(svget3(sums, 1), svabd_x(mask, current, svld1_u8(mask, background)), _1));
+            svset3(sums, 2, svdot_u32(svget3(sums, 2), svabd_x(mask, current, svld1_u8(mask, background + 1)), _1));
+        }
+
+        SIMD_INLINE void AbsDifferenceSums3x3(const uint8_t* current, const uint8_t* background, size_t stride, const svuint8_t& _1, 
+            const svbool_t& mask, svuint32x3_t &sums0, svuint32x3_t &sums3, svuint32x3_t& sums6)
+        {
+            svuint8_t _current = svld1_u8(mask, current);
+            AbsDifferenceSums3(_current, background - stride, _1, mask, sums0);
+            AbsDifferenceSums3(_current, background, _1, mask, sums3);
+            AbsDifferenceSums3(_current, background + stride, _1, mask, sums6);
+        }
+
+        SIMD_INLINE void AddRowSums3(svuint32x3_t src, uint64_t* dst)
+        {
+            dst[0] += svaddv_u32(svptrue_b32(), svget3(src, 0));
+            dst[1] += svaddv_u32(svptrue_b32(), svget3(src, 1));
+            dst[2] += svaddv_u32(svptrue_b32(), svget3(src, 2));
+        }
+
+        void AbsDifferenceSums3x3(const uint8_t* current, size_t currentStride, const uint8_t* background, size_t backgroundStride, size_t width, size_t height, uint64_t* sums)
+        {
+            assert(height > 2 && width > 2);
+
+            width -= 2;
+            height -= 2;
+            current += 1 + currentStride;
+            background += 1 + backgroundStride;
+
+            size_t A = svlen(svuint8_t());
+            size_t widthA = AlignLo(width, A);
+            const svbool_t body = svwhilelt_b8(size_t(0), A);
+            const svbool_t tail = svwhilelt_b8(widthA, width);
+            svuint8_t _1 = svdup_n_u8(1);
+
+            for (size_t i = 0; i < 9; ++i)
+                sums[i] = 0;
+            for (size_t row = 0; row < height; ++row)
+            {
+                svuint32x3_t sums0 = svcreate3_u32(svdup_n_u32(0), svdup_n_u32(0), svdup_n_u32(0));
+                svuint32x3_t sums3 = svcreate3_u32(svdup_n_u32(0), svdup_n_u32(0), svdup_n_u32(0));
+                svuint32x3_t sums6 = svcreate3_u32(svdup_n_u32(0), svdup_n_u32(0), svdup_n_u32(0));
+                size_t col = 0;
+                for (; col < widthA; col += A)
+                    AbsDifferenceSums3x3(current + col, background + col, backgroundStride, _1, body, sums0, sums3, sums6);
+                if (widthA < width)
+                    AbsDifferenceSums3x3(current + col, background + col, backgroundStride, _1, tail, sums0, sums3, sums6);
+                AddRowSums3(sums0, sums + 0);
+                AddRowSums3(sums3, sums + 3);
+                AddRowSums3(sums6, sums + 6);
+                current += currentStride;
+                background += backgroundStride;
+            }
+        }
     }
 #endif
 }
diff --git a/src/Test/TestDifferenceSum.cpp b/src/Test/TestDifferenceSum.cpp
index 003fdcc6f0..99ad5c3272 100644
--- a/src/Test/TestDifferenceSum.cpp
+++ b/src/Test/TestDifferenceSum.cpp
@@ -350,6 +350,11 @@ namespace Test
             result = result && DifferenceSumsAutoTest(FUNC_S(Simd::Neon::AbsDifferenceSums3x3), FUNC_S(SimdAbsDifferenceSums3x3), 9);
 #endif
 
+#ifdef SIMD_SVE_ENABLE
+        if (Simd::Sve::Enable && TestSve(options))
+            result = result && DifferenceSumsAutoTest(FUNC_S(Simd::Sve::AbsDifferenceSums3x3), FUNC_S(SimdAbsDifferenceSums3x3), 9);
+#endif
+
         return result;
     }
 

From fbceae9c926879998bad5fc9f0fca7255cba741a Mon Sep 17 00:00:00 2001
From: Yermalayeu Ihar <ermig@tut.by>
Date: Mon, 4 May 2026 21:01:50 +0300
Subject: [PATCH 07/32] *fix bug in Sve::AbsDifferenceSums3x3.

---
 src/Simd/SimdSve1AbsDifferenceSum.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/Simd/SimdSve1AbsDifferenceSum.cpp b/src/Simd/SimdSve1AbsDifferenceSum.cpp
index 1a24e8c683..5a7f4728d8 100644
--- a/src/Simd/SimdSve1AbsDifferenceSum.cpp
+++ b/src/Simd/SimdSve1AbsDifferenceSum.cpp
@@ -99,11 +99,11 @@ namespace Simd
 
         //--------------------------------------------------------------------------------------------------
 
-        SIMD_INLINE void AbsDifferenceSums3(const svuint8_t& current, const uint8_t* background, const svuint8_t& _1, const svbool_t& mask, svuint32x3_t sums)
+        SIMD_INLINE void AbsDifferenceSums3(const svuint8_t& current, const uint8_t* background, const svuint8_t& _1, const svbool_t& mask, svuint32x3_t & sums)
         {
-            svset3(sums, 0, svdot_u32(svget3(sums, 0), svabd_x(mask, current, svld1_u8(mask, background - 1)), _1));
-            svset3(sums, 1, svdot_u32(svget3(sums, 1), svabd_x(mask, current, svld1_u8(mask, background)), _1));
-            svset3(sums, 2, svdot_u32(svget3(sums, 2), svabd_x(mask, current, svld1_u8(mask, background + 1)), _1));
+            sums = svset3(sums, 0, svdot_u32(svget3(sums, 0), svabd_x(mask, current, svld1_u8(mask, background - 1)), _1));
+            sums = svset3(sums, 1, svdot_u32(svget3(sums, 1), svabd_x(mask, current, svld1_u8(mask, background)), _1));
+            sums = svset3(sums, 2, svdot_u32(svget3(sums, 2), svabd_x(mask, current, svld1_u8(mask, background + 1)), _1));
         }
 
         SIMD_INLINE void AbsDifferenceSums3x3(const uint8_t* current, const uint8_t* background, size_t stride, const svuint8_t& _1, 

From 1293ef7dfbd5668fb5e80bf7d710cd85bdfb2920 Mon Sep 17 00:00:00 2001
From: Yermalayeu Ihar <ermig@tut.by>
Date: Tue, 5 May 2026 09:08:18 +0300
Subject: [PATCH 08/32] *fix bug in Sve::AbsDifferenceSums3x3 (part 2).

---
 src/Simd/SimdSve1AbsDifferenceSum.cpp | 51 ++++++++++++++++-----------
 1 file changed, 30 insertions(+), 21 deletions(-)

diff --git a/src/Simd/SimdSve1AbsDifferenceSum.cpp b/src/Simd/SimdSve1AbsDifferenceSum.cpp
index 5a7f4728d8..8545b979ff 100644
--- a/src/Simd/SimdSve1AbsDifferenceSum.cpp
+++ b/src/Simd/SimdSve1AbsDifferenceSum.cpp
@@ -99,27 +99,35 @@ namespace Simd
 
         //--------------------------------------------------------------------------------------------------
 
-        SIMD_INLINE void AbsDifferenceSums3(const svuint8_t& current, const uint8_t* background, const svuint8_t& _1, const svbool_t& mask, svuint32x3_t & sums)
+        SIMD_INLINE void AbsDifferenceSums3(const svuint8_t& current, const uint8_t* background, const svuint8_t& _1, const svbool_t& mask, 
+            svuint32_t &sum0, svuint32_t& sum1, svuint32_t& sum2)
         {
-            sums = svset3(sums, 0, svdot_u32(svget3(sums, 0), svabd_x(mask, current, svld1_u8(mask, background - 1)), _1));
-            sums = svset3(sums, 1, svdot_u32(svget3(sums, 1), svabd_x(mask, current, svld1_u8(mask, background)), _1));
-            sums = svset3(sums, 2, svdot_u32(svget3(sums, 2), svabd_x(mask, current, svld1_u8(mask, background + 1)), _1));
+            sum0 = svdot_u32(sum0, svabd_x(mask, current, svld1_u8(mask, background - 1)), _1);
+            sum1 = svdot_u32(sum1, svabd_x(mask, current, svld1_u8(mask, background)), _1);
+            sum2 = svdot_u32(sum2, svabd_x(mask, current, svld1_u8(mask, background + 1)), _1);
         }
 
-        SIMD_INLINE void AbsDifferenceSums3x3(const uint8_t* current, const uint8_t* background, size_t stride, const svuint8_t& _1, 
-            const svbool_t& mask, svuint32x3_t &sums0, svuint32x3_t &sums3, svuint32x3_t& sums6)
+        SIMD_INLINE void AbsDifferenceSums3x3(const uint8_t* current, const uint8_t* background, size_t stride, const svuint8_t& _1, const svbool_t& mask, 
+            svuint32_t& sum0, svuint32_t& sum1, svuint32_t& sum2, svuint32_t& sum3, svuint32_t& sum4, svuint32_t& sum5, svuint32_t& sum6, svuint32_t& sum7, svuint32_t& sum8)
         {
             svuint8_t _current = svld1_u8(mask, current);
-            AbsDifferenceSums3(_current, background - stride, _1, mask, sums0);
-            AbsDifferenceSums3(_current, background, _1, mask, sums3);
-            AbsDifferenceSums3(_current, background + stride, _1, mask, sums6);
+            AbsDifferenceSums3(_current, background - stride, _1, mask, sum0, sum1, sum2);
+            AbsDifferenceSums3(_current, background, _1, mask, sum3, sum4, sum5);
+            AbsDifferenceSums3(_current, background + stride, _1, mask, sum6, sum7, sum8);
         }
 
-        SIMD_INLINE void AddRowSums3(svuint32x3_t src, uint64_t* dst)
+        SIMD_INLINE void ClearSums(svuint32_t& sum0, svuint32_t& sum1, svuint32_t& sum2)
         {
-            dst[0] += svaddv_u32(svptrue_b32(), svget3(src, 0));
-            dst[1] += svaddv_u32(svptrue_b32(), svget3(src, 1));
-            dst[2] += svaddv_u32(svptrue_b32(), svget3(src, 2));
+            sum0 = svdup_n_u32(0);
+            sum1 = svdup_n_u32(0);
+            sum2 = svdup_n_u32(0);
+        }
+
+        SIMD_INLINE void AddSums(const svuint32_t& sum0, const svuint32_t& sum1, const svuint32_t& sum2, uint64_t* sums)
+        {
+            sums[0] += svaddv_u32(svptrue_b32(), sum0);
+            sums[1] += svaddv_u32(svptrue_b32(), sum1);
+            sums[2] += svaddv_u32(svptrue_b32(), sum2);
         }
 
         void AbsDifferenceSums3x3(const uint8_t* current, size_t currentStride, const uint8_t* background, size_t backgroundStride, size_t width, size_t height, uint64_t* sums)
@@ -139,19 +147,20 @@ namespace Simd
 
             for (size_t i = 0; i < 9; ++i)
                 sums[i] = 0;
+            svuint32_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
             for (size_t row = 0; row < height; ++row)
             {
-                svuint32x3_t sums0 = svcreate3_u32(svdup_n_u32(0), svdup_n_u32(0), svdup_n_u32(0));
-                svuint32x3_t sums3 = svcreate3_u32(svdup_n_u32(0), svdup_n_u32(0), svdup_n_u32(0));
-                svuint32x3_t sums6 = svcreate3_u32(svdup_n_u32(0), svdup_n_u32(0), svdup_n_u32(0));
+                ClearSums(s0, s1, s3);
+                ClearSums(s3, s4, s5);
+                ClearSums(s6, s7, s8);
                 size_t col = 0;
                 for (; col < widthA; col += A)
-                    AbsDifferenceSums3x3(current + col, background + col, backgroundStride, _1, body, sums0, sums3, sums6);
+                    AbsDifferenceSums3x3(current + col, background + col, backgroundStride, _1, body, s0, s1, s2, s3, s4, s5, s6, s7, s8);
                 if (widthA < width)
-                    AbsDifferenceSums3x3(current + col, background + col, backgroundStride, _1, tail, sums0, sums3, sums6);
-                AddRowSums3(sums0, sums + 0);
-                AddRowSums3(sums3, sums + 3);
-                AddRowSums3(sums6, sums + 6);
+                    AbsDifferenceSums3x3(current + col, background + col, backgroundStride, _1, tail, s0, s1, s2, s3, s4, s5, s6, s7, s8);
+                AddSums(s0, s1, s2, sums + 0);
+                AddSums(s3, s4, s5, sums + 3);
+                AddSums(s6, s7, s8, sums + 6);
                 current += currentStride;
                 background += backgroundStride;
             }

From 50a2f75592d68768b23983b2db4cd0a6a67f112e Mon Sep 17 00:00:00 2001
From: Yermalayeu Ihar <ermig@tut.by>
Date: Tue, 5 May 2026 09:13:41 +0300
Subject: [PATCH 09/32] *fix bug in Sve::AbsDifferenceSums3x3 (part 3).

---
 src/Simd/SimdSve1AbsDifferenceSum.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/Simd/SimdSve1AbsDifferenceSum.cpp b/src/Simd/SimdSve1AbsDifferenceSum.cpp
index 8545b979ff..6d55d57823 100644
--- a/src/Simd/SimdSve1AbsDifferenceSum.cpp
+++ b/src/Simd/SimdSve1AbsDifferenceSum.cpp
@@ -108,12 +108,12 @@ namespace Simd
         }
 
         SIMD_INLINE void AbsDifferenceSums3x3(const uint8_t* current, const uint8_t* background, size_t stride, const svuint8_t& _1, const svbool_t& mask, 
-            svuint32_t& sum0, svuint32_t& sum1, svuint32_t& sum2, svuint32_t& sum3, svuint32_t& sum4, svuint32_t& sum5, svuint32_t& sum6, svuint32_t& sum7, svuint32_t& sum8)
+            svuint32_t& s0, svuint32_t& s1, svuint32_t& s2, svuint32_t& s3, svuint32_t& s4, svuint32_t& s5, svuint32_t& s6, svuint32_t& s7, svuint32_t& s8)
         {
             svuint8_t _current = svld1_u8(mask, current);
-            AbsDifferenceSums3(_current, background - stride, _1, mask, sum0, sum1, sum2);
-            AbsDifferenceSums3(_current, background, _1, mask, sum3, sum4, sum5);
-            AbsDifferenceSums3(_current, background + stride, _1, mask, sum6, sum7, sum8);
+            AbsDifferenceSums3(_current, background - stride, _1, mask, s0, s1, s2);
+            AbsDifferenceSums3(_current, background, _1, mask, s3, s4, s5);
+            AbsDifferenceSums3(_current, background + stride, _1, mask, s6, s7, s8);
         }
 
         SIMD_INLINE void ClearSums(svuint32_t& sum0, svuint32_t& sum1, svuint32_t& sum2)
@@ -150,7 +150,7 @@ namespace Simd
             svuint32_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
             for (size_t row = 0; row < height; ++row)
             {
-                ClearSums(s0, s1, s3);
+                ClearSums(s0, s1, s2);
                 ClearSums(s3, s4, s5);
                 ClearSums(s6, s7, s8);
                 size_t col = 0;

From 9a91bcfefa3ac9f1a550fdb62965e36ae8147bfd Mon Sep 17 00:00:00 2001
From: Yermalayeu Ihar <ermig@tut.by>
Date: Tue, 5 May 2026 10:02:08 +0300
Subject: [PATCH 10/32] +add SVE optimizations of function
 AbsDifferenceSums3x3Masked.

---
 docs/2026.html                        |  1 +
 src/Simd/SimdLib.cpp                  |  5 +++
 src/Simd/SimdSve1.h                   |  3 ++
 src/Simd/SimdSve1AbsDifferenceSum.cpp | 60 +++++++++++++++++++++++++++
 src/Test/TestDifferenceSum.cpp        |  5 +++
 5 files changed, 74 insertions(+)

diff --git a/docs/2026.html b/docs/2026.html
index c09e0834de..d35a59563b 100644
--- a/docs/2026.html
+++ b/docs/2026.html
@@ -44,6 +44,7 @@ <h5>New features</h5>
  <li>SVE optimizations of function AbsDifferenceSum.</li>
  <li>SVE optimizations of function AbsDifferenceSumMasked.</li>
  <li>SVE optimizations of function AbsDifferenceSums3x3.</li>
+ <li>SVE optimizations of function AbsDifferenceSums3x3Masked.</li>
 </ul>
 
 <a href="#HOME">Home</a>
diff --git a/src/Simd/SimdLib.cpp b/src/Simd/SimdLib.cpp
index 7db27e4646..6afbc92956 100644
--- a/src/Simd/SimdLib.cpp
+++ b/src/Simd/SimdLib.cpp
@@ -434,6 +434,11 @@ SIMD_API void SimdAbsDifferenceSums3x3Masked(const uint8_t *current, size_t curr
         Sse41::AbsDifferenceSums3x3Masked(current, currentStride, background, backgroundStride, mask, maskStride, index, width, height, sums);
     else
 #endif
+#ifdef SIMD_SVE_ENABLE
+    if (Sve::Enable)
+        Sve::AbsDifferenceSums3x3Masked(current, currentStride, background, backgroundStride, mask, maskStride, index, width, height, sums);
+    else
+#endif
 #ifdef SIMD_NEON_ENABLE
     if (Neon::Enable && width >= Neon::A + 2)
         Neon::AbsDifferenceSums3x3Masked(current, currentStride, background, backgroundStride, mask, maskStride, index, width, height, sums);
diff --git a/src/Simd/SimdSve1.h b/src/Simd/SimdSve1.h
index ad8f3f3fc9..293d6bf276 100644
--- a/src/Simd/SimdSve1.h
+++ b/src/Simd/SimdSve1.h
@@ -43,6 +43,9 @@ namespace Simd
         void AbsDifferenceSums3x3(const uint8_t* current, size_t currentStride, const uint8_t* background, size_t backgroundStride,
             size_t width, size_t height, uint64_t* sums);
 
+        void AbsDifferenceSums3x3Masked(const uint8_t* current, size_t currentStride, const uint8_t* background, size_t backgroundStride,
+            const uint8_t* mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t* sums);
+
         void AbsGradientSaturatedSum(const uint8_t* src, size_t srcStride, size_t width, size_t height, uint8_t* dst, size_t dstStride);
 
         void BgrToRgb(const uint8_t* bgr, size_t width, size_t height, size_t bgrStride, uint8_t* rgb, size_t rgbStride);
diff --git a/src/Simd/SimdSve1AbsDifferenceSum.cpp b/src/Simd/SimdSve1AbsDifferenceSum.cpp
index 6d55d57823..6efa0d0a94 100644
--- a/src/Simd/SimdSve1AbsDifferenceSum.cpp
+++ b/src/Simd/SimdSve1AbsDifferenceSum.cpp
@@ -165,6 +165,66 @@ namespace Simd
                 background += backgroundStride;
             }
         }
+
+        //--------------------------------------------------------------------------------------------------
+
+        SIMD_INLINE void AbsDifferenceSums3Masked(const svuint8_t& current, const uint8_t* background, const svuint8_t& _1, const svbool_t& mask,
+            svuint32_t& sum0, svuint32_t& sum1, svuint32_t& sum2)
+        {
+            sum0 = svdot_u32(sum0, svabd_z(mask, current, svld1_u8(mask, background - 1)), _1);
+            sum1 = svdot_u32(sum1, svabd_z(mask, current, svld1_u8(mask, background)), _1);
+            sum2 = svdot_u32(sum2, svabd_z(mask, current, svld1_u8(mask, background + 1)), _1);
+        }
+
+        SIMD_INLINE void AbsDifferenceSums3x3Masked(const uint8_t* c, const uint8_t* b, size_t stride, const uint8_t* m, const svuint8_t& _1, const svuint8_t& i, const svbool_t& mask,
+            svuint32_t& s0, svuint32_t& s1, svuint32_t& s2, svuint32_t& s3, svuint32_t& s4, svuint32_t& s5, svuint32_t& s6, svuint32_t& s7, svuint32_t& s8)
+        {
+            svuint8_t _c = svld1_u8(mask, c);
+            svuint8_t _m = svld1_u8(mask, m);
+            svbool_t _mask = svcmpeq_u8(mask, _m, i);
+            AbsDifferenceSums3Masked(_c, b - stride, _1, _mask, s0, s1, s2);
+            AbsDifferenceSums3Masked(_c, b, _1, _mask, s3, s4, s5);
+            AbsDifferenceSums3Masked(_c, b + stride, _1, _mask, s6, s7, s8);
+        }
+
+        void AbsDifferenceSums3x3Masked(const uint8_t* current, size_t currentStride, const uint8_t* background, size_t backgroundStride,
+            const uint8_t* mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t* sums)
+        {
+            assert(height > 2 && width > 2);
+
+            width -= 2;
+            height -= 2;
+            current += 1 + currentStride;
+            background += 1 + backgroundStride;
+            mask += 1 + maskStride;
+
+            size_t A = svlen(svuint8_t());
+            size_t widthA = AlignLo(width, A);
+            const svbool_t body = svwhilelt_b8(size_t(0), A);
+            const svbool_t tail = svwhilelt_b8(widthA, width);
+            svuint8_t _index = svdup_n_u8(index), _1 = svdup_n_u8(1);
+
+            for (size_t i = 0; i < 9; ++i)
+                sums[i] = 0;
+            svuint32_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
+            for (size_t row = 0; row < height; ++row)
+            {
+                ClearSums(s0, s1, s2);
+                ClearSums(s3, s4, s5);
+                ClearSums(s6, s7, s8);
+                size_t col = 0;
+                for (; col < widthA; col += A)
+                    AbsDifferenceSums3x3Masked(current + col, background + col, backgroundStride, mask + col, _1, _index, body, s0, s1, s2, s3, s4, s5, s6, s7, s8);
+                if (widthA < width)
+                    AbsDifferenceSums3x3Masked(current + col, background + col, backgroundStride, mask + col, _1, _index, tail, s0, s1, s2, s3, s4, s5, s6, s7, s8);
+                AddSums(s0, s1, s2, sums + 0);
+                AddSums(s3, s4, s5, sums + 3);
+                AddSums(s6, s7, s8, sums + 6);
+                current += currentStride;
+                background += backgroundStride;
+                mask += maskStride;
+            }
+        }
     }
 #endif
 }
diff --git a/src/Test/TestDifferenceSum.cpp b/src/Test/TestDifferenceSum.cpp
index 99ad5c3272..983efaa2bd 100644
--- a/src/Test/TestDifferenceSum.cpp
+++ b/src/Test/TestDifferenceSum.cpp
@@ -385,6 +385,11 @@ namespace Test
             result = result && DifferenceSumsMaskedAutoTest(FUNC_M(Simd::Neon::AbsDifferenceSums3x3Masked), FUNC_M(SimdAbsDifferenceSums3x3Masked), 9);
 #endif 
 
+#ifdef SIMD_SVE_ENABLE
+        if (Simd::Sve::Enable && TestSve(options))
+            result = result && DifferenceSumsMaskedAutoTest(FUNC_M(Simd::Sve::AbsDifferenceSums3x3Masked), FUNC_M(SimdAbsDifferenceSums3x3Masked), 9);
+#endif 
+
         return result;
     }
 

From e11754104a3b0b6abc3fa7b9eb1aedfba8f7f5c8 Mon Sep 17 00:00:00 2001
From: Yermalayeu Ihar <ermig@tut.by>
Date: Tue, 5 May 2026 10:27:47 +0300
Subject: [PATCH 11/32] *improve SVE optimizations of function
 AbsDifferenceSums3x3.

---
 src/Simd/SimdSve1AbsDifferenceSum.cpp | 60 +++++++++++++++++++++++----
 1 file changed, 51 insertions(+), 9 deletions(-)

diff --git a/src/Simd/SimdSve1AbsDifferenceSum.cpp b/src/Simd/SimdSve1AbsDifferenceSum.cpp
index 6efa0d0a94..d7c425b876 100644
--- a/src/Simd/SimdSve1AbsDifferenceSum.cpp
+++ b/src/Simd/SimdSve1AbsDifferenceSum.cpp
@@ -99,6 +99,20 @@ namespace Simd
 
         //--------------------------------------------------------------------------------------------------
 
+        SIMD_INLINE void ClearSums(svuint32_t& sum0, svuint32_t& sum1, svuint32_t& sum2)
+        {
+            sum0 = svdup_n_u32(0);
+            sum1 = svdup_n_u32(0);
+            sum2 = svdup_n_u32(0);
+        }
+
+        SIMD_INLINE void AddSums(const svuint32_t& sum0, const svuint32_t& sum1, const svuint32_t& sum2, uint64_t* sums)
+        {
+            sums[0] += svaddv_u32(svptrue_b32(), sum0);
+            sums[1] += svaddv_u32(svptrue_b32(), sum1);
+            sums[2] += svaddv_u32(svptrue_b32(), sum2);
+        }
+
         SIMD_INLINE void AbsDifferenceSums3(const svuint8_t& current, const uint8_t* background, const svuint8_t& _1, const svbool_t& mask, 
             svuint32_t &sum0, svuint32_t& sum1, svuint32_t& sum2)
         {
@@ -116,18 +130,28 @@ namespace Simd
             AbsDifferenceSums3(_current, background + stride, _1, mask, s6, s7, s8);
         }
 
-        SIMD_INLINE void ClearSums(svuint32_t& sum0, svuint32_t& sum1, svuint32_t& sum2)
+        SIMD_INLINE void AbsDifferenceSums3x2(const svuint8_t& c0, const svuint8_t& c1, const uint8_t* b, const svuint8_t& _1, const svbool_t& mask,
+            svuint32_t& s0, svuint32_t& s1, svuint32_t& s2, svuint32_t& s3, svuint32_t& s4, svuint32_t& s5)
         {
-            sum0 = svdup_n_u32(0);
-            sum1 = svdup_n_u32(0);
-            sum2 = svdup_n_u32(0);
+            svuint8_t b0 = svld1_u8(mask, b - 1);
+            s0 = svdot_u32(s0, svabd_x(mask, c0, b0), _1);
+            s3 = svdot_u32(s3, svabd_x(mask, c1, b0), _1);
+            svuint8_t b1 = svld1_u8(mask, b);
+            s1 = svdot_u32(s1, svabd_x(mask, c0, b1), _1);
+            s4 = svdot_u32(s4, svabd_x(mask, c1, b1), _1);
+            svuint8_t b2 = svld1_u8(mask, b + 1);
+            s2 = svdot_u32(s2, svabd_x(mask, c0, b2), _1);
+            s5 = svdot_u32(s5, svabd_x(mask, c1, b2), _1);
         }
 
-        SIMD_INLINE void AddSums(const svuint32_t& sum0, const svuint32_t& sum1, const svuint32_t& sum2, uint64_t* sums)
+        SIMD_INLINE void AbsDifferenceSums3x3x2(const uint8_t* c, size_t cStride, const uint8_t* b, size_t bStride, const svuint8_t& _1, const svbool_t& mask,
+            svuint32_t& s0, svuint32_t& s1, svuint32_t& s2, svuint32_t& s3, svuint32_t& s4, svuint32_t& s5, svuint32_t& s6, svuint32_t& s7, svuint32_t& s8)
         {
-            sums[0] += svaddv_u32(svptrue_b32(), sum0);
-            sums[1] += svaddv_u32(svptrue_b32(), sum1);
-            sums[2] += svaddv_u32(svptrue_b32(), sum2);
+            svuint8_t c0 = svld1_u8(mask, c), c1 = svld1_u8(mask, c + cStride);
+            AbsDifferenceSums3(c0, b - bStride, _1, mask, s0, s1, s2);
+            AbsDifferenceSums3x2(c1, c0, b, _1, mask, s0, s1, s2, s3, s4, s5);
+            AbsDifferenceSums3x2(c1, c0, b + bStride, _1, mask, s3, s4, s5, s6, s7, s8);
+            AbsDifferenceSums3(c1, b + 2 * bStride, _1, mask, s6, s7, s8);
         }
 
         void AbsDifferenceSums3x3(const uint8_t* current, size_t currentStride, const uint8_t* background, size_t backgroundStride, size_t width, size_t height, uint64_t* sums)
@@ -141,6 +165,7 @@ namespace Simd
 
             size_t A = svlen(svuint8_t());
             size_t widthA = AlignLo(width, A);
+            size_t height2 = AlignLo(height, 2);
             const svbool_t body = svwhilelt_b8(size_t(0), A);
             const svbool_t tail = svwhilelt_b8(widthA, width);
             svuint8_t _1 = svdup_n_u8(1);
@@ -148,7 +173,24 @@ namespace Simd
             for (size_t i = 0; i < 9; ++i)
                 sums[i] = 0;
             svuint32_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
-            for (size_t row = 0; row < height; ++row)
+            size_t row = 0;
+            for (; row < height2; row += 2)
+            {
+                ClearSums(s0, s1, s2);
+                ClearSums(s3, s4, s5);
+                ClearSums(s6, s7, s8);
+                size_t col = 0;
+                for (; col < widthA; col += A)
+                    AbsDifferenceSums3x3x2(current + col, currentStride, background + col, backgroundStride, _1, body, s0, s1, s2, s3, s4, s5, s6, s7, s8);
+                if (widthA < width)
+                    AbsDifferenceSums3x3x2(current + col, currentStride, background + col, backgroundStride, _1, tail, s0, s1, s2, s3, s4, s5, s6, s7, s8);
+                AddSums(s0, s1, s2, sums + 0);
+                AddSums(s3, s4, s5, sums + 3);
+                AddSums(s6, s7, s8, sums + 6);
+                current += 2 * currentStride;
+                background += 2 * backgroundStride;
+            }
+            for (; row < height; ++row)
             {
                 ClearSums(s0, s1, s2);
                 ClearSums(s3, s4, s5);

From b7ac1869b9285d7b7370a9eca26b3b8dda636c34 Mon Sep 17 00:00:00 2001
From: Yermalayeu Ihar <ermig@tut.by>
Date: Tue, 5 May 2026 12:27:19 +0300
Subject: [PATCH 12/32] +add SVE optimizations of function
 BackgroundGrowRangeSlow.

---
 docs/2026.html                  |  1 +
 prj/vs2022/Sve1.vcxproj         |  1 +
 prj/vs2022/Sve1.vcxproj.filters |  6 +++
 src/Simd/SimdLib.cpp            |  5 +++
 src/Simd/SimdSve1.h             |  3 ++
 src/Simd/SimdSve1Background.cpp | 66 +++++++++++++++++++++++++++++++++
 src/Test/TestBackground.cpp     |  5 +++
 7 files changed, 87 insertions(+)
 create mode 100644 src/Simd/SimdSve1Background.cpp

diff --git a/docs/2026.html b/docs/2026.html
index d35a59563b..5f125b2243 100644
--- a/docs/2026.html
+++ b/docs/2026.html
@@ -45,6 +45,7 @@ <h5>New features</h5>
  <li>SVE optimizations of function AbsDifferenceSumMasked.</li>
  <li>SVE optimizations of function AbsDifferenceSums3x3.</li>
  <li>SVE optimizations of function AbsDifferenceSums3x3Masked.</li>
+ <li>SVE optimizations of function BackgroundGrowRangeSlow.</li>
 </ul>
 
 <a href="#HOME">Home</a>
diff --git a/prj/vs2022/Sve1.vcxproj b/prj/vs2022/Sve1.vcxproj
index 93c5d19a12..a960799e61 100644
--- a/prj/vs2022/Sve1.vcxproj
+++ b/prj/vs2022/Sve1.vcxproj
@@ -24,6 +24,7 @@
     <ClCompile Include="..\..\src\Simd\SimdSve1AbsDifference.cpp" />
     <ClCompile Include="..\..\src\Simd\SimdSve1AbsDifferenceSum.cpp" />
     <ClCompile Include="..\..\src\Simd\SimdSve1AbsGradientSaturatedSum.cpp" />
+    <ClCompile Include="..\..\src\Simd\SimdSve1Background.cpp" />
     <ClCompile Include="..\..\src\Simd\SimdSve1BgrToRgb.cpp" />
     <ClCompile Include="..\..\src\Simd\SimdSve1Cpu.cpp" />
     <ClCompile Include="..\..\src\Simd\SimdSve1Deinterleave.cpp" />
diff --git a/prj/vs2022/Sve1.vcxproj.filters b/prj/vs2022/Sve1.vcxproj.filters
index 3356e32b25..87fc1e9732 100644
--- a/prj/vs2022/Sve1.vcxproj.filters
+++ b/prj/vs2022/Sve1.vcxproj.filters
@@ -22,6 +22,9 @@
     <Filter Include="Sve1\Statistics">
       <UniqueIdentifier>{b123ae36-270f-4b5c-8b87-372f66f78ba6}</UniqueIdentifier>
     </Filter>
+    <Filter Include="Sve1\Motion">
+      <UniqueIdentifier>{c617af02-4208-4080-9349-d24bcf67c558}</UniqueIdentifier>
+    </Filter>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\..\src\Simd\SimdSve1.h">
@@ -53,5 +56,8 @@
     <ClCompile Include="..\..\src\Simd\SimdSve1AbsDifferenceSum.cpp">
       <Filter>Sve1\Statistics</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\src\Simd\SimdSve1Background.cpp">
+      <Filter>Sve1\Motion</Filter>
+    </ClCompile>
   </ItemGroup>
 </Project>
\ No newline at end of file
diff --git a/src/Simd/SimdLib.cpp b/src/Simd/SimdLib.cpp
index 6afbc92956..d7e8c70752 100644
--- a/src/Simd/SimdLib.cpp
+++ b/src/Simd/SimdLib.cpp
@@ -722,6 +722,11 @@ SIMD_API void SimdBackgroundGrowRangeSlow(const uint8_t * value, size_t valueStr
         Sse41::BackgroundGrowRangeSlow(value, valueStride, width, height, lo, loStride, hi, hiStride);
     else
 #endif
+#ifdef SIMD_SVE_ENABLE
+    if (Sve::Enable)
+        Sve::BackgroundGrowRangeSlow(value, valueStride, width, height, lo, loStride, hi, hiStride);
+    else
+#endif
 #ifdef SIMD_NEON_ENABLE
     if (Neon::Enable && width >= Neon::A)
         Neon::BackgroundGrowRangeSlow(value, valueStride, width, height, lo, loStride, hi, hiStride);
diff --git a/src/Simd/SimdSve1.h b/src/Simd/SimdSve1.h
index 293d6bf276..a1133666de 100644
--- a/src/Simd/SimdSve1.h
+++ b/src/Simd/SimdSve1.h
@@ -48,6 +48,9 @@ namespace Simd
 
         void AbsGradientSaturatedSum(const uint8_t* src, size_t srcStride, size_t width, size_t height, uint8_t* dst, size_t dstStride);
 
+        void BackgroundGrowRangeSlow(const uint8_t* value, size_t valueStride, size_t width, size_t height,
+            uint8_t* lo, size_t loStride, uint8_t* hi, size_t hiStride);
+
         void BgrToRgb(const uint8_t* bgr, size_t width, size_t height, size_t bgrStride, uint8_t* rgb, size_t rgbStride);
 
         void DeinterleaveUv(const uint8_t* uv, size_t uvStride, size_t width, size_t height, uint8_t* u, size_t uStride, uint8_t* v, size_t vStride);
diff --git a/src/Simd/SimdSve1Background.cpp b/src/Simd/SimdSve1Background.cpp
new file mode 100644
index 0000000000..fcb76d4d38
--- /dev/null
+++ b/src/Simd/SimdSve1Background.cpp
@@ -0,0 +1,66 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2017 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdMemory.h"
+#include "Simd/SimdStore.h"
+
+namespace Simd
+{
+#ifdef SIMD_SVE_ENABLE    
+    namespace Sve
+    {
+        SIMD_INLINE void BackgroundGrowRangeSlow(const uint8_t * value, uint8_t * lo, uint8_t * hi, const svuint8_t& _1, const svbool_t & mask)
+        {
+            svuint8_t _value = svld1_u8(mask, value);
+            svuint8_t _lo = svld1_u8(mask, lo);
+            svuint8_t _hi = svld1_u8(mask, hi);
+
+            svbool_t inc = svcmpgt_u8(mask, _value, _hi);
+            svbool_t dec = svcmpgt_u8(mask, _value, _lo);
+
+            svst1_u8(mask, lo, svqsub_u8(_lo, svand_u8_z(dec, _1, _1)));
+            svst1_u8(mask, hi, svqadd_u8(_hi, svand_u8_z(inc, _1, _1)));
+        }
+
+        void BackgroundGrowRangeSlow(const uint8_t * value, size_t valueStride, size_t width, size_t height, uint8_t * lo, size_t loStride, uint8_t * hi, size_t hiStride)
+        {
+            size_t A = svlen(svuint8_t());
+            size_t widthA = AlignLo(width, A);
+            const svbool_t body = svwhilelt_b8(size_t(0), A);
+            const svbool_t tail = svwhilelt_b8(widthA, width);
+            svuint8_t _1 = svdup_n_u8(1);
+            for (size_t row = 0; row < height; ++row)
+            {
+                size_t col = 0;
+                for (; col < widthA; col += A)
+                    BackgroundGrowRangeSlow(value + col, lo + col, hi + col, _1, body);
+                if (widthA < width)
+                    BackgroundGrowRangeSlow(value + col, lo + col, hi + col, _1, tail);
+                value += valueStride;
+                lo += loStride;
+                hi += hiStride;
+            }
+        }
+    }
+#endif
+}
diff --git a/src/Test/TestBackground.cpp b/src/Test/TestBackground.cpp
index fe509fa3db..51d9351f56 100644
--- a/src/Test/TestBackground.cpp
+++ b/src/Test/TestBackground.cpp
@@ -464,6 +464,11 @@ namespace Test
             result = result && BackgroundChangeRangeAutoTest(FUNC1(Simd::Neon::BackgroundGrowRangeSlow), FUNC1(SimdBackgroundGrowRangeSlow));
 #endif 
 
+#ifdef SIMD_SVE_ENABLE
+        if (Simd::Sve::Enable && TestSve(options))
+            result = result && BackgroundChangeRangeAutoTest(FUNC1(Simd::Sve::BackgroundGrowRangeSlow), FUNC1(SimdBackgroundGrowRangeSlow));
+#endif 
+
         return result;
     }
 

From 89bebe810e2f533e20faf0902a1ae317b70bd93d Mon Sep 17 00:00:00 2001
From: Yermalayeu Ihar <ermig@tut.by>
Date: Tue, 5 May 2026 12:35:22 +0300
Subject: [PATCH 13/32] *fix bug in SVE optimizations of function
 BackgroundGrowRangeSlow.

---
 src/Simd/SimdSve1Background.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Simd/SimdSve1Background.cpp b/src/Simd/SimdSve1Background.cpp
index fcb76d4d38..5ffafb7ff0 100644
--- a/src/Simd/SimdSve1Background.cpp
+++ b/src/Simd/SimdSve1Background.cpp
@@ -36,7 +36,7 @@ namespace Simd
             svuint8_t _hi = svld1_u8(mask, hi);
 
             svbool_t inc = svcmpgt_u8(mask, _value, _hi);
-            svbool_t dec = svcmpgt_u8(mask, _value, _lo);
+            svbool_t dec = svcmplt_u8(mask, _value, _lo);
 
             svst1_u8(mask, lo, svqsub_u8(_lo, svand_u8_z(dec, _1, _1)));
             svst1_u8(mask, hi, svqadd_u8(_hi, svand_u8_z(inc, _1, _1)));

From 0452f8838ce6f7be222df237ea6cd2f8dbacec15 Mon Sep 17 00:00:00 2001
From: Yermalayeu Ihar <ermig@tut.by>
Date: Tue, 5 May 2026 12:48:22 +0300
Subject: [PATCH 14/32] *fix bug: Error in function SimdAlignment for SVE
 (ARM).

---
 docs/2026.html           | 4 ++++
 src/Simd/SimdAlignment.h | 5 +++++
 2 files changed, 9 insertions(+)

diff --git a/docs/2026.html b/docs/2026.html
index 5f125b2243..14493f65b6 100644
--- a/docs/2026.html
+++ b/docs/2026.html
@@ -47,6 +47,10 @@ <h5>New features</h5>
  <li>SVE optimizations of function AbsDifferenceSums3x3Masked.</li>
  <li>SVE optimizations of function BackgroundGrowRangeSlow.</li>
 </ul>
+<h5>Bug fixing</h5>
+<ul>
+ <li>Error in function SimdAlignment for SVE (ARM).</li>
+</ul>
 
 <a href="#HOME">Home</a>
 <hr/>
diff --git a/src/Simd/SimdAlignment.h b/src/Simd/SimdAlignment.h
index 837884b1f5..738c230552 100644
--- a/src/Simd/SimdAlignment.h
+++ b/src/Simd/SimdAlignment.h
@@ -51,6 +51,11 @@ namespace Simd
             return sizeof(__m128i);
         else
 #endif
+#ifdef SIMD_SVE_ENABLE
+        if (Sve::Enable)
+            return Sve::SveSize;
+        else
+#endif
 #ifdef SIMD_NEON_ENABLE
         if (Neon::Enable)
             return sizeof(uint8x16_t);

From 56d807e60314e807ce6f3f4a92a7d97a855d1da2 Mon Sep 17 00:00:00 2001
From: Yermalayeu Ihar <ermig@tut.by>
Date: Tue, 5 May 2026 13:10:58 +0300
Subject: [PATCH 15/32] *update help.

---
 README.md                                     |  3 +-
 docs/2026.html                                | 13 ++++
 docs/help/group__c__types.html                |  2 +-
 docs/help/group__descrint.html                | 22 +++---
 docs/help/group__drawing.html                 | 10 +--
 docs/help/group__gaussian__filter.html        |  4 +-
 docs/help/group__image__io.html               | 10 +--
 docs/help/group__info.html                    | 42 ++++++++--
 docs/help/group__memory.html                  | 78 +++++++++++++------
 docs/help/group__object__detection.html       |  6 +-
 .../group__recursive__bilateral__filter.html  |  4 +-
 docs/help/group__resizing.html                |  4 +-
 docs/help/group__shifting.html                |  6 +-
 docs/help/group__synet__add.html              |  4 +-
 .../help/group__synet__convolution__bf16.html | 10 +--
 .../help/group__synet__convolution__fp32.html | 12 +--
 .../help/group__synet__convolution__int8.html | 12 +--
 .../group__synet__deconvolution__bf16.html    | 10 +--
 .../group__synet__deconvolution__fp32.html    | 12 +--
 docs/help/group__synet__gather__elements.html |  8 +-
 docs/help/group__synet__grid__sample.html     |  6 +-
 docs/help/group__synet__inner__product.html   | 10 +--
 .../group__synet__inner__product__bf16.html   | 10 +--
 ...oup__synet__merged__convolution__bf16.html | 12 +--
 ...oup__synet__merged__convolution__fp32.html | 12 +--
 ...oup__synet__merged__convolution__int8.html | 12 +--
 docs/help/group__synet__permute.html          |  6 +-
 docs/help/group__synet__quantized__add.html   |  4 +-
 .../group__synet__quantized__convolution.html | 12 +--
 ...oup__synet__quantized__inner__product.html | 12 +--
 ...synet__quantized__merged__convolution.html | 12 +--
 docs/help/group__synet__scale.html            | 12 +--
 docs/help/group__warp__affine.html            |  4 +-
 docs/help/index.html                          |  3 +-
 docs/help/struct_simd_1_1_view.html           |  2 +-
 docs/index.html                               |  2 +-
 prj/txt/DoxygenOverview.txt                   |  3 +-
 src/Simd/SimdLib.h                            |  3 +-
 38 files changed, 242 insertions(+), 167 deletions(-)

diff --git a/README.md b/README.md
index 8ed8dc5505..d3892b2c73 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@ object detection and classification, neural network.
 
 The algorithms are optimized with using of different SIMD CPU extensions. 
 In particular the library supports following CPU extensions: 
-SSE, AVX, AVX-512 and AMX for x86/x64, NEON for ARM, HVX for Hexagon.
+SSE, AVX, AVX-512 and AMX for x86/x64, NEON, SVE for ARM, HVX for Hexagon.
 
 The Simd Library has C API and also contains useful C++ classes and functions to facilitate access to C API. 
 The library supports dynamic and static linking, 32-bit and 64-bit Windows and Linux, 
@@ -98,6 +98,7 @@ There are addition build parameters:
 * `SIMD_AVX512` - Enable of AVX-512 (AVX-512F, AVX-512CD, AVX-512VL, AVX-512DQ, AVX-512BW) CPU extensions. It is switched on by default.
 * `SIMD_AVX512VNNI` - Enable of AVX-512-VNNI CPU extensions. It is switched on by default.
 * `SIMD_AMXBF16` - Enable of AMX-BF16, AMX-INT8 and AVX-512-BF16 CPU extensions. It is switched off by default.
+* `SIMD_SVE` - Enable of SVE CPU extension. It is switched off by default.
 * `SIMD_TEST` - Build test framework. It is switched on by default.
 * `SIMD_INFO` - Print build information. It is switched on by default.
 * `SIMD_PERF` - Enable of internal performance statistic. It is switched off by default.
diff --git a/docs/2026.html b/docs/2026.html
index 14493f65b6..76f760826e 100644
--- a/docs/2026.html
+++ b/docs/2026.html
@@ -52,6 +52,19 @@ <h5>Bug fixing</h5>
  <li>Error in function SimdAlignment for SVE (ARM).</li>
 </ul>
 
+<h4>Documentation</h4>
+<h5>Improving</h5>
+<ul>
+ <li>Description of function SimdVersion.</li>
+ <li>Description of function SimdCpuDesc.</li>
+ <li>Description of function SimdCpuInfo.</li>
+ <li>Description of function SimdAllocate.</li>
+ <li>Description of function SimdFree.</li>
+ <li>Description of function SimdAlign.</li>
+ <li>Description of function SimdAlignment.</li>
+ <li>Description of function SimdRelease.</li>
+</ul>
+
 <a href="#HOME">Home</a>
 <hr/>
 <h3 id="R161">May 4, 2026 (version 7.1.161)</h3> 
diff --git a/docs/help/group__c__types.html b/docs/help/group__c__types.html
index c41e21addc..036bf3d69d 100644
--- a/docs/help/group__c__types.html
+++ b/docs/help/group__c__types.html
@@ -325,7 +325,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga111a9cea2240175930bb547
         </tr>
       </table>
 </div><div class="memdoc">
-<p >Describes type of description which can return function <a class="el" href="group__info.html#gad14c64b5882fb15ce8bc683bec8c0da1" title="Gets description of CPU and Simd Library.">SimdCpuDesc</a>. </p>
+<p >Describes type of description which can return function <a class="el" href="group__info.html#gad14c64b5882fb15ce8bc683bec8c0da1" title="Gets a text description of the CPU.">SimdCpuDesc</a>. </p>
 <table class="fieldtable">
 <tr><th colspan="2">Enumerator</th></tr><tr><td class="fieldname"><a id="gga111a9cea2240175930bb547eb01811eba568fe08c74a07ad02c864add946d601a" name="gga111a9cea2240175930bb547eb01811eba568fe08c74a07ad02c864add946d601a"></a>SimdCpuDescModel&#160;</td><td class="fielddoc"><p >A CPU model name. </p>
 </td></tr>
diff --git a/docs/help/group__descrint.html b/docs/help/group__descrint.html
index 815597d487..b1c61b1f65 100644
--- a/docs/help/group__descrint.html
+++ b/docs/help/group__descrint.html
@@ -121,7 +121,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga9a34fef8e808d9d7c0c62f3
   </table>
   </dd>
 </dl>
-<dl class="section return"><dt>Returns</dt><dd>a pointer to Integer Descriptor Engine context. On error it returns NULL. It must be released with using of function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. This pointer is used in functions <a class="el" href="group__descrint.html#gaf0b9048006ffc1d6663de8c707bbaad7" title="Gets size in bytes of encoded integer descriptor.">SimdDescrIntEncodedSize</a>, <a class="el" href="group__descrint.html#ga84e3a3f6817ef40bec588eaae974d82e" title="Gets length of original (32-bit or 16-bit) float descriptor.">SimdDescrIntDecodedSize</a>, <a class="el" href="group__descrint.html#gab1f0da1cfdba134cc6d9ef9150ff175c" title="Encodes 32-bit float descriptor to integer form.">SimdDescrIntEncode32f</a>, <a class="el" href="group__descrint.html#gafd383eee16784e07693dacabb7570d22" title="Encodes 16-bit float descriptor to integer form.">SimdDescrIntEncode16f</a>, <a class="el" href="group__descrint.html#ga519ddfec7344cc61b54207d2f82d04a2" title="Decodes integer descriptor to original 32-bit float form.">SimdDescrIntDecode32f</a>, <a class="el" href="group__descrint.html#gabb9388eb4a8706bff5aa22464d081550" title="Decodes integer descriptor to original 16-bit float form.">SimdDescrIntDecode16f</a>, <a class="el" href="group__descrint.html#gad19afb4821f4e4ecf07c06730ae5847c" title="Calculates cosine distance of two integer descriptors.">SimdDescrIntCosineDistance</a>, <a class="el" href="group__descrint.html#gaa0644c0c0a77aef8d1c0bedf17066ecf" title="Calculates mutual cosine distance of two arrays of integer descriptor arrays.">SimdDescrIntCosineDistancesMxNa</a>, <a class="el" href="group__descrint.html#gab7ee06fa831e04c9eff9bf39e833f323" title="Calculates mutual cosine distance of two arrays of integer descriptors.">SimdDescrIntCosineDistancesMxNp</a>, <a class="el" href="group__descrint.html#ga597d136b464cfbdd65791f5343ca3a6f" title="Calculates vector norm for integer descriptor.">SimdDescrIntVectorNorm</a>. </dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>a pointer to Integer Descriptor Engine context. On error it returns NULL. It must be released with using of function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. This pointer is used in functions <a class="el" href="group__descrint.html#gaf0b9048006ffc1d6663de8c707bbaad7" title="Gets size in bytes of encoded integer descriptor.">SimdDescrIntEncodedSize</a>, <a class="el" href="group__descrint.html#ga84e3a3f6817ef40bec588eaae974d82e" title="Gets length of original (32-bit or 16-bit) float descriptor.">SimdDescrIntDecodedSize</a>, <a class="el" href="group__descrint.html#gab1f0da1cfdba134cc6d9ef9150ff175c" title="Encodes 32-bit float descriptor to integer form.">SimdDescrIntEncode32f</a>, <a class="el" href="group__descrint.html#gafd383eee16784e07693dacabb7570d22" title="Encodes 16-bit float descriptor to integer form.">SimdDescrIntEncode16f</a>, <a class="el" href="group__descrint.html#ga519ddfec7344cc61b54207d2f82d04a2" title="Decodes integer descriptor to original 32-bit float form.">SimdDescrIntDecode32f</a>, <a class="el" href="group__descrint.html#gabb9388eb4a8706bff5aa22464d081550" title="Decodes integer descriptor to original 16-bit float form.">SimdDescrIntDecode16f</a>, <a class="el" href="group__descrint.html#gad19afb4821f4e4ecf07c06730ae5847c" title="Calculates cosine distance of two integer descriptors.">SimdDescrIntCosineDistance</a>, <a class="el" href="group__descrint.html#gaa0644c0c0a77aef8d1c0bedf17066ecf" title="Calculates mutual cosine distance of two arrays of integer descriptor arrays.">SimdDescrIntCosineDistancesMxNa</a>, <a class="el" href="group__descrint.html#gab7ee06fa831e04c9eff9bf39e833f323" title="Calculates mutual cosine distance of two arrays of integer descriptors.">SimdDescrIntCosineDistancesMxNp</a>, <a class="el" href="group__descrint.html#ga597d136b464cfbdd65791f5343ca3a6f" title="Calculates vector norm for integer descriptor.">SimdDescrIntVectorNorm</a>. </dd></dl>
 
 </div>
 </div>
@@ -144,7 +144,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#gaf0b9048006ffc1d6663de8c
 <p>Gets size in bytes of encoded integer descriptor. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to Integer Descriptor Engine context. It must be created by function <a class="el" href="group__descrint.html#ga9a34fef8e808d9d7c0c62f3832e0c1d2" title="Initializes Integer Descriptor Engine.">SimdDescrIntInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to Integer Descriptor Engine context. It must be created by function <a class="el" href="group__descrint.html#ga9a34fef8e808d9d7c0c62f3832e0c1d2" title="Initializes Integer Descriptor Engine.">SimdDescrIntInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
   </table>
   </dd>
 </dl>
@@ -171,7 +171,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga84e3a3f6817ef40bec588ea
 <p>Gets length of original (32-bit or 16-bit) float descriptor. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to Integer Descriptor Engine context. It must be created by function <a class="el" href="group__descrint.html#ga9a34fef8e808d9d7c0c62f3832e0c1d2" title="Initializes Integer Descriptor Engine.">SimdDescrIntInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to Integer Descriptor Engine context. It must be created by function <a class="el" href="group__descrint.html#ga9a34fef8e808d9d7c0c62f3832e0c1d2" title="Initializes Integer Descriptor Engine.">SimdDescrIntInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
   </table>
   </dd>
 </dl>
@@ -214,7 +214,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#gab1f0da1cfdba134cc6d9ef9
 <p>Encodes 32-bit float descriptor to integer form. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to Integer Descriptor Engine context. It must be created by function <a class="el" href="group__descrint.html#ga9a34fef8e808d9d7c0c62f3832e0c1d2" title="Initializes Integer Descriptor Engine.">SimdDescrIntInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to Integer Descriptor Engine context. It must be created by function <a class="el" href="group__descrint.html#ga9a34fef8e808d9d7c0c62f3832e0c1d2" title="Initializes Integer Descriptor Engine.">SimdDescrIntInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">src</td><td>- a pointer to original 32-bit float descriptor. Its length can be determined by function <a class="el" href="group__descrint.html#ga84e3a3f6817ef40bec588eaae974d82e" title="Gets length of original (32-bit or 16-bit) float descriptor.">SimdDescrIntDecodedSize</a>. </td></tr>
     <tr><td class="paramdir">[out]</td><td class="paramname">dst</td><td>- a pointer to encoded integer descriptor. Its size in bytes can be determined by function <a class="el" href="group__descrint.html#gaf0b9048006ffc1d6663de8c707bbaad7" title="Gets size in bytes of encoded integer descriptor.">SimdDescrIntEncodedSize</a>. </td></tr>
   </table>
@@ -258,7 +258,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#gafd383eee16784e07693daca
 <p>Encodes 16-bit float descriptor to integer form. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to Integer Descriptor Engine context. It must be created by function <a class="el" href="group__descrint.html#ga9a34fef8e808d9d7c0c62f3832e0c1d2" title="Initializes Integer Descriptor Engine.">SimdDescrIntInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to Integer Descriptor Engine context. It must be created by function <a class="el" href="group__descrint.html#ga9a34fef8e808d9d7c0c62f3832e0c1d2" title="Initializes Integer Descriptor Engine.">SimdDescrIntInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">src</td><td>- a pointer to original 16-bit float descriptor. Its length can be determined by function <a class="el" href="group__descrint.html#ga84e3a3f6817ef40bec588eaae974d82e" title="Gets length of original (32-bit or 16-bit) float descriptor.">SimdDescrIntDecodedSize</a>. </td></tr>
     <tr><td class="paramdir">[out]</td><td class="paramname">dst</td><td>- a pointer to encoded integer descriptor. Its size in bytes can be determined by function <a class="el" href="group__descrint.html#gaf0b9048006ffc1d6663de8c707bbaad7" title="Gets size in bytes of encoded integer descriptor.">SimdDescrIntEncodedSize</a>. </td></tr>
   </table>
@@ -302,7 +302,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga519ddfec7344cc61b54207d
 <p>Decodes integer descriptor to original 32-bit float form. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to Integer Descriptor Engine context. It must be created by function <a class="el" href="group__descrint.html#ga9a34fef8e808d9d7c0c62f3832e0c1d2" title="Initializes Integer Descriptor Engine.">SimdDescrIntInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to Integer Descriptor Engine context. It must be created by function <a class="el" href="group__descrint.html#ga9a34fef8e808d9d7c0c62f3832e0c1d2" title="Initializes Integer Descriptor Engine.">SimdDescrIntInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">src</td><td>- a pointer to encoded integer descriptor. Its size in bytes can be determined by function <a class="el" href="group__descrint.html#gaf0b9048006ffc1d6663de8c707bbaad7" title="Gets size in bytes of encoded integer descriptor.">SimdDescrIntEncodedSize</a>. </td></tr>
     <tr><td class="paramdir">[out]</td><td class="paramname">dst</td><td>- a pointer to output 32-bit float descriptor. Its length can be determined by function <a class="el" href="group__descrint.html#ga84e3a3f6817ef40bec588eaae974d82e" title="Gets length of original (32-bit or 16-bit) float descriptor.">SimdDescrIntDecodedSize</a>. </td></tr>
   </table>
@@ -346,7 +346,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#gabb9388eb4a8706bff5aa224
 <p>Decodes integer descriptor to original 16-bit float form. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to Integer Descriptor Engine context. It must be created by function <a class="el" href="group__descrint.html#ga9a34fef8e808d9d7c0c62f3832e0c1d2" title="Initializes Integer Descriptor Engine.">SimdDescrIntInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to Integer Descriptor Engine context. It must be created by function <a class="el" href="group__descrint.html#ga9a34fef8e808d9d7c0c62f3832e0c1d2" title="Initializes Integer Descriptor Engine.">SimdDescrIntInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">src</td><td>- a pointer to encoded integer descriptor. Its size in bytes can be determined by function <a class="el" href="group__descrint.html#gaf0b9048006ffc1d6663de8c707bbaad7" title="Gets size in bytes of encoded integer descriptor.">SimdDescrIntEncodedSize</a>. </td></tr>
     <tr><td class="paramdir">[out]</td><td class="paramname">dst</td><td>- a pointer to output 16-bit float descriptor. Its length can be determined by function <a class="el" href="group__descrint.html#ga84e3a3f6817ef40bec588eaae974d82e" title="Gets length of original (32-bit or 16-bit) float descriptor.">SimdDescrIntDecodedSize</a>. </td></tr>
   </table>
@@ -397,7 +397,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#gad19afb4821f4e4ecf07c067
 <dl class="section note"><dt>Note</dt><dd>Integer descriptor can be received with using of functions <a class="el" href="group__descrint.html#gab1f0da1cfdba134cc6d9ef9150ff175c" title="Encodes 32-bit float descriptor to integer form.">SimdDescrIntEncode32f</a> or <a class="el" href="group__descrint.html#gafd383eee16784e07693dacabb7570d22" title="Encodes 16-bit float descriptor to integer form.">SimdDescrIntEncode16f</a>. Its size in bytes is determined by function <a class="el" href="group__descrint.html#gaf0b9048006ffc1d6663de8c707bbaad7" title="Gets size in bytes of encoded integer descriptor.">SimdDescrIntEncodedSize</a>.</dd></dl>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to Integer Descriptor Engine context. It must be created by function <a class="el" href="group__descrint.html#ga9a34fef8e808d9d7c0c62f3832e0c1d2" title="Initializes Integer Descriptor Engine.">SimdDescrIntInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to Integer Descriptor Engine context. It must be created by function <a class="el" href="group__descrint.html#ga9a34fef8e808d9d7c0c62f3832e0c1d2" title="Initializes Integer Descriptor Engine.">SimdDescrIntInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">a</td><td>- a pointer to the first integer descriptor. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">b</td><td>- a pointer to the second integer descriptor. </td></tr>
     <tr><td class="paramdir">[out]</td><td class="paramname">distance</td><td>- a pointer to 32-bit float with cosine distance. </td></tr>
@@ -461,7 +461,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#gaa0644c0c0a77aef8d1c0bed
 <dl class="section note"><dt>Note</dt><dd>Integer descriptor can be received with using of functions <a class="el" href="group__descrint.html#gab1f0da1cfdba134cc6d9ef9150ff175c" title="Encodes 32-bit float descriptor to integer form.">SimdDescrIntEncode32f</a> or <a class="el" href="group__descrint.html#gafd383eee16784e07693dacabb7570d22" title="Encodes 16-bit float descriptor to integer form.">SimdDescrIntEncode16f</a>. Its size in bytes is determined by function <a class="el" href="group__descrint.html#gaf0b9048006ffc1d6663de8c707bbaad7" title="Gets size in bytes of encoded integer descriptor.">SimdDescrIntEncodedSize</a>.</dd></dl>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to Integer Descriptor Engine context. It must be created by function <a class="el" href="group__descrint.html#ga9a34fef8e808d9d7c0c62f3832e0c1d2" title="Initializes Integer Descriptor Engine.">SimdDescrIntInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to Integer Descriptor Engine context. It must be created by function <a class="el" href="group__descrint.html#ga9a34fef8e808d9d7c0c62f3832e0c1d2" title="Initializes Integer Descriptor Engine.">SimdDescrIntInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">M</td><td>- a number of A arrays. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">N</td><td>- a number of B arrays. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">A</td><td>- a pointer to the first array with pointers to integer descriptors. </td></tr>
@@ -527,7 +527,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#gab7ee06fa831e04c9eff9bf3
 <dl class="section note"><dt>Note</dt><dd>Integer descriptor can be received with using of functions <a class="el" href="group__descrint.html#gab1f0da1cfdba134cc6d9ef9150ff175c" title="Encodes 32-bit float descriptor to integer form.">SimdDescrIntEncode32f</a> or <a class="el" href="group__descrint.html#gafd383eee16784e07693dacabb7570d22" title="Encodes 16-bit float descriptor to integer form.">SimdDescrIntEncode16f</a>. Its size in bytes is determined by function <a class="el" href="group__descrint.html#gaf0b9048006ffc1d6663de8c707bbaad7" title="Gets size in bytes of encoded integer descriptor.">SimdDescrIntEncodedSize</a>.</dd></dl>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to Integer Descriptor Engine context. It must be created by function <a class="el" href="group__descrint.html#ga9a34fef8e808d9d7c0c62f3832e0c1d2" title="Initializes Integer Descriptor Engine.">SimdDescrIntInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to Integer Descriptor Engine context. It must be created by function <a class="el" href="group__descrint.html#ga9a34fef8e808d9d7c0c62f3832e0c1d2" title="Initializes Integer Descriptor Engine.">SimdDescrIntInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">M</td><td>- a number of A arrays. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">N</td><td>- a number of B arrays. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">A</td><td>- a pointer to the first array with integer descriptors. </td></tr>
@@ -575,7 +575,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga597d136b464cfbdd65791f5
 <dl class="section note"><dt>Note</dt><dd>Integer descriptor can be received with using of functions <a class="el" href="group__descrint.html#gab1f0da1cfdba134cc6d9ef9150ff175c" title="Encodes 32-bit float descriptor to integer form.">SimdDescrIntEncode32f</a> or <a class="el" href="group__descrint.html#gafd383eee16784e07693dacabb7570d22" title="Encodes 16-bit float descriptor to integer form.">SimdDescrIntEncode16f</a>. Its size in bytes is determined by function <a class="el" href="group__descrint.html#gaf0b9048006ffc1d6663de8c707bbaad7" title="Gets size in bytes of encoded integer descriptor.">SimdDescrIntEncodedSize</a>.</dd></dl>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to Integer Descriptor Engine context. It must be created by function <a class="el" href="group__descrint.html#ga9a34fef8e808d9d7c0c62f3832e0c1d2" title="Initializes Integer Descriptor Engine.">SimdDescrIntInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to Integer Descriptor Engine context. It must be created by function <a class="el" href="group__descrint.html#ga9a34fef8e808d9d7c0c62f3832e0c1d2" title="Initializes Integer Descriptor Engine.">SimdDescrIntInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">a</td><td>- a pointer to integer descriptor. </td></tr>
     <tr><td class="paramdir">[out]</td><td class="paramname">norm</td><td>- a pointer to result 32-bit float norm. </td></tr>
   </table>
diff --git a/docs/help/group__drawing.html b/docs/help/group__drawing.html
index 2e24e725fd..d881a6ea7d 100644
--- a/docs/help/group__drawing.html
+++ b/docs/help/group__drawing.html
@@ -977,7 +977,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#gae6922d862b2d1b120d53ff3
 </div><div class="memdoc">
 
 <p>Creates font context. </p>
-<dl class="section return"><dt>Returns</dt><dd>a pointer to font context. On error it returns NULL. This pointer is used in functions <a class="el" href="group__drawing.html#ga59e83705ad52c565eff3f86b0c74fc35" title="Sets font height.">SimdFontResize</a>, <a class="el" href="group__drawing.html#gabc9eefe0ef3c652aec698a9a52b10b80" title="Gets current font height.">SimdFontHeight</a>. It must be released with using of function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>a pointer to font context. On error it returns NULL. This pointer is used in functions <a class="el" href="group__drawing.html#ga59e83705ad52c565eff3f86b0c74fc35" title="Sets font height.">SimdFontResize</a>, <a class="el" href="group__drawing.html#gabc9eefe0ef3c652aec698a9a52b10b80" title="Gets current font height.">SimdFontHeight</a>. It must be released with using of function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </dd></dl>
 
 </div>
 </div>
@@ -1010,7 +1010,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga59e83705ad52c565eff3f86
 <p>Sets font height. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a font context. It must be created by function <a class="el" href="group__drawing.html#gae6922d862b2d1b120d53ff38dcc6a943" title="Creates font context.">SimdFontInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a font context. It must be created by function <a class="el" href="group__drawing.html#gae6922d862b2d1b120d53ff38dcc6a943" title="Creates font context.">SimdFontInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">height</td><td>- a new height of font. </td></tr>
   </table>
   </dd>
@@ -1038,7 +1038,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#gabc9eefe0ef3c652aec698a9
 <p>Gets current font height. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a font context. It must be created by function <a class="el" href="group__drawing.html#gae6922d862b2d1b120d53ff38dcc6a943" title="Creates font context.">SimdFontInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a font context. It must be created by function <a class="el" href="group__drawing.html#gae6922d862b2d1b120d53ff38dcc6a943" title="Creates font context.">SimdFontInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
   </table>
   </dd>
 </dl>
@@ -1087,7 +1087,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga98c1a82133291d9803eca43
 <p>Measures size of region which need to draw current text with using of given font. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a font context. It must be created by function <a class="el" href="group__drawing.html#gae6922d862b2d1b120d53ff38dcc6a943" title="Creates font context.">SimdFontInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a font context. It must be created by function <a class="el" href="group__drawing.html#gae6922d862b2d1b120d53ff38dcc6a943" title="Creates font context.">SimdFontInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">text</td><td>- a pointer to text. </td></tr>
     <tr><td class="paramdir">[out]</td><td class="paramname">width</td><td>- a measured width of region need to draw this text. </td></tr>
     <tr><td class="paramdir">[out]</td><td class="paramname">height</td><td>- a measured height of region need to draw this text. </td></tr>
@@ -1174,7 +1174,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga6e944a648a9cae7903642dd
 <p>Draws a text on canvas at current position with using of given font and color. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a font context. It must be created by function <a class="el" href="group__drawing.html#gae6922d862b2d1b120d53ff38dcc6a943" title="Creates font context.">SimdFontInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a font context. It must be created by function <a class="el" href="group__drawing.html#gae6922d862b2d1b120d53ff38dcc6a943" title="Creates font context.">SimdFontInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
     <tr><td class="paramdir">[out]</td><td class="paramname">canvas</td><td>- a pointer to pixels data of canvas image. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">stride</td><td>- a row size of canvas image. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">width</td><td>- a width of canvas image. </td></tr>
diff --git a/docs/help/group__gaussian__filter.html b/docs/help/group__gaussian__filter.html
index 5daa7769ed..b477c7b11c 100644
--- a/docs/help/group__gaussian__filter.html
+++ b/docs/help/group__gaussian__filter.html
@@ -208,7 +208,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga615ed4d1996638c45b1a0af
   </table>
   </dd>
 </dl>
-<dl class="section return"><dt>Returns</dt><dd>a pointer to filter context. On error it returns NULL. This pointer is used in functions <a class="el" href="group__gaussian__filter.html#ga3d8cfda122c2fa1a340e90f8226051b6" title="Performs image Gaussian blurring.">SimdGaussianBlurRun</a>. It must be released with using of function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>a pointer to filter context. On error it returns NULL. This pointer is used in functions <a class="el" href="group__gaussian__filter.html#ga3d8cfda122c2fa1a340e90f8226051b6" title="Performs image Gaussian blurring.">SimdGaussianBlurRun</a>. It must be released with using of function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </dd></dl>
 
 </div>
 </div>
@@ -270,7 +270,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga3d8cfda122c2fa1a340e90f
 dst[dx, dy] = sum;
 </pre><dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">filter</td><td>- a filter context. It must be created by function <a class="el" href="group__gaussian__filter.html#ga615ed4d1996638c45b1a0aff4d3ee50a" title="Creates Gaussian blur filter context.">SimdGaussianBlurInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">filter</td><td>- a filter context. It must be created by function <a class="el" href="group__gaussian__filter.html#ga615ed4d1996638c45b1a0aff4d3ee50a" title="Creates Gaussian blur filter context.">SimdGaussianBlurInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">src</td><td>- a pointer to pixels data of the original input image. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">srcStride</td><td>- a row size (in bytes) of the input image. </td></tr>
     <tr><td class="paramdir">[out]</td><td class="paramname">dst</td><td>- a pointer to pixels data of the filtered output image. </td></tr>
diff --git a/docs/help/group__image__io.html b/docs/help/group__image__io.html
index dd6f8c003d..1d79c01973 100644
--- a/docs/help/group__image__io.html
+++ b/docs/help/group__image__io.html
@@ -148,7 +148,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga1445d4a63bdc08366f14fac
   </table>
   </dd>
 </dl>
-<dl class="section return"><dt>Returns</dt><dd>a pointer to memory buffer with output image file. It has to be deleted after use by function <a class="el" href="group__memory.html#ga58c32d1470db0a1a698abd8280aeee52" title="Frees aligned memory block.">SimdFree</a>. On error it returns NULL. </dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>a pointer to memory buffer with output image file. It has to be deleted after use by function <a class="el" href="group__memory.html#ga58c32d1470db0a1a698abd8280aeee52" title="Frees an aligned memory block previously allocated by SimdAllocate.">SimdFree</a>. On error it returns NULL. </dd></dl>
 
 </div>
 </div>
@@ -316,7 +316,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga2fcf7cdf5452267607ef745
   </table>
   </dd>
 </dl>
-<dl class="section return"><dt>Returns</dt><dd>a pointer to memory buffer with output image file. It has to be deleted after use by function <a class="el" href="group__memory.html#ga58c32d1470db0a1a698abd8280aeee52" title="Frees aligned memory block.">SimdFree</a>. On error it returns NULL. </dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>a pointer to memory buffer with output image file. It has to be deleted after use by function <a class="el" href="group__memory.html#ga58c32d1470db0a1a698abd8280aeee52" title="Frees an aligned memory block previously allocated by SimdAllocate.">SimdFree</a>. On error it returns NULL. </dd></dl>
 
 </div>
 </div>
@@ -417,7 +417,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#gac0ee5c58a3d4de7540d52e7
   </table>
   </dd>
 </dl>
-<dl class="section return"><dt>Returns</dt><dd>a pointer to memory buffer with output image file. It has to be deleted after use by function <a class="el" href="group__memory.html#ga58c32d1470db0a1a698abd8280aeee52" title="Frees aligned memory block.">SimdFree</a>. On error it returns NULL. </dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>a pointer to memory buffer with output image file. It has to be deleted after use by function <a class="el" href="group__memory.html#ga58c32d1470db0a1a698abd8280aeee52" title="Frees an aligned memory block previously allocated by SimdAllocate.">SimdFree</a>. On error it returns NULL. </dd></dl>
 
 </div>
 </div>
@@ -484,7 +484,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga7453ef35019d11327346364
   </table>
   </dd>
 </dl>
-<dl class="section return"><dt>Returns</dt><dd>a pointer to pixels data of output image. It has to be deleted after use by function <a class="el" href="group__memory.html#ga58c32d1470db0a1a698abd8280aeee52" title="Frees aligned memory block.">SimdFree</a>. On error it returns NULL. </dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>a pointer to pixels data of output image. It has to be deleted after use by function <a class="el" href="group__memory.html#ga58c32d1470db0a1a698abd8280aeee52" title="Frees an aligned memory block previously allocated by SimdAllocate.">SimdFree</a>. On error it returns NULL. </dd></dl>
 
 </div>
 </div>
@@ -544,7 +544,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga19f18f90a7cdad46890ed2e
   </table>
   </dd>
 </dl>
-<dl class="section return"><dt>Returns</dt><dd>a pointer to pixels data of output image. It has to be deleted after use by function <a class="el" href="group__memory.html#ga58c32d1470db0a1a698abd8280aeee52" title="Frees aligned memory block.">SimdFree</a>. On error it returns NULL. </dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>a pointer to pixels data of output image. It has to be deleted after use by function <a class="el" href="group__memory.html#ga58c32d1470db0a1a698abd8280aeee52" title="Frees an aligned memory block previously allocated by SimdAllocate.">SimdFree</a>. On error it returns NULL. </dd></dl>
 
 </div>
 </div>
diff --git a/docs/help/group__info.html b/docs/help/group__info.html
index 64ee6cb630..3e213b7e9f 100644
--- a/docs/help/group__info.html
+++ b/docs/help/group__info.html
@@ -54,7 +54,7 @@ <h1>Simd Library Documentation.</h1>
 <tr class="memdesc:gab492c1cf423815199787e5262e46573e"><td class="mdescLeft">&#160;</td><td class="mdescRight">Gets version of Simd Library.  <a href="group__info.html#gab492c1cf423815199787e5262e46573e">More...</a><br /></td></tr>
 <tr class="separator:gab492c1cf423815199787e5262e46573e"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:gad14c64b5882fb15ce8bc683bec8c0da1"><td class="memItemLeft" align="right" valign="top">SIMD_API const char *&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__info.html#gad14c64b5882fb15ce8bc683bec8c0da1">SimdCpuDesc</a> (<a class="el" href="group__c__types.html#ga111a9cea2240175930bb547eb01811eb">SimdCpuDescType</a> type)</td></tr>
-<tr class="memdesc:gad14c64b5882fb15ce8bc683bec8c0da1"><td class="mdescLeft">&#160;</td><td class="mdescRight">Gets description of CPU and Simd Library.  <a href="group__info.html#gad14c64b5882fb15ce8bc683bec8c0da1">More...</a><br /></td></tr>
+<tr class="memdesc:gad14c64b5882fb15ce8bc683bec8c0da1"><td class="mdescLeft">&#160;</td><td class="mdescRight">Gets a text description of the CPU.  <a href="group__info.html#gad14c64b5882fb15ce8bc683bec8c0da1">More...</a><br /></td></tr>
 <tr class="separator:gad14c64b5882fb15ce8bc683bec8c0da1"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:gae2699c71293d16a769d1b18d33efdcaa"><td class="memItemLeft" align="right" valign="top">SIMD_API uint64_t&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__info.html#gae2699c71293d16a769d1b18d33efdcaa">SimdCpuInfo</a> (<a class="el" href="group__c__types.html#ga1fae7864d7881bf1ec39a5186cabbb74">SimdCpuInfoType</a> type)</td></tr>
 <tr class="memdesc:gae2699c71293d16a769d1b18d33efdcaa"><td class="mdescLeft">&#160;</td><td class="mdescRight">Gets information about CPU and Simd Library.  <a href="group__info.html#gae2699c71293d16a769d1b18d33efdcaa">More...</a><br /></td></tr>
@@ -86,7 +86,18 @@ <h2 class="memtitle"><span class="permalink"><a href="#gab492c1cf423815199787e52
 </div><div class="memdoc">
 
 <p>Gets version of Simd Library. </p>
-<dl class="section return"><dt>Returns</dt><dd>string with version of Simd Library (major version number, minor version number, release number, number of SVN's commits). </dd></dl>
+<p >Returns a pointer to a null-terminated, statically allocated string that encodes the library version. The format of the string is: </p><pre class="fragment">major.minor.release[.branch-sha]
+</pre><p> where <b>major</b>, <b>minor</b> and <b>release</b> are numeric components taken from the library's version file, and the optional <b>branch</b> and <b>sha</b> suffix identify the Git branch name and short commit hash at build time (e.g. <code>"7.1.161.main-a1b2c3d"</code>). When version information is not available at build time the function returns <code>"unknown"</code>.</p>
+<p >The returned pointer is valid for the lifetime of the process and must not be freed.</p>
+<p >Using example: </p><pre class="fragment">#include "Simd/SimdLib.h"
+#include &lt;iostream&gt;
+
+int main()
+{
+    std::cout &lt;&lt; "Simd Library version: " &lt;&lt; SimdVersion() &lt;&lt; std::endl;
+    return 0;
+}
+</pre><dl class="section return"><dt>Returns</dt><dd>a pointer to a static null-terminated string with the version of Simd Library. </dd></dl>
 
 </div>
 </div>
@@ -106,23 +117,27 @@ <h2 class="memtitle"><span class="permalink"><a href="#gad14c64b5882fb15ce8bc683
       </table>
 </div><div class="memdoc">
 
-<p>Gets description of CPU and Simd Library. </p>
-<dl class="section note"><dt>Note</dt><dd>See enumeration <a class="el" href="group__c__types.html#ga111a9cea2240175930bb547eb01811eb">SimdCpuDescType</a>.</dd></dl>
+<p>Gets a text description of the CPU. </p>
+<p >Returns a pointer to a null-terminated string whose content depends on the requested <a class="el" href="group__c__types.html#ga111a9cea2240175930bb547eb01811eb">SimdCpuDescType</a>:</p><ul>
+<li><a class="el" href="group__c__types.html#gga111a9cea2240175930bb547eb01811eba568fe08c74a07ad02c864add946d601a">SimdCpuDescModel</a> — the CPU brand/model name string (e.g. <code>"Intel(R) Core(TM) i7-8700 CPU @ 3.20GHz"</code>). On x86 it is read from the CPUID brand-string leaves; on Linux/ARM it is obtained via <code>lscpu</code>. An empty string is returned on platforms where the model name is not available (Apple, Android).</li>
+</ul>
+<p >The returned pointer is valid for the lifetime of the process and must not be freed. For an unknown or unsupported <em>type</em> value the function returns <code>NULL</code>.</p>
+<dl class="section note"><dt>Note</dt><dd>See enumeration <a class="el" href="group__c__types.html#ga111a9cea2240175930bb547eb01811eb">SimdCpuDescType</a> for the full list of supported types.</dd></dl>
 <p>Using example: </p><pre class="fragment">#include "Simd/SimdLib.h"
 #include &lt;iostream&gt;
 
 int main()
 {
-    std::cout &lt;&lt; "CPU: " &lt;&lt; SimdCpuDesc(SimdCpuDescModel) &lt;&lt; std::endl;
+    std::cout &lt;&lt; "CPU model: " &lt;&lt; SimdCpuDesc(SimdCpuDescModel) &lt;&lt; std::endl;
     return 0;
 }
 </pre><dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">type</td><td>- a type of required description. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">type</td><td>- a type of required description. See <a class="el" href="group__c__types.html#ga111a9cea2240175930bb547eb01811eb">SimdCpuDescType</a>. </td></tr>
   </table>
   </dd>
 </dl>
-<dl class="section return"><dt>Returns</dt><dd>a value which contains description of CPU and Simd Library. </dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>a pointer to a static null-terminated string with the requested CPU description, or <code>NULL</code> if <em>type</em> is not supported. </dd></dl>
 
 </div>
 </div>
@@ -143,6 +158,13 @@ <h2 class="memtitle"><span class="permalink"><a href="#gae2699c71293d16a769d1b18
 </div><div class="memdoc">
 
 <p>Gets information about CPU and Simd Library. </p>
+<p >Depending on the requested <a class="el" href="group__c__types.html#ga1fae7864d7881bf1ec39a5186cabbb74">SimdCpuInfoType</a>, the function returns one of the following kinds of values:</p><ul>
+<li>CPU topology: number of sockets, physical cores, or logical threads.</li>
+<li>Cache / RAM sizes in bytes (L1 data cache, L2 cache, L3 cache, physical RAM).</li>
+<li>SIMD extension availability: 1 if the extension is supported and enabled by the library, 0 otherwise. The extensions covered are SSE4.1 (and below), AVX2 (and FMA/AVX), AVX-512BW (and AVX-512F), AVX-512VNNI, AMX-BF16 (and AMX-INT8/AVX-512VBMI/AVX-512FP16), NEON, SVE, and HVX.</li>
+<li>SVE vector width in bytes (<a class="el" href="group__c__types.html#gga1fae7864d7881bf1ec39a5186cabbb74a873f13ad98aa92a129ee54606f153357">SimdCpuInfoSveSize</a>).</li>
+<li>Current CPU core frequency in Hz (<a class="el" href="group__c__types.html#gga1fae7864d7881bf1ec39a5186cabbb74ac784f409bccc42223efa7ae4d63d241f">SimdCpuInfoCurrentFrequency</a>); returns 0 if unavailable on the platform.</li>
+</ul>
 <dl class="section note"><dt>Note</dt><dd>See enumeration <a class="el" href="group__c__types.html#ga1fae7864d7881bf1ec39a5186cabbb74">SimdCpuInfoType</a>.</dd></dl>
 <p>Using example: </p><pre class="fragment">#include "Simd/SimdLib.h"
 #include &lt;iostream&gt;
@@ -162,6 +184,10 @@ <h2 class="memtitle"><span class="permalink"><a href="#gae2699c71293d16a769d1b18
     std::cout &lt;&lt; "AVX-512VNNI: " &lt;&lt; (SimdCpuInfo(SimdCpuInfoAvx512vnni) ? "Yes" : "No") &lt;&lt; std::endl;
     std::cout &lt;&lt; "AMX-BF16: " &lt;&lt; (SimdCpuInfo(SimdCpuInfoAmxBf16) ? "Yes" : "No") &lt;&lt; std::endl;
     std::cout &lt;&lt; "ARM-NEON: " &lt;&lt; (SimdCpuInfo(SimdCpuInfoNeon) ? "Yes" : "No") &lt;&lt; std::endl;
+    std::cout &lt;&lt; "ARM-SVE: " &lt;&lt; (SimdCpuInfo(SimdCpuInfoSve) ? "Yes" : "No") &lt;&lt; std::endl;
+    std::cout &lt;&lt; "ARM-SVE size: " &lt;&lt; SimdCpuInfo(SimdCpuInfoSveSize) * 8 &lt;&lt; " bits" &lt;&lt; std::endl;
+    std::cout &lt;&lt; "HVX: " &lt;&lt; (SimdCpuInfo(SimdCpuInfoHvx) ? "Yes" : "No") &lt;&lt; std::endl;
+    std::cout &lt;&lt; "Current frequency: " &lt;&lt; SimdCpuInfo(SimdCpuInfoCurrentFrequency) / 1000000 &lt;&lt; " MHz" &lt;&lt; std::endl;
     return 0;
 }
 </pre><dl class="params"><dt>Parameters</dt><dd>
@@ -170,7 +196,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#gae2699c71293d16a769d1b18
   </table>
   </dd>
 </dl>
-<dl class="section return"><dt>Returns</dt><dd>a value which contains information about CPU and Simd Library. </dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>a value whose meaning depends on <em>type:</em> a count (topology), size in bytes (cache/RAM), 1 or 0 (SIMD availability), size in bytes (SVE vector width), or frequency in Hz (current CPU frequency). </dd></dl>
 
 </div>
 </div>
diff --git a/docs/help/group__memory.html b/docs/help/group__memory.html
index 3eac0f5f61..9e5c3568e6 100644
--- a/docs/help/group__memory.html
+++ b/docs/help/group__memory.html
@@ -51,19 +51,19 @@ <h1>Simd Library Documentation.</h1>
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="func-members" name="func-members"></a>
 Functions</h2></td></tr>
 <tr class="memitem:gaf7d22d64276a37f51fbbd9645d8ac08e"><td class="memItemLeft" align="right" valign="top">SIMD_API void *&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__memory.html#gaf7d22d64276a37f51fbbd9645d8ac08e">SimdAllocate</a> (size_t size, size_t align)</td></tr>
-<tr class="memdesc:gaf7d22d64276a37f51fbbd9645d8ac08e"><td class="mdescLeft">&#160;</td><td class="mdescRight">Allocates aligned memory block.  <a href="group__memory.html#gaf7d22d64276a37f51fbbd9645d8ac08e">More...</a><br /></td></tr>
+<tr class="memdesc:gaf7d22d64276a37f51fbbd9645d8ac08e"><td class="mdescLeft">&#160;</td><td class="mdescRight">Allocates an aligned memory block.  <a href="group__memory.html#gaf7d22d64276a37f51fbbd9645d8ac08e">More...</a><br /></td></tr>
 <tr class="separator:gaf7d22d64276a37f51fbbd9645d8ac08e"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ga58c32d1470db0a1a698abd8280aeee52"><td class="memItemLeft" align="right" valign="top">SIMD_API void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__memory.html#ga58c32d1470db0a1a698abd8280aeee52">SimdFree</a> (void *ptr)</td></tr>
-<tr class="memdesc:ga58c32d1470db0a1a698abd8280aeee52"><td class="mdescLeft">&#160;</td><td class="mdescRight">Frees aligned memory block.  <a href="group__memory.html#ga58c32d1470db0a1a698abd8280aeee52">More...</a><br /></td></tr>
+<tr class="memdesc:ga58c32d1470db0a1a698abd8280aeee52"><td class="mdescLeft">&#160;</td><td class="mdescRight">Frees an aligned memory block previously allocated by <a class="el" href="group__memory.html#gaf7d22d64276a37f51fbbd9645d8ac08e" title="Allocates an aligned memory block.">SimdAllocate</a>.  <a href="group__memory.html#ga58c32d1470db0a1a698abd8280aeee52">More...</a><br /></td></tr>
 <tr class="separator:ga58c32d1470db0a1a698abd8280aeee52"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ga25d97184e429b85ebde28c4a390004db"><td class="memItemLeft" align="right" valign="top">SIMD_API size_t&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__memory.html#ga25d97184e429b85ebde28c4a390004db">SimdAlign</a> (size_t size, size_t align)</td></tr>
-<tr class="memdesc:ga25d97184e429b85ebde28c4a390004db"><td class="mdescLeft">&#160;</td><td class="mdescRight">Gets aligned size.  <a href="group__memory.html#ga25d97184e429b85ebde28c4a390004db">More...</a><br /></td></tr>
+<tr class="memdesc:ga25d97184e429b85ebde28c4a390004db"><td class="mdescLeft">&#160;</td><td class="mdescRight">Rounds a size value up to the nearest multiple of a given alignment.  <a href="group__memory.html#ga25d97184e429b85ebde28c4a390004db">More...</a><br /></td></tr>
 <tr class="separator:ga25d97184e429b85ebde28c4a390004db"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ga08c80f8832ca1e3606340c8589da365f"><td class="memItemLeft" align="right" valign="top">SIMD_API size_t&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__memory.html#ga08c80f8832ca1e3606340c8589da365f">SimdAlignment</a> (void)</td></tr>
-<tr class="memdesc:ga08c80f8832ca1e3606340c8589da365f"><td class="mdescLeft">&#160;</td><td class="mdescRight">Gets alignment required for the most productive work of <a class="el" href="namespace_simd.html">Simd</a> Library.  <a href="group__memory.html#ga08c80f8832ca1e3606340c8589da365f">More...</a><br /></td></tr>
+<tr class="memdesc:ga08c80f8832ca1e3606340c8589da365f"><td class="mdescLeft">&#160;</td><td class="mdescRight">Returns the optimal memory alignment for the current platform.  <a href="group__memory.html#ga08c80f8832ca1e3606340c8589da365f">More...</a><br /></td></tr>
 <tr class="separator:ga08c80f8832ca1e3606340c8589da365f"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ga4e38dedd9f946265c9762858a71aa4cf"><td class="memItemLeft" align="right" valign="top">SIMD_API void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf">SimdRelease</a> (void *context)</td></tr>
-<tr class="memdesc:ga4e38dedd9f946265c9762858a71aa4cf"><td class="mdescLeft">&#160;</td><td class="mdescRight">Releases context created with using of <a class="el" href="namespace_simd.html">Simd</a> Library API.  <a href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf">More...</a><br /></td></tr>
+<tr class="memdesc:ga4e38dedd9f946265c9762858a71aa4cf"><td class="mdescLeft">&#160;</td><td class="mdescRight">Destroys an opaque context object created by the <a class="el" href="namespace_simd.html">Simd</a> Library API.  <a href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf">More...</a><br /></td></tr>
 <tr class="separator:ga4e38dedd9f946265c9762858a71aa4cf"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:gaa843d0f43843b050ed2eadbe581b8405"><td class="memItemLeft" align="right" valign="top">SIMD_INLINE void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__memory.html#gaa843d0f43843b050ed2eadbe581b8405">LitterCpuCache</a> (size_t k=2)</td></tr>
 <tr class="memdesc:gaa843d0f43843b050ed2eadbe581b8405"><td class="mdescLeft">&#160;</td><td class="mdescRight">It creates a large buffer and fills it.  <a href="group__memory.html#gaa843d0f43843b050ed2eadbe581b8405">More...</a><br /></td></tr>
@@ -98,16 +98,31 @@ <h2 class="memtitle"><span class="permalink"><a href="#gaf7d22d64276a37f51fbbd96
       </table>
 </div><div class="memdoc">
 
-<p>Allocates aligned memory block. </p>
-<dl class="section note"><dt>Note</dt><dd>The memory allocated by this function is must be deleted by function <a class="el" href="group__memory.html#ga58c32d1470db0a1a698abd8280aeee52" title="Frees aligned memory block.">SimdFree</a>.</dd></dl>
-<dl class="params"><dt>Parameters</dt><dd>
+<p>Allocates an aligned memory block. </p>
+<p >Allocates a contiguous memory block of at least <em>size</em> bytes whose start address is a multiple of <em>align</em>. The alignment value must be a power of two and, on POSIX platforms (GCC), is rounded up to at least <code>sizeof(void*)</code> internally. The actual allocation is performed via the platform-appropriate aligned allocator: <code>_aligned_malloc</code> (MSVC), <code>__mingw_aligned_malloc</code> (MinGW), <code>posix_memalign</code> (GCC), or plain <code>malloc</code> on platforms that do not support aligned allocation.</p>
+<p >The block must be released with <a class="el" href="group__memory.html#ga58c32d1470db0a1a698abd8280aeee52" title="Frees an aligned memory block previously allocated by SimdAllocate.">SimdFree</a> — passing it to the standard <code>free</code> or <code>delete</code> is undefined behaviour.</p>
+<p >Using example: </p><pre class="fragment">#include "Simd/SimdLib.h"
+
+int main()
+{
+    const size_t size  = 1024;
+    const size_t align = SimdAlignment();
+    uint8_t * data = (uint8_t *)SimdAllocate(size, align);
+    if (data)
+    {
+        // use data ...
+        SimdFree(data);
+    }
+    return 0;
+}
+</pre><dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">size</td><td>- a size of memory block. </td></tr>
-    <tr><td class="paramdir">[in]</td><td class="paramname">align</td><td>- a required alignment of memory block.</td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">size</td><td>- the number of bytes to allocate. Must be greater than zero. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">align</td><td>- the required alignment of the allocated block in bytes. Must be a power of two. Use <a class="el" href="group__memory.html#ga08c80f8832ca1e3606340c8589da365f" title="Returns the optimal memory alignment for the current platform.">SimdAlignment</a> to obtain the optimal alignment for the current platform. </td></tr>
   </table>
   </dd>
 </dl>
-<dl class="section return"><dt>Returns</dt><dd>a pointer to allocated memory. </dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>a pointer to the newly allocated aligned memory block, or <code>NULL</code> if the allocation fails. </dd></dl>
 
 </div>
 </div>
@@ -127,11 +142,13 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga58c32d1470db0a1a698abd8
       </table>
 </div><div class="memdoc">
 
-<p>Frees aligned memory block. </p>
-<dl class="section note"><dt>Note</dt><dd>This function frees a memory allocated by function <a class="el" href="group__memory.html#gaf7d22d64276a37f51fbbd9645d8ac08e" title="Allocates aligned memory block.">SimdAllocate</a>.</dd></dl>
+<p>Frees an aligned memory block previously allocated by <a class="el" href="group__memory.html#gaf7d22d64276a37f51fbbd9645d8ac08e" title="Allocates an aligned memory block.">SimdAllocate</a>. </p>
+<p >Releases the memory block pointed to by <em>ptr</em>, which must have been returned by a prior call to <a class="el" href="group__memory.html#gaf7d22d64276a37f51fbbd9645d8ac08e" title="Allocates an aligned memory block.">SimdAllocate</a>. Passing a pointer obtained from any other allocator (e.g. <code>malloc</code>, <code>new</code>, or <code>_aligned_malloc</code>) is undefined behaviour.</p>
+<p >Passing <code>NULL</code> is safe and has no effect, consistent with the behaviour of the standard <code>free</code> function.</p>
+<p >The underlying release call matches the allocator used by <a class="el" href="group__memory.html#gaf7d22d64276a37f51fbbd9645d8ac08e" title="Allocates an aligned memory block.">SimdAllocate</a> for the current platform: <code>_aligned_free</code> (MSVC), <code>__mingw_aligned_free</code> (MinGW), or <code>free</code> (GCC and others).</p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">ptr</td><td>- a pointer to the memory to be deleted. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">ptr</td><td>- a pointer to the memory block to free. Must have been returned by <a class="el" href="group__memory.html#gaf7d22d64276a37f51fbbd9645d8ac08e" title="Allocates an aligned memory block.">SimdAllocate</a>, or <code>NULL</code> (in which case the call has no effect). </td></tr>
   </table>
   </dd>
 </dl>
@@ -164,15 +181,17 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga25d97184e429b85ebde28c4
       </table>
 </div><div class="memdoc">
 
-<p>Gets aligned size. </p>
+<p>Rounds a size value up to the nearest multiple of a given alignment. </p>
+<p >Returns the smallest value that is both a multiple of <em>align</em> and greater than or equal to <em>size</em>. If <em>size</em> is already a multiple of <em>align</em>, it is returned unchanged.</p>
+<p >The function uses the bitwise formula <code>(size + align - 1) &amp; ~(align - 1)</code>, which requires <em>align</em> to be a positive power of two.</p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">size</td><td>- an original size. </td></tr>
-    <tr><td class="paramdir">[in]</td><td class="paramname">align</td><td>- a required alignment.</td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">size</td><td>- the original size in bytes (or elements) to be aligned. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">align</td><td>- the required alignment in bytes. Must be a positive power of two. Use <a class="el" href="group__memory.html#ga08c80f8832ca1e3606340c8589da365f" title="Returns the optimal memory alignment for the current platform.">SimdAlignment</a> to obtain the optimal alignment for the current platform.</td></tr>
   </table>
   </dd>
 </dl>
-<dl class="section return"><dt>Returns</dt><dd>an aligned size. </dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>the smallest multiple of <em>align</em> that is greater than or equal to <em>size</em>. </dd></dl>
 
 </div>
 </div>
@@ -192,8 +211,18 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga08c80f8832ca1e3606340c8
       </table>
 </div><div class="memdoc">
 
-<p>Gets alignment required for the most productive work of <a class="el" href="namespace_simd.html">Simd</a> Library. </p>
-<dl class="section return"><dt>Returns</dt><dd>a required alignment. </dd></dl>
+<p>Returns the optimal memory alignment for the current platform. </p>
+<p >Returns the byte-width of the widest SIMD register available at runtime, which is the recommended alignment value to pass to <a class="el" href="group__memory.html#gaf7d22d64276a37f51fbbd9645d8ac08e" title="Allocates an aligned memory block.">SimdAllocate</a> and <a class="el" href="group__memory.html#ga25d97184e429b85ebde28c4a390004db" title="Rounds a size value up to the nearest multiple of a given alignment.">SimdAlign</a> in order to achieve best performance.</p>
+<p >The value is determined once at library initialization time by probing the active SIMD extensions and is constant for the lifetime of the process:</p><ul>
+<li><b>128</b> bytes — HVX (Qualcomm Hexagon)</li>
+<li><b>64</b> bytes — AVX-512 (x86, when either AVX-512BW or AVX-512VNNI is available)</li>
+<li><b>32</b> bytes — AVX2 (x86)</li>
+<li><b>16</b> bytes — SSE4.1 (x86) or NEON (ARM)</li>
+<li><b>sizeof(void*)</b> — scalar fallback (no SIMD extensions detected)</li>
+<li><b>SVE</b> vector size for current CPU in bytes — when SVE is available.</li>
+</ul>
+<p >The returned value is always a power of two and equals the value of the <code>SIMD_ALIGN</code> compile-time constant used internally by the library.</p>
+<dl class="section return"><dt>Returns</dt><dd>the optimal alignment in bytes for the current platform. </dd></dl>
 
 </div>
 </div>
@@ -213,11 +242,14 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga4e38dedd9f946265c976285
       </table>
 </div><div class="memdoc">
 
-<p>Releases context created with using of <a class="el" href="namespace_simd.html">Simd</a> Library API. </p>
-<dl class="section note"><dt>Note</dt><dd>This function releases a context created by functions <a class="el" href="group__object__detection.html#ga6eba3c14ffe78667df250d0d91e0d768" title="Loads a classifier cascade from file.">SimdDetectionLoadA</a> and <a class="el" href="group__object__detection.html#ga1c24fa9c803b2fbff5b175b9d582bfc5" title="Initializes hidden classifier cascade structure to work with given size of input 8-bit gray image.">SimdDetectionInit</a>.</dd></dl>
+<p>Destroys an opaque context object created by the <a class="el" href="namespace_simd.html">Simd</a> Library API. </p>
+<p >Releases any context object returned by a <a class="el" href="namespace_simd.html">Simd</a> Library context-creation function, i.e. any function whose name ends in <code>Init</code> (such as <a class="el" href="group__gaussian__filter.html#ga615ed4d1996638c45b1a0aff4d3ee50a" title="Creates Gaussian blur filter context.">SimdGaussianBlurInit</a>, <a class="el" href="group__resizing.html#gabee5cdab6c6e3c678381d5b872287184" title="Creates resize context.">SimdResizerInit</a>, <a class="el" href="group__warp__affine.html#ga190e3594adfa696ac4c75863dbd04c4f" title="Creates wrap affine context.">SimdWarpAffineInit</a>, <a class="el" href="group__descrint.html#ga9a34fef8e808d9d7c0c62f3832e0c1d2" title="Initializes Integer Descriptor Engine.">SimdDescrIntInit</a>, <a class="el" href="group__drawing.html#gae6922d862b2d1b120d53ff38dcc6a943" title="Creates font context.">SimdFontInit</a>, <a class="el" href="group__synet__convolution__fp32.html#gadf957b215db3ddbcc65083ed964f0b46" title="Initializes FP32 convolution algorithm.">SimdSynetConvolution32fInit</a>, and others), as well as <a class="el" href="group__object__detection.html#ga6eba3c14ffe78667df250d0d91e0d768" title="Loads a classifier cascade from file.">SimdDetectionLoadA</a>.</p>
+<p >Internally the function performs a polymorphic <code>delete</code> through the virtual destructor of the internal <code>Deletable</code> base class, ensuring that the correct destructor is always invoked regardless of the actual context type.</p>
+<p >Passing <code>NULL</code> is safe and has no effect, consistent with the behaviour of a C++ <code>delete</code> expression on a null pointer.</p>
+<dl class="section note"><dt>Note</dt><dd>Passing a pointer that was not returned by a <a class="el" href="namespace_simd.html">Simd</a> Library context-creation function (for example a pointer from <a class="el" href="group__memory.html#gaf7d22d64276a37f51fbbd9645d8ac08e" title="Allocates an aligned memory block.">SimdAllocate</a>, <code>malloc</code>, or <code>new</code>) is undefined behaviour.</dd></dl>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a context to be released. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to the context to be released, or <code>NULL</code>. </td></tr>
   </table>
   </dd>
 </dl>
diff --git a/docs/help/group__object__detection.html b/docs/help/group__object__detection.html
index a14762f74a..a113feab58 100644
--- a/docs/help/group__object__detection.html
+++ b/docs/help/group__object__detection.html
@@ -112,7 +112,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga6eba3c14ffe78667df250d0
   </table>
   </dd>
 </dl>
-<dl class="section return"><dt>Returns</dt><dd>a pointer to loaded cascade. On error it returns NULL. This pointer is used in functions <a class="el" href="group__object__detection.html#ga2741acb9a4edf6d227c83ec3cf8ab1e3" title="Gets information about the classifier cascade.">SimdDetectionInfo</a> and <a class="el" href="group__object__detection.html#ga1c24fa9c803b2fbff5b175b9d582bfc5" title="Initializes hidden classifier cascade structure to work with given size of input 8-bit gray image.">SimdDetectionInit</a>, and must be released with using of function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>a pointer to loaded cascade. On error it returns NULL. This pointer is used in functions <a class="el" href="group__object__detection.html#ga2741acb9a4edf6d227c83ec3cf8ab1e3" title="Gets information about the classifier cascade.">SimdDetectionInfo</a> and <a class="el" href="group__object__detection.html#ga1c24fa9c803b2fbff5b175b9d582bfc5" title="Initializes hidden classifier cascade structure to work with given size of input 8-bit gray image.">SimdDetectionInit</a>, and must be released with using of function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </dd></dl>
 
 </div>
 </div>
@@ -141,7 +141,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga3505ce44c0562e8d4fe2c82
   </table>
   </dd>
 </dl>
-<dl class="section return"><dt>Returns</dt><dd>a pointer to loaded cascade. On error it returns NULL. This pointer is used in functions <a class="el" href="group__object__detection.html#ga2741acb9a4edf6d227c83ec3cf8ab1e3" title="Gets information about the classifier cascade.">SimdDetectionInfo</a> and <a class="el" href="group__object__detection.html#ga1c24fa9c803b2fbff5b175b9d582bfc5" title="Initializes hidden classifier cascade structure to work with given size of input 8-bit gray image.">SimdDetectionInit</a>, and must be released with using of function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>a pointer to loaded cascade. On error it returns NULL. This pointer is used in functions <a class="el" href="group__object__detection.html#ga2741acb9a4edf6d227c83ec3cf8ab1e3" title="Gets information about the classifier cascade.">SimdDetectionInfo</a> and <a class="el" href="group__object__detection.html#ga1c24fa9c803b2fbff5b175b9d582bfc5" title="Initializes hidden classifier cascade structure to work with given size of input 8-bit gray image.">SimdDetectionInit</a>, and must be released with using of function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </dd></dl>
 
 </div>
 </div>
@@ -295,7 +295,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga1c24fa9c803b2fbff5b175b
   </table>
   </dd>
 </dl>
-<dl class="section return"><dt>Returns</dt><dd>a pointer to hidden cascade. On error it returns NULL. This pointer is used in functions <a class="el" href="group__object__detection.html#ga8273855df9316d23cfaa85d981e75471" title="Prepares hidden classifier cascade structure to work with given input 8-bit gray image.">SimdDetectionPrepare</a>, <a class="el" href="group__object__detection.html#gaba65e21ccb1d49e55c899bc344fc8a46" title="Performs object detection with using of HAAR cascade classifier (uses 32-bit float numbers,...">SimdDetectionHaarDetect32fp</a>, <a class="el" href="group__object__detection.html#ga7f0c306ca3748341a43d9b08f895c2ac" title="Performs object detection with using of HAAR cascade classifier (uses 32-bit float numbers,...">SimdDetectionHaarDetect32fi</a>, <a class="el" href="group__object__detection.html#gaf7b8ab17f35c923312d383259dea20e3" title="Performs object detection with using of LBP cascade classifier (uses 32-bit float numbers,...">SimdDetectionLbpDetect32fp</a>, <a class="el" href="group__object__detection.html#ga218cc08cbff2ae5b0807dded3486ac16" title="Performs object detection with using of LBP cascade classifier (uses 32-bit float numbers,...">SimdDetectionLbpDetect32fi</a>, <a class="el" href="group__object__detection.html#ga31f469e40e42cb70563dfe9ab63b6032" title="Performs object detection with using of LBP cascade classifier (uses 16-bit integer numbers,...">SimdDetectionLbpDetect16ip</a> and <a class="el" href="group__object__detection.html#ga2a3409fbda5e1e13852d066cfede2d9f" title="Performs object detection with using of LBP cascade classifier (uses 16-bit integer numbers,...">SimdDetectionLbpDetect16ii</a>. It must be released with using of function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>a pointer to hidden cascade. On error it returns NULL. This pointer is used in functions <a class="el" href="group__object__detection.html#ga8273855df9316d23cfaa85d981e75471" title="Prepares hidden classifier cascade structure to work with given input 8-bit gray image.">SimdDetectionPrepare</a>, <a class="el" href="group__object__detection.html#gaba65e21ccb1d49e55c899bc344fc8a46" title="Performs object detection with using of HAAR cascade classifier (uses 32-bit float numbers,...">SimdDetectionHaarDetect32fp</a>, <a class="el" href="group__object__detection.html#ga7f0c306ca3748341a43d9b08f895c2ac" title="Performs object detection with using of HAAR cascade classifier (uses 32-bit float numbers,...">SimdDetectionHaarDetect32fi</a>, <a class="el" href="group__object__detection.html#gaf7b8ab17f35c923312d383259dea20e3" title="Performs object detection with using of LBP cascade classifier (uses 32-bit float numbers,...">SimdDetectionLbpDetect32fp</a>, <a class="el" href="group__object__detection.html#ga218cc08cbff2ae5b0807dded3486ac16" title="Performs object detection with using of LBP cascade classifier (uses 32-bit float numbers,...">SimdDetectionLbpDetect32fi</a>, <a class="el" href="group__object__detection.html#ga31f469e40e42cb70563dfe9ab63b6032" title="Performs object detection with using of LBP cascade classifier (uses 16-bit integer numbers,...">SimdDetectionLbpDetect16ip</a> and <a class="el" href="group__object__detection.html#ga2a3409fbda5e1e13852d066cfede2d9f" title="Performs object detection with using of LBP cascade classifier (uses 16-bit integer numbers,...">SimdDetectionLbpDetect16ii</a>. It must be released with using of function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </dd></dl>
 
 </div>
 </div>
diff --git a/docs/help/group__recursive__bilateral__filter.html b/docs/help/group__recursive__bilateral__filter.html
index 36c4d4fbdf..b4953b49f6 100644
--- a/docs/help/group__recursive__bilateral__filter.html
+++ b/docs/help/group__recursive__bilateral__filter.html
@@ -186,7 +186,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga7000a8eba5de2aa5f2f130f
   </table>
   </dd>
 </dl>
-<dl class="section return"><dt>Returns</dt><dd>a pointer to filter context. On error it returns NULL. This pointer is used in functions <a class="el" href="group__recursive__bilateral__filter.html#ga76a84d0867dc68986bb7e32ca811b429" title="Performs image recursive bilateral filtering.">SimdRecursiveBilateralFilterRun</a>. It must be released with using of function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>a pointer to filter context. On error it returns NULL. This pointer is used in functions <a class="el" href="group__recursive__bilateral__filter.html#ga76a84d0867dc68986bb7e32ca811b429" title="Performs image recursive bilateral filtering.">SimdRecursiveBilateralFilterRun</a>. It must be released with using of function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </dd></dl>
 
 </div>
 </div>
@@ -237,7 +237,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga76a84d0867dc68986bb7e32
 <p>Performs image recursive bilateral filtering. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">filter</td><td>- a filter context. It must be created by function <a class="el" href="group__recursive__bilateral__filter.html#ga7000a8eba5de2aa5f2f130f08f335e43" title="Creates Recursive bilateral filter context.">SimdRecursiveBilateralFilterInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">filter</td><td>- a filter context. It must be created by function <a class="el" href="group__recursive__bilateral__filter.html#ga7000a8eba5de2aa5f2f130f08f335e43" title="Creates Recursive bilateral filter context.">SimdRecursiveBilateralFilterInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">src</td><td>- a pointer to pixels data of the original input image. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">srcStride</td><td>- a row size (in bytes) of the input image. </td></tr>
     <tr><td class="paramdir">[out]</td><td class="paramname">dst</td><td>- a pointer to pixels data of the filtered output image. </td></tr>
diff --git a/docs/help/group__resizing.html b/docs/help/group__resizing.html
index f6a8988737..c9b808fa89 100644
--- a/docs/help/group__resizing.html
+++ b/docs/help/group__resizing.html
@@ -732,7 +732,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#gabee5cdab6c6e3c678381d5b
   </table>
   </dd>
 </dl>
-<dl class="section return"><dt>Returns</dt><dd>a pointer to resize context. On error it returns NULL. This pointer is used in functions <a class="el" href="group__resizing.html#gaa38558511e560de8645e511740442e83" title="Performs image resizing.">SimdResizerRun</a>. It must be released with using of function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>a pointer to resize context. On error it returns NULL. This pointer is used in functions <a class="el" href="group__resizing.html#gaa38558511e560de8645e511740442e83" title="Performs image resizing.">SimdResizerRun</a>. It must be released with using of function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </dd></dl>
 
 </div>
 </div>
@@ -783,7 +783,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#gaa38558511e560de8645e511
 <p>Performs image resizing. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">resizer</td><td>- a resize context. It must be created by function <a class="el" href="group__resizing.html#gabee5cdab6c6e3c678381d5b872287184" title="Creates resize context.">SimdResizerInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">resizer</td><td>- a resize context. It must be created by function <a class="el" href="group__resizing.html#gabee5cdab6c6e3c678381d5b872287184" title="Creates resize context.">SimdResizerInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">src</td><td>- a pointer to pixels data of the original input image. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">srcStride</td><td>- a row size (in bytes) of the input image. </td></tr>
     <tr><td class="paramdir">[out]</td><td class="paramname">dst</td><td>- a pointer to pixels data of the resized output image. </td></tr>
diff --git a/docs/help/group__shifting.html b/docs/help/group__shifting.html
index cbfc7cc7f6..62be61fe85 100644
--- a/docs/help/group__shifting.html
+++ b/docs/help/group__shifting.html
@@ -313,7 +313,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#gabebd6e86333f15b87c1627c
   </table>
   </dd>
 </dl>
-<dl class="section return"><dt>Returns</dt><dd>a pointer to shift detector context. On error it returns NULL. This pointer is used in functions <a class="el" href="group__shifting.html#ga71d9af73149f28f85f8aee0fc8f14e16">SimdShiftDetectorSetBackground</a>, <a class="el" href="group__shifting.html#gaaf472bc7ab2dcbde2de5e81f99e24954">SimdShiftDetectorEstimate</a>. It must be released with using of function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>a pointer to shift detector context. On error it returns NULL. This pointer is used in functions <a class="el" href="group__shifting.html#ga71d9af73149f28f85f8aee0fc8f14e16">SimdShiftDetectorSetBackground</a>, <a class="el" href="group__shifting.html#gaaf472bc7ab2dcbde2de5e81f99e24954">SimdShiftDetectorEstimate</a>. It must be released with using of function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </dd></dl>
 
 </div>
 </div>
@@ -358,7 +358,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga71d9af73149f28f85f8aee0
 <dl class="section note"><dt>Note</dt><dd>This function used in class <a class="el" href="struct_simd_1_1_shift_detector.html" title="ShiftDetector structure provides shift detection of given region at the image.">Simd::ShiftDetector</a>.</dd></dl>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a shift detector context. It must be created by function <a class="el" href="group__shifting.html#gabebd6e86333f15b87c1627c37977a4a7">SimdShiftDetectorInitBuffers</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a shift detector context. It must be created by function <a class="el" href="group__shifting.html#gabebd6e86333f15b87c1627c37977a4a7">SimdShiftDetectorInitBuffers</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">bkg</td><td>- a pointer to pixels data of background image. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">bkgStride</td><td>- a row size of the background image. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">makeCopy</td><td>- if true, copy of the background will be created. </td></tr>
@@ -451,7 +451,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#gaaf472bc7ab2dcbde2de5e81
 <dl class="section note"><dt>Note</dt><dd>This function used in class <a class="el" href="struct_simd_1_1_shift_detector.html" title="ShiftDetector structure provides shift detection of given region at the image.">Simd::ShiftDetector</a>.</dd></dl>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a shift detector context. It must be created by function <a class="el" href="group__shifting.html#gabebd6e86333f15b87c1627c37977a4a7">SimdShiftDetectorInitBuffers</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a shift detector context. It must be created by function <a class="el" href="group__shifting.html#gabebd6e86333f15b87c1627c37977a4a7">SimdShiftDetectorInitBuffers</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">curr</td><td>- a pointer to pixels data of current image. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">currStride</td><td>- a row size of the current image. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">currWidth</td><td>- a width of current image. </td></tr>
diff --git a/docs/help/group__synet__add.html b/docs/help/group__synet__add.html
index 08ba7d3ad0..439325d0ea 100644
--- a/docs/help/group__synet__add.html
+++ b/docs/help/group__synet__add.html
@@ -143,7 +143,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga3ffb605bc0d9a4b3f986ecb
   </table>
   </dd>
 </dl>
-<dl class="section return"><dt>Returns</dt><dd>a pointer to add context. On error it returns NULL. It must be released with using of function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. This pointer is used in function <a class="el" href="group__synet__add.html#ga0258902b9df03b5c47063f78fe2bef79" title="Performs forward propagation of add algorithm.">SimdSynetAdd16bForward</a>. </dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>a pointer to add context. On error it returns NULL. It must be released with using of function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. This pointer is used in function <a class="el" href="group__synet__add.html#ga0258902b9df03b5c47063f78fe2bef79" title="Performs forward propagation of add algorithm.">SimdSynetAdd16bForward</a>. </dd></dl>
 
 </div>
 </div>
@@ -188,7 +188,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga0258902b9df03b5c47063f7
 <p>Performs forward propagation of add algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to add context. It must be created by function <a class="el" href="group__synet__add.html#ga3ffb605bc0d9a4b3f986ecbf1539fb8b" title="Initializes add algorithm.">SimdSynetAdd16bInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to add context. It must be created by function <a class="el" href="group__synet__add.html#ga3ffb605bc0d9a4b3f986ecbf1539fb8b" title="Initializes add algorithm.">SimdSynetAdd16bInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">a</td><td>- a pointer to input A tensor. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">b</td><td>- a pointer to input B tensor. </td></tr>
     <tr><td class="paramdir">[out]</td><td class="paramname">dst</td><td>- a pointer to output tensor. </td></tr>
diff --git a/docs/help/group__synet__convolution__bf16.html b/docs/help/group__synet__convolution__bf16.html
index f1997e4b9b..c204d40b41 100644
--- a/docs/help/group__synet__convolution__bf16.html
+++ b/docs/help/group__synet__convolution__bf16.html
@@ -110,7 +110,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga659aadbe5941aef63424e08
   </table>
   </dd>
 </dl>
-<dl class="section return"><dt>Returns</dt><dd>a pointer to BF16 convolution context. On error it returns NULL. It must be released with using of function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. This pointer is used in functions <a class="el" href="group__synet__convolution__bf16.html#ga1da8e6cf208a427ecffbc706504b8a48" title="Gets size in bytes of external temporary buffer required for BF16 convolution algorithm.">SimdSynetConvolution16bExternalBufferSize</a>, <a class="el" href="group__synet__convolution__bf16.html#gaf1f273a4b853961d94545377a6eb54a7" title="Gets size (in bytes) of internal buffer used inside BF16 convolution algorithm.">SimdSynetConvolution16bInternalBufferSize</a>, <a class="el" href="group__synet__convolution__bf16.html#ga7c71b65192b7355f4812e295802c63c1" title="Gets description of internal implementation of BF16 convolution algorithm.">SimdSynetConvolution16bInfo</a>, SimdSynetConvolution16bSetParams and <a class="el" href="group__synet__convolution__bf16.html#ga7eb1779f16a473549f1273a412782c65" title="Performs forward propagation of BF16 convolution algorithm.">SimdSynetConvolution16bForward</a>. </dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>a pointer to BF16 convolution context. On error it returns NULL. It must be released with using of function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. This pointer is used in functions <a class="el" href="group__synet__convolution__bf16.html#ga1da8e6cf208a427ecffbc706504b8a48" title="Gets size in bytes of external temporary buffer required for BF16 convolution algorithm.">SimdSynetConvolution16bExternalBufferSize</a>, <a class="el" href="group__synet__convolution__bf16.html#gaf1f273a4b853961d94545377a6eb54a7" title="Gets size (in bytes) of internal buffer used inside BF16 convolution algorithm.">SimdSynetConvolution16bInternalBufferSize</a>, <a class="el" href="group__synet__convolution__bf16.html#ga7c71b65192b7355f4812e295802c63c1" title="Gets description of internal implementation of BF16 convolution algorithm.">SimdSynetConvolution16bInfo</a>, SimdSynetConvolution16bSetParams and <a class="el" href="group__synet__convolution__bf16.html#ga7eb1779f16a473549f1273a412782c65" title="Performs forward propagation of BF16 convolution algorithm.">SimdSynetConvolution16bForward</a>. </dd></dl>
 
 </div>
 </div>
@@ -133,7 +133,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga1da8e6cf208a427ecffbc70
 <p>Gets size in bytes of external temporary buffer required for BF16 convolution algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to BF16 convolution context. It must be created by function <a class="el" href="group__synet__convolution__bf16.html#ga659aadbe5941aef63424e08e0466d50e" title="Initializes BF16 convolution algorithm.">SimdSynetConvolution16bInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to BF16 convolution context. It must be created by function <a class="el" href="group__synet__convolution__bf16.html#ga659aadbe5941aef63424e08e0466d50e" title="Initializes BF16 convolution algorithm.">SimdSynetConvolution16bInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
   </table>
   </dd>
 </dl>
@@ -160,7 +160,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#gaf1f273a4b853961d9454537
 <p>Gets size (in bytes) of internal buffer used inside BF16 convolution algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to BF16 convolution context. It must be created by function <a class="el" href="group__synet__convolution__bf16.html#ga659aadbe5941aef63424e08e0466d50e" title="Initializes BF16 convolution algorithm.">SimdSynetConvolution16bInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to BF16 convolution context. It must be created by function <a class="el" href="group__synet__convolution__bf16.html#ga659aadbe5941aef63424e08e0466d50e" title="Initializes BF16 convolution algorithm.">SimdSynetConvolution16bInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
   </table>
   </dd>
 </dl>
@@ -187,7 +187,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga7c71b65192b7355f4812e29
 <p>Gets description of internal implementation of BF16 convolution algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to BF16 convolution context. It must be created by function <a class="el" href="group__synet__convolution__bf16.html#ga659aadbe5941aef63424e08e0466d50e" title="Initializes BF16 convolution algorithm.">SimdSynetConvolution16bInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to BF16 convolution context. It must be created by function <a class="el" href="group__synet__convolution__bf16.html#ga659aadbe5941aef63424e08e0466d50e" title="Initializes BF16 convolution algorithm.">SimdSynetConvolution16bInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
   </table>
   </dd>
 </dl>
@@ -236,7 +236,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga7eb1779f16a473549f1273a
 <p>Performs forward propagation of BF16 convolution algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to BF16 convolution context. It must be created by function <a class="el" href="group__synet__convolution__bf16.html#ga659aadbe5941aef63424e08e0466d50e" title="Initializes BF16 convolution algorithm.">SimdSynetConvolution16bInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to BF16 convolution context. It must be created by function <a class="el" href="group__synet__convolution__bf16.html#ga659aadbe5941aef63424e08e0466d50e" title="Initializes BF16 convolution algorithm.">SimdSynetConvolution16bInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">src</td><td>- a pointer to input tensor. </td></tr>
     <tr><td class="paramdir">[out]</td><td class="paramname">buf</td><td>- a pointer to external temporary buffer. The size of the external temporary buffer is determined by function <a class="el" href="group__synet__convolution__bf16.html#ga1da8e6cf208a427ecffbc706504b8a48" title="Gets size in bytes of external temporary buffer required for BF16 convolution algorithm.">SimdSynetConvolution16bExternalBufferSize</a>. Can be NULL (it causes usage of internal buffer). </td></tr>
     <tr><td class="paramdir">[out]</td><td class="paramname">dst</td><td>- a pointer to output tensor. </td></tr>
diff --git a/docs/help/group__synet__convolution__fp32.html b/docs/help/group__synet__convolution__fp32.html
index b5dd9836a0..3c76b35396 100644
--- a/docs/help/group__synet__convolution__fp32.html
+++ b/docs/help/group__synet__convolution__fp32.html
@@ -106,7 +106,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#gadf957b215db3ddbcc65083e
   </table>
   </dd>
 </dl>
-<dl class="section return"><dt>Returns</dt><dd>a pointer to FP32 convolution context. On error it returns NULL. It must be released with using of function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. This pointer is used in functions <a class="el" href="group__synet__convolution__fp32.html#ga47b83adc66431926e50b39761efbb2a9" title="Gets size of external temporary buffer required for FP32 convolution algorithm.">SimdSynetConvolution32fExternalBufferSize</a>, <a class="el" href="group__synet__convolution__fp32.html#gaa12366fe5a5850160e63300e99287ca7" title="Gets size of internal buffer used inside FP32 convolution algorithm.">SimdSynetConvolution32fInternalBufferSize</a>, <a class="el" href="group__synet__convolution__fp32.html#ga8daf0410bdb6b45866b889cd809a2814" title="Gets description of internal implementation of FP32 convolution algorithm.">SimdSynetConvolution32fInfo</a>, <a class="el" href="group__synet__convolution__fp32.html#ga0147bce67b7f215178835f0f3e710eaa" title="Sets weights, biases and parameters of activation function required for FP32 convolution algorithm.">SimdSynetConvolution32fSetParams</a> and <a class="el" href="group__synet__convolution__fp32.html#gac34c3e3672c20e307137705511814dbc" title="Performs forward propagation of FP32 convolution algorithm.">SimdSynetConvolution32fForward</a>. </dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>a pointer to FP32 convolution context. On error it returns NULL. It must be released with using of function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. This pointer is used in functions <a class="el" href="group__synet__convolution__fp32.html#ga47b83adc66431926e50b39761efbb2a9" title="Gets size of external temporary buffer required for FP32 convolution algorithm.">SimdSynetConvolution32fExternalBufferSize</a>, <a class="el" href="group__synet__convolution__fp32.html#gaa12366fe5a5850160e63300e99287ca7" title="Gets size of internal buffer used inside FP32 convolution algorithm.">SimdSynetConvolution32fInternalBufferSize</a>, <a class="el" href="group__synet__convolution__fp32.html#ga8daf0410bdb6b45866b889cd809a2814" title="Gets description of internal implementation of FP32 convolution algorithm.">SimdSynetConvolution32fInfo</a>, <a class="el" href="group__synet__convolution__fp32.html#ga0147bce67b7f215178835f0f3e710eaa" title="Sets weights, biases and parameters of activation function required for FP32 convolution algorithm.">SimdSynetConvolution32fSetParams</a> and <a class="el" href="group__synet__convolution__fp32.html#gac34c3e3672c20e307137705511814dbc" title="Performs forward propagation of FP32 convolution algorithm.">SimdSynetConvolution32fForward</a>. </dd></dl>
 
 </div>
 </div>
@@ -129,7 +129,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga47b83adc66431926e50b397
 <p>Gets size of external temporary buffer required for FP32 convolution algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to FP32 convolution context. It must be created by function <a class="el" href="group__synet__convolution__fp32.html#gadf957b215db3ddbcc65083ed964f0b46" title="Initializes FP32 convolution algorithm.">SimdSynetConvolution32fInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to FP32 convolution context. It must be created by function <a class="el" href="group__synet__convolution__fp32.html#gadf957b215db3ddbcc65083ed964f0b46" title="Initializes FP32 convolution algorithm.">SimdSynetConvolution32fInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
   </table>
   </dd>
 </dl>
@@ -156,7 +156,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#gaa12366fe5a5850160e63300
 <p>Gets size of internal buffer used inside FP32 convolution algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to FP32 convolution context. It must be created by function <a class="el" href="group__synet__convolution__fp32.html#gadf957b215db3ddbcc65083ed964f0b46" title="Initializes FP32 convolution algorithm.">SimdSynetConvolution32fInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to FP32 convolution context. It must be created by function <a class="el" href="group__synet__convolution__fp32.html#gadf957b215db3ddbcc65083ed964f0b46" title="Initializes FP32 convolution algorithm.">SimdSynetConvolution32fInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
   </table>
   </dd>
 </dl>
@@ -183,7 +183,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga8daf0410bdb6b45866b889c
 <p>Gets description of internal implementation of FP32 convolution algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to FP32 convolution context. It must be created by function <a class="el" href="group__synet__convolution__fp32.html#gadf957b215db3ddbcc65083ed964f0b46" title="Initializes FP32 convolution algorithm.">SimdSynetConvolution32fInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to FP32 convolution context. It must be created by function <a class="el" href="group__synet__convolution__fp32.html#gadf957b215db3ddbcc65083ed964f0b46" title="Initializes FP32 convolution algorithm.">SimdSynetConvolution32fInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
   </table>
   </dd>
 </dl>
@@ -238,7 +238,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga0147bce67b7f215178835f0
 <p>Sets weights, biases and parameters of activation function required for FP32 convolution algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in,out]</td><td class="paramname">context</td><td>- a pointer to FP32 convolution context. It must be created by function <a class="el" href="group__synet__convolution__fp32.html#gadf957b215db3ddbcc65083ed964f0b46" title="Initializes FP32 convolution algorithm.">SimdSynetConvolution32fInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in,out]</td><td class="paramname">context</td><td>- a pointer to FP32 convolution context. It must be created by function <a class="el" href="group__synet__convolution__fp32.html#gadf957b215db3ddbcc65083ed964f0b46" title="Initializes FP32 convolution algorithm.">SimdSynetConvolution32fInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">weight</td><td>- a pointer to convolution weights. </td></tr>
     <tr><td class="paramdir">[out]</td><td class="paramname">internal</td><td>- a flag signalizing that weight is stored in the internal buffer. Can be NULL. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">bias</td><td>- a pointer to bias. Can be NULL. </td></tr>
@@ -290,7 +290,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#gac34c3e3672c20e307137705
 <p>Performs forward propagation of FP32 convolution algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to FP32 convolution context. It must be created by function <a class="el" href="group__synet__convolution__fp32.html#gadf957b215db3ddbcc65083ed964f0b46" title="Initializes FP32 convolution algorithm.">SimdSynetConvolution32fInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to FP32 convolution context. It must be created by function <a class="el" href="group__synet__convolution__fp32.html#gadf957b215db3ddbcc65083ed964f0b46" title="Initializes FP32 convolution algorithm.">SimdSynetConvolution32fInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">src</td><td>- a pointer to input tensor. </td></tr>
     <tr><td class="paramdir">[out]</td><td class="paramname">buf</td><td>- a pointer to external temporary buffer. The size of the external temporary buffer is determined by function <a class="el" href="group__synet__convolution__fp32.html#ga47b83adc66431926e50b39761efbb2a9" title="Gets size of external temporary buffer required for FP32 convolution algorithm.">SimdSynetConvolution32fExternalBufferSize</a>. Can be NULL (it causes usage of internal buffer). </td></tr>
     <tr><td class="paramdir">[out]</td><td class="paramname">dst</td><td>- a pointer to output tensor. </td></tr>
diff --git a/docs/help/group__synet__convolution__int8.html b/docs/help/group__synet__convolution__int8.html
index 0913d872d4..9bd02e654a 100644
--- a/docs/help/group__synet__convolution__int8.html
+++ b/docs/help/group__synet__convolution__int8.html
@@ -113,7 +113,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga53d7304604ffae850717450
   </table>
   </dd>
 </dl>
-<dl class="section return"><dt>Returns</dt><dd>a pointer to INT8 convolution context. On error it returns NULL. It must be released with using of function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. This pointer is used in functions <a class="el" href="group__synet__convolution__int8.html#gaa6fcba4caa5d1bf4844d7f5d5bd0fb61" title="Gets size in bytes of external temporary buffer required for INT8 convolution algorithm.">SimdSynetConvolution8iExternalBufferSize</a>, <a class="el" href="group__synet__convolution__int8.html#ga3841bca2ac310225763ae483429967cc" title="Gets size of internal buffer used inside INT8 convolution algorithm.">SimdSynetConvolution8iInternalBufferSize</a>, <a class="el" href="group__synet__convolution__int8.html#ga58e619c50627e2c7033d5100e8c754eb" title="Gets description of internal implementation of INT8 convolution algorithm.">SimdSynetConvolution8iInfo</a>, <a class="el" href="group__synet__convolution__int8.html#ga808feb178234a756f2b5bec560cd8e9a" title="Sets weights, biases, parameters of activation function, input/output tensor statistics required for ...">SimdSynetConvolution8iSetParams</a> and <a class="el" href="group__synet__convolution__int8.html#gab3af52ee0b31eba79308941cd0eeb5b4" title="Performs forward propagation of INT8 convolution algorithm.">SimdSynetConvolution8iForward</a>. </dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>a pointer to INT8 convolution context. On error it returns NULL. It must be released with using of function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. This pointer is used in functions <a class="el" href="group__synet__convolution__int8.html#gaa6fcba4caa5d1bf4844d7f5d5bd0fb61" title="Gets size in bytes of external temporary buffer required for INT8 convolution algorithm.">SimdSynetConvolution8iExternalBufferSize</a>, <a class="el" href="group__synet__convolution__int8.html#ga3841bca2ac310225763ae483429967cc" title="Gets size of internal buffer used inside INT8 convolution algorithm.">SimdSynetConvolution8iInternalBufferSize</a>, <a class="el" href="group__synet__convolution__int8.html#ga58e619c50627e2c7033d5100e8c754eb" title="Gets description of internal implementation of INT8 convolution algorithm.">SimdSynetConvolution8iInfo</a>, <a class="el" href="group__synet__convolution__int8.html#ga808feb178234a756f2b5bec560cd8e9a" title="Sets weights, biases, parameters of activation function, input/output tensor statistics required for ...">SimdSynetConvolution8iSetParams</a> and <a class="el" href="group__synet__convolution__int8.html#gab3af52ee0b31eba79308941cd0eeb5b4" title="Performs forward propagation of INT8 convolution algorithm.">SimdSynetConvolution8iForward</a>. </dd></dl>
 
 </div>
 </div>
@@ -136,7 +136,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#gaa6fcba4caa5d1bf4844d7f5
 <p>Gets size in bytes of external temporary buffer required for INT8 convolution algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to INT8 convolution context. It must be created by function <a class="el" href="group__synet__convolution__int8.html#ga53d7304604ffae8507174504bc1c4151" title="Initializes INT8 convolution algorithm.">SimdSynetConvolution8iInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to INT8 convolution context. It must be created by function <a class="el" href="group__synet__convolution__int8.html#ga53d7304604ffae8507174504bc1c4151" title="Initializes INT8 convolution algorithm.">SimdSynetConvolution8iInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
   </table>
   </dd>
 </dl>
@@ -163,7 +163,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga3841bca2ac310225763ae48
 <p>Gets size of internal buffer used inside INT8 convolution algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to INT8 convolution context. It must be created by function <a class="el" href="group__synet__convolution__int8.html#ga53d7304604ffae8507174504bc1c4151" title="Initializes INT8 convolution algorithm.">SimdSynetConvolution8iInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to INT8 convolution context. It must be created by function <a class="el" href="group__synet__convolution__int8.html#ga53d7304604ffae8507174504bc1c4151" title="Initializes INT8 convolution algorithm.">SimdSynetConvolution8iInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
   </table>
   </dd>
 </dl>
@@ -190,7 +190,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga58e619c50627e2c7033d510
 <p>Gets description of internal implementation of INT8 convolution algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to INT8 convolution context. It must be created by function <a class="el" href="group__synet__convolution__int8.html#ga53d7304604ffae8507174504bc1c4151" title="Initializes INT8 convolution algorithm.">SimdSynetConvolution8iInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to INT8 convolution context. It must be created by function <a class="el" href="group__synet__convolution__int8.html#ga53d7304604ffae8507174504bc1c4151" title="Initializes INT8 convolution algorithm.">SimdSynetConvolution8iInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
   </table>
   </dd>
 </dl>
@@ -245,7 +245,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga808feb178234a756f2b5bec
 <p>Sets weights, biases, parameters of activation function, input/output tensor statistics required for INT8 convolution algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in,out]</td><td class="paramname">context</td><td>- a pointer to INT8 convolution context. It must be created by function <a class="el" href="group__synet__convolution__int8.html#ga53d7304604ffae8507174504bc1c4151" title="Initializes INT8 convolution algorithm.">SimdSynetConvolution8iInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in,out]</td><td class="paramname">context</td><td>- a pointer to INT8 convolution context. It must be created by function <a class="el" href="group__synet__convolution__int8.html#ga53d7304604ffae8507174504bc1c4151" title="Initializes INT8 convolution algorithm.">SimdSynetConvolution8iInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">weight</td><td>- a pointer to original (32-bit float point) convolution weights. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">bias</td><td>- a pointer to original (32-bit float point) bias. Can be NULL. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">params</td><td>- a pointer to original (32-bit float point) parameters of activation functions (see <a class="el" href="group__synet__types.html#gac0134d6cef9d64de8b89171e79ef4568">SimdConvolutionActivationType</a>). Can be NULL. </td></tr>
@@ -297,7 +297,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#gab3af52ee0b31eba79308941
 <p>Performs forward propagation of INT8 convolution algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to INT8 convolution context. It must be created by function <a class="el" href="group__synet__convolution__int8.html#ga53d7304604ffae8507174504bc1c4151" title="Initializes INT8 convolution algorithm.">SimdSynetConvolution8iInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to INT8 convolution context. It must be created by function <a class="el" href="group__synet__convolution__int8.html#ga53d7304604ffae8507174504bc1c4151" title="Initializes INT8 convolution algorithm.">SimdSynetConvolution8iInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">src</td><td>- a pointer to input tensor. </td></tr>
     <tr><td class="paramdir">[out]</td><td class="paramname">buf</td><td>- a pointer to external temporary buffer. The size of the external temporary buffer is determined by function <a class="el" href="group__synet__convolution__int8.html#gaa6fcba4caa5d1bf4844d7f5d5bd0fb61" title="Gets size in bytes of external temporary buffer required for INT8 convolution algorithm.">SimdSynetConvolution8iExternalBufferSize</a>. Can be NULL (it causes usage of internal buffer). </td></tr>
     <tr><td class="paramdir">[out]</td><td class="paramname">dst</td><td>- a pointer to output tensor. </td></tr>
diff --git a/docs/help/group__synet__deconvolution__bf16.html b/docs/help/group__synet__deconvolution__bf16.html
index 6f8a6bf5f7..ccfa6c96c5 100644
--- a/docs/help/group__synet__deconvolution__bf16.html
+++ b/docs/help/group__synet__deconvolution__bf16.html
@@ -110,7 +110,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga18371c79e130668f030c84a
   </table>
   </dd>
 </dl>
-<dl class="section return"><dt>Returns</dt><dd>a pointer to BF16 convolution context. On error it returns NULL. It must be released with using of function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. This pointer is used in functions <a class="el" href="group__synet__deconvolution__bf16.html#gaf0c816ca00f64a2548e46c35bac88a92" title="Gets size in bytes of external temporary buffer required for BF16 deconvolution algorithm.">SimdSynetDeconvolution16bExternalBufferSize</a>, <a class="el" href="group__synet__deconvolution__bf16.html#ga5cd0d28f6b351e6f356714cace569dd3" title="Gets size (in bytes) of internal buffer used inside BF16 deconvolution algorithm.">SimdSynetDeconvolution16bInternalBufferSize</a>, <a class="el" href="group__synet__deconvolution__bf16.html#ga795b35e0b76a7e58a7fac77fe743ba87" title="Gets description of internal implementation of BF16 deconvolution algorithm.">SimdSynetDeconvolution16bInfo</a>, SimdSynetDeconvolution16bSetParams and <a class="el" href="group__synet__deconvolution__bf16.html#ga67d174c2c4aff41840cac42c1649eedb" title="Performs forward propagation of BF16 deconvolution algorithm.">SimdSynetDeconvolution16bForward</a>. </dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>a pointer to BF16 convolution context. On error it returns NULL. It must be released with using of function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. This pointer is used in functions <a class="el" href="group__synet__deconvolution__bf16.html#gaf0c816ca00f64a2548e46c35bac88a92" title="Gets size in bytes of external temporary buffer required for BF16 deconvolution algorithm.">SimdSynetDeconvolution16bExternalBufferSize</a>, <a class="el" href="group__synet__deconvolution__bf16.html#ga5cd0d28f6b351e6f356714cace569dd3" title="Gets size (in bytes) of internal buffer used inside BF16 deconvolution algorithm.">SimdSynetDeconvolution16bInternalBufferSize</a>, <a class="el" href="group__synet__deconvolution__bf16.html#ga795b35e0b76a7e58a7fac77fe743ba87" title="Gets description of internal implementation of BF16 deconvolution algorithm.">SimdSynetDeconvolution16bInfo</a>, SimdSynetDeconvolution16bSetParams and <a class="el" href="group__synet__deconvolution__bf16.html#ga67d174c2c4aff41840cac42c1649eedb" title="Performs forward propagation of BF16 deconvolution algorithm.">SimdSynetDeconvolution16bForward</a>. </dd></dl>
 
 </div>
 </div>
@@ -133,7 +133,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#gaf0c816ca00f64a2548e46c3
 <p>Gets size in bytes of external temporary buffer required for BF16 deconvolution algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to BF16 deconvolution context. It must be created by function <a class="el" href="group__synet__deconvolution__bf16.html#ga18371c79e130668f030c84a861ea247a" title="Initializes BF16 deconvolution algorithm.">SimdSynetDeconvolution16bInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to BF16 deconvolution context. It must be created by function <a class="el" href="group__synet__deconvolution__bf16.html#ga18371c79e130668f030c84a861ea247a" title="Initializes BF16 deconvolution algorithm.">SimdSynetDeconvolution16bInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
   </table>
   </dd>
 </dl>
@@ -160,7 +160,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga5cd0d28f6b351e6f356714c
 <p>Gets size (in bytes) of internal buffer used inside BF16 deconvolution algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to BF16 deconvolution context. It must be created by function <a class="el" href="group__synet__deconvolution__bf16.html#ga18371c79e130668f030c84a861ea247a" title="Initializes BF16 deconvolution algorithm.">SimdSynetDeconvolution16bInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to BF16 deconvolution context. It must be created by function <a class="el" href="group__synet__deconvolution__bf16.html#ga18371c79e130668f030c84a861ea247a" title="Initializes BF16 deconvolution algorithm.">SimdSynetDeconvolution16bInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
   </table>
   </dd>
 </dl>
@@ -187,7 +187,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga795b35e0b76a7e58a7fac77
 <p>Gets description of internal implementation of BF16 deconvolution algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to BF16 deconvolution context. It must be created by function <a class="el" href="group__synet__deconvolution__bf16.html#ga18371c79e130668f030c84a861ea247a" title="Initializes BF16 deconvolution algorithm.">SimdSynetDeconvolution16bInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to BF16 deconvolution context. It must be created by function <a class="el" href="group__synet__deconvolution__bf16.html#ga18371c79e130668f030c84a861ea247a" title="Initializes BF16 deconvolution algorithm.">SimdSynetDeconvolution16bInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
   </table>
   </dd>
 </dl>
@@ -236,7 +236,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga67d174c2c4aff41840cac42
 <p>Performs forward propagation of BF16 deconvolution algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to BF16 deconvolution context. It must be created by function <a class="el" href="group__synet__deconvolution__bf16.html#ga18371c79e130668f030c84a861ea247a" title="Initializes BF16 deconvolution algorithm.">SimdSynetDeconvolution16bInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to BF16 deconvolution context. It must be created by function <a class="el" href="group__synet__deconvolution__bf16.html#ga18371c79e130668f030c84a861ea247a" title="Initializes BF16 deconvolution algorithm.">SimdSynetDeconvolution16bInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">src</td><td>- a pointer to input tensor. </td></tr>
     <tr><td class="paramdir">[out]</td><td class="paramname">buf</td><td>- a pointer to external temporary buffer. The size of the external temporary buffer is determined by function <a class="el" href="group__synet__deconvolution__bf16.html#gaf0c816ca00f64a2548e46c35bac88a92" title="Gets size in bytes of external temporary buffer required for BF16 deconvolution algorithm.">SimdSynetDeconvolution16bExternalBufferSize</a>. Can be NULL (it causes usage of internal buffer). </td></tr>
     <tr><td class="paramdir">[out]</td><td class="paramname">dst</td><td>- a pointer to output tensor. </td></tr>
diff --git a/docs/help/group__synet__deconvolution__fp32.html b/docs/help/group__synet__deconvolution__fp32.html
index 0c865b2a24..f8c8c25b9e 100644
--- a/docs/help/group__synet__deconvolution__fp32.html
+++ b/docs/help/group__synet__deconvolution__fp32.html
@@ -113,7 +113,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#gafb81a917363606558d66124
   </table>
   </dd>
 </dl>
-<dl class="section return"><dt>Returns</dt><dd>a pointer to FP32 deconvolution context. On error it returns NULL. It must be released with using of function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. This pointer is used in functions <a class="el" href="group__synet__deconvolution__fp32.html#ga52f5c821b4f9539155f7e8ac1b9b3d2d" title="Gets size of external temporary buffer required for FP32 deconvolution algorithm.">SimdSynetDeconvolution32fExternalBufferSize</a>, <a class="el" href="group__synet__deconvolution__fp32.html#ga01e3e5f738288088516d58ef81463a98" title="Gets size of internal buffer used inside FP32 deconvolution algorithm.">SimdSynetDeconvolution32fInternalBufferSize</a>, <a class="el" href="group__synet__deconvolution__fp32.html#ga8a7d6429f8bb03efb89c619a49e48faa" title="Gets description of internal implementation of FP32 deconvolution algorithm.">SimdSynetDeconvolution32fInfo</a>, <a class="el" href="group__synet__deconvolution__fp32.html#ga84c49be88db5c6ca7a556e21096b6ca6" title="Sets weights, biases and parameters of activation function required for FP32 deconvolution algorithm.">SimdSynetDeconvolution32fSetParams</a> and <a class="el" href="group__synet__deconvolution__fp32.html#gaa852d017af38ce7713739846fd0c9357" title="Performs forward propagation of FP32 deconvolution algorithm.">SimdSynetDeconvolution32fForward</a>. </dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>a pointer to FP32 deconvolution context. On error it returns NULL. It must be released with using of function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. This pointer is used in functions <a class="el" href="group__synet__deconvolution__fp32.html#ga52f5c821b4f9539155f7e8ac1b9b3d2d" title="Gets size of external temporary buffer required for FP32 deconvolution algorithm.">SimdSynetDeconvolution32fExternalBufferSize</a>, <a class="el" href="group__synet__deconvolution__fp32.html#ga01e3e5f738288088516d58ef81463a98" title="Gets size of internal buffer used inside FP32 deconvolution algorithm.">SimdSynetDeconvolution32fInternalBufferSize</a>, <a class="el" href="group__synet__deconvolution__fp32.html#ga8a7d6429f8bb03efb89c619a49e48faa" title="Gets description of internal implementation of FP32 deconvolution algorithm.">SimdSynetDeconvolution32fInfo</a>, <a class="el" href="group__synet__deconvolution__fp32.html#ga84c49be88db5c6ca7a556e21096b6ca6" title="Sets weights, biases and parameters of activation function required for FP32 deconvolution algorithm.">SimdSynetDeconvolution32fSetParams</a> and <a class="el" href="group__synet__deconvolution__fp32.html#gaa852d017af38ce7713739846fd0c9357" title="Performs forward propagation of FP32 deconvolution algorithm.">SimdSynetDeconvolution32fForward</a>. </dd></dl>
 
 </div>
 </div>
@@ -136,7 +136,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga52f5c821b4f9539155f7e8a
 <p>Gets size of external temporary buffer required for FP32 deconvolution algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to FP32 deconvolution context. It must be created by function <a class="el" href="group__synet__deconvolution__fp32.html#gafb81a917363606558d66124c2f3a7bad" title="Initializes FP32 deconvolution algorithm.">SimdSynetDeconvolution32fInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to FP32 deconvolution context. It must be created by function <a class="el" href="group__synet__deconvolution__fp32.html#gafb81a917363606558d66124c2f3a7bad" title="Initializes FP32 deconvolution algorithm.">SimdSynetDeconvolution32fInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
   </table>
   </dd>
 </dl>
@@ -163,7 +163,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga01e3e5f738288088516d58e
 <p>Gets size of internal buffer used inside FP32 deconvolution algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to FP32 deconvolution context. It must be created by function <a class="el" href="group__synet__deconvolution__fp32.html#gafb81a917363606558d66124c2f3a7bad" title="Initializes FP32 deconvolution algorithm.">SimdSynetDeconvolution32fInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to FP32 deconvolution context. It must be created by function <a class="el" href="group__synet__deconvolution__fp32.html#gafb81a917363606558d66124c2f3a7bad" title="Initializes FP32 deconvolution algorithm.">SimdSynetDeconvolution32fInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
   </table>
   </dd>
 </dl>
@@ -190,7 +190,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga8a7d6429f8bb03efb89c619
 <p>Gets description of internal implementation of FP32 deconvolution algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to FP32 deconvolution context. It must be created by function <a class="el" href="group__synet__deconvolution__fp32.html#gafb81a917363606558d66124c2f3a7bad" title="Initializes FP32 deconvolution algorithm.">SimdSynetDeconvolution32fInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to FP32 deconvolution context. It must be created by function <a class="el" href="group__synet__deconvolution__fp32.html#gafb81a917363606558d66124c2f3a7bad" title="Initializes FP32 deconvolution algorithm.">SimdSynetDeconvolution32fInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
   </table>
   </dd>
 </dl>
@@ -245,7 +245,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga84c49be88db5c6ca7a556e2
 <p>Sets weights, biases and parameters of activation function required for FP32 deconvolution algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in,out]</td><td class="paramname">context</td><td>- a pointer to FP32 deconvolution context. It must be created by function <a class="el" href="group__synet__deconvolution__fp32.html#gafb81a917363606558d66124c2f3a7bad" title="Initializes FP32 deconvolution algorithm.">SimdSynetDeconvolution32fInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in,out]</td><td class="paramname">context</td><td>- a pointer to FP32 deconvolution context. It must be created by function <a class="el" href="group__synet__deconvolution__fp32.html#gafb81a917363606558d66124c2f3a7bad" title="Initializes FP32 deconvolution algorithm.">SimdSynetDeconvolution32fInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">weight</td><td>- a pointer to deconvolution weights. </td></tr>
     <tr><td class="paramdir">[out]</td><td class="paramname">internal</td><td>- a flag signalizing that weight is stored in the internal buffer. Can be NULL. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">bias</td><td>- a pointer to bias. Can be NULL. </td></tr>
@@ -297,7 +297,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#gaa852d017af38ce771373984
 <p>Performs forward propagation of FP32 deconvolution algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to FP32 deconvolution context. It must be created by function <a class="el" href="group__synet__deconvolution__fp32.html#gafb81a917363606558d66124c2f3a7bad" title="Initializes FP32 deconvolution algorithm.">SimdSynetDeconvolution32fInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to FP32 deconvolution context. It must be created by function <a class="el" href="group__synet__deconvolution__fp32.html#gafb81a917363606558d66124c2f3a7bad" title="Initializes FP32 deconvolution algorithm.">SimdSynetDeconvolution32fInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">src</td><td>- a pointer to input tensor. </td></tr>
     <tr><td class="paramdir">[out]</td><td class="paramname">buf</td><td>- a pointer to external temporary buffer. The size of the external temporary buffer is determined by function <a class="el" href="group__synet__deconvolution__fp32.html#ga52f5c821b4f9539155f7e8ac1b9b3d2d" title="Gets size of external temporary buffer required for FP32 deconvolution algorithm.">SimdSynetDeconvolution32fExternalBufferSize</a>. Can be NULL (it causes usage of internal buffer). </td></tr>
     <tr><td class="paramdir">[out]</td><td class="paramname">dst</td><td>- a pointer to output tensor. </td></tr>
diff --git a/docs/help/group__synet__gather__elements.html b/docs/help/group__synet__gather__elements.html
index 3cc92ba300..0590991246 100644
--- a/docs/help/group__synet__gather__elements.html
+++ b/docs/help/group__synet__gather__elements.html
@@ -153,7 +153,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#gaa3a5fa80fb162f011f5f782
   </table>
   </dd>
 </dl>
-<dl class="section return"><dt>Returns</dt><dd>a pointer to gather elements context. On error it returns NULL. It must be released with using of function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. This pointer is used in functions :: SimdSynetGatherElementsSetIndex, <a class="el" href="group__synet__gather__elements.html#ga3db3ad3123e342e8087abd38dce51b7b" title="Gets size of internal buffer in bytes used inside gather elements algorithm.">SimdSynetGatherElementsInternalBufferSize</a>, and <a class="el" href="group__synet__gather__elements.html#ga89560f21767f6c28a34c2cbfaefb9684" title="Performs forward propagation of gather elements algorithm.">SimdSynetGatherElementsForward</a>. </dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>a pointer to gather elements context. On error it returns NULL. It must be released with using of function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. This pointer is used in functions :: SimdSynetGatherElementsSetIndex, <a class="el" href="group__synet__gather__elements.html#ga3db3ad3123e342e8087abd38dce51b7b" title="Gets size of internal buffer in bytes used inside gather elements algorithm.">SimdSynetGatherElementsInternalBufferSize</a>, and <a class="el" href="group__synet__gather__elements.html#ga89560f21767f6c28a34c2cbfaefb9684" title="Performs forward propagation of gather elements algorithm.">SimdSynetGatherElementsForward</a>. </dd></dl>
 
 </div>
 </div>
@@ -186,7 +186,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga68c675cebb866b8446ee4b1
 <p>Sets and analyses constant gather elements indexes. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to gather elements context. It must be created by function <a class="el" href="group__synet__gather__elements.html#gaa3a5fa80fb162f011f5f782b70436669">SimdSynetGatherElementsInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to gather elements context. It must be created by function <a class="el" href="group__synet__gather__elements.html#gaa3a5fa80fb162f011f5f782b70436669">SimdSynetGatherElementsInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">idx</td><td>- a pointer to tensor with indexes. It can be INT32 or INT64. Its size = outer[0] * .. * outer[outerSize - 1] * idxCount * inner. </td></tr>
   </table>
   </dd>
@@ -213,7 +213,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga3db3ad3123e342e8087abd3
 <p>Gets size of internal buffer in bytes used inside gather elements algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to gather elements context. It must be created by function <a class="el" href="group__synet__gather__elements.html#gaa3a5fa80fb162f011f5f782b70436669">SimdSynetGatherElementsInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to gather elements context. It must be created by function <a class="el" href="group__synet__gather__elements.html#gaa3a5fa80fb162f011f5f782b70436669">SimdSynetGatherElementsInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
   </table>
   </dd>
 </dl>
@@ -262,7 +262,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga89560f21767f6c28a34c2cb
 <p>Performs forward propagation of gather elements algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to gather elements algorithm. It must be created by function <a class="el" href="group__synet__gather__elements.html#gaa3a5fa80fb162f011f5f782b70436669">SimdSynetGatherElementsInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to gather elements algorithm. It must be created by function <a class="el" href="group__synet__gather__elements.html#gaa3a5fa80fb162f011f5f782b70436669">SimdSynetGatherElementsInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">src</td><td>- a pointer to input tensor. Its size = outer[0] * .. * outer[outerSize - 1] * srcCount * inner. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">idx</td><td>- a pointer to index tensor. Its size = outer[0] * .. * outer[outerSize - 1] * idxCount * inner. </td></tr>
     <tr><td class="paramdir">[out]</td><td class="paramname">dst</td><td>- a pointer to output tensor. Its size = outer[0] * .. * outer[outerSize - 1] * idxCount * inner. </td></tr>
diff --git a/docs/help/group__synet__grid__sample.html b/docs/help/group__synet__grid__sample.html
index 3979a126b8..48334c5ad3 100644
--- a/docs/help/group__synet__grid__sample.html
+++ b/docs/help/group__synet__grid__sample.html
@@ -222,7 +222,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#gaffb2079b0d01fb019828625
   </table>
   </dd>
 </dl>
-<dl class="section return"><dt>Returns</dt><dd>a pointer to grid sample 2D context. On error it returns NULL. It must be released with using of function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. This pointer is used in functions <a class="el" href="group__synet__grid__sample.html#ga14fd30e2f1e0423bb8f82b9d80e26664" title="Gets size of internal buffer used inside permute algorithm.">SimdSynetGridSample2dInternalBufferSize</a>, and <a class="el" href="group__synet__grid__sample.html#ga38c4ee8918e69f0ba545a763fc0e3643" title="Performs forward propagation of grid sample algorithm.">SimdSynetGridSample2dForward</a>. </dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>a pointer to grid sample 2D context. On error it returns NULL. It must be released with using of function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. This pointer is used in functions <a class="el" href="group__synet__grid__sample.html#ga14fd30e2f1e0423bb8f82b9d80e26664" title="Gets size of internal buffer used inside permute algorithm.">SimdSynetGridSample2dInternalBufferSize</a>, and <a class="el" href="group__synet__grid__sample.html#ga38c4ee8918e69f0ba545a763fc0e3643" title="Performs forward propagation of grid sample algorithm.">SimdSynetGridSample2dForward</a>. </dd></dl>
 
 </div>
 </div>
@@ -245,7 +245,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga14fd30e2f1e0423bb8f82b9
 <p>Gets size of internal buffer used inside permute algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to grid sample 2D context. It must be created by function <a class="el" href="group__synet__grid__sample.html#gaffb2079b0d01fb0198286258acd9ef7e" title="Initializes grid sample 2D algorithm.">SimdSynetGridSample2dInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to grid sample 2D context. It must be created by function <a class="el" href="group__synet__grid__sample.html#gaffb2079b0d01fb0198286258acd9ef7e" title="Initializes grid sample 2D algorithm.">SimdSynetGridSample2dInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
   </table>
   </dd>
 </dl>
@@ -294,7 +294,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga38c4ee8918e69f0ba545a76
 <p>Performs forward propagation of grid sample algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to grid sample 2D context. It must be created by function <a class="el" href="group__synet__grid__sample.html#gaffb2079b0d01fb0198286258acd9ef7e" title="Initializes grid sample 2D algorithm.">SimdSynetGridSample2dInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to grid sample 2D context. It must be created by function <a class="el" href="group__synet__grid__sample.html#gaffb2079b0d01fb0198286258acd9ef7e" title="Initializes grid sample 2D algorithm.">SimdSynetGridSample2dInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">src</td><td>- a pointer to input tensor. It has size = batch * channels * srcH * srcW. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">grd</td><td>- a pointer to grid tensor. It has size = batch * dstH * dstW * 2. </td></tr>
     <tr><td class="paramdir">[out]</td><td class="paramname">dst</td><td>- a pointer to output tensor. It has size = batch * channels * dstH * dstW. </td></tr>
diff --git a/docs/help/group__synet__inner__product.html b/docs/help/group__synet__inner__product.html
index 7d5b9f31d0..279ea25165 100644
--- a/docs/help/group__synet__inner__product.html
+++ b/docs/help/group__synet__inner__product.html
@@ -144,7 +144,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#gac76853987f78ee8ecf43291
   </table>
   </dd>
 </dl>
-<dl class="section return"><dt>Returns</dt><dd>a pointer to FP32 inner product context. On error it returns NULL. It must be released with using of function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. This pointer is used in functions <a class="el" href="group__synet__inner__product.html#gaac5aae215c0d9e365585b950c60b5542" title="Gets size of internal buffer used inside FP32 inner product algorithm.">SimdSynetInnerProduct32fInternalBufferSize</a>, <a class="el" href="group__synet__inner__product.html#ga0402e7e1393ac674c6b4546abfb4ff21" title="Gets size of external buffer used in FP32 inner product algorithm.">SimdSynetInnerProduct32fExternalBufferSize</a>, <a class="el" href="group__synet__inner__product.html#gae5e021aa481477506995d361d03b46dd" title="Sets weights, biases and parameters of activation function required for FP32 inner product algorithm.">SimdSynetInnerProduct32fSetParams</a> and <a class="el" href="group__synet__inner__product.html#gac8b3ddda7d17004426743db647e875d1" title="Performs forward propagation of FP32 inner product algorithm.">SimdSynetInnerProduct32fForward</a>. </dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>a pointer to FP32 inner product context. On error it returns NULL. It must be released with using of function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. This pointer is used in functions <a class="el" href="group__synet__inner__product.html#gaac5aae215c0d9e365585b950c60b5542" title="Gets size of internal buffer used inside FP32 inner product algorithm.">SimdSynetInnerProduct32fInternalBufferSize</a>, <a class="el" href="group__synet__inner__product.html#ga0402e7e1393ac674c6b4546abfb4ff21" title="Gets size of external buffer used in FP32 inner product algorithm.">SimdSynetInnerProduct32fExternalBufferSize</a>, <a class="el" href="group__synet__inner__product.html#gae5e021aa481477506995d361d03b46dd" title="Sets weights, biases and parameters of activation function required for FP32 inner product algorithm.">SimdSynetInnerProduct32fSetParams</a> and <a class="el" href="group__synet__inner__product.html#gac8b3ddda7d17004426743db647e875d1" title="Performs forward propagation of FP32 inner product algorithm.">SimdSynetInnerProduct32fForward</a>. </dd></dl>
 
 </div>
 </div>
@@ -167,7 +167,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#gaac5aae215c0d9e365585b95
 <p>Gets size of internal buffer used inside FP32 inner product algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to FP32 inner product context. It must be created by function <a class="el" href="group__synet__inner__product.html#gac76853987f78ee8ecf43291888a0b9f1" title="Initializes FP32 inner product algorithm.">SimdSynetInnerProduct32fInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to FP32 inner product context. It must be created by function <a class="el" href="group__synet__inner__product.html#gac76853987f78ee8ecf43291888a0b9f1" title="Initializes FP32 inner product algorithm.">SimdSynetInnerProduct32fInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
   </table>
   </dd>
 </dl>
@@ -194,7 +194,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga0402e7e1393ac674c6b4546
 <p>Gets size of external buffer used in FP32 inner product algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to FP32 inner product context. It must be created by function <a class="el" href="group__synet__inner__product.html#gac76853987f78ee8ecf43291888a0b9f1" title="Initializes FP32 inner product algorithm.">SimdSynetInnerProduct32fInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to FP32 inner product context. It must be created by function <a class="el" href="group__synet__inner__product.html#gac76853987f78ee8ecf43291888a0b9f1" title="Initializes FP32 inner product algorithm.">SimdSynetInnerProduct32fInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
   </table>
   </dd>
 </dl>
@@ -249,7 +249,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#gae5e021aa481477506995d36
 <p>Sets weights, biases and parameters of activation function required for FP32 inner product algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in,out]</td><td class="paramname">context</td><td>- a pointer to FP32 inner product context. It must be created by function <a class="el" href="group__synet__inner__product.html#gac76853987f78ee8ecf43291888a0b9f1" title="Initializes FP32 inner product algorithm.">SimdSynetInnerProduct32fInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in,out]</td><td class="paramname">context</td><td>- a pointer to FP32 inner product context. It must be created by function <a class="el" href="group__synet__inner__product.html#gac76853987f78ee8ecf43291888a0b9f1" title="Initializes FP32 inner product algorithm.">SimdSynetInnerProduct32fInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">weight</td><td>- a pointer to inner product weights. </td></tr>
     <tr><td class="paramdir">[out]</td><td class="paramname">internal</td><td>- a flag signalizing that weight is stored in the internal buffer. Can be NULL. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">bias</td><td>- a pointer to bias. Can be NULL. </td></tr>
@@ -307,7 +307,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#gac8b3ddda7d17004426743db
 <p>Performs forward propagation of FP32 inner product algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to FP32 inner product context. It must be created by function <a class="el" href="group__synet__inner__product.html#gac76853987f78ee8ecf43291888a0b9f1" title="Initializes FP32 inner product algorithm.">SimdSynetInnerProduct32fInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to FP32 inner product context. It must be created by function <a class="el" href="group__synet__inner__product.html#gac76853987f78ee8ecf43291888a0b9f1" title="Initializes FP32 inner product algorithm.">SimdSynetInnerProduct32fInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">A</td><td>- a pointer to A matrix. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">B</td><td>- a pointer to B matrix. Can be NULL if B is constant matrix. In that case you have to set B (weight) in function SimdSynetInnerProduct16bSetParams. </td></tr>
     <tr><td class="paramdir">[out]</td><td class="paramname">buf</td><td>- a pointer to external buffer. The size of the external temporary buffer is determined by function <a class="el" href="group__synet__inner__product__bf16.html#ga2068a3866a4d83fce8b8b3c64a38bf85" title="Gets size in bytes of external buffer used in BF16 inner product algorithm.">SimdSynetInnerProduct16bExternalBufferSize</a>. Can be NULL (it causes usage of internal buffer). </td></tr>
diff --git a/docs/help/group__synet__inner__product__bf16.html b/docs/help/group__synet__inner__product__bf16.html
index 6123b7750c..058dc69610 100644
--- a/docs/help/group__synet__inner__product__bf16.html
+++ b/docs/help/group__synet__inner__product__bf16.html
@@ -166,7 +166,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga80b323e00e9c6bac9434203
   </table>
   </dd>
 </dl>
-<dl class="section return"><dt>Returns</dt><dd>a pointer to BF16 inner product context. On error it returns NULL. It must be released with using of function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. This pointer is used in functions <a class="el" href="group__synet__inner__product__bf16.html#ga77a93ef1a0409d3e66835b3e2d6d92ac" title="Gets size in bytes of internal buffer used inside BF16 inner product algorithm.">SimdSynetInnerProduct16bInternalBufferSize</a>, <a class="el" href="group__synet__inner__product__bf16.html#ga2068a3866a4d83fce8b8b3c64a38bf85" title="Gets size in bytes of external buffer used in BF16 inner product algorithm.">SimdSynetInnerProduct16bExternalBufferSize</a>, <a class="el" href="group__synet__inner__product__bf16.html#ga8ee7e775a1e7932e2322ff1f02996ba1" title="Gets string with description of internal implementation of BF16 inner product algorithm.">SimdSynetInnerProduct16bInfo</a>, SimdSynetInnerProduct16bSetParams and <a class="el" href="group__synet__inner__product__bf16.html#gae873b7b89f3c3275f521946c9937571d" title="Performs forward propagation of BF16 inner product algorithm.">SimdSynetInnerProduct16bForward</a>. </dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>a pointer to BF16 inner product context. On error it returns NULL. It must be released with using of function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. This pointer is used in functions <a class="el" href="group__synet__inner__product__bf16.html#ga77a93ef1a0409d3e66835b3e2d6d92ac" title="Gets size in bytes of internal buffer used inside BF16 inner product algorithm.">SimdSynetInnerProduct16bInternalBufferSize</a>, <a class="el" href="group__synet__inner__product__bf16.html#ga2068a3866a4d83fce8b8b3c64a38bf85" title="Gets size in bytes of external buffer used in BF16 inner product algorithm.">SimdSynetInnerProduct16bExternalBufferSize</a>, <a class="el" href="group__synet__inner__product__bf16.html#ga8ee7e775a1e7932e2322ff1f02996ba1" title="Gets string with description of internal implementation of BF16 inner product algorithm.">SimdSynetInnerProduct16bInfo</a>, SimdSynetInnerProduct16bSetParams and <a class="el" href="group__synet__inner__product__bf16.html#gae873b7b89f3c3275f521946c9937571d" title="Performs forward propagation of BF16 inner product algorithm.">SimdSynetInnerProduct16bForward</a>. </dd></dl>
 
 </div>
 </div>
@@ -189,7 +189,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga77a93ef1a0409d3e66835b3
 <p>Gets size in bytes of internal buffer used inside BF16 inner product algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to BF16 inner product context. It must be created by function <a class="el" href="group__synet__inner__product__bf16.html#ga80b323e00e9c6bac9434203f6b27ed99" title="Initializes BF16 inner product (matrix multiplication) algorithm.">SimdSynetInnerProduct16bInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to BF16 inner product context. It must be created by function <a class="el" href="group__synet__inner__product__bf16.html#ga80b323e00e9c6bac9434203f6b27ed99" title="Initializes BF16 inner product (matrix multiplication) algorithm.">SimdSynetInnerProduct16bInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
   </table>
   </dd>
 </dl>
@@ -216,7 +216,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga2068a3866a4d83fce8b8b3c
 <p>Gets size in bytes of external buffer used in BF16 inner product algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to BF16 inner product context. It must be created by function <a class="el" href="group__synet__inner__product__bf16.html#ga80b323e00e9c6bac9434203f6b27ed99" title="Initializes BF16 inner product (matrix multiplication) algorithm.">SimdSynetInnerProduct16bInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to BF16 inner product context. It must be created by function <a class="el" href="group__synet__inner__product__bf16.html#ga80b323e00e9c6bac9434203f6b27ed99" title="Initializes BF16 inner product (matrix multiplication) algorithm.">SimdSynetInnerProduct16bInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
   </table>
   </dd>
 </dl>
@@ -243,7 +243,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga8ee7e775a1e7932e2322ff1
 <p>Gets string with description of internal implementation of BF16 inner product algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to BF16 inner product context. It must be created by function <a class="el" href="group__synet__inner__product__bf16.html#ga80b323e00e9c6bac9434203f6b27ed99" title="Initializes BF16 inner product (matrix multiplication) algorithm.">SimdSynetInnerProduct16bInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to BF16 inner product context. It must be created by function <a class="el" href="group__synet__inner__product__bf16.html#ga80b323e00e9c6bac9434203f6b27ed99" title="Initializes BF16 inner product (matrix multiplication) algorithm.">SimdSynetInnerProduct16bInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
   </table>
   </dd>
 </dl>
@@ -298,7 +298,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#gae873b7b89f3c3275f521946
 <p>Performs forward propagation of BF16 inner product algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to BF16 inner product context. It must be created by function <a class="el" href="group__synet__inner__product__bf16.html#ga80b323e00e9c6bac9434203f6b27ed99" title="Initializes BF16 inner product (matrix multiplication) algorithm.">SimdSynetInnerProduct16bInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to BF16 inner product context. It must be created by function <a class="el" href="group__synet__inner__product__bf16.html#ga80b323e00e9c6bac9434203f6b27ed99" title="Initializes BF16 inner product (matrix multiplication) algorithm.">SimdSynetInnerProduct16bInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">A</td><td>- a pointer to A matrix. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">B</td><td>- a pointer to B matrix. Can be NULL if B is constant matrix. In that case you have to set B (weight) in function SimdSynetInnerProduct16bSetParams. </td></tr>
     <tr><td class="paramdir">[out]</td><td class="paramname">buf</td><td>- a pointer to external buffer. The size of the external temporary buffer is determined by function <a class="el" href="group__synet__inner__product__bf16.html#ga2068a3866a4d83fce8b8b3c64a38bf85" title="Gets size in bytes of external buffer used in BF16 inner product algorithm.">SimdSynetInnerProduct16bExternalBufferSize</a>. Can be NULL (it causes usage of internal buffer). </td></tr>
diff --git a/docs/help/group__synet__merged__convolution__bf16.html b/docs/help/group__synet__merged__convolution__bf16.html
index 8b75f74f21..91acb6f329 100644
--- a/docs/help/group__synet__merged__convolution__bf16.html
+++ b/docs/help/group__synet__merged__convolution__bf16.html
@@ -120,7 +120,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga823d8453c012ec9197a32c3
   </table>
   </dd>
 </dl>
-<dl class="section return"><dt>Returns</dt><dd>a pointer to BF16 merged convolution context. On error it returns NULL. It must be released with using of function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. This pointer is used in functions <a class="el" href="group__synet__merged__convolution__bf16.html#ga5d665fe0f3d55e7e1bd2aad0608fb065" title="Gets size in bytes of external temporary buffer required for BF16 merged convolution algorithm.">SimdSynetMergedConvolution16bExternalBufferSize</a>, <a class="el" href="group__synet__merged__convolution__bf16.html#ga5905cb72a74b45e487301e6f867ab7e4" title="Gets size in bytes of internal buffer used inside BF16 merged convolution algorithm.">SimdSynetMergedConvolution16bInternalBufferSize</a>, <a class="el" href="group__synet__merged__convolution__bf16.html#ga21ed275291baa407a917a0c83ebbaa2b" title="Gets description of internal implementation of BF16 merged convolution algorithm.">SimdSynetMergedConvolution16bInfo</a>, <a class="el" href="group__synet__merged__convolution__bf16.html#gaf2d799425500289c8646ba5055b2d882" title="Sets weights, biases and parameters of activation function required for BF16 merged convolution algor...">SimdSynetMergedConvolution16bSetParams</a> and <a class="el" href="group__synet__merged__convolution__bf16.html#gad4f3db9d3195acac1af7b332ce7bfd1d" title="Performs forward propagation of BF16 merged convolution algorithm.">SimdSynetMergedConvolution16bForward</a>. </dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>a pointer to BF16 merged convolution context. On error it returns NULL. It must be released with using of function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. This pointer is used in functions <a class="el" href="group__synet__merged__convolution__bf16.html#ga5d665fe0f3d55e7e1bd2aad0608fb065" title="Gets size in bytes of external temporary buffer required for BF16 merged convolution algorithm.">SimdSynetMergedConvolution16bExternalBufferSize</a>, <a class="el" href="group__synet__merged__convolution__bf16.html#ga5905cb72a74b45e487301e6f867ab7e4" title="Gets size in bytes of internal buffer used inside BF16 merged convolution algorithm.">SimdSynetMergedConvolution16bInternalBufferSize</a>, <a class="el" href="group__synet__merged__convolution__bf16.html#ga21ed275291baa407a917a0c83ebbaa2b" title="Gets description of internal implementation of BF16 merged convolution algorithm.">SimdSynetMergedConvolution16bInfo</a>, <a class="el" href="group__synet__merged__convolution__bf16.html#gaf2d799425500289c8646ba5055b2d882" title="Sets weights, biases and parameters of activation function required for BF16 merged convolution algor...">SimdSynetMergedConvolution16bSetParams</a> and <a class="el" href="group__synet__merged__convolution__bf16.html#gad4f3db9d3195acac1af7b332ce7bfd1d" title="Performs forward propagation of BF16 merged convolution algorithm.">SimdSynetMergedConvolution16bForward</a>. </dd></dl>
 
 </div>
 </div>
@@ -143,7 +143,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga5d665fe0f3d55e7e1bd2aad
 <p>Gets size in bytes of external temporary buffer required for BF16 merged convolution algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to BF16 merged convolution context. It must be created by function <a class="el" href="group__synet__merged__convolution__bf16.html#ga823d8453c012ec9197a32c31e8533716" title="Initializes BF16 merged convolution algorithm.">SimdSynetMergedConvolution16bInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to BF16 merged convolution context. It must be created by function <a class="el" href="group__synet__merged__convolution__bf16.html#ga823d8453c012ec9197a32c31e8533716" title="Initializes BF16 merged convolution algorithm.">SimdSynetMergedConvolution16bInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
   </table>
   </dd>
 </dl>
@@ -170,7 +170,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga5905cb72a74b45e487301e6
 <p>Gets size in bytes of internal buffer used inside BF16 merged convolution algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to BF16 merged convolution context. It must be created by function <a class="el" href="group__synet__merged__convolution__bf16.html#ga823d8453c012ec9197a32c31e8533716" title="Initializes BF16 merged convolution algorithm.">SimdSynetMergedConvolution16bInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to BF16 merged convolution context. It must be created by function <a class="el" href="group__synet__merged__convolution__bf16.html#ga823d8453c012ec9197a32c31e8533716" title="Initializes BF16 merged convolution algorithm.">SimdSynetMergedConvolution16bInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
   </table>
   </dd>
 </dl>
@@ -197,7 +197,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga21ed275291baa407a917a0c
 <p>Gets description of internal implementation of BF16 merged convolution algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to BF16 merged convolution context. It must be created by function <a class="el" href="group__synet__merged__convolution__bf16.html#ga823d8453c012ec9197a32c31e8533716" title="Initializes BF16 merged convolution algorithm.">SimdSynetMergedConvolution16bInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to BF16 merged convolution context. It must be created by function <a class="el" href="group__synet__merged__convolution__bf16.html#ga823d8453c012ec9197a32c31e8533716" title="Initializes BF16 merged convolution algorithm.">SimdSynetMergedConvolution16bInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
   </table>
   </dd>
 </dl>
@@ -246,7 +246,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#gaf2d799425500289c8646ba5
 <p>Sets weights, biases and parameters of activation function required for BF16 merged convolution algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in,out]</td><td class="paramname">context</td><td>- a pointer to BF16 merged convolution context. It must be created by function <a class="el" href="group__synet__merged__convolution__bf16.html#ga823d8453c012ec9197a32c31e8533716" title="Initializes BF16 merged convolution algorithm.">SimdSynetMergedConvolution16bInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in,out]</td><td class="paramname">context</td><td>- a pointer to BF16 merged convolution context. It must be created by function <a class="el" href="group__synet__merged__convolution__bf16.html#ga823d8453c012ec9197a32c31e8533716" title="Initializes BF16 merged convolution algorithm.">SimdSynetMergedConvolution16bInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">weight</td><td>- a pointer to the array with pointers to convolution weights. The array size is determined by number of merged convolutions. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">bias</td><td>- a pointer to the array with pointers to bias. The array size is determined by number of merged convolutions. Can be NULL. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">params</td><td>- a pointer to the array with pointers to parameters of the activation functions (see <a class="el" href="group__synet__types.html#gac0134d6cef9d64de8b89171e79ef4568">SimdConvolutionActivationType</a>). The array size is determined by number of merged convolutions. Can be NULL. </td></tr>
@@ -297,7 +297,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#gad4f3db9d3195acac1af7b33
 <p>Performs forward propagation of BF16 merged convolution algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to BF16 merged convolution context. It must be created by function <a class="el" href="group__synet__merged__convolution__bf16.html#ga823d8453c012ec9197a32c31e8533716" title="Initializes BF16 merged convolution algorithm.">SimdSynetMergedConvolution16bInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to BF16 merged convolution context. It must be created by function <a class="el" href="group__synet__merged__convolution__bf16.html#ga823d8453c012ec9197a32c31e8533716" title="Initializes BF16 merged convolution algorithm.">SimdSynetMergedConvolution16bInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">src</td><td>- a pointer to input image. </td></tr>
     <tr><td class="paramdir">[out]</td><td class="paramname">buf</td><td>- a pointer to external temporary buffer. The size in bytes of the external temporary buffer is determined by function <a class="el" href="group__synet__merged__convolution__bf16.html#ga5d665fe0f3d55e7e1bd2aad0608fb065" title="Gets size in bytes of external temporary buffer required for BF16 merged convolution algorithm.">SimdSynetMergedConvolution16bExternalBufferSize</a>. Can be NULL (it causes usage of internal buffer). </td></tr>
     <tr><td class="paramdir">[out]</td><td class="paramname">dst</td><td>- a pointer to output image. </td></tr>
diff --git a/docs/help/group__synet__merged__convolution__fp32.html b/docs/help/group__synet__merged__convolution__fp32.html
index 569861f746..2aeefb11f9 100644
--- a/docs/help/group__synet__merged__convolution__fp32.html
+++ b/docs/help/group__synet__merged__convolution__fp32.html
@@ -120,7 +120,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#gaed6b10310042f327ae9763e
   </table>
   </dd>
 </dl>
-<dl class="section return"><dt>Returns</dt><dd>a pointer to FP32 merged convolution context. On error it returns NULL. It must be released with using of function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. This pointer is used in functions <a class="el" href="group__synet__merged__convolution__fp32.html#ga884e019594562cb69235ad8c61fdc6f5" title="Gets size of external temporary buffer required for FP32 merged convolution algorithm.">SimdSynetMergedConvolution32fExternalBufferSize</a>, <a class="el" href="group__synet__merged__convolution__fp32.html#ga3ebe7b865e3c3eb55ff211dd8e182a92" title="Gets size of internal buffer used inside FP32 merged convolution algorithm.">SimdSynetMergedConvolution32fInternalBufferSize</a>, <a class="el" href="group__synet__merged__convolution__fp32.html#ga6268605fb7d3980dc9ae6f85871d69b5" title="Gets description of internal implementation of FP32 merged convolution algorithm.">SimdSynetMergedConvolution32fInfo</a>, <a class="el" href="group__synet__merged__convolution__fp32.html#gabf080d79703a4c4b40d69dc350c2057f" title="Sets weights, biases and parameters of activation function required for FP32 merged convolution algor...">SimdSynetMergedConvolution32fSetParams</a> and <a class="el" href="group__synet__merged__convolution__fp32.html#ga79036c777c2b67def4c02e4eae4b1c53" title="Performs forward propagation of FP32 merged convolution algorithm.">SimdSynetMergedConvolution32fForward</a>. </dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>a pointer to FP32 merged convolution context. On error it returns NULL. It must be released with using of function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. This pointer is used in functions <a class="el" href="group__synet__merged__convolution__fp32.html#ga884e019594562cb69235ad8c61fdc6f5" title="Gets size of external temporary buffer required for FP32 merged convolution algorithm.">SimdSynetMergedConvolution32fExternalBufferSize</a>, <a class="el" href="group__synet__merged__convolution__fp32.html#ga3ebe7b865e3c3eb55ff211dd8e182a92" title="Gets size of internal buffer used inside FP32 merged convolution algorithm.">SimdSynetMergedConvolution32fInternalBufferSize</a>, <a class="el" href="group__synet__merged__convolution__fp32.html#ga6268605fb7d3980dc9ae6f85871d69b5" title="Gets description of internal implementation of FP32 merged convolution algorithm.">SimdSynetMergedConvolution32fInfo</a>, <a class="el" href="group__synet__merged__convolution__fp32.html#gabf080d79703a4c4b40d69dc350c2057f" title="Sets weights, biases and parameters of activation function required for FP32 merged convolution algor...">SimdSynetMergedConvolution32fSetParams</a> and <a class="el" href="group__synet__merged__convolution__fp32.html#ga79036c777c2b67def4c02e4eae4b1c53" title="Performs forward propagation of FP32 merged convolution algorithm.">SimdSynetMergedConvolution32fForward</a>. </dd></dl>
 
 </div>
 </div>
@@ -143,7 +143,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga884e019594562cb69235ad8
 <p>Gets size of external temporary buffer required for FP32 merged convolution algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to FP32 merged convolution context. It must be created by function <a class="el" href="group__synet__merged__convolution__fp32.html#gaed6b10310042f327ae9763e621dcfb1b" title="Initializes FP32 merged convolution algorithm.">SimdSynetMergedConvolution32fInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to FP32 merged convolution context. It must be created by function <a class="el" href="group__synet__merged__convolution__fp32.html#gaed6b10310042f327ae9763e621dcfb1b" title="Initializes FP32 merged convolution algorithm.">SimdSynetMergedConvolution32fInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
   </table>
   </dd>
 </dl>
@@ -170,7 +170,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga3ebe7b865e3c3eb55ff211d
 <p>Gets size of internal buffer used inside FP32 merged convolution algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to FP32 merged convolution context. It must be created by function <a class="el" href="group__synet__merged__convolution__fp32.html#gaed6b10310042f327ae9763e621dcfb1b" title="Initializes FP32 merged convolution algorithm.">SimdSynetMergedConvolution32fInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to FP32 merged convolution context. It must be created by function <a class="el" href="group__synet__merged__convolution__fp32.html#gaed6b10310042f327ae9763e621dcfb1b" title="Initializes FP32 merged convolution algorithm.">SimdSynetMergedConvolution32fInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
   </table>
   </dd>
 </dl>
@@ -197,7 +197,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga6268605fb7d3980dc9ae6f8
 <p>Gets description of internal implementation of FP32 merged convolution algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to FP32 merged convolution context. It must be created by function <a class="el" href="group__synet__merged__convolution__fp32.html#gaed6b10310042f327ae9763e621dcfb1b" title="Initializes FP32 merged convolution algorithm.">SimdSynetMergedConvolution32fInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to FP32 merged convolution context. It must be created by function <a class="el" href="group__synet__merged__convolution__fp32.html#gaed6b10310042f327ae9763e621dcfb1b" title="Initializes FP32 merged convolution algorithm.">SimdSynetMergedConvolution32fInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
   </table>
   </dd>
 </dl>
@@ -252,7 +252,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#gabf080d79703a4c4b40d69dc
 <p>Sets weights, biases and parameters of activation function required for FP32 merged convolution algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in,out]</td><td class="paramname">context</td><td>- a pointer to FP32 merged convolution context. It must be created by function <a class="el" href="group__synet__merged__convolution__fp32.html#gaed6b10310042f327ae9763e621dcfb1b" title="Initializes FP32 merged convolution algorithm.">SimdSynetMergedConvolution32fInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in,out]</td><td class="paramname">context</td><td>- a pointer to FP32 merged convolution context. It must be created by function <a class="el" href="group__synet__merged__convolution__fp32.html#gaed6b10310042f327ae9763e621dcfb1b" title="Initializes FP32 merged convolution algorithm.">SimdSynetMergedConvolution32fInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">weight</td><td>- a pointer to the array with pointers to convolution weights. The array size is determined by number of merged convolutions. </td></tr>
     <tr><td class="paramdir">[out]</td><td class="paramname">internal</td><td>- a pointer to the array of flags signalizing that weights are stored in the internal buffer. The array size is determined by number of merged convolutions. Can be NULL. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">bias</td><td>- a pointer to the array with pointers to bias. The array size is determined by number of merged convolutions. Can be NULL. </td></tr>
@@ -304,7 +304,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga79036c777c2b67def4c02e4
 <p>Performs forward propagation of FP32 merged convolution algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to FP32 merged convolution context. It must be created by function <a class="el" href="group__synet__merged__convolution__fp32.html#gaed6b10310042f327ae9763e621dcfb1b" title="Initializes FP32 merged convolution algorithm.">SimdSynetMergedConvolution32fInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to FP32 merged convolution context. It must be created by function <a class="el" href="group__synet__merged__convolution__fp32.html#gaed6b10310042f327ae9763e621dcfb1b" title="Initializes FP32 merged convolution algorithm.">SimdSynetMergedConvolution32fInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">src</td><td>- a pointer to input image. </td></tr>
     <tr><td class="paramdir">[out]</td><td class="paramname">buf</td><td>- a pointer to external temporary buffer. The size of the external temporary buffer is determined by function <a class="el" href="group__synet__merged__convolution__fp32.html#ga884e019594562cb69235ad8c61fdc6f5" title="Gets size of external temporary buffer required for FP32 merged convolution algorithm.">SimdSynetMergedConvolution32fExternalBufferSize</a>. Can be NULL (it causes usage of internal buffer). </td></tr>
     <tr><td class="paramdir">[out]</td><td class="paramname">dst</td><td>- a pointer to output image. </td></tr>
diff --git a/docs/help/group__synet__merged__convolution__int8.html b/docs/help/group__synet__merged__convolution__int8.html
index bc02accf06..a1b0e9aefd 100644
--- a/docs/help/group__synet__merged__convolution__int8.html
+++ b/docs/help/group__synet__merged__convolution__int8.html
@@ -120,7 +120,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#gae27c7d7b6164ccb162efc39
   </table>
   </dd>
 </dl>
-<dl class="section return"><dt>Returns</dt><dd>a pointer to INT8 merged convolution context. On error it returns NULL. It must be released with using of function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. This pointer is used in functions <a class="el" href="group__synet__merged__convolution__int8.html#ga5cb8517d49c1df7b229cd971ef838c73" title="Gets size in bytes of external temporary buffer required for INT8 merged convolution algorithm.">SimdSynetMergedConvolution8iExternalBufferSize</a>, <a class="el" href="group__synet__merged__convolution__int8.html#gadc219d5cb851b9c0cd4a6b452f788a53" title="Gets size in bytes of internal buffer used inside INT8 merged convolution algorithm.">SimdSynetMergedConvolution8iInternalBufferSize</a>, <a class="el" href="group__synet__merged__convolution__int8.html#gac85bc162b76df5b2b571924889202994" title="Gets description of internal implementation of INT8 merged convolution algorithm.">SimdSynetMergedConvolution8iInfo</a>, <a class="el" href="group__synet__merged__convolution__int8.html#ga6a81417d8f7ad1b2ce96e52638273e36" title="Sets weights, biases and parameters of activation function required for INT8 merged convolution algor...">SimdSynetMergedConvolution8iSetParams</a> and <a class="el" href="group__synet__merged__convolution__int8.html#ga1a9301fc3e500612e96e5275ad3ddb8d" title="Performs forward propagation of INT8 merged convolution algorithm.">SimdSynetMergedConvolution8iForward</a>. </dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>a pointer to INT8 merged convolution context. On error it returns NULL. It must be released with using of function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. This pointer is used in functions <a class="el" href="group__synet__merged__convolution__int8.html#ga5cb8517d49c1df7b229cd971ef838c73" title="Gets size in bytes of external temporary buffer required for INT8 merged convolution algorithm.">SimdSynetMergedConvolution8iExternalBufferSize</a>, <a class="el" href="group__synet__merged__convolution__int8.html#gadc219d5cb851b9c0cd4a6b452f788a53" title="Gets size in bytes of internal buffer used inside INT8 merged convolution algorithm.">SimdSynetMergedConvolution8iInternalBufferSize</a>, <a class="el" href="group__synet__merged__convolution__int8.html#gac85bc162b76df5b2b571924889202994" title="Gets description of internal implementation of INT8 merged convolution algorithm.">SimdSynetMergedConvolution8iInfo</a>, <a class="el" href="group__synet__merged__convolution__int8.html#ga6a81417d8f7ad1b2ce96e52638273e36" title="Sets weights, biases and parameters of activation function required for INT8 merged convolution algor...">SimdSynetMergedConvolution8iSetParams</a> and <a class="el" href="group__synet__merged__convolution__int8.html#ga1a9301fc3e500612e96e5275ad3ddb8d" title="Performs forward propagation of INT8 merged convolution algorithm.">SimdSynetMergedConvolution8iForward</a>. </dd></dl>
 
 </div>
 </div>
@@ -143,7 +143,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga5cb8517d49c1df7b229cd97
 <p>Gets size in bytes of external temporary buffer required for INT8 merged convolution algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to INT8 merged convolution context. It must be created by function <a class="el" href="group__synet__merged__convolution__int8.html#gae27c7d7b6164ccb162efc397d1ddfb9f" title="Initializes INT8 merged convolution algorithm.">SimdSynetMergedConvolution8iInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to INT8 merged convolution context. It must be created by function <a class="el" href="group__synet__merged__convolution__int8.html#gae27c7d7b6164ccb162efc397d1ddfb9f" title="Initializes INT8 merged convolution algorithm.">SimdSynetMergedConvolution8iInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
   </table>
   </dd>
 </dl>
@@ -170,7 +170,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#gadc219d5cb851b9c0cd4a6b4
 <p>Gets size in bytes of internal buffer used inside INT8 merged convolution algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to INT8 merged convolution context. It must be created by function <a class="el" href="group__synet__merged__convolution__int8.html#gae27c7d7b6164ccb162efc397d1ddfb9f" title="Initializes INT8 merged convolution algorithm.">SimdSynetMergedConvolution8iInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to INT8 merged convolution context. It must be created by function <a class="el" href="group__synet__merged__convolution__int8.html#gae27c7d7b6164ccb162efc397d1ddfb9f" title="Initializes INT8 merged convolution algorithm.">SimdSynetMergedConvolution8iInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
   </table>
   </dd>
 </dl>
@@ -197,7 +197,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#gac85bc162b76df5b2b571924
 <p>Gets description of internal implementation of INT8 merged convolution algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to INT8 merged convolution context. It must be created by function <a class="el" href="group__synet__merged__convolution__int8.html#gae27c7d7b6164ccb162efc397d1ddfb9f" title="Initializes INT8 merged convolution algorithm.">SimdSynetMergedConvolution8iInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to INT8 merged convolution context. It must be created by function <a class="el" href="group__synet__merged__convolution__int8.html#gae27c7d7b6164ccb162efc397d1ddfb9f" title="Initializes INT8 merged convolution algorithm.">SimdSynetMergedConvolution8iInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
   </table>
   </dd>
 </dl>
@@ -258,7 +258,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga6a81417d8f7ad1b2ce96e52
 <p>Sets weights, biases and parameters of activation function required for INT8 merged convolution algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in,out]</td><td class="paramname">context</td><td>- a pointer to INT8 merged convolution context. It must be created by function <a class="el" href="group__synet__merged__convolution__int8.html#gae27c7d7b6164ccb162efc397d1ddfb9f" title="Initializes INT8 merged convolution algorithm.">SimdSynetMergedConvolution8iInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in,out]</td><td class="paramname">context</td><td>- a pointer to INT8 merged convolution context. It must be created by function <a class="el" href="group__synet__merged__convolution__int8.html#gae27c7d7b6164ccb162efc397d1ddfb9f" title="Initializes INT8 merged convolution algorithm.">SimdSynetMergedConvolution8iInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">weight</td><td>- a pointer to the array with pointers to convolution weights. The array size is determined by number of merged convolutions. </td></tr>
     <tr><td class="paramdir">[out]</td><td class="paramname">internal</td><td>- a pointer to the array of flags signalizing that weights are stored in the internal buffer. The array size is determined by number of merged convolutions. Can be NULL. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">bias</td><td>- a pointer to the array with pointers to bias. The array size is determined by number of merged convolutions. Can be NULL. </td></tr>
@@ -311,7 +311,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga1a9301fc3e500612e96e527
 <p>Performs forward propagation of INT8 merged convolution algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to INT8 merged convolution context. It must be created by function <a class="el" href="group__synet__merged__convolution__int8.html#gae27c7d7b6164ccb162efc397d1ddfb9f" title="Initializes INT8 merged convolution algorithm.">SimdSynetMergedConvolution8iInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to INT8 merged convolution context. It must be created by function <a class="el" href="group__synet__merged__convolution__int8.html#gae27c7d7b6164ccb162efc397d1ddfb9f" title="Initializes INT8 merged convolution algorithm.">SimdSynetMergedConvolution8iInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">src</td><td>- a pointer to input image. </td></tr>
     <tr><td class="paramdir">[out]</td><td class="paramname">buf</td><td>- a pointer to external temporary buffer. The size in bytes of the external temporary buffer is determined by function <a class="el" href="group__synet__merged__convolution__int8.html#ga5cb8517d49c1df7b229cd971ef838c73" title="Gets size in bytes of external temporary buffer required for INT8 merged convolution algorithm.">SimdSynetMergedConvolution8iExternalBufferSize</a>. Can be NULL (it causes usage of internal buffer). </td></tr>
     <tr><td class="paramdir">[out]</td><td class="paramname">dst</td><td>- a pointer to output image. </td></tr>
diff --git a/docs/help/group__synet__permute.html b/docs/help/group__synet__permute.html
index e60ed152c8..3c931ecfa9 100644
--- a/docs/help/group__synet__permute.html
+++ b/docs/help/group__synet__permute.html
@@ -111,7 +111,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#gaab60c1320f3858dcde156c8
   </table>
   </dd>
 </dl>
-<dl class="section return"><dt>Returns</dt><dd>a pointer to permute context. On error it returns NULL. It must be released with using of function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. This pointer is used in functions <a class="el" href="group__synet__permute.html#ga19c5f9b48d3652187cd63618aa871ba4" title="Gets size of internal buffer used inside permute algorithm.">SimdSynetPermuteInternalBufferSize</a>, and <a class="el" href="group__synet__permute.html#gad7252175085b7daa48c7a802bd2bda29" title="Performs forward propagation of permute algorithm.">SimdSynetPermuteForward</a>. </dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>a pointer to permute context. On error it returns NULL. It must be released with using of function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. This pointer is used in functions <a class="el" href="group__synet__permute.html#ga19c5f9b48d3652187cd63618aa871ba4" title="Gets size of internal buffer used inside permute algorithm.">SimdSynetPermuteInternalBufferSize</a>, and <a class="el" href="group__synet__permute.html#gad7252175085b7daa48c7a802bd2bda29" title="Performs forward propagation of permute algorithm.">SimdSynetPermuteForward</a>. </dd></dl>
 
 </div>
 </div>
@@ -134,7 +134,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga19c5f9b48d3652187cd6361
 <p>Gets size of internal buffer used inside permute algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to permute context. It must be created by function <a class="el" href="group__synet__permute.html#gaab60c1320f3858dcde156c87e56a6437" title="Initializes permute algorithm.">SimdSynetPermuteInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to permute context. It must be created by function <a class="el" href="group__synet__permute.html#gaab60c1320f3858dcde156c87e56a6437" title="Initializes permute algorithm.">SimdSynetPermuteInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
   </table>
   </dd>
 </dl>
@@ -177,7 +177,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#gad7252175085b7daa48c7a80
 <p>Performs forward propagation of permute algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to permute context. It must be created by function <a class="el" href="group__synet__permute.html#gaab60c1320f3858dcde156c87e56a6437" title="Initializes permute algorithm.">SimdSynetPermuteInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to permute context. It must be created by function <a class="el" href="group__synet__permute.html#gaab60c1320f3858dcde156c87e56a6437" title="Initializes permute algorithm.">SimdSynetPermuteInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">src</td><td>- a pointer to input image. </td></tr>
     <tr><td class="paramdir">[out]</td><td class="paramname">dst</td><td>- a pointer to output image. </td></tr>
   </table>
diff --git a/docs/help/group__synet__quantized__add.html b/docs/help/group__synet__quantized__add.html
index d11a5370e2..bf9e58e033 100644
--- a/docs/help/group__synet__quantized__add.html
+++ b/docs/help/group__synet__quantized__add.html
@@ -185,7 +185,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga5fcc723e03d0ab761231434
   </table>
   </dd>
 </dl>
-<dl class="section return"><dt>Returns</dt><dd>a pointer to quantized addition context. On error it returns NULL. It must be released with using of function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. This pointer is used in function <a class="el" href="group__synet__quantized__add.html#ga4ac762cb4d76eacf764bbed5eb65887c" title="Performs forward propagation of quantized addition algorithm.">SimdSynetQuantizedAddForward</a>. </dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>a pointer to quantized addition context. On error it returns NULL. It must be released with using of function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. This pointer is used in function <a class="el" href="group__synet__quantized__add.html#ga4ac762cb4d76eacf764bbed5eb65887c" title="Performs forward propagation of quantized addition algorithm.">SimdSynetQuantizedAddForward</a>. </dd></dl>
 
 </div>
 </div>
@@ -230,7 +230,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga4ac762cb4d76eacf764bbed
 <p>Performs forward propagation of quantized addition algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to quantized addition context. It must be created by function <a class="el" href="group__synet__quantized__add.html#ga5fcc723e03d0ab761231434e7158f9b3" title="Initializes quantized addition algorithm.">SimdSynetQuantizedAddInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to quantized addition context. It must be created by function <a class="el" href="group__synet__quantized__add.html#ga5fcc723e03d0ab761231434e7158f9b3" title="Initializes quantized addition algorithm.">SimdSynetQuantizedAddInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">a</td><td>- a pointer to input A tensor. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">b</td><td>- a pointer to input B tensor. </td></tr>
     <tr><td class="paramdir">[out]</td><td class="paramname">dst</td><td>- a pointer to output tensor. </td></tr>
diff --git a/docs/help/group__synet__quantized__convolution.html b/docs/help/group__synet__quantized__convolution.html
index 36237e90b6..a269e8fdaf 100644
--- a/docs/help/group__synet__quantized__convolution.html
+++ b/docs/help/group__synet__quantized__convolution.html
@@ -106,7 +106,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#gad933c2426637cd4d7e3ba60
   </table>
   </dd>
 </dl>
-<dl class="section return"><dt>Returns</dt><dd>a pointer to Quantized convolution context. On error it returns NULL. It must be released with using of function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. This pointer is used in functions <a class="el" href="group__synet__quantized__convolution.html#ga3ecd6531da17e3aee500d445f2824687" title="Gets size in bytes of external temporary buffer required for Quantized convolution algorithm.">SimdSynetQuantizedConvolutionExternalBufferSize</a>, <a class="el" href="group__synet__quantized__convolution.html#ga90d4d31b8be9c685d8eddad6b2cf3527" title="Gets size of internal buffer used inside Quantized convolution algorithm.">SimdSynetQuantizedConvolutionInternalBufferSize</a>, <a class="el" href="group__synet__quantized__convolution.html#gafcd47c51b810a4845eafb5f6ba4e2b2a" title="Gets description of internal implementation of Quantized convolution algorithm.">SimdSynetQuantizedConvolutionInfo</a>, <a class="el" href="group__synet__quantized__convolution.html#ga6a8f99441e8bc8b9bdd3d561a9e08f35" title="Sets weights, biases, input/output parameters required for Quantized convolution algorithm.">SimdSynetQuantizedConvolutionSetParams</a> and <a class="el" href="group__synet__quantized__convolution.html#ga14bfb110331518424c64839040c55780" title="Performs forward propagation of Quantized convolution algorithm.">SimdSynetQuantizedConvolutionForward</a>. </dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>a pointer to Quantized convolution context. On error it returns NULL. It must be released with using of function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. This pointer is used in functions <a class="el" href="group__synet__quantized__convolution.html#ga3ecd6531da17e3aee500d445f2824687" title="Gets size in bytes of external temporary buffer required for Quantized convolution algorithm.">SimdSynetQuantizedConvolutionExternalBufferSize</a>, <a class="el" href="group__synet__quantized__convolution.html#ga90d4d31b8be9c685d8eddad6b2cf3527" title="Gets size of internal buffer used inside Quantized convolution algorithm.">SimdSynetQuantizedConvolutionInternalBufferSize</a>, <a class="el" href="group__synet__quantized__convolution.html#gafcd47c51b810a4845eafb5f6ba4e2b2a" title="Gets description of internal implementation of Quantized convolution algorithm.">SimdSynetQuantizedConvolutionInfo</a>, <a class="el" href="group__synet__quantized__convolution.html#ga6a8f99441e8bc8b9bdd3d561a9e08f35" title="Sets weights, biases, input/output parameters required for Quantized convolution algorithm.">SimdSynetQuantizedConvolutionSetParams</a> and <a class="el" href="group__synet__quantized__convolution.html#ga14bfb110331518424c64839040c55780" title="Performs forward propagation of Quantized convolution algorithm.">SimdSynetQuantizedConvolutionForward</a>. </dd></dl>
 
 </div>
 </div>
@@ -129,7 +129,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga3ecd6531da17e3aee500d44
 <p>Gets size in bytes of external temporary buffer required for Quantized convolution algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to Quantized convolution context. It must be created by function <a class="el" href="group__synet__quantized__convolution.html#gad933c2426637cd4d7e3ba602d4ac25c5" title="Initializes Quantized convolution algorithm.">SimdSynetQuantizedConvolutionInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to Quantized convolution context. It must be created by function <a class="el" href="group__synet__quantized__convolution.html#gad933c2426637cd4d7e3ba602d4ac25c5" title="Initializes Quantized convolution algorithm.">SimdSynetQuantizedConvolutionInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
   </table>
   </dd>
 </dl>
@@ -156,7 +156,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga90d4d31b8be9c685d8eddad
 <p>Gets size of internal buffer used inside Quantized convolution algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to Quantized convolution context. It must be created by function <a class="el" href="group__synet__quantized__convolution.html#gad933c2426637cd4d7e3ba602d4ac25c5" title="Initializes Quantized convolution algorithm.">SimdSynetQuantizedConvolutionInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to Quantized convolution context. It must be created by function <a class="el" href="group__synet__quantized__convolution.html#gad933c2426637cd4d7e3ba602d4ac25c5" title="Initializes Quantized convolution algorithm.">SimdSynetQuantizedConvolutionInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
   </table>
   </dd>
 </dl>
@@ -183,7 +183,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#gafcd47c51b810a4845eafb5f
 <p>Gets description of internal implementation of Quantized convolution algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to Quantized convolution context. It must be created by function <a class="el" href="group__synet__quantized__convolution.html#gad933c2426637cd4d7e3ba602d4ac25c5" title="Initializes Quantized convolution algorithm.">SimdSynetQuantizedConvolutionInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to Quantized convolution context. It must be created by function <a class="el" href="group__synet__quantized__convolution.html#gad933c2426637cd4d7e3ba602d4ac25c5" title="Initializes Quantized convolution algorithm.">SimdSynetQuantizedConvolutionInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
   </table>
   </dd>
 </dl>
@@ -250,7 +250,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga6a8f99441e8bc8b9bdd3d56
 <p>Sets weights, biases, input/output parameters required for Quantized convolution algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in,out]</td><td class="paramname">context</td><td>- a pointer to Quantized convolution context. It must be created by function <a class="el" href="group__synet__quantized__convolution.html#gad933c2426637cd4d7e3ba602d4ac25c5" title="Initializes Quantized convolution algorithm.">SimdSynetQuantizedConvolutionInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in,out]</td><td class="paramname">context</td><td>- a pointer to Quantized convolution context. It must be created by function <a class="el" href="group__synet__quantized__convolution.html#gad933c2426637cd4d7e3ba602d4ac25c5" title="Initializes Quantized convolution algorithm.">SimdSynetQuantizedConvolutionInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">ioScale</td><td>- a pointer to 32-bit float point input/output tensors scales. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">ioZero</td><td>- a pointer to 8-bit unsigned integer input/output tensors zeros. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">weight</td><td>- a pointer to 8-bit integer convolution weight. </td></tr>
@@ -304,7 +304,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga14bfb110331518424c64839
 <p>Performs forward propagation of Quantized convolution algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to Quantized convolution context. It must be created by function <a class="el" href="group__synet__quantized__convolution.html#gad933c2426637cd4d7e3ba602d4ac25c5" title="Initializes Quantized convolution algorithm.">SimdSynetQuantizedConvolutionInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to Quantized convolution context. It must be created by function <a class="el" href="group__synet__quantized__convolution.html#gad933c2426637cd4d7e3ba602d4ac25c5" title="Initializes Quantized convolution algorithm.">SimdSynetQuantizedConvolutionInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">src</td><td>- a pointer to input tensor. </td></tr>
     <tr><td class="paramdir">[out]</td><td class="paramname">buf</td><td>- a pointer to external temporary buffer. The size of the external temporary buffer is determined by function <a class="el" href="group__synet__quantized__convolution.html#ga3ecd6531da17e3aee500d445f2824687" title="Gets size in bytes of external temporary buffer required for Quantized convolution algorithm.">SimdSynetQuantizedConvolutionExternalBufferSize</a>. Can be NULL (it causes usage of internal buffer). </td></tr>
     <tr><td class="paramdir">[out]</td><td class="paramname">dst</td><td>- a pointer to output tensor. </td></tr>
diff --git a/docs/help/group__synet__quantized__inner__product.html b/docs/help/group__synet__quantized__inner__product.html
index 0288fd7331..df809ebae3 100644
--- a/docs/help/group__synet__quantized__inner__product.html
+++ b/docs/help/group__synet__quantized__inner__product.html
@@ -162,7 +162,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga5eaacdfff72d2a67646e497
   </table>
   </dd>
 </dl>
-<dl class="section return"><dt>Returns</dt><dd>a pointer to quantized inner product context. On error it returns NULL. It must be released with using of function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. This pointer is used in functions <a class="el" href="group__synet__quantized__inner__product.html#gaaad6d0b33dc6faa232ae3d9260be4a55" title="Gets size in bytes of internal buffer used inside quantized inner product algorithm.">SimdSynetQuantizedInnerProductInternalBufferSize</a>, <a class="el" href="group__synet__quantized__inner__product.html#ga03e3f29b213d51a61819e285bb552d54" title="Gets size in bytes of external buffer used in quantized inner product algorithm.">SimdSynetQuantizedInnerProductExternalBufferSize</a>, <a class="el" href="group__synet__quantized__inner__product.html#gadaa204e5e49cc1d942f70bfc6df71273" title="Gets string with description of internal implementation of quantized inner product algorithm.">SimdSynetQuantizedInnerProductInfo</a>, <a class="el" href="group__synet__quantized__inner__product.html#ga608af4e56e178ddcfb18b3133c5c6f66" title="Sets weights, biases, input/output parameters required for quantized inner product algorithm.">SimdSynetQuantizedInnerProductSetParams</a> and <a class="el" href="group__synet__quantized__inner__product.html#gae8d1fb465fcedf1474a6f1b7df157874" title="Performs forward propagation of quantized inner product algorithm.">SimdSynetQuantizedInnerProductForward</a>. </dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>a pointer to quantized inner product context. On error it returns NULL. It must be released with using of function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. This pointer is used in functions <a class="el" href="group__synet__quantized__inner__product.html#gaaad6d0b33dc6faa232ae3d9260be4a55" title="Gets size in bytes of internal buffer used inside quantized inner product algorithm.">SimdSynetQuantizedInnerProductInternalBufferSize</a>, <a class="el" href="group__synet__quantized__inner__product.html#ga03e3f29b213d51a61819e285bb552d54" title="Gets size in bytes of external buffer used in quantized inner product algorithm.">SimdSynetQuantizedInnerProductExternalBufferSize</a>, <a class="el" href="group__synet__quantized__inner__product.html#gadaa204e5e49cc1d942f70bfc6df71273" title="Gets string with description of internal implementation of quantized inner product algorithm.">SimdSynetQuantizedInnerProductInfo</a>, <a class="el" href="group__synet__quantized__inner__product.html#ga608af4e56e178ddcfb18b3133c5c6f66" title="Sets weights, biases, input/output parameters required for quantized inner product algorithm.">SimdSynetQuantizedInnerProductSetParams</a> and <a class="el" href="group__synet__quantized__inner__product.html#gae8d1fb465fcedf1474a6f1b7df157874" title="Performs forward propagation of quantized inner product algorithm.">SimdSynetQuantizedInnerProductForward</a>. </dd></dl>
 
 </div>
 </div>
@@ -185,7 +185,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#gaaad6d0b33dc6faa232ae3d9
 <p>Gets size in bytes of internal buffer used inside quantized inner product algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to quantized inner product context. It must be created by function <a class="el" href="group__synet__quantized__inner__product.html#ga5eaacdfff72d2a67646e497e6f4901aa" title="Initializes quantized inner product (matrix multiplication) algorithm.">SimdSynetQuantizedInnerProductInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to quantized inner product context. It must be created by function <a class="el" href="group__synet__quantized__inner__product.html#ga5eaacdfff72d2a67646e497e6f4901aa" title="Initializes quantized inner product (matrix multiplication) algorithm.">SimdSynetQuantizedInnerProductInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
   </table>
   </dd>
 </dl>
@@ -212,7 +212,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga03e3f29b213d51a61819e28
 <p>Gets size in bytes of external buffer used in quantized inner product algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to quantized inner product context. It must be created by function <a class="el" href="group__synet__quantized__inner__product.html#ga5eaacdfff72d2a67646e497e6f4901aa" title="Initializes quantized inner product (matrix multiplication) algorithm.">SimdSynetQuantizedInnerProductInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to quantized inner product context. It must be created by function <a class="el" href="group__synet__quantized__inner__product.html#ga5eaacdfff72d2a67646e497e6f4901aa" title="Initializes quantized inner product (matrix multiplication) algorithm.">SimdSynetQuantizedInnerProductInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
   </table>
   </dd>
 </dl>
@@ -239,7 +239,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#gadaa204e5e49cc1d942f70bf
 <p>Gets string with description of internal implementation of quantized inner product algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to quantized inner product context. It must be created by function <a class="el" href="group__synet__quantized__inner__product.html#ga5eaacdfff72d2a67646e497e6f4901aa" title="Initializes quantized inner product (matrix multiplication) algorithm.">SimdSynetQuantizedInnerProductInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to quantized inner product context. It must be created by function <a class="el" href="group__synet__quantized__inner__product.html#ga5eaacdfff72d2a67646e497e6f4901aa" title="Initializes quantized inner product (matrix multiplication) algorithm.">SimdSynetQuantizedInnerProductInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
   </table>
   </dd>
 </dl>
@@ -312,7 +312,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga608af4e56e178ddcfb18b31
 <p>Sets weights, biases, input/output parameters required for quantized inner product algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in,out]</td><td class="paramname">context</td><td>- a pointer to quantized inner product context. It must be created by function <a class="el" href="group__synet__quantized__inner__product.html#ga5eaacdfff72d2a67646e497e6f4901aa" title="Initializes quantized inner product (matrix multiplication) algorithm.">SimdSynetQuantizedInnerProductInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in,out]</td><td class="paramname">context</td><td>- a pointer to quantized inner product context. It must be created by function <a class="el" href="group__synet__quantized__inner__product.html#ga5eaacdfff72d2a67646e497e6f4901aa" title="Initializes quantized inner product (matrix multiplication) algorithm.">SimdSynetQuantizedInnerProductInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">aScale</td><td>- a pointer to 32-bit float point input A tensor scale. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">aZero</td><td>- a pointer to 8-bit unsigned integer input A tensor zero. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">b</td><td>- a pointer to 8-bit integer input B tensor. Can be NULL. </td></tr>
@@ -373,7 +373,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#gae8d1fb465fcedf1474a6f1b
 <p>Performs forward propagation of quantized inner product algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to quantized inner product context. It must be created by function <a class="el" href="group__synet__quantized__inner__product.html#ga5eaacdfff72d2a67646e497e6f4901aa" title="Initializes quantized inner product (matrix multiplication) algorithm.">SimdSynetQuantizedInnerProductInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to quantized inner product context. It must be created by function <a class="el" href="group__synet__quantized__inner__product.html#ga5eaacdfff72d2a67646e497e6f4901aa" title="Initializes quantized inner product (matrix multiplication) algorithm.">SimdSynetQuantizedInnerProductInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">A</td><td>- a pointer to A matrix. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">B</td><td>- a pointer to B matrix. Can be NULL if B is constant matrix. In that case you have to set B in function SimdSynetQuantizedInnerProductSetParams. </td></tr>
     <tr><td class="paramdir">[out]</td><td class="paramname">buf</td><td>- a pointer to external buffer. The size of the external temporary buffer is determined by function <a class="el" href="group__synet__quantized__inner__product.html#ga03e3f29b213d51a61819e285bb552d54" title="Gets size in bytes of external buffer used in quantized inner product algorithm.">SimdSynetQuantizedInnerProductExternalBufferSize</a>. Can be NULL (it causes usage of internal buffer). </td></tr>
diff --git a/docs/help/group__synet__quantized__merged__convolution.html b/docs/help/group__synet__quantized__merged__convolution.html
index 269bf31a86..2730802ca8 100644
--- a/docs/help/group__synet__quantized__merged__convolution.html
+++ b/docs/help/group__synet__quantized__merged__convolution.html
@@ -120,7 +120,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#gaa9f441e637281d082be3f12
   </table>
   </dd>
 </dl>
-<dl class="section return"><dt>Returns</dt><dd>a pointer to Quantized merged convolution context. On error it returns NULL. It must be released with using of function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. This pointer is used in functions <a class="el" href="group__synet__quantized__merged__convolution.html#gae670512bddad450107bd87e79f78754d" title="Gets size of external temporary buffer required for Quantized merged convolution algorithm.">SimdSynetQuantizedMergedConvolutionExternalBufferSize</a>, <a class="el" href="group__synet__quantized__merged__convolution.html#gac5556ea3cad19429fade8737bced72c4" title="Gets size of internal buffer used inside Quantized merged convolution algorithm.">SimdSynetQuantizedMergedConvolutionInternalBufferSize</a>, <a class="el" href="group__synet__quantized__merged__convolution.html#gacd95d7cb5de9dbd5f97e3cac0277020c" title="Gets description of internal implementation of Quantized merged convolution algorithm.">SimdSynetQuantizedMergedConvolutionInfo</a>, <a class="el" href="group__synet__quantized__merged__convolution.html#ga23e272bae01fa4e382c3a57c8e78b013" title="Sets weights, biases, input/output parameters required for Quantized merged convolution algorithm.">SimdSynetQuantizedMergedConvolutionSetParams</a> and <a class="el" href="group__synet__quantized__merged__convolution.html#ga638e67b2bff56e4523dd252c131ea2b6" title="Performs forward propagation of Quantized merged convolution algorithm.">SimdSynetQuantizedMergedConvolutionForward</a>. </dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>a pointer to Quantized merged convolution context. On error it returns NULL. It must be released with using of function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. This pointer is used in functions <a class="el" href="group__synet__quantized__merged__convolution.html#gae670512bddad450107bd87e79f78754d" title="Gets size of external temporary buffer required for Quantized merged convolution algorithm.">SimdSynetQuantizedMergedConvolutionExternalBufferSize</a>, <a class="el" href="group__synet__quantized__merged__convolution.html#gac5556ea3cad19429fade8737bced72c4" title="Gets size of internal buffer used inside Quantized merged convolution algorithm.">SimdSynetQuantizedMergedConvolutionInternalBufferSize</a>, <a class="el" href="group__synet__quantized__merged__convolution.html#gacd95d7cb5de9dbd5f97e3cac0277020c" title="Gets description of internal implementation of Quantized merged convolution algorithm.">SimdSynetQuantizedMergedConvolutionInfo</a>, <a class="el" href="group__synet__quantized__merged__convolution.html#ga23e272bae01fa4e382c3a57c8e78b013" title="Sets weights, biases, input/output parameters required for Quantized merged convolution algorithm.">SimdSynetQuantizedMergedConvolutionSetParams</a> and <a class="el" href="group__synet__quantized__merged__convolution.html#ga638e67b2bff56e4523dd252c131ea2b6" title="Performs forward propagation of Quantized merged convolution algorithm.">SimdSynetQuantizedMergedConvolutionForward</a>. </dd></dl>
 
 </div>
 </div>
@@ -143,7 +143,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#gae670512bddad450107bd87e
 <p>Gets size of external temporary buffer required for Quantized merged convolution algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to Quantized merged convolution context. It must be created by function <a class="el" href="group__synet__quantized__merged__convolution.html#gaa9f441e637281d082be3f121f6364a0b" title="Initializes Quantized merged convolution algorithm.">SimdSynetQuantizedMergedConvolutionInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to Quantized merged convolution context. It must be created by function <a class="el" href="group__synet__quantized__merged__convolution.html#gaa9f441e637281d082be3f121f6364a0b" title="Initializes Quantized merged convolution algorithm.">SimdSynetQuantizedMergedConvolutionInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
   </table>
   </dd>
 </dl>
@@ -170,7 +170,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#gac5556ea3cad19429fade873
 <p>Gets size of internal buffer used inside Quantized merged convolution algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to Quantized merged convolution context. It must be created by function <a class="el" href="group__synet__quantized__merged__convolution.html#gaa9f441e637281d082be3f121f6364a0b" title="Initializes Quantized merged convolution algorithm.">SimdSynetQuantizedMergedConvolutionInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to Quantized merged convolution context. It must be created by function <a class="el" href="group__synet__quantized__merged__convolution.html#gaa9f441e637281d082be3f121f6364a0b" title="Initializes Quantized merged convolution algorithm.">SimdSynetQuantizedMergedConvolutionInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
   </table>
   </dd>
 </dl>
@@ -197,7 +197,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#gacd95d7cb5de9dbd5f97e3ca
 <p>Gets description of internal implementation of Quantized merged convolution algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to Quantized merged convolution context. It must be created by function <a class="el" href="group__synet__quantized__merged__convolution.html#gaa9f441e637281d082be3f121f6364a0b" title="Initializes Quantized merged convolution algorithm.">SimdSynetQuantizedMergedConvolutionInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to Quantized merged convolution context. It must be created by function <a class="el" href="group__synet__quantized__merged__convolution.html#gaa9f441e637281d082be3f121f6364a0b" title="Initializes Quantized merged convolution algorithm.">SimdSynetQuantizedMergedConvolutionInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
   </table>
   </dd>
 </dl>
@@ -258,7 +258,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga23e272bae01fa4e382c3a57
 <p>Sets weights, biases, input/output parameters required for Quantized merged convolution algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in,out]</td><td class="paramname">context</td><td>- a pointer to Quantized merged convolution context. It must be created by function <a class="el" href="group__synet__quantized__merged__convolution.html#gaa9f441e637281d082be3f121f6364a0b" title="Initializes Quantized merged convolution algorithm.">SimdSynetQuantizedMergedConvolutionInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in,out]</td><td class="paramname">context</td><td>- a pointer to Quantized merged convolution context. It must be created by function <a class="el" href="group__synet__quantized__merged__convolution.html#gaa9f441e637281d082be3f121f6364a0b" title="Initializes Quantized merged convolution algorithm.">SimdSynetQuantizedMergedConvolutionInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">ioScale</td><td>- a pointer to 32-bit float point input/output tensors scales. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">ioZero</td><td>- a pointer to 8-bit unsigned integer input/output tensors zeros. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">weight</td><td>- a pointer to 8-bit integer convolution weights. </td></tr>
@@ -311,7 +311,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga638e67b2bff56e4523dd252
 <p>Performs forward propagation of Quantized merged convolution algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to Quantized merged convolution context. It must be created by function <a class="el" href="group__synet__quantized__merged__convolution.html#gaa9f441e637281d082be3f121f6364a0b" title="Initializes Quantized merged convolution algorithm.">SimdSynetQuantizedMergedConvolutionInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to Quantized merged convolution context. It must be created by function <a class="el" href="group__synet__quantized__merged__convolution.html#gaa9f441e637281d082be3f121f6364a0b" title="Initializes Quantized merged convolution algorithm.">SimdSynetQuantizedMergedConvolutionInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">src</td><td>- a pointer to input tensor. </td></tr>
     <tr><td class="paramdir">[out]</td><td class="paramname">buf</td><td>- a pointer to external temporary buffer. The size of the external temporary buffer is determined by function <a class="el" href="group__synet__quantized__merged__convolution.html#gae670512bddad450107bd87e79f78754d" title="Gets size of external temporary buffer required for Quantized merged convolution algorithm.">SimdSynetQuantizedMergedConvolutionExternalBufferSize</a>. Can be NULL (it causes usage of internal buffer). </td></tr>
     <tr><td class="paramdir">[out]</td><td class="paramname">dst</td><td>- a pointer to output tensor. </td></tr>
diff --git a/docs/help/group__synet__scale.html b/docs/help/group__synet__scale.html
index 4ebd36a39f..d8632881f6 100644
--- a/docs/help/group__synet__scale.html
+++ b/docs/help/group__synet__scale.html
@@ -144,7 +144,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#gaf07d21983b8c1814f0937a7
   </table>
   </dd>
 </dl>
-<dl class="section return"><dt>Returns</dt><dd>a pointer to scale context. On error it returns NULL. It must be released with using of function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. This pointer is used in function <a class="el" href="group__synet__scale.html#ga7cc60002266247d22ce3615fe3f92111" title="Performs forward propagation of BF16 scale algorithm.">SimdSynetScale16bForward</a>. </dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>a pointer to scale context. On error it returns NULL. It must be released with using of function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. This pointer is used in function <a class="el" href="group__synet__scale.html#ga7cc60002266247d22ce3615fe3f92111" title="Performs forward propagation of BF16 scale algorithm.">SimdSynetScale16bForward</a>. </dd></dl>
 
 </div>
 </div>
@@ -195,7 +195,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga7cc60002266247d22ce3615
 <p>Performs forward propagation of BF16 scale algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to scale context. It must be created by function <a class="el" href="group__synet__scale.html#gaf07d21983b8c1814f0937a7c026f3914" title="Initializes BF16 scale algorithm.">SimdSynetScale16bInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to scale context. It must be created by function <a class="el" href="group__synet__scale.html#gaf07d21983b8c1814f0937a7c026f3914" title="Initializes BF16 scale algorithm.">SimdSynetScale16bInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">src</td><td>- a pointer to input tensor. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">norm</td><td>- a pointer to FP32 array with scale coefficients. Can be NULL. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">bias</td><td>- a pointer to FP32 array with shift coefficients. Can be NULL. </td></tr>
@@ -366,7 +366,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga83d8a5a0e0fea89e5ef3793
   </table>
   </dd>
 </dl>
-<dl class="section return"><dt>Returns</dt><dd>a pointer to INT8 scale context. On error it returns NULL. It must be released with using of function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. This pointer is used in functions <a class="el" href="group__synet__scale.html#ga6fec2ad288df0b3db9dde198fd7f7909" title="Gets size of internal buffer used inside INT8 scale algorithm.">SimdSynetScale8iInternalBufferSize</a>, <a class="el" href="group__synet__scale.html#ga2c2961b109c7a5f28c5ac353c3d47a45" title="Sets scale, bias, parameters of activation function, input/output tensor statistics required for INT8...">SimdSynetScale8iSetParams</a> and <a class="el" href="group__synet__scale.html#ga946025524abc3a4fa3bf00d4daca866e" title="Performs forward propagation of INT8 scale algorithm.">SimdSynetScale8iForward</a>. </dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>a pointer to INT8 scale context. On error it returns NULL. It must be released with using of function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. This pointer is used in functions <a class="el" href="group__synet__scale.html#ga6fec2ad288df0b3db9dde198fd7f7909" title="Gets size of internal buffer used inside INT8 scale algorithm.">SimdSynetScale8iInternalBufferSize</a>, <a class="el" href="group__synet__scale.html#ga2c2961b109c7a5f28c5ac353c3d47a45" title="Sets scale, bias, parameters of activation function, input/output tensor statistics required for INT8...">SimdSynetScale8iSetParams</a> and <a class="el" href="group__synet__scale.html#ga946025524abc3a4fa3bf00d4daca866e" title="Performs forward propagation of INT8 scale algorithm.">SimdSynetScale8iForward</a>. </dd></dl>
 
 </div>
 </div>
@@ -389,7 +389,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga6fec2ad288df0b3db9dde19
 <p>Gets size of internal buffer used inside INT8 scale algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to INT8 scale context. It must be created by function <a class="el" href="group__synet__scale.html#ga83d8a5a0e0fea89e5ef3793bf92be9e5" title="Initializes INT8 scale algorithm.">SimdSynetScale8iInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to INT8 scale context. It must be created by function <a class="el" href="group__synet__scale.html#ga83d8a5a0e0fea89e5ef3793bf92be9e5" title="Initializes INT8 scale algorithm.">SimdSynetScale8iInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
   </table>
   </dd>
 </dl>
@@ -438,7 +438,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga2c2961b109c7a5f28c5ac35
 <p>Sets scale, bias, parameters of activation function, input/output tensor statistics required for INT8 scale algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in,out]</td><td class="paramname">context</td><td>- a pointer to INT8 convolution context. It must be created by function <a class="el" href="group__synet__scale.html#ga83d8a5a0e0fea89e5ef3793bf92be9e5" title="Initializes INT8 scale algorithm.">SimdSynetScale8iInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in,out]</td><td class="paramname">context</td><td>- a pointer to INT8 convolution context. It must be created by function <a class="el" href="group__synet__scale.html#ga83d8a5a0e0fea89e5ef3793bf92be9e5" title="Initializes INT8 scale algorithm.">SimdSynetScale8iInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">scale</td><td>- a pointer to original (32-bit float point) scale. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">bias</td><td>- a pointer to original (32-bit float point) bias. Can be NULL. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">stats</td><td>- a pointer to pointers with statistics of input(min - stats[0], max - stats[1]) and output(min - stats[2], max - stats[3]) tensors. Can be NULL for subsequent calls of this function. </td></tr>
@@ -483,7 +483,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga946025524abc3a4fa3bf00d
 <p>Performs forward propagation of INT8 scale algorithm. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to INT8 scale context. It must be created by function <a class="el" href="group__synet__scale.html#ga83d8a5a0e0fea89e5ef3793bf92be9e5" title="Initializes INT8 scale algorithm.">SimdSynetScale8iInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a pointer to INT8 scale context. It must be created by function <a class="el" href="group__synet__scale.html#ga83d8a5a0e0fea89e5ef3793bf92be9e5" title="Initializes INT8 scale algorithm.">SimdSynetScale8iInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">src</td><td>- a pointer to input tensor. </td></tr>
     <tr><td class="paramdir">[out]</td><td class="paramname">dst</td><td>- a pointer to output tensor. </td></tr>
   </table>
diff --git a/docs/help/group__warp__affine.html b/docs/help/group__warp__affine.html
index bc08e16fc7..cd3a67230b 100644
--- a/docs/help/group__warp__affine.html
+++ b/docs/help/group__warp__affine.html
@@ -228,7 +228,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga190e3594adfa696ac4c7586
   </table>
   </dd>
 </dl>
-<dl class="section return"><dt>Returns</dt><dd>a pointer to warp affine context. On error it returns NULL. This pointer is used in functions <a class="el" href="group__warp__affine.html#gaa1b41e25cb57778b2320bbf6943a8dcc" title="Performs warp affine for current image.">SimdWarpAffineRun</a>. It must be released with using of function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>a pointer to warp affine context. On error it returns NULL. This pointer is used in functions <a class="el" href="group__warp__affine.html#gaa1b41e25cb57778b2320bbf6943a8dcc" title="Performs warp affine for current image.">SimdWarpAffineRun</a>. It must be released with using of function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </dd></dl>
 
 </div>
 </div>
@@ -268,7 +268,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#gaa1b41e25cb57778b2320bbf
 <dl class="section note"><dt>Note</dt><dd>This function has a C++ wrapper <a class="el" href="group__warp__affine.html#ga03ebb0fcd0f0bb257db1662fd30d7333" title="Performs warp affine for current image.">Simd::WarpAffine</a>(const View&amp; src, const float * mat, View&amp; dst, SimdWarpAffineFlags flags = SimdWarpAffineInterpBilinear | SimdWarpAffineBorderConstant, const uint8_t* border = NULL).</dd></dl>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a warp affine context. It must be created by function <a class="el" href="group__warp__affine.html#ga190e3594adfa696ac4c75863dbd04c4f" title="Creates wrap affine context.">SimdWarpAffineInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Releases context created with using of Simd Library API.">SimdRelease</a>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">context</td><td>- a warp affine context. It must be created by function <a class="el" href="group__warp__affine.html#ga190e3594adfa696ac4c75863dbd04c4f" title="Creates wrap affine context.">SimdWarpAffineInit</a> and released by function <a class="el" href="group__memory.html#ga4e38dedd9f946265c9762858a71aa4cf" title="Destroys an opaque context object created by the Simd Library API.">SimdRelease</a>. </td></tr>
     <tr><td class="paramdir">[in]</td><td class="paramname">src</td><td>- a pointer to pixels data of the original input image. </td></tr>
     <tr><td class="paramdir">[out]</td><td class="paramname">dst</td><td>- a pointer to pixels data of the filtered output image. </td></tr>
   </table>
diff --git a/docs/help/index.html b/docs/help/index.html
index bf3ea684f4..5d9f2d4a19 100644
--- a/docs/help/index.html
+++ b/docs/help/index.html
@@ -45,7 +45,7 @@ <h1>Simd Library Documentation.</h1>
 <div class="textblock"><h1><a class="anchor" id="s0"></a>
 Introduction</h1>
 <p >The <b>Simd Library</b> is a free open source image processing library and machine learning, designed for C and C++ programmers. It provides many useful high performance algorithms for image processing and machine learning such as: pixel format conversion, image scaling and filtration, extraction of statistic information from images, motion detection, object detection and classification, neural network.</p>
-<p >The algorithms are optimized with using of different SIMD CPU extensions. In particular the library supports following CPU extensions: SSE, AVX, AVX-512 and AMX for x86/x64, NEON for ARM.</p>
+<p >The algorithms are optimized with using of different SIMD CPU extensions. In particular the library supports following CPU extensions: SSE, AVX, AVX-512 and AMX for x86/x64, NEON, SVE for ARM, HVX for Hexagon.</p>
 <p >The Simd Library has C API and also contains useful C++ classes and functions to facilitate access to C API. The library supports dynamic and static linking, 32-bit and 64-bit Windows and Linux, MSVS, G++ and Clang compilers, MSVS project and CMake build systems.</p>
 <h1><a class="anchor" id="s1"></a>
 Library folder's structure</h1>
@@ -94,6 +94,7 @@ <h1><a class="anchor" id="s2"></a>
 <li><code>SIMD_AVX512</code> - Enable of AVX-512 (AVX-512F, AVX-512CD, AVX-512VL, AVX-512DQ, AVX-512BW) CPU extensions. It is switched on by default.</li>
 <li><code>SIMD_AVX512VNNI</code> - Enable of AVX-512-VNNI CPU extensions. It is switched on by default.</li>
 <li><code>SIMD_AMXBF16</code> - Enable of AMX-BF16, AMX-INT8 and AVX-512-BF16 CPU extensions. It is switched off by default.</li>
+<li><code>SIMD_SVE</code> - Enable of SVE CPU extension. It is switched off by default.</li>
 <li><code>SIMD_TEST</code> - Build test framework. It is switched on by default.</li>
 <li><code>SIMD_INFO</code> - Print build information. It is switched on by default.</li>
 <li><code>SIMD_PERF</code> - Enable of internal performance statistic. It is switched off by default.</li>
diff --git a/docs/help/struct_simd_1_1_view.html b/docs/help/struct_simd_1_1_view.html
index 2ed1b35bde..2762b3820a 100644
--- a/docs/help/struct_simd_1_1_view.html
+++ b/docs/help/struct_simd_1_1_view.html
@@ -1827,7 +1827,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#a5741a1f89a39e5245f9f397b
   </table>
   </dd>
 </dl>
-<dl class="section return"><dt>Returns</dt><dd>- a released pointer to pixel data. It must be deleted by function <a class="el" href="group__memory.html#ga58c32d1470db0a1a698abd8280aeee52" title="Frees aligned memory block.">SimdFree</a>. </dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>- a released pointer to pixel data. It must be deleted by function <a class="el" href="group__memory.html#ga58c32d1470db0a1a698abd8280aeee52" title="Frees an aligned memory block previously allocated by SimdAllocate.">SimdFree</a>. </dd></dl>
 
 </div>
 </div>
diff --git a/docs/index.html b/docs/index.html
index c28c5f7ec9..0104f85d91 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -26,7 +26,7 @@ <h3>Description</h3>
 pixel format conversion, image scaling and filtration, extraction of statistic information from images, motion detection,
 object detection and classification, neural network.</p>
 <p>The algorithms are optimized with using of different SIMD CPU extensions. 
-In particular the library supports following CPU extensions: SSE, AVX, AVX-512 and AMX for x86/x64, NEON for ARM, HVX for Hexagon.</p>
+In particular the library supports following CPU extensions: SSE, AVX, AVX-512 and AMX for x86/x64, NEON, SVE for ARM, HVX for Hexagon.</p>
 <p>The Simd Library has C API and also contains useful C++ and Python wrapper classes and functions to facilitate access to C API. 
 The library supports dynamic and static linking, 32-bit and 64-bit Windows and Linux, 
 MSVS, G++ and Clang compilers, MSVS project and CMake build systems.</p>
diff --git a/prj/txt/DoxygenOverview.txt b/prj/txt/DoxygenOverview.txt
index 9a2c44c3ec..d7330a507f 100644
--- a/prj/txt/DoxygenOverview.txt
+++ b/prj/txt/DoxygenOverview.txt
@@ -8,7 +8,7 @@
     extraction of statistic information from images, motion detection, object detection and classification, neural network.
     
     The algorithms are optimized with using of different SIMD CPU extensions. 
-    In particular the library supports following CPU extensions: SSE, AVX, AVX-512 and AMX for x86/x64, NEON for ARM.
+    In particular the library supports following CPU extensions: SSE, AVX, AVX-512 and AMX for x86/x64, NEON, SVE for ARM, HVX for Hexagon.
     
     The %Simd Library has C API and also contains useful C++ classes and functions to facilitate access to C API.
     The library supports dynamic and static linking, 32-bit and 64-bit Windows and Linux, MSVS, G++ and Clang compilers, MSVS project and CMake build systems.
@@ -82,6 +82,7 @@
      - `SIMD_AVX512` - Enable of AVX-512 (AVX-512F, AVX-512CD, AVX-512VL, AVX-512DQ, AVX-512BW) CPU extensions. It is switched on by default.
      - `SIMD_AVX512VNNI` - Enable of AVX-512-VNNI CPU extensions. It is switched on by default.
      - `SIMD_AMXBF16` - Enable of AMX-BF16, AMX-INT8 and AVX-512-BF16 CPU extensions. It is switched off by default.
+     - `SIMD_SVE` - Enable of SVE CPU extension. It is switched off by default.
      - `SIMD_TEST` - Build test framework. It is switched on by default.
      - `SIMD_INFO` - Print build information. It is switched on by default.
      - `SIMD_PERF` - Enable of internal performance statistic. It is switched off by default.
diff --git a/src/Simd/SimdLib.h b/src/Simd/SimdLib.h
index b889e52df0..1357a6eb55 100644
--- a/src/Simd/SimdLib.h
+++ b/src/Simd/SimdLib.h
@@ -1002,11 +1002,12 @@ extern "C"
 
         The value is determined once at library initialization time by probing the active SIMD extensions
         and is constant for the lifetime of the process:
+        - \b 128 bytes — HVX (Qualcomm Hexagon)
         - \b 64 bytes — AVX-512 (x86, when either AVX-512BW or AVX-512VNNI is available)
         - \b 32 bytes — AVX2 (x86)
         - \b 16 bytes — SSE4.1 (x86) or NEON (ARM)
-        - <b>sizeof(HVX_Vector)</b> — HVX (Qualcomm Hexagon)
         - <b>sizeof(void*)</b> — scalar fallback (no SIMD extensions detected)
+        - \b SVE vector size for current CPU in bytes — when SVE is available.
 
         The returned value is always a power of two and equals the value of the \c SIMD_ALIGN compile-time
         constant used internally by the library.

From 55792300be79e0ea5ded36fd616f34de1d6343f9 Mon Sep 17 00:00:00 2001
From: Yermalayeu Ihar <ermig@tut.by>
Date: Tue, 5 May 2026 17:38:33 +0300
Subject: [PATCH 16/32] +add SVE optimizations of function
 BackgroundGrowRangeFast.

---
 docs/2026.html                  |  1 +
 src/Simd/SimdLib.cpp            |  5 +++++
 src/Simd/SimdLib.h              | 12 ++++++++++++
 src/Simd/SimdSve1.h             |  3 +++
 src/Simd/SimdSve1Background.cpp | 31 +++++++++++++++++++++++++++++++
 src/Test/TestBackground.cpp     |  6 ++++++
 6 files changed, 58 insertions(+)

diff --git a/docs/2026.html b/docs/2026.html
index 76f760826e..6e4767bbd5 100644
--- a/docs/2026.html
+++ b/docs/2026.html
@@ -46,6 +46,7 @@ <h5>New features</h5>
  <li>SVE optimizations of function AbsDifferenceSums3x3.</li>
  <li>SVE optimizations of function AbsDifferenceSums3x3Masked.</li>
  <li>SVE optimizations of function BackgroundGrowRangeSlow.</li>
+ <li>SVE optimizations of function BackgroundGrowRangeFast.</li>
 </ul>
 <h5>Bug fixing</h5>
 <ul>
diff --git a/src/Simd/SimdLib.cpp b/src/Simd/SimdLib.cpp
index d7e8c70752..38e12f405d 100644
--- a/src/Simd/SimdLib.cpp
+++ b/src/Simd/SimdLib.cpp
@@ -754,6 +754,11 @@ SIMD_API void SimdBackgroundGrowRangeFast(const uint8_t * value, size_t valueStr
         Sse41::BackgroundGrowRangeFast(value, valueStride, width, height, lo, loStride, hi, hiStride);
     else
 #endif
+#ifdef SIMD_SVE_ENABLE
+    if (Sve::Enable)
+        Sve::BackgroundGrowRangeFast(value, valueStride, width, height, lo, loStride, hi, hiStride);
+    else
+#endif
 #ifdef SIMD_NEON_ENABLE
     if (Neon::Enable && width >= Neon::A)
         Neon::BackgroundGrowRangeFast(value, valueStride, width, height, lo, loStride, hi, hiStride);
diff --git a/src/Simd/SimdLib.h b/src/Simd/SimdLib.h
index 1357a6eb55..30616f1ac0 100644
--- a/src/Simd/SimdLib.h
+++ b/src/Simd/SimdLib.h
@@ -81,6 +81,14 @@ typedef unsigned __int64  uint64_t;
 #if _MSVC_LANG >= 201703L
 #define SIMD_CPP_2017_ENABLE
 #endif
+
+#if _MSVC_LANG >= 202002L
+#define SIMD_CPP_2020_ENABLE
+#endif
+
+#if _MSVC_LANG >= 202302L
+#define SIMD_CPP_2023_ENABLE
+#endif
 #else
 #if __cplusplus >= 201103L
 #define SIMD_CPP_2011_ENABLE
@@ -97,6 +105,10 @@ typedef unsigned __int64  uint64_t;
 #if __cplusplus >= 202002L
 #define SIMD_CPP_2020_ENABLE
 #endif
+
+#if __cplusplus >= 202302L
+#define SIMD_CPP_2023_ENABLE
+#endif
 #endif
 
 #if defined(SIMD_CPP_2020_ENABLE)
diff --git a/src/Simd/SimdSve1.h b/src/Simd/SimdSve1.h
index a1133666de..853f9b4897 100644
--- a/src/Simd/SimdSve1.h
+++ b/src/Simd/SimdSve1.h
@@ -51,6 +51,9 @@ namespace Simd
         void BackgroundGrowRangeSlow(const uint8_t* value, size_t valueStride, size_t width, size_t height,
             uint8_t* lo, size_t loStride, uint8_t* hi, size_t hiStride);
 
+        void BackgroundGrowRangeFast(const uint8_t* value, size_t valueStride, size_t width, size_t height,
+            uint8_t* lo, size_t loStride, uint8_t* hi, size_t hiStride);
+
         void BgrToRgb(const uint8_t* bgr, size_t width, size_t height, size_t bgrStride, uint8_t* rgb, size_t rgbStride);
 
         void DeinterleaveUv(const uint8_t* uv, size_t uvStride, size_t width, size_t height, uint8_t* u, size_t uStride, uint8_t* v, size_t vStride);
diff --git a/src/Simd/SimdSve1Background.cpp b/src/Simd/SimdSve1Background.cpp
index 5ffafb7ff0..a2bb2d5077 100644
--- a/src/Simd/SimdSve1Background.cpp
+++ b/src/Simd/SimdSve1Background.cpp
@@ -61,6 +61,37 @@ namespace Simd
                 hi += hiStride;
             }
         }
+
+        //--------------------------------------------------------------------------------------------------
+
+        SIMD_INLINE void BackgroundGrowRangeFast(const uint8_t* value, uint8_t* lo, uint8_t* hi, const svbool_t& mask)
+        {
+            svuint8_t _value = svld1_u8(mask, value);
+            svuint8_t _lo = svld1_u8(mask, lo);
+            svuint8_t _hi = svld1_u8(mask, hi);
+
+            svst1_u8(mask, lo, svmin_u8_x(mask, _lo, _value));
+            svst1_u8(mask, hi, svmax_u8_x(mask, _hi, _value));
+        }
+
+        void BackgroundGrowRangeFast(const uint8_t* value, size_t valueStride, size_t width, size_t height, uint8_t* lo, size_t loStride, uint8_t* hi, size_t hiStride)
+        {
+            size_t A = svlen(svuint8_t());
+            size_t widthA = AlignLo(width, A);
+            const svbool_t body = svwhilelt_b8(size_t(0), A);
+            const svbool_t tail = svwhilelt_b8(widthA, width);
+            for (size_t row = 0; row < height; ++row)
+            {
+                size_t col = 0;
+                for (; col < widthA; col += A)
+                    BackgroundGrowRangeFast(value + col, lo + col, hi + col, body);
+                if (widthA < width)
+                    BackgroundGrowRangeFast(value + col, lo + col, hi + col, tail);
+                value += valueStride;
+                lo += loStride;
+                hi += hiStride;
+            }
+        }
     }
 #endif
 }
diff --git a/src/Test/TestBackground.cpp b/src/Test/TestBackground.cpp
index 51d9351f56..60ea6cebb8 100644
--- a/src/Test/TestBackground.cpp
+++ b/src/Test/TestBackground.cpp
@@ -498,6 +498,12 @@ namespace Test
         if (Simd::Neon::Enable && TestNeon(options) && W >= Simd::Neon::A)
             result = result && BackgroundChangeRangeAutoTest(FUNC1(Simd::Neon::BackgroundGrowRangeFast), FUNC1(SimdBackgroundGrowRangeFast));
 #endif
+
+#ifdef SIMD_SVE_ENABLE
+        if (Simd::Sve::Enable && TestSve(options))
+            result = result && BackgroundChangeRangeAutoTest(FUNC1(Simd::Sve::BackgroundGrowRangeFast), FUNC1(SimdBackgroundGrowRangeFast));
+#endif
+
         return result;
     }
 

From 1a0a8674f3dc178c2af9dcb68c3d7b8ac9c26959 Mon Sep 17 00:00:00 2001
From: Centimo <Centimo@users.noreply.github.com>
Date: Mon, 18 May 2026 03:36:48 +0300
Subject: [PATCH 17/32] Add View::Copy and Frame::Copy returning by value

Add new Copy() overloads (no args and with Rectangle) returning a new
View/Frame by value, alongside the existing pointer-returning Clone().
---
 src/Simd/SimdFrame.hpp | 27 +++++++++++++++++++++++++++
 src/Simd/SimdView.hpp  | 29 +++++++++++++++++++++++++++++
 2 files changed, 56 insertions(+)

diff --git a/src/Simd/SimdFrame.hpp b/src/Simd/SimdFrame.hpp
index a706b04c77..af84a19d04 100644
--- a/src/Simd/SimdFrame.hpp
+++ b/src/Simd/SimdFrame.hpp
@@ -200,6 +200,21 @@ namespace Simd
         */
         Frame * Clone(Frame & buffer) const;
 
+        /*!
+            Gets a copy of current frame by value.
+
+            \return a new Frame structure containing a copy of the frame.
+        */
+        Frame Copy() const;
+
+        /*!
+            Gets a copy of region of current frame bounded by the rectangle with specified coordinates, by value.
+
+            \param [in] rect - a rectangle which bounds the region.
+            \return a new Frame structure containing a copy of the region.
+        */
+        Frame Copy(const Rectangle<ptrdiff_t>& rect) const;
+
         /*!
             Creates reference to other Frame structure.
 
@@ -680,6 +695,18 @@ namespace Simd
         return clone;
     }
 
+    template <template<class> class A> SIMD_INLINE Frame<A> Frame<A>::Copy() const
+    {
+        Frame<A> copy(width, height, format, flipped, timestamp, yuvType);
+        Simd::Copy(*this, copy);
+        return copy;
+    }
+
+    template <template<class> class A> SIMD_INLINE Frame<A> Frame<A>::Copy(const Rectangle<ptrdiff_t>& rect) const
+    {
+        return Region(rect).Copy();
+    }
+
     template <template<class> class A> SIMD_INLINE Frame<A> & Frame<A>::operator = (const Frame<A> & frame)
     {
         if (this != &frame)
diff --git a/src/Simd/SimdView.hpp b/src/Simd/SimdView.hpp
index d594371846..04c90a3b16 100644
--- a/src/Simd/SimdView.hpp
+++ b/src/Simd/SimdView.hpp
@@ -266,6 +266,21 @@ namespace Simd
         */
         View * Clone(View & buffer) const;
 
+        /*!
+            Gets a copy of current image view by value.
+
+            \return a new View structure containing a copy of the image.
+        */
+        View Copy() const;
+
+        /*!
+            Gets a copy of region of current image view bounded by the rectangle with specified coordinates, by value.
+
+            \param [in] rect - a rectangle which bounds the region.
+            \return a new View structure containing a copy of the region.
+        */
+        View Copy(const Rectangle<ptrdiff_t>& rect) const;
+
         /*!
             Creates view which references to other View structure.
 
@@ -940,6 +955,20 @@ namespace Simd
         return view;
     }
 
+    template <template<class> class A> SIMD_INLINE View<A> View<A>::Copy() const
+    {
+        View<A> view(width, height, format);
+        size_t size = width*PixelSize();
+        for (size_t row = 0; row < height; ++row)
+            memcpy(view.data + view.stride*row, data + stride*row, size);
+        return view;
+    }
+
+    template <template<class> class A> SIMD_INLINE View<A> View<A>::Copy(const Rectangle<ptrdiff_t>& rect) const
+    {
+        return Region(rect).Copy();
+    }
+
     /*! \cond */
     template <template<class> class A> SIMD_INLINE View<A> & View<A>::operator = (const View<A> & view)
     {

From fe182203e12fc85f353d462cecf0277dce3d670b Mon Sep 17 00:00:00 2001
From: Centimo <Centimo@users.noreply.github.com>
Date: Mon, 18 May 2026 03:47:08 +0300
Subject: [PATCH 18/32] gitignore: ignore .idea/ and cmake-build-*/

---
 .gitignore | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.gitignore b/.gitignore
index 574065d038..3e475d4df9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,5 @@
 /build
 /prj/txt/FullVersion.txt
 /src/Simd/SimdVersion.h
+.idea/
+cmake-build-*/

From d7706f17825e70c2a7ea3804029f4b22a2d568b9 Mon Sep 17 00:00:00 2001
From: Yermalayeu Ihar <ermig@tut.by>
Date: Wed, 20 May 2026 16:42:18 +0300
Subject: [PATCH 19/32] +add declaration of class
 SynetConvolution16bNhwcSpecV3.

---
 docs/2026.html                       |  2 ++
 src/Simd/SimdSynetConvolution16b.h   | 45 ++++++++++++++++++++++++++++
 src/Test/TestSynetConvolution16b.cpp | 30 +++++++++----------
 3 files changed, 62 insertions(+), 15 deletions(-)

diff --git a/docs/2026.html b/docs/2026.html
index 6e4767bbd5..ce089c4482 100644
--- a/docs/2026.html
+++ b/docs/2026.html
@@ -47,6 +47,8 @@ <h5>New features</h5>
  <li>SVE optimizations of function AbsDifferenceSums3x3Masked.</li>
  <li>SVE optimizations of function BackgroundGrowRangeSlow.</li>
  <li>SVE optimizations of function BackgroundGrowRangeFast.</li>
+ <li>Method View::Copy.</li>
+ <li>Method Frame::Copy.</li>
 </ul>
 <h5>Bug fixing</h5>
 <ul>
diff --git a/src/Simd/SimdSynetConvolution16b.h b/src/Simd/SimdSynetConvolution16b.h
index 1a99d24773..1fe2b46321 100644
--- a/src/Simd/SimdSynetConvolution16b.h
+++ b/src/Simd/SimdSynetConvolution16b.h
@@ -377,6 +377,51 @@ namespace Simd
 
         //-------------------------------------------------------------------------------------------------
 
+        class SynetConvolution16bNhwcSpecV3 : public SynetConvolution16b
+        {
+        public:
+            SynetConvolution16bNhwcSpecV3(const ConvParam& p);
+            virtual String Ext() const { return "Base"; }
+            virtual String Desc() const;
+            virtual size_t ExternalBufferSize() const;
+            virtual void SetParams(const float* weight, const float* bias, const float* params);
+            virtual void Forward(const uint8_t* src, uint8_t* buf, uint8_t* dst);
+
+            static bool Preferable(const ConvParam& p);
+
+            struct AlgParam
+            {
+                size_t F, microD, microS, microC;
+                size_t batch, srcC, srcH, srcW, dstC, K;
+                size_t padV, padH, padE, gapV, gapH, kA;
+                size_t macroD, macroH, macroC;
+                size_t bufS, bufD, elem;
+            };
+
+            typedef void(*PreprocessPtr)(const uint8_t* src, const ConvParam& p, const AlgParam& a, size_t dyBeg, size_t dyEnd, int end, uint16_t* dst);
+
+            typedef void(*BodyConvPtr)(const uint16_t* src, const ConvParam& p, const AlgParam& a, const int* srcOffs,
+                size_t dstC, size_t dstS, size_t nK, int zero, const uint16_t* weight, float* sum);
+
+            typedef void(*LastConvPtr)(const uint16_t* src, const ConvParam& p, const AlgParam& a, const int* srcOffs, size_t dstC, size_t dstS, size_t nK, int zero,
+                const uint16_t* weight, float* sum, const float* bias, const float* params, const int* dstMask, const int* dstOffs, uint8_t* dst);
+
+        protected:
+            void SetAlgParam(size_t F, size_t microD, size_t microS, size_t microC, size_t L1, size_t L2, size_t L3);
+            virtual void SetWeight(const float* weight);
+
+            void ForwardSingle(const uint8_t* src, uint16_t* buf, float* sum, uint8_t* dst);
+            void ForwardBatch(const uint8_t* src, uint16_t* buf, float* sum, uint8_t* dst);
+
+            AlgParam _alg;
+            Array32i _srcOffs, _dstMask, _nK, _maBufOffs, _maSumOffs, _miDstOffs;
+            PreprocessPtr _preprocess;
+            BodyConvPtr _bodyConv;
+            LastConvPtr _lastConv;
+        };
+
+        //-------------------------------------------------------------------------------------------------
+
         class SynetConvolution16bNhwcDepthwise : public SynetConvolution16b
         {
         public:
diff --git a/src/Test/TestSynetConvolution16b.cpp b/src/Test/TestSynetConvolution16b.cpp
index 5fcafb58e1..0cd899620b 100644
--- a/src/Test/TestSynetConvolution16b.cpp
+++ b/src/Test/TestSynetConvolution16b.cpp
@@ -364,21 +364,6 @@ namespace Test
         result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 999, 6, 6, 999, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2);
         result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 125, 116, 116, 125, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2);
 #endif
-#if 0
-        result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 256, 16, 16, 256, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2);
-        //result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 112, 24, 24, 112, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2);
-        result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 56, 96, 96, 56, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2);
-        result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 56, 48, 48, 56, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2);
-        //result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 56, 48, 48, 112, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2);
-        //result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 64, 48, 48, 56, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2);
-        result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 112, 24, 24, 112, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2);
-        //result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 112, 24, 24, 224, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2);
-        //result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 128, 24, 24, 112, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2);
-        result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 224, 12, 12, 224, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2);
-        result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 224, 12, 12, 448, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2);
-        result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 448, 6, 6, 448, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2);
-        //result = result && SynetConvolution16bForwardAutoTest(eps, Param(10, 448, 6, 6, 448, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2);
-#endif
 #if 0
         result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 32, 321, 321, 16, _2, _1, _1, _0, _0, 1, aId, tT, b16, f32), c, f1, f2);
         result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 16, 320, 320, 32, _2, _1, _1, _0, _1, 1, aId, tT, b16, f32), c, f1, f2);
@@ -564,6 +549,21 @@ namespace Test
         result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 32, 96, 96, 96, _3, _1, _2, _1, _1, 1, aId, tT, b16, b16), c, f1, f2);
 #endif
 #if 1
+        //result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 256, 16, 16, 256, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2);
+        //result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 112, 24, 24, 112, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2);
+        result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 56, 96, 96, 56, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2);
+        result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 56, 48, 48, 56, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2);
+        //result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 56, 48, 48, 112, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2);
+        //result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 64, 48, 48, 56, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2);
+        result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 112, 24, 24, 112, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2);
+        //result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 112, 24, 24, 224, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2);
+        //result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 128, 24, 24, 112, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2);
+        result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 224, 12, 12, 224, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2);
+        result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 224, 12, 12, 448, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2);
+        result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 448, 6, 6, 448, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2);
+        //result = result && SynetConvolution16bForwardAutoTest(eps, Param(10, 448, 6, 6, 448, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2);
+#endif
+#if 0
         result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 224, 12, 12, 448, _1, _1, _2, _0, _0, 1, aId, tT, b16, b16), c, f1, f2);
         result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 56, 96, 96, 56, _3, _1, _2, _1, _1, 1, aId, tT, b16, b16), c, f1, f2);
         result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 56, 48, 48, 112, _1, _1, _2, _0, _0, 1, aId, tT, b16, b16), c, f1, f2);

From 8ea37f38a01888d0933c8847c4916ce3e1c053f8 Mon Sep 17 00:00:00 2001
From: Yermalayeu Ihar <ermig@tut.by>
Date: Wed, 20 May 2026 17:22:13 +0300
Subject: [PATCH 20/32] +add Base implementation of class
 SynetConvolution16bNhwcSpecV3.

---
 docs/2026.html                                |   1 +
 prj/vs2022/Base.vcxproj                       |   1 +
 prj/vs2022/Base.vcxproj.filters               |   3 +
 .../SimdBaseSynetConvolution16bNhwcSpecV3.cpp | 314 ++++++++++++++++++
 src/Simd/SimdSynetConvolution16b.h            |   2 +-
 5 files changed, 320 insertions(+), 1 deletion(-)
 create mode 100644 src/Simd/SimdBaseSynetConvolution16bNhwcSpecV3.cpp

diff --git a/docs/2026.html b/docs/2026.html
index ce089c4482..fb1b130286 100644
--- a/docs/2026.html
+++ b/docs/2026.html
@@ -49,6 +49,7 @@ <h5>New features</h5>
  <li>SVE optimizations of function BackgroundGrowRangeFast.</li>
  <li>Method View::Copy.</li>
  <li>Method Frame::Copy.</li>
+ <li>Base implementation of class SynetConvolution16bNhwcSpecV3.</li>
 </ul>
 <h5>Bug fixing</h5>
 <ul>
diff --git a/prj/vs2022/Base.vcxproj b/prj/vs2022/Base.vcxproj
index 5cc450ff14..4ef731618e 100644
--- a/prj/vs2022/Base.vcxproj
+++ b/prj/vs2022/Base.vcxproj
@@ -226,6 +226,7 @@
     <ClCompile Include="..\..\src\Simd\SimdBaseSynetConvolution16bNhwcSpecV0.cpp" />
     <ClCompile Include="..\..\src\Simd\SimdBaseSynetConvolution16bNhwcSpecV1.cpp" />
     <ClCompile Include="..\..\src\Simd\SimdBaseSynetConvolution16bNhwcSpecV2.cpp" />
+    <ClCompile Include="..\..\src\Simd\SimdBaseSynetConvolution16bNhwcSpecV3.cpp" />
     <ClCompile Include="..\..\src\Simd\SimdBaseSynetConvolution32f.cpp" />
     <ClCompile Include="..\..\src\Simd\SimdBaseSynetConvolution32fDirectNchw.cpp" />
     <ClCompile Include="..\..\src\Simd\SimdBaseSynetConvolution32fGemm.cpp" />
diff --git a/prj/vs2022/Base.vcxproj.filters b/prj/vs2022/Base.vcxproj.filters
index 4da71c92cf..1bf70407f7 100644
--- a/prj/vs2022/Base.vcxproj.filters
+++ b/prj/vs2022/Base.vcxproj.filters
@@ -472,6 +472,9 @@
     <ClCompile Include="..\..\src\Simd\SimdBaseSynetScale16b.cpp">
       <Filter>Base\Synet\Scale</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\src\Simd\SimdBaseSynetConvolution16bNhwcSpecV3.cpp">
+      <Filter>Base\Synet\Convolution</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\..\src\Simd\SimdBase.h">
diff --git a/src/Simd/SimdBaseSynetConvolution16bNhwcSpecV3.cpp b/src/Simd/SimdBaseSynetConvolution16bNhwcSpecV3.cpp
new file mode 100644
index 0000000000..79c1cf8c2d
--- /dev/null
+++ b/src/Simd/SimdBaseSynetConvolution16bNhwcSpecV3.cpp
@@ -0,0 +1,314 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2026 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdSynetConvolution16b.h"
+#include "Simd/SimdSynetConvolution32f.h"
+#include "Simd/SimdSynetConvolution32fCommon.h"
+#include "Simd/SimdSynet.h"
+#include "Simd/SimdBase.h"
+#include "Simd/SimdBFloat16.h"
+#include "Simd/SimdAlignment.h"
+#include "Simd/SimdCpu.h"
+
+namespace Simd
+{
+#if defined(SIMD_SYNET_ENABLE)
+    namespace Base
+    {
+        SynetConvolution16bNhwcSpecV3::SynetConvolution16bNhwcSpecV3(const ConvParam& p)
+            : SynetConvolution16b(p)
+        {
+            _preprocess = 0;
+            _bodyConv = 0;
+            _lastConv = 0;
+        }
+
+        String SynetConvolution16bNhwcSpecV3::Desc() const
+        {
+            std::stringstream desc;
+            desc << Ext() << "::NhwcSpecV3";
+            if (_alg.batch > 1)
+                desc << "-" << _alg.batch;
+            return desc.str();
+        }
+
+        void SynetConvolution16bNhwcSpecV3::SetAlgParam()
+        {
+            const ConvParam& p = _param;
+            AlgParam& a = _alg;
+
+            int L1 = int(Base::AlgCacheL1() * (p.IsKernel(5) ? 1.05 : 1.00)), L2 = int(Base::AlgCacheL2() * 0.5), L3 = int(Base::AlgCacheL3());
+
+            a.F = 16;
+            a.microD = 32;
+            a.microS = 32;
+            a.microC = 32;
+            a.srcC = AlignHi(p.srcC, a.microC);
+            a.padV = Simd::Max(p.padY, p.padH);
+            a.padH = Simd::Max(p.padX, p.padW);
+            a.srcH = p.srcH + a.padV;
+            a.srcW = p.srcW + a.padH;
+            a.gapV = a.srcH - p.dstH;
+            a.gapH = a.srcW - p.dstW;
+            a.dstC = AlignHi(p.dstC, a.microD);
+            a.kA = p.kernelX * p.kernelY;
+            a.K = a.srcC * a.kA;
+            a.padE = a.srcW * a.padV + a.padH * Simd::Max<size_t>(1, a.padV);
+
+            a.macroC = Simd::RestrictRange(AlignLo(L1 / a.microD / a.kA / 2, a.microC), a.microC, a.srcC);
+            a.batch = 1;
+            size_t bufSize = a.srcC * a.srcH * a.srcW * 2;
+            if (bufSize * 2 <= L2 && p.batch > 1)
+            {
+                for (size_t batch = 1; batch <= p.batch; ++batch)
+                    if (p.batch % batch == 0 && batch * bufSize <= L2)
+                        a.batch = batch;
+            }
+            a.macroH = Simd::RestrictRange(L2 / a.macroC / a.srcW / 2, size_t(1), p.dstH * a.batch);
+            a.macroD = Simd::RestrictRange(AlignLoAny(L3 / a.macroC / a.kA / 2, a.microD), a.microD, AlignHiAny(p.dstC, a.microD));
+            a.macroD = Simd::Min<size_t>(a.macroD, a.microD * 4);
+
+            a.bufD = AlignHi(a.batch * a.srcH * a.srcW, a.microS) * a.macroD;
+
+            a.elem = _elemD;
+            a.bufS = (a.batch * a.srcH * a.srcW + a.padE + a.microS) * a.srcC;
+
+            _stepS = p.srcH * p.srcW * p.srcC * a.batch * _elemS;
+            _stepD = p.dstH * p.dstW * p.dstC * a.batch * _elemD;
+
+            int dX = (int)a.microC, dY = (int)a.srcW * dX, dC = int(a.batch * a.srcH * a.srcW + a.padE) * dX;
+            _srcOffs.Resize(DivHi(a.K, a.microC));
+            for (size_t c = 0, offsS = 0, i = 0; c < a.srcC; c += dX, offsS += dC)
+                for (size_t y = 0, offsY = offsS; y < p.kernelY; y += 1, offsY += dY)
+                    for (size_t offsX = offsY, endX = offsY + p.kernelX * dX; offsX < endX; offsX += dX, i++)
+                        _srcOffs[i] = (int)offsX;
+
+            _dstMask.Resize(AlignHi((a.srcH * a.batch - a.gapV) * a.srcW - a.padH, a.microS));
+            size_t i = 0;
+            for (size_t b = 0; b < a.batch; b++)
+            {
+                for (size_t y = 0; y < p.dstH; y++)
+                {
+                    for (size_t x = 0; x < p.dstW; x++, i++)
+                        _dstMask[i] = -1;
+                    for (size_t x = 0; x < a.gapH; x++, i++)
+                        _dstMask[i] = 0;
+                }
+                for (size_t y = 0, gapI = a.gapV * a.srcW; y < gapI && i < _dstMask.size; y++, i++)
+                    _dstMask[i] = 0;
+            }
+            for (; i < _dstMask.size; i++)
+                _dstMask[i] = 0;
+
+            _nK.Resize(DivHi(a.srcC, a.macroC));
+            for (size_t o = 0, c = 0; o < _nK.size; o++, c += a.macroC)
+            {
+                size_t macroC = Simd::Min(a.srcC, c + a.macroC) - c;
+                _nK[o] = int(DivHi(macroC, a.microC) * a.kA);
+            }
+            if (_nK.size > 1 && _nK[_nK.size - 1] < _nK[_nK.size - 2])
+                Simd::Swap(_nK[_nK.size - 1], _nK[_nK.size - 2]);
+
+            size_t n = DivHi(a.batch * p.dstH, a.macroH);
+            _maBufOffs.Resize(n);
+            _maSumOffs.Resize(n + 1);
+            _miDstOffs.Resize(DivHi(_dstMask.size, a.microS));
+            for (size_t i = 0; i <= n; ++i)
+            {
+                if (i == n)
+                    _maSumOffs[i] = int((a.srcH * a.batch - a.gapV) * a.srcW - a.padH);
+                else
+                {
+                    size_t dy = i * a.macroH;
+                    size_t sumOffs = Simd::Max<ptrdiff_t>(dy * a.srcW - a.gapH, 0);
+                    _maSumOffs[i] = int(AlignLo(sumOffs, a.microS));
+                    _maBufOffs[i] = _maSumOffs[i];
+                }
+            }
+            _miDstOffs[0] = 0;
+            for (size_t i = 1; i < _miDstOffs.size; ++i)
+            {
+                _miDstOffs[i] = _miDstOffs[i - 1];
+                for (size_t j = (i - 1) * a.microS, m = i * a.microS; j < m; ++j)
+                    if (_dstMask[j])
+                        _miDstOffs[i]++;
+            }
+        }
+
+        size_t SynetConvolution16bNhwcSpecV3::ExternalBufferSize() const
+        {
+            const AlgParam& a = _alg;
+            size_t size = 0;
+            size += a.bufS * sizeof(uint16_t);
+            size += a.bufD * sizeof(float);
+            return size;
+        }
+
+        void SynetConvolution16bNhwcSpecV3::SetParams(const float* weight, const float* bias, const float* params)
+        {
+            SetWeight(weight);
+            SynetConvolution16b::SetBias(bias, _alg.microD);
+            SynetConvolution16b::SetParams(params, _alg.microD);
+        }
+
+        void SynetConvolution16bNhwcSpecV3::SetWeight(const float* weight)
+        {
+            const ConvParam& p = _param;
+            const AlgParam& a = _alg;
+            _weight.Resize(a.K * a.dstC, true);
+            uint16_t* dst = _weight.data;
+            const size_t microC = a.microC, F = a.F;
+            for (size_t mad = 0; mad < p.dstC; mad += F)
+            {
+                for (size_t mac = 0; mac < p.srcC; mac += microC)
+                {
+                    for (size_t k = 0; k < a.kA; k++)
+                    {
+                        for (size_t c = 0; c < microC; c += 2)
+                        {
+                            const float* src = weight + (k * p.srcC + mac + c) * p.dstC + mad;
+                            for (size_t d = 0; d < F; ++d)
+                            {
+                                for (size_t i = 0; i < 2; ++i)
+                                {
+                                    if (mad + d < p.dstC && mac + c + i < p.srcC)
+                                        *(dst++) = Float32ToBFloat16(src[i * p.dstC]);
+                                    else
+                                        *(dst++) = 0;
+                                }
+                                src++;
+                            }
+                         }
+                    }
+                }
+            }
+        }
+
+        void SynetConvolution16bNhwcSpecV3::Forward(const uint8_t* src, uint8_t* buf8, uint8_t* dst)
+        {
+            const ConvParam& p = _param;
+            const AlgParam& a = _alg;
+            buf8 = Buffer(buf8);
+            uint16_t* bufS = a.bufS ? Allocate<uint16_t>(buf8, a.bufS) : NULL;
+            float* bufD = a.bufD ? Allocate<float>(buf8, a.bufD) : NULL;
+            for (size_t b = 0; b < p.batch; b += a.batch)
+            {
+                uint16_t* buf = bufS ? bufS : (uint16_t*)src;
+                float* sum = bufD ? bufD : (float*)dst;
+                if(a.batch == 1)
+                    ForwardSingle(src, buf, sum, dst);
+                else
+                    ForwardBatch(src, buf, sum, dst);
+                src += _stepS;
+                dst += _stepD;
+            }
+        }
+
+        void SynetConvolution16bNhwcSpecV3::ForwardSingle(const uint8_t* src, uint16_t* buf, float* sum, uint8_t* dst)
+        {
+            const ConvParam& p = _param;
+            const AlgParam& a = _alg;
+            const float* bias = _bias.data, * params = _params.data;
+            size_t dS = a.microC, dB = a.macroD, dD = p.dstC * _elemD;
+            size_t bufOffs = ((a.padV - p.padY) * a.srcW + (a.padH - p.padX)) * dS;
+            for (size_t mad = 0; mad < p.dstC; mad += a.macroD)
+            {
+                size_t macroD = Simd::Min(p.dstC, mad + a.macroD) - mad;
+                const uint16_t* weight = _weight.data + mad * a.K;
+                const int* srcOffs = _srcOffs.data;
+                for (size_t nk = 0; nk < _nK.size; ++nk)
+                {
+                    int zero = nk == 0 ? 1 : 0;
+                    size_t nK = _nK[nk];
+                    for (size_t dyBeg = 0, dyN = 0; dyBeg < p.dstH; dyN++)
+                    {
+                        size_t dyEnd = Simd::Min(dyBeg + a.macroH, p.dstH);
+                        size_t dstS = _maSumOffs[dyN + 1] - _maSumOffs[dyN];
+                        size_t miIdx = _maSumOffs[dyN] / a.microS;
+                        if (mad == 0 && zero)
+                            _preprocess(src, p, a, dyBeg, dyEnd, dyEnd == p.dstH ? 1 : 0, buf);
+                        if (nk == _nK.size - 1)
+                            _lastConv(buf + bufOffs + _maBufOffs[dyN] * dS, p, a, srcOffs, macroD, dstS, nK, zero, weight,
+                                sum + _maSumOffs[dyN] * dB, bias, params, _dstMask.data + _maSumOffs[dyN], _miDstOffs.data + miIdx, dst + _miDstOffs[miIdx] * dD);
+                        else
+                            _bodyConv(buf + bufOffs + _maBufOffs[dyN] * dS, p, a, srcOffs, macroD, dstS, nK, zero, weight, sum + _maSumOffs[dyN] * dB);
+                        dyBeg = dyEnd;
+                    }
+                    srcOffs += nK;
+                    weight += nK * a.microC * a.F;
+                }
+                bias += macroD;
+                if (p.activation == ::SimdConvolutionActivationPrelu)
+                    params += macroD;
+                dst += macroD * _elemD;
+            }
+        }
+
+        void SynetConvolution16bNhwcSpecV3::ForwardBatch(const uint8_t* src, uint16_t* buf, float* sum, uint8_t* dst)
+        {
+            const ConvParam& p = _param;
+            const AlgParam& a = _alg;
+            const float* bias = _bias.data, * params = _params.data;
+            const int* mask = _dstMask.data;
+            size_t dstH = p.dstH * a.batch, dstS = _maSumOffs[1] - _maSumOffs[0];
+            size_t bufOffs = ((a.padV - p.padY) * a.srcW + (a.padH - p.padX)) * a.microC;
+            for (size_t mad = 0; mad < p.dstC; mad += a.macroD)
+            {
+                size_t macroD = Simd::Min(p.dstC, mad + a.macroD) - mad;
+                const uint16_t* weight = _weight.data + mad * a.K;
+                const int* srcOffs = _srcOffs.data;
+                for (size_t nk = 0; nk < _nK.size; ++nk)
+                {
+                    int zero = nk == 0 ? 1 : 0;
+                    size_t nK = _nK[nk];
+                    if (mad == 0 && zero)
+                    {
+                        size_t dS = p.srcH * p.srcW * p.srcC * _elemS;
+                        size_t dB = a.srcH * a.srcW * a.microC;
+                        for (size_t b = 0; b < a.batch; ++b)
+                            _preprocess(src + b * dS, p, a, 0, p.dstH, b == a.batch - 1 ? 1 : 0, buf + b * dB);
+                    }
+                    if (nk == _nK.size - 1)
+                        _lastConv(buf + bufOffs, p, a, srcOffs, macroD, dstS, nK, zero, weight, sum, bias, params, mask, _miDstOffs.data, dst);
+                    else
+                        _bodyConv(buf + bufOffs, p, a, srcOffs, macroD, dstS, nK, zero, weight, sum);
+                    srcOffs += nK;
+                    weight += nK * a.microC * a.F;
+                }
+                bias += macroD;
+                if (p.activation == ::SimdConvolutionActivationPrelu)
+                    params += macroD;
+                dst += macroD * _elemD;
+            }
+        }
+
+        bool SynetConvolution16bNhwcSpecV3::Preferable(const ConvParam& p)
+        {
+            const size_t M = p.dstH * p.dstW;
+            static int choise = 0;
+            return 1 && p.trans != 0 && p.group == 1 && p.IsDilation(1) && p.IsStride(1) && !p.IsKernel(1) && p.dstC >= 4
+                && p.srcC >= 9 && p.srcC <= 128 && M >= 16;// && (choise++) & 0;
+        }
+    }
+#endif
+}
diff --git a/src/Simd/SimdSynetConvolution16b.h b/src/Simd/SimdSynetConvolution16b.h
index 1fe2b46321..e481b76dd0 100644
--- a/src/Simd/SimdSynetConvolution16b.h
+++ b/src/Simd/SimdSynetConvolution16b.h
@@ -407,7 +407,7 @@ namespace Simd
                 const uint16_t* weight, float* sum, const float* bias, const float* params, const int* dstMask, const int* dstOffs, uint8_t* dst);
 
         protected:
-            void SetAlgParam(size_t F, size_t microD, size_t microS, size_t microC, size_t L1, size_t L2, size_t L3);
+            void SetAlgParam();
             virtual void SetWeight(const float* weight);
 
             void ForwardSingle(const uint8_t* src, uint16_t* buf, float* sum, uint8_t* dst);

From b0b21219bafb2d2c683e1d5763cedf66584e75c4 Mon Sep 17 00:00:00 2001
From: Yermalayeu Ihar <ermig@tut.by>
Date: Thu, 21 May 2026 12:02:20 +0300
Subject: [PATCH 21/32] +add AMX-BF16 optimizations of class
 SynetConvolution16bNhwcSpecV3.

---
 docs/2026.html                                |   2 +-
 prj/vs2022/AmxBf16.vcxproj                    |   1 +
 prj/vs2022/AmxBf16.vcxproj.filters            |   3 +
 src/Simd/SimdAmxBf16SynetConvolution16b.cpp   |   2 +
 ...mdAmxBf16SynetConvolution16bNhwcSpecV3.cpp | 695 ++++++++++++++++++
 .../SimdBaseSynetConvolution16bNhwcSpecV3.cpp |   4 +-
 src/Simd/SimdSynetConvolution16b.h            |   8 +
 7 files changed, 712 insertions(+), 3 deletions(-)
 create mode 100644 src/Simd/SimdAmxBf16SynetConvolution16bNhwcSpecV3.cpp

diff --git a/docs/2026.html b/docs/2026.html
index fb1b130286..cf0499a982 100644
--- a/docs/2026.html
+++ b/docs/2026.html
@@ -49,7 +49,7 @@ <h5>New features</h5>
  <li>SVE optimizations of function BackgroundGrowRangeFast.</li>
  <li>Method View::Copy.</li>
  <li>Method Frame::Copy.</li>
- <li>Base implementation of class SynetConvolution16bNhwcSpecV3.</li>
+ <li>Base implementation, AMX-BF16 optimizations of class SynetConvolution16bNhwcSpecV3.</li>
 </ul>
 <h5>Bug fixing</h5>
 <ul>
diff --git a/prj/vs2022/AmxBf16.vcxproj b/prj/vs2022/AmxBf16.vcxproj
index 11bea96861..fadbbffa28 100644
--- a/prj/vs2022/AmxBf16.vcxproj
+++ b/prj/vs2022/AmxBf16.vcxproj
@@ -88,6 +88,7 @@
     <ClCompile Include="..\..\src\Simd\SimdAmxBf16SynetConvolution16bNhwcSpecV0.cpp" />
     <ClCompile Include="..\..\src\Simd\SimdAmxBf16SynetConvolution16bNhwcSpecV1.cpp" />
     <ClCompile Include="..\..\src\Simd\SimdAmxBf16SynetConvolution16bNhwcSpecV2.cpp" />
+    <ClCompile Include="..\..\src\Simd\SimdAmxBf16SynetConvolution16bNhwcSpecV3.cpp" />
     <ClCompile Include="..\..\src\Simd\SimdAmxBf16SynetConvolution8iDirect.cpp" />
     <ClCompile Include="..\..\src\Simd\SimdAmxBf16SynetConvolution8iDirect1x1.cpp" />
     <ClCompile Include="..\..\src\Simd\SimdAmxBf16SynetConvolution8iDirectAny.cpp" />
diff --git a/prj/vs2022/AmxBf16.vcxproj.filters b/prj/vs2022/AmxBf16.vcxproj.filters
index cd85d24155..31ef806543 100644
--- a/prj/vs2022/AmxBf16.vcxproj.filters
+++ b/prj/vs2022/AmxBf16.vcxproj.filters
@@ -353,5 +353,8 @@
     <ClCompile Include="..\..\src\Simd\SimdAmxBf16SynetConvolution16bNhwcGemmV2.cpp">
       <Filter>AmxBf16\Synet\Convolution</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\src\Simd\SimdAmxBf16SynetConvolution16bNhwcSpecV3.cpp">
+      <Filter>AmxBf16\Synet\Convolution</Filter>
+    </ClCompile>
   </ItemGroup>
 </Project>
\ No newline at end of file
diff --git a/src/Simd/SimdAmxBf16SynetConvolution16b.cpp b/src/Simd/SimdAmxBf16SynetConvolution16b.cpp
index 85589ae25c..a080a9c765 100644
--- a/src/Simd/SimdAmxBf16SynetConvolution16b.cpp
+++ b/src/Simd/SimdAmxBf16SynetConvolution16b.cpp
@@ -33,6 +33,8 @@ namespace Simd
             ConvParam param(batch, conv, compatibility);
             if (!param.Valid(SimdTensorData32f, SimdTensorData16b))
                 return NULL;
+            if (SynetConvolution16bNhwcSpecV3::Preferable(param))
+                return new AmxBf16::SynetConvolution16bNhwcSpecV3(param);
             if (SynetConvolution16bNhwcSpecV2::Preferable(param))
                 return new AmxBf16::SynetConvolution16bNhwcSpecV2(param);
             //if (SynetConvolution16bNhwcSpecV1::Preferable(param))
diff --git a/src/Simd/SimdAmxBf16SynetConvolution16bNhwcSpecV3.cpp b/src/Simd/SimdAmxBf16SynetConvolution16bNhwcSpecV3.cpp
new file mode 100644
index 0000000000..98137ca067
--- /dev/null
+++ b/src/Simd/SimdAmxBf16SynetConvolution16bNhwcSpecV3.cpp
@@ -0,0 +1,695 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2026 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdSynetConvolution16b.h"
+#include "Simd/SimdSynetConvolution16bCommon.h"
+#include "Simd/SimdBFloat16.h"
+#include "Simd/SimdSynet.h"
+#include "Simd/SimdAmxBf16.h"
+#include "Simd/SimdSet.h"
+#include "Simd/SimdCopy.h"
+#include "Simd/SimdCpu.h"
+#include "Simd/SimdTile.h"
+
+namespace Simd
+{
+#if (defined(SIMD_AMXBF16_ENABLE) || (defined(SIMD_AVX512BW_ENABLE) && defined(SIMD_AMX_EMULATE)))
+    namespace AmxBf16
+    {
+        typedef Base::SynetConvolution16bNhwcSpecV3::AlgParam AlgParam;
+        typedef Base::SynetConvolution16bNhwcSpecV3::LastConvPtr LastConvPtr;
+
+        //-------------------------------------------------------------------------------------------------
+
+        static void Convert16bNhwcSpecV3(const uint8_t* src8, const ConvParam& p, const AlgParam& a, size_t dyBeg, size_t dyEnd, int end, uint16_t* dst)
+        {
+            assert(a.microC == DF);
+            const float* src = (float*)src8;
+            size_t srcCDF = Simd::AlignLo(p.srcC, DF);
+            __mmask32 tailC = TailMask32(p.srcC - srcCDF);
+            size_t syPad = p.kernelY - 1 - p.padY, syBeg, syEnd = (dyEnd == p.dstH ? p.srcH : dyEnd + syPad);
+            size_t cD = a.batch * a.srcH * a.srcW + a.padE, sD = a.microC;
+            if (dyBeg == 0)
+            {
+                for (size_t s = 0, n = a.padV * a.srcW; s < n; ++s)
+                    for (size_t c = 0; c < a.srcC; c += a.microC)
+                        Avx512bw::SetZero(dst + c * cD + s * sD);
+                dst += a.padV * a.srcW * sD;
+                syBeg = 0;
+            }
+            else
+            {
+                syBeg = dyBeg + syPad;
+                src += syBeg * p.srcW * p.srcC;
+                dst += (dyBeg + p.kernelY - 1 + a.padV - p.padY) * a.srcW * sD;
+            }
+            for (size_t sy = syBeg; sy < syEnd; ++sy)
+            {
+                if (a.padH)
+                {
+                    for (size_t s = 0; s < a.padH; ++s)
+                        for (size_t c = 0; c < a.srcC; c += a.microC)
+                            Avx512bw::SetZero(dst + c * cD + s * sD);
+                    dst += a.padH * sD;
+                }
+                for (size_t sx = 0; sx < p.srcW; ++sx)
+                {
+                    size_t sc = 0;
+                    for (; sc < srcCDF; sc += DF)
+                        AmxBf16::Float32ToBFloat16(src + sc, dst + sc * cD);
+                    if (tailC)
+                        AmxBf16::Float32ToBFloat16(src + sc, dst + sc * cD, tailC);
+                    src += p.srcC;
+                    dst += sD;
+                }
+            }
+            if (end)
+            {
+                for (size_t s = 0, n = a.padE; s < n; ++s)
+                    for (size_t c = 0; c < a.srcC; c += a.microC)
+                        Avx512bw::SetZero(dst + c * cD + s * sD);
+            }
+            else if (dyEnd != p.dstH)
+            {
+                for (size_t s = 0, n = a.padH; s < n; ++s)
+                    for (size_t c = 0; c < a.srcC; c += a.microC)
+                        Avx512bw::SetZero(dst + c * cD + s * sD);
+            }
+        }
+
+        static void Reorder16bNhwcSpecV3(const uint8_t* src8, const ConvParam& p, const AlgParam& a, size_t dyBeg, size_t dyEnd, int end, uint16_t* dst)
+        {
+            assert(a.microC == DF);
+            const uint16_t* src = (uint16_t*)src8;
+            size_t srcCDF = Simd::AlignLo(p.srcC, DF);
+            __mmask32 tailC = TailMask32(p.srcC - srcCDF);
+            size_t syPad = p.kernelY - 1 - p.padY, syBeg, syEnd = (dyEnd == p.dstH ? p.srcH : dyEnd + syPad);
+            size_t cD = a.batch * a.srcH * a.srcW + a.padE, sD = a.microC;
+            if (dyBeg == 0)
+            {
+                for (size_t s = 0, n = a.padV * a.srcW; s < n; ++s)
+                    for (size_t c = 0; c < a.srcC; c += a.microC)
+                        Avx512bw::SetZero(dst + c * cD + s * sD);
+                dst += a.padV * a.srcW * sD;
+                syBeg = 0;
+            }
+            else
+            {
+                syBeg = dyBeg + syPad;
+                src += syBeg * p.srcW * p.srcC;
+                dst += (dyBeg + p.kernelY - 1 + a.padV - p.padY) * a.srcW * sD;
+            }
+            for (size_t sy = syBeg; sy < syEnd; ++sy)
+            {
+                if (a.padH)
+                {
+                    for (size_t s = 0; s < a.padH; ++s)
+                        for (size_t c = 0; c < a.srcC; c += a.microC)
+                            Avx512bw::SetZero(dst + c * cD + s * sD);
+                    dst += a.padH * sD;
+                }
+                for (size_t sx = 0; sx < p.srcW; ++sx)
+                {
+                    size_t sc = 0;
+                    for (; sc < srcCDF; sc += DF)
+                        Avx512bw::Copy(src + sc, dst + sc * cD);
+                    if (tailC)
+                        Avx512bw::Copy(src + sc, dst + sc * cD, tailC);
+                    src += p.srcC;
+                    dst += sD;
+                }
+            }
+            if (end)
+            {
+                for (size_t s = 0, n = a.padE; s < n; ++s)
+                    for (size_t c = 0; c < a.srcC; c += a.microC)
+                        Avx512bw::SetZero(dst + c * cD + s * sD);
+            }
+            else if (dyEnd != p.dstH)
+            {
+                for (size_t s = 0, n = a.padH; s < n; ++s)
+                    for (size_t c = 0; c < a.srcC; c += a.microC)
+                        Avx512bw::SetZero(dst + c * cD + s * sD);
+            }
+        }
+
+        //-------------------------------------------------------------------------------------------------
+
+        SIMD_INLINE void Convolution16bNhwcSpecV3Body32x32(const uint16_t* src0, const ConvParam& p, const AlgParam& a, const int* offs, size_t nK, int zero, const uint16_t* weight0, float* buf0)
+        {
+            int dB = (int)a.macroD, dS = (int)a.microC, strideS = dS * 2, dW = 512, strideW = 64, strideB = dB * 4;
+            const uint16_t* weight1 = weight0 + a.K * F;
+            const uint16_t* src1 = src0 + 16 * dS;
+            float* buf1 = buf0 + 16 * dB;
+
+            if (zero)
+            {
+                _tile_zero(0);
+                _tile_zero(1);
+                _tile_zero(2);
+                _tile_zero(3);
+            }
+            else
+            {
+                _tile_stream_loadd(0, buf0 + 0, strideB);
+                _tile_stream_loadd(1, buf0 + F, strideB);
+                _tile_stream_loadd(2, buf1 + 0, strideB);
+                _tile_stream_loadd(3, buf1 + F, strideB);
+            }
+
+            int n1 = (int)nK - 1, o = offs[0];
+            _tile_stream_loadd(4, src0 + o, strideS);
+            _tile_loadd(6, weight0, strideW);
+            for (int i = 0; i < n1; ++i, weight1 += dW)
+            {
+                _tile_stream_loadd(5, src1 + o, strideS);
+                _tile_loadd(7, weight1, strideW);
+                _tile_dpbf16ps(0, 4, 6);
+                _tile_dpbf16ps(1, 4, 7);
+                o = offs[i + 1];
+                _tile_stream_loadd(4, src0 + o, strideS);
+                _tile_dpbf16ps(2, 5, 6);
+                weight0 += dW;
+                _tile_loadd(6, weight0, strideW);
+                _tile_dpbf16ps(3, 5, 7);
+            }
+            _tile_loadd(7, weight1, strideW);
+            _tile_stream_loadd(5, src1 + offs[n1], strideS);
+
+            _tile_dpbf16ps(0, 4, 6);
+            _tile_stored(0, buf0 + 0, strideB);
+            TileMoveToMemory(buf0 + 0, dB);
+
+            _tile_dpbf16ps(1, 4, 7);
+            _tile_stored(1, buf0 + F, strideB);
+            TileMoveToMemory(buf0 + F, dB);
+
+            _tile_dpbf16ps(2, 5, 6);
+            _tile_stored(2, buf1 + 0, strideB);
+            TileMoveToMemory(buf1 + 0, dB);
+
+            _tile_dpbf16ps(3, 5, 7);
+            _tile_stored(3, buf1 + F, strideB);
+            TileMoveToMemory(buf1 + F, dB);
+        }
+
+        SIMD_INLINE void Convolution16bNhwcSpecV3Body32x16(const uint16_t* src0, const ConvParam& p, const AlgParam& a, const int* offs, size_t nK, int zero, const uint16_t* weight0, float* buf0)
+        {
+            int dB = (int)a.macroD, dS = (int)a.microC, strideS = dS * 2, dW = 512, strideW = 64, strideB = dB * 4;
+            const uint16_t* src1 = src0 + 16 * dS;
+            float* buf1 = buf0 + 16 * dB;
+
+            if (zero)
+            {
+                _tile_zero(0);
+                _tile_zero(2);
+            }
+            else
+            {
+                _tile_stream_loadd(0, buf0 + 0, strideB);
+                _tile_stream_loadd(2, buf1 + 0, strideB);
+            }
+
+            int n1 = (int)nK - 1, o = offs[0];
+            _tile_loadd(4, src0 + o, strideS);
+            for (int i = 0; i < n1; ++i)
+            {
+                _tile_stream_loadd(6, weight0, strideW);
+                _tile_loadd(5, src1 + o, strideS);
+                _tile_dpbf16ps(0, 4, 6);
+                o = offs[i + 1];
+                _tile_loadd(4, src0 + o, strideS);
+                _tile_dpbf16ps(2, 5, 6);
+                weight0 += dW;
+            }
+            _tile_stream_loadd(6, weight0, strideW);
+            _tile_loadd(5, src1 + offs[n1], strideS);
+
+            _tile_dpbf16ps(0, 4, 6);
+            _tile_stored(0, buf0 + 0, strideB);
+            TileMoveToMemory(buf0 + 0, dB);
+
+            _tile_dpbf16ps(2, 5, 6);
+            _tile_stored(2, buf1 + 0, strideB);
+            TileMoveToMemory(buf1 + 0, dB);
+        }
+
+        SIMD_INLINE void Convolution16bNhwcSpecV3Body16x32(const uint16_t* src0, const ConvParam& p, const AlgParam& a, const int* offs, size_t nK, int zero, const uint16_t* weight0, float* buf0)
+        {
+            int dB = (int)a.macroD, dS = (int)a.microC, strideS = dS * 2, dW = 512, strideW = 64, strideB = dB * 4;
+            const uint16_t* weight1 = weight0 + a.K * F;
+
+            if (zero)
+            {
+                _tile_zero(0);
+                _tile_zero(1);
+            }
+            else
+            {
+                _tile_stream_loadd(0, buf0 + 0, strideB);
+                _tile_stream_loadd(1, buf0 + F, strideB);
+            }
+
+            int n1 = (int)nK - 1;
+            _tile_loadd(6, weight0, strideW);
+            for (int i = 0; i < n1; ++i, weight1 += dW)
+            {
+                _tile_stream_loadd(4, src0 + offs[i], strideS);
+                _tile_loadd(7, weight1, strideW);
+                _tile_dpbf16ps(0, 4, 6);
+                weight0 += dW;
+                _tile_loadd(6, weight0, strideW);
+                _tile_dpbf16ps(1, 4, 7);
+            }
+            _tile_stream_loadd(4, src0 + offs[n1], strideS);
+            _tile_loadd(7, weight1, strideW);
+
+            _tile_dpbf16ps(0, 4, 6);
+            _tile_stored(0, buf0 + 0, strideB);
+            TileMoveToMemory(buf0 + 0, dB);
+
+            _tile_dpbf16ps(1, 4, 7);
+            _tile_stored(1, buf0 + F, strideB);
+            TileMoveToMemory(buf0 + F, dB);
+        }
+
+        SIMD_INLINE void Convolution16bNhwcSpecV3Body16x16(const uint16_t* src0, const ConvParam& p, const AlgParam& a, const int* offs, size_t nK, int zero, const uint16_t* weight0, float* buf0)
+        {
+            int dB = (int)a.macroD, dS = (int)a.microC, strideS = dS * 2, dW = 512, strideW = 64, strideB = dB * 4;
+
+            if (zero)
+            {
+                _tile_zero(0);
+            }
+            else
+            {
+                _tile_stream_loadd(0, buf0 + 0, strideB);
+            }
+
+            int n = (int)nK;
+            for (int i = 0; i < n; ++i)
+            {
+                _tile_stream_loadd(4, src0 + offs[i], strideS);
+                _tile_loadd(6, weight0, strideW);
+                _tile_dpbf16ps(0, 4, 6);
+                weight0 += dW;
+            }
+
+            _tile_stored(0, buf0 + 0, strideB);
+            TileMoveToMemory(buf0 + 0, dB);
+        }
+
+        typedef void (*Convolution16bNhwcSpecV3BodyPtr)(const uint16_t* src0, const ConvParam& p, const AlgParam& a, const int* offset, size_t nK, int zero, const uint16_t* weight0, float* buf0);
+
+        static void Convolution16bNhwcSpecV3Body(const uint16_t* src, const ConvParam& p, const AlgParam& a, const int* offs, size_t dstC, size_t dstS, size_t nK, int zero, const uint16_t* weight, float* buf)
+        {
+            size_t n1 = AlignHi(dstS, 16), n = 32;
+            size_t nn = AlignLo(n1, n), m = n1 - nn, dW = a.K * DF;
+            size_t dB = a.macroD, dS = a.microC;
+
+            SetTileConfFull();
+            for (size_t dc = 0; dc < dstC; dc += DF)
+            {
+                size_t dC = Simd::Min(DF, dstC - dc);
+                size_t i = 0;
+                if (dC > F)
+                {
+                    for (; i < nn; i += n)
+                        Convolution16bNhwcSpecV3Body32x32(src + i * dS, p, a, offs, nK, zero, weight, buf + i * dB);
+                    if (m)
+                        Convolution16bNhwcSpecV3Body16x32(src + i * dS, p, a, offs, nK, zero, weight, buf + i * dB);
+                }
+                else
+                {
+                    for (; i < nn; i += n)
+                        Convolution16bNhwcSpecV3Body32x16(src + i * dS, p, a, offs, nK, zero, weight, buf + i * dB);
+                    if (m)
+                        Convolution16bNhwcSpecV3Body16x16(src + i * dS, p, a, offs, nK, zero, weight, buf + i * dB);
+                }
+                weight += dW;
+                buf += DF;
+            }
+        }
+
+        //-------------------------------------------------------------------------------------------------
+
+        template<Term16bType term, SimdConvolutionActivationType type, int M, int flush> static SIMD_INLINE void ApplyMx1(
+            uint8_t *& ptr, int dP, float* buf, const __m512* bias, const __m512* params, const int* mask, __mmask32 tail = __mmask32(-1))
+        {
+            uint32_t msk = mask[0];
+            tail = tail & msk;
+            if (M == 1)
+            {
+                __m512 f0 = Activate<type>(_mm512_add_ps(_mm512_loadu_ps(buf), bias[0]), params, 0);
+                _mm_prefetch((const char*)buf + 0, _MM_HINT_NTA);
+                if (term == Term16bLast16b)
+                {
+                    _mm256_mask_storeu_epi16((uint16_t*)ptr, (__mmask16)tail, (__m256i)_mm512_cvtneps_pbh(f0));
+                    if (flush == 1)
+                        _mm_prefetch((const char*)ptr, _MM_HINT_NTA);
+                    else if (flush == 2)
+                        _m_prefetchw((char*)ptr);
+                }
+                else
+                {
+                    _mm512_mask_storeu_ps((float*)ptr, (__mmask16)tail, f0);
+                    if (flush == 1)
+                        _mm_prefetch((const char*)ptr, _MM_HINT_NTA);
+                    else if (flush == 2)
+                        _m_prefetchw((char*)ptr);
+                }
+            }
+            else if (M == 2)
+            {
+                __m512 f0 = Activate<type>(_mm512_add_ps(_mm512_loadu_ps(buf + 0), bias[0]), params, 0);
+                _mm_prefetch((const char*)buf + 0, _MM_HINT_NTA);
+                __m512 f1 = Activate<type>(_mm512_add_ps(_mm512_loadu_ps(buf + F), bias[1]), params, 1);
+                _mm_prefetch((const char*)buf + A, _MM_HINT_NTA);
+                if (term == Term16bLast16b)
+                {
+                    _mm512_mask_storeu_epi16((uint16_t*)ptr, tail, (__m512i)_mm512_cvtne2ps_pbh(f1, f0));
+                    if (flush == 1)
+                        _mm_prefetch((const char*)ptr, _MM_HINT_NTA);
+                    else if (flush == 2)
+                        _m_prefetchw((char*)ptr);
+                }
+                else
+                {
+                    _mm512_mask_storeu_ps((float*)ptr, (__mmask16)msk, f0);
+                    if (flush == 1)
+                        _mm_prefetch((const char*)ptr, _MM_HINT_NTA);
+                    else if (flush == 2)
+                        _m_prefetchw((char*)ptr + 0);
+                    _mm512_mask_storeu_ps((float*)(ptr + A), (__mmask16)tail, f1);
+                    if (flush == 1)
+                        _mm_prefetch((const char*)(ptr + A), _MM_HINT_NTA);
+                    else if (flush == 2)
+                        _m_prefetchw((char*)ptr + A);
+                }
+            }
+            ptr += dP & msk;
+        }
+
+        template<Term16bType term, SimdConvolutionActivationType type, int M, int N, int flush> static SIMD_INLINE void ApplyMxN(
+            uint8_t*& ptr, int dP, float* buf, int dB, const __m512* bias, const __m512* params, const int* mask, __mmask32 tail = __mmask32(-1))
+        {
+            if (N > 0) ApplyMx1<term, type, M, flush>(ptr, dP, buf + 0 * dB, bias, params, mask + 0, tail);
+            if (N > 1) ApplyMx1<term, type, M, flush>(ptr, dP, buf + 1 * dB, bias, params, mask + 1, tail);
+            if (N > 2) ApplyMx1<term, type, M, flush>(ptr, dP, buf + 2 * dB, bias, params, mask + 2, tail);
+            if (N > 3) ApplyMx1<term, type, M, flush>(ptr, dP, buf + 3 * dB, bias, params, mask + 3, tail);
+            if (N > 4) ApplyMx1<term, type, M, flush>(ptr, dP, buf + 4 * dB, bias, params, mask + 4, tail);
+            if (N > 5) ApplyMx1<term, type, M, flush>(ptr, dP, buf + 5 * dB, bias, params, mask + 5, tail);
+            if (N > 6) ApplyMx1<term, type, M, flush>(ptr, dP, buf + 6 * dB, bias, params, mask + 6, tail);
+            if (N > 7) ApplyMx1<term, type, M, flush>(ptr, dP, buf + 7 * dB, bias, params, mask + 7, tail);
+        }
+
+        //-------------------------------------------------------------------------------------------------
+
+        template<Term16bType term, SimdConvolutionActivationType type, int M, int apply, int flush> void Convolution16bNhwcSpecV3_1x32x32(
+            const uint16_t* src0, const ConvParam& p, const AlgParam& a, const int* offs, size_t nK, int zero, const uint16_t* weight0, 
+            const __m512* bias, const __m512* params, float* buf2, const int* mask, uint8_t * &dst, __mmask32 tail)
+        {
+            int dD = int(p.dstC * a.elem), dB = (int)a.macroD, dS = (int)a.microC, strideS = dS * 2, dW = 512, strideW = 64, strideB = dB * 4;
+            const uint16_t* weight1 = weight0 + a.K * F;
+            const uint16_t* src1 = src0 + 16 * dS;
+            float* buf0 = buf2 - 32 * dB;
+            float* buf3 = buf2 + 16 * dB;
+
+            if (zero)
+            {
+                if (M > 0) _tile_zero(0);
+                if (M > 1) _tile_zero(1);
+                if (M > 0) _tile_zero(2);
+                if (M > 1) _tile_zero(3);
+            }
+            else
+            {
+                if (M > 0) _tile_stream_loadd(0, buf2 + 0, strideB);
+                if (M > 1) _tile_stream_loadd(1, buf2 + F, strideB);
+                if (M > 0) _tile_stream_loadd(2, buf3 + 0, strideB);
+                if (M > 1) _tile_stream_loadd(3, buf3 + F, strideB);
+            }
+
+            int n1 = (int)nK - 1, i = 0, o = offs[0], na = apply ? (8 / apply - 1) : 0, ds = 0;
+            _tile_stream_loadd(4, src0 + o, strideS);
+            if (M > 0) _tile_loadd(6, weight0, strideW);
+            for (; i < na; ++i, weight1 += dW)
+            {
+                if (M > 1) _tile_loadd(7, weight1, strideW);
+                if (M > 0) _tile_dpbf16ps(0, 4, 6);
+                ApplyMxN<term, type, M, apply, flush>(dst, dD, buf0 + ds * dB, dB, bias, params, mask + ds, tail), ds += apply;
+                _tile_stream_loadd(5, src1 + o, strideS);
+                if (M > 1) _tile_dpbf16ps(1, 4, 7);
+                ApplyMxN<term, type, M, apply, flush>(dst, dD, buf0 + ds * dB, dB, bias, params, mask + ds, tail), ds += apply;
+                o = offs[i + 1];
+                _tile_stream_loadd(4, src0 + o, strideS);
+                if (M > 0) _tile_dpbf16ps(2, 5, 6);
+                ApplyMxN<term, type, M, apply, flush>(dst, dD, buf0 + ds * dB, dB, bias, params, mask + ds, tail), ds += apply;
+                weight0 += dW;
+                if (M > 0) _tile_loadd(6, weight0, strideW);
+                if (M > 1) _tile_dpbf16ps(3, 5, 7);
+                ApplyMxN<term, type, M, apply, flush>(dst, dD, buf0 + ds * dB, dB, bias, params, mask + ds, tail), ds += apply;
+            }
+            for (; i < n1; ++i, weight1 += dW)
+            {
+                if (M > 1) _tile_loadd(7, weight1, strideW);
+                if (M > 0) _tile_dpbf16ps(0, 4, 6);
+                _tile_stream_loadd(5, src1 + o, strideS);
+                if (M > 1) _tile_dpbf16ps(1, 4, 7);
+                o = offs[i + 1];
+                _tile_stream_loadd(4, src0 + o, strideS);
+                if (M > 0) _tile_dpbf16ps(2, 5, 6);
+                weight0 += dW;
+                if (M > 0) _tile_loadd(6, weight0, strideW);
+                if (M > 1) _tile_dpbf16ps(3, 5, 7);
+            }
+            if (M > 1) _tile_loadd(7, weight1, strideW);
+            _tile_stream_loadd(5, src1 + offs[n1], strideS);
+
+            if (M > 0) _tile_dpbf16ps(0, 4, 6);
+            ApplyMxN<term, type, M, apply, flush>(dst, dD, buf0 + ds * dB, dB, bias, params, mask + ds, tail), ds += apply;
+            if (M > 0) _tile_stored(0, buf2 + 0, strideB);
+
+            if (M > 1) _tile_dpbf16ps(1, 4, 7);
+            ApplyMxN<term, type, M, apply, flush>(dst, dD, buf0 + ds * dB, dB, bias, params, mask + ds, tail), ds += apply;
+            if (M > 1) _tile_stored(1, buf2 + F, strideB);
+
+            if (M > 0) _tile_dpbf16ps(2, 5, 6);
+            ApplyMxN<term, type, M, apply, flush>(dst, dD, buf0 + ds * dB, dB, bias, params, mask + ds, tail), ds += apply;
+            if (M > 0) _tile_stored(2, buf3 + 0, strideB);
+
+            if (M > 1) _tile_dpbf16ps(3, 5, 7);
+            ApplyMxN<term, type, M, apply, flush>(dst, dD, buf0 + ds * dB, dB, bias, params, mask + ds, tail), ds += apply;
+            if (M > 1) _tile_stored(3, buf3 + F, strideB);
+        }
+
+        template<Term16bType term, SimdConvolutionActivationType type, int M, int apply, int flush> void Convolution16bNhwcSpecV3_1x16x32(
+            const uint16_t* src0, const ConvParam& p, const AlgParam& a, const int* offs, size_t nK, int zero, const uint16_t* weight0,
+            const __m512* bias, const __m512* params, float* buf2, const int* mask, uint8_t*& dst, __mmask32 tail)
+        {
+            int dD = int(p.dstC * a.elem), dB = (int)a.macroD, dS = (int)a.microC, strideS = dS * 2, dW = 512, strideW = 64, strideB = dB * 4;
+            const uint16_t* weight1 = weight0 + a.K * F;
+            const uint16_t* src1 = src0 + 16 * dS;
+            float* buf0 = buf2 - 32 * dB;
+
+            if (zero)
+            {
+                if (M > 0) _tile_zero(0);
+                if (M > 1) _tile_zero(1);
+            }
+            else
+            {
+                if (M > 0) _tile_stream_loadd(0, buf2 + 0, strideB);
+                if (M > 1) _tile_stream_loadd(1, buf2 + F, strideB);
+            }
+
+            int n1 = (int)nK - 1, i = 0, o = offs[0], na = apply ? (8 / apply - 1) : 0, ds = 0;
+            _tile_stream_loadd(4, src0 + o, strideS);
+            if (M > 0) _tile_loadd(6, weight0, strideW);
+            for (; i < na; ++i, weight1 += dW)
+            {
+                if (M > 1) _tile_loadd(7, weight1, strideW);
+                if (M > 0) _tile_dpbf16ps(0, 4, 6);
+                ApplyMxN<term, type, M, 2 * apply, flush>(dst, dD, buf0 + ds * dB, dB, bias, params, mask + ds, tail), ds += 2 * apply;
+                if (M > 1) _tile_dpbf16ps(1, 4, 7);
+                ApplyMxN<term, type, M, 2 * apply, flush>(dst, dD, buf0 + ds * dB, dB, bias, params, mask + ds, tail), ds += 2 * apply;
+                o = offs[i + 1];
+                _tile_stream_loadd(4, src0 + o, strideS);
+                weight0 += dW;
+                if (M > 0) _tile_loadd(6, weight0, strideW);
+            }
+            for (; i < n1; ++i, weight1 += dW)
+            {
+                if (M > 1) _tile_loadd(7, weight1, strideW);
+                if (M > 0) _tile_dpbf16ps(0, 4, 6);
+                if (M > 1) _tile_dpbf16ps(1, 4, 7);
+                o = offs[i + 1];
+                _tile_stream_loadd(4, src0 + o, strideS);
+                weight0 += dW;
+                if (M > 0) _tile_loadd(6, weight0, strideW);
+            }
+            if (M > 1) _tile_loadd(7, weight1, strideW);
+
+            if (M > 0) _tile_dpbf16ps(0, 4, 6);
+            ApplyMxN<term, type, M, 2 * apply, flush>(dst, dD, buf0 + ds * dB, dB, bias, params, mask + ds, tail), ds += 2 * apply;
+            if (M > 0) _tile_stored(0, buf2 + 0, strideB);
+
+            if (M > 1) _tile_dpbf16ps(1, 4, 7);
+            ApplyMxN<term, type, M, 2 * apply, flush>(dst, dD, buf0 + ds * dB, dB, bias, params, mask + ds, tail), ds += 2 * apply;
+            if (M > 1) _tile_stored(1, buf2 + F, strideB);
+        }
+
+        template<Term16bType term, SimdConvolutionActivationType type, int M, int apply, int flush> void Convolution16bNhwcSpecV3_Nx32x32M(
+            const uint16_t* src0, const ConvParam& p, const AlgParam& a, size_t dstS, const int* offs, size_t nK, int zero, const uint16_t* weight0,
+            const float* bias, const float* params, __m512* _params, float* buf, const int* mask, uint8_t* dst, __mmask32 tail)
+        {
+            int dB = (int)a.macroD, dD = int(p.dstC * a.elem), dS = (int)a.microC;
+
+            __m512 _bias[2];
+            if (M > 0) _bias[0] = _mm512_loadu_ps(bias + 0 * F);
+            if (M > 1) _bias[1] = _mm512_loadu_ps(bias + 1 * F);
+            if (type == SimdConvolutionActivationPrelu)
+            {
+                if (M > 0) _params[0] = _mm512_loadu_ps(params + 0 * F);
+                if (M > 1) _params[1] = _mm512_loadu_ps(params + 1 * F);
+            }
+
+            size_t pds = 0;
+            Convolution16bNhwcSpecV3_1x32x32<term, type, M, 0, flush>(src0, p, a, offs, nK, zero, weight0, _bias, _params, buf, mask, dst, tail);
+            for (size_t cds = 32; cds < dstS; pds += 32)
+            {
+                if (cds + 16 >= dstS)
+                {
+                    Convolution16bNhwcSpecV3_1x16x32<term, type, M, apply, flush>(src0 + cds * dS, p, a, offs, nK, zero, weight0, _bias, _params, buf + cds * dB, mask + pds, dst, tail);
+                    cds += 16;
+                }
+                else
+                {
+                    Convolution16bNhwcSpecV3_1x32x32<term, type, M, apply, flush>(src0 + cds * dS, p, a, offs, nK, zero, weight0, _bias, _params, buf + cds * dB, mask + pds, dst, tail);
+                    cds += 32;
+                }
+            }
+            size_t dstS8 = dstS & (~7);
+            for (; pds < dstS8; pds += 8)
+            {
+                ApplyMxN<term, type, M, 8, flush>(dst, dD, buf + pds * dB, dB, _bias, _params, mask + pds, tail);
+            }
+            for (; pds < dstS; ++pds)
+            {
+                ApplyMxN<term, type, M, 1, flush>(dst, dD, buf + pds * dB, dB, _bias, _params, mask + pds, tail);
+            }
+        }
+
+        //-------------------------------------------------------------------------------------------------
+
+        typedef void (*Convolution16bNhwcSpecV3LastPtr)(const uint16_t* src0, const ConvParam& p, const AlgParam& a, size_t dstS, const int* offs, size_t nK, int zero, 
+            const uint16_t* weight0, const float* bias, const float* params, __m512* _params, float* buf, const int* mask, uint8_t* dst, __mmask32 tail);
+
+        template<Term16bType term, SimdConvolutionActivationType type, int apply, int flush> void Convolution16bNhwcSpecV3Last(
+            const uint16_t* src, const ConvParam& p, const AlgParam& a, const int* offs, size_t dstC, size_t dstS, size_t nK, int zero,
+            const uint16_t* weight, float* buf, const float* bias, const float* params, const int* mask, const int* dstOffs, uint8_t* dst)
+        {
+            size_t n = 256, n1 = dstS, nn = AlignLoAny(n1, n), dW = a.K * a.microD;
+            size_t dB = a.macroD, dD = p.dstC * a.elem, dS = a.microC;
+
+            size_t dstC32 = AlignLo(dstC, 32), dstCt = dstC - dstC32;
+            __mmask32 tailD = term == Term16bLast16b ? TailMask32(dstCt) : (__mmask32)TailMask16(dstCt - AlignLo(dstCt - 1, 16));
+            Convolution16bNhwcSpecV3LastPtr mainConv = Convolution16bNhwcSpecV3_Nx32x32M<term, type, 2, apply, flush>;
+            Convolution16bNhwcSpecV3LastPtr tailConv = dstCt > 16 ? Convolution16bNhwcSpecV3_Nx32x32M<term, type, 2, apply, flush> :
+                Convolution16bNhwcSpecV3_Nx32x32M<term, type, 1, apply, flush>;
+
+            __m512 _params[2];
+            _params[0] = _mm512_set1_ps(params[0]);
+            if (type == SimdConvolutionActivationRestrictRange ||
+                type == SimdConvolutionActivationHswish ||
+                type == SimdConvolutionActivationHardSigmoid)
+                _params[1] = _mm512_set1_ps(params[1]);
+
+            SetTileConfFull();
+            for (size_t i = 0; i < n1;)
+            {
+                size_t dn = (n1 - i >= n + 32 ? n : n1 - i);
+                const uint16_t* s = src + i * dS;
+                const uint16_t* w = weight;
+                float* b = buf + i * dB;
+                uint8_t* d = dst + (dstOffs[i/32] - dstOffs[0]) * dD;
+                size_t dc = 0;
+                for (; dc < dstC32; dc += DF, w += dW)
+                    mainConv(s, p, a, dn, offs, nK, zero, w, bias + dc, params + dc, _params, b + dc, mask + i, d + dc * a.elem, __mmask32(-1));
+                if (dc < dstC)
+                    tailConv(s, p, a, dn, offs, nK, zero, w, bias + dc, params + dc, _params, b + dc, mask + i, d + dc * a.elem, tailD);
+                i += dn;
+            }
+        }
+
+        //-------------------------------------------------------------------------------------------------
+
+        template <Term16bType term, SimdConvolutionActivationType type, int flush> SIMD_INLINE void SetLastConvV3(const ConvParam& p, size_t nK, LastConvPtr& lastConv)
+        {
+            if (nK >= 8)
+                lastConv = Convolution16bNhwcSpecV3Last<term, type, 1, flush>;
+            else if (nK >= 4)
+                lastConv = Convolution16bNhwcSpecV3Last<term, type, 2, flush>;
+            else if (nK >= 2)
+                lastConv = Convolution16bNhwcSpecV3Last<term, type, 4, flush>;
+            else
+                lastConv = NULL;
+        }
+
+        template <SimdConvolutionActivationType type> SIMD_INLINE void SetLastConvV3(const ConvParam& p, size_t nK, LastConvPtr& lastConv)
+        {
+            if (p.dstT == SimdTensorData16b)
+                SetLastConvV3<Term16bLast16b, type, 0>(p, nK, lastConv);
+            else
+                SetLastConvV3<Term16bLast32f, type, 0>(p, nK, lastConv);
+        }
+
+        SynetConvolution16bNhwcSpecV3::SynetConvolution16bNhwcSpecV3(const ConvParam & p)
+            : Base::SynetConvolution16bNhwcSpecV3(p)
+        {
+            SetAlgParam();
+            if (_src16b)
+                _preprocess = Reorder16bNhwcSpecV3;
+            else
+                _preprocess = Convert16bNhwcSpecV3;
+            _bodyConv = Convolution16bNhwcSpecV3Body;
+            size_t nK = _nK[_nK.size - 1];
+            switch (p.activation)
+            {
+            case SimdConvolutionActivationIdentity: SetLastConvV3<SimdConvolutionActivationRestrictRange>(p, nK, _lastConv); break;
+            case SimdConvolutionActivationRelu: SetLastConvV3<SimdConvolutionActivationRestrictRange>(p, nK, _lastConv); break;
+            case SimdConvolutionActivationLeakyRelu: SetLastConvV3<SimdConvolutionActivationPrelu>(p, nK, _lastConv); break;
+            case SimdConvolutionActivationRestrictRange: SetLastConvV3<SimdConvolutionActivationRestrictRange>(p, nK, _lastConv); break;
+            case SimdConvolutionActivationPrelu: SetLastConvV3<SimdConvolutionActivationPrelu>(p, nK, _lastConv); break;
+            case SimdConvolutionActivationElu: SetLastConvV3<SimdConvolutionActivationElu>(p, nK, _lastConv); break;
+            case SimdConvolutionActivationHswish: SetLastConvV3<SimdConvolutionActivationHswish>(p, nK, _lastConv); break;
+            case SimdConvolutionActivationMish: SetLastConvV3<SimdConvolutionActivationMish>(p, nK, _lastConv); break;
+            case SimdConvolutionActivationHardSigmoid: SetLastConvV3<SimdConvolutionActivationHardSigmoid>(p, nK, _lastConv); break;
+            case SimdConvolutionActivationSwish: SetLastConvV3<SimdConvolutionActivationSwish>(p, nK, _lastConv); break;
+            case SimdConvolutionActivationGelu: SetLastConvV3<SimdConvolutionActivationGelu>(p, nK, _lastConv); break;
+            default: assert(0);
+            }
+        }
+    }
+#endif
+}
diff --git a/src/Simd/SimdBaseSynetConvolution16bNhwcSpecV3.cpp b/src/Simd/SimdBaseSynetConvolution16bNhwcSpecV3.cpp
index 79c1cf8c2d..c3ff94b096 100644
--- a/src/Simd/SimdBaseSynetConvolution16bNhwcSpecV3.cpp
+++ b/src/Simd/SimdBaseSynetConvolution16bNhwcSpecV3.cpp
@@ -306,8 +306,8 @@ namespace Simd
         {
             const size_t M = p.dstH * p.dstW;
             static int choise = 0;
-            return 1 && p.trans != 0 && p.group == 1 && p.IsDilation(1) && p.IsStride(1) && !p.IsKernel(1) && p.dstC >= 4
-                && p.srcC >= 9 && p.srcC <= 128 && M >= 16;// && (choise++) & 0;
+            return 1 && p.trans != 0 && p.group == 1 && p.IsDilation(1) && p.IsStride(1) && p.kernelX == 3 && p.dstC >= 4
+                && p.srcC >= 9 && /*p.srcC <= 128 &&*/ M >= 16 && 1;// && (choise++) & 0;
         }
     }
 #endif
diff --git a/src/Simd/SimdSynetConvolution16b.h b/src/Simd/SimdSynetConvolution16b.h
index e481b76dd0..2ac8b90d46 100644
--- a/src/Simd/SimdSynetConvolution16b.h
+++ b/src/Simd/SimdSynetConvolution16b.h
@@ -692,6 +692,14 @@ namespace Simd
             virtual String Ext() const { return "AmxBf16"; }
         };
 
+        class SynetConvolution16bNhwcSpecV3 : public Base::SynetConvolution16bNhwcSpecV3
+        {
+        public:
+            SynetConvolution16bNhwcSpecV3(const ConvParam& p);
+
+            virtual String Ext() const { return "AmxBf16"; }
+        };
+
         class SynetConvolution16bNchwGemm : public Avx512bw::SynetConvolution16bNchwGemm
         {
         public:

From b922645873ec5b2bfd32afa8c4f6160a21d71f37 Mon Sep 17 00:00:00 2001
From: Yermalayeu Ihar <ermig@tut.by>
Date: Thu, 21 May 2026 13:36:55 +0300
Subject: [PATCH 22/32] *extend using of SynetConvolution16bNhwcSpecV3.

---
 src/Simd/SimdBaseSynetConvolution16bNhwcSpecV3.cpp | 2 +-
 src/Test/TestSynetConvolution16b.cpp               | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/Simd/SimdBaseSynetConvolution16bNhwcSpecV3.cpp b/src/Simd/SimdBaseSynetConvolution16bNhwcSpecV3.cpp
index c3ff94b096..e912315c11 100644
--- a/src/Simd/SimdBaseSynetConvolution16bNhwcSpecV3.cpp
+++ b/src/Simd/SimdBaseSynetConvolution16bNhwcSpecV3.cpp
@@ -306,7 +306,7 @@ namespace Simd
         {
             const size_t M = p.dstH * p.dstW;
             static int choise = 0;
-            return 1 && p.trans != 0 && p.group == 1 && p.IsDilation(1) && p.IsStride(1) && p.kernelX == 3 && p.dstC >= 4
+            return 1 && p.trans != 0 && p.group == 1 && p.IsDilation(1) && p.IsStride(1) && !p.IsKernel(1) && p.dstC >= 4
                 && p.srcC >= 9 && /*p.srcC <= 128 &&*/ M >= 16 && 1;// && (choise++) & 0;
         }
     }
diff --git a/src/Test/TestSynetConvolution16b.cpp b/src/Test/TestSynetConvolution16b.cpp
index 0cd899620b..b8de4da8b2 100644
--- a/src/Test/TestSynetConvolution16b.cpp
+++ b/src/Test/TestSynetConvolution16b.cpp
@@ -563,6 +563,12 @@ namespace Test
         result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 448, 6, 6, 448, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2);
         //result = result && SynetConvolution16bForwardAutoTest(eps, Param(10, 448, 6, 6, 448, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2);
 #endif
+#if 1
+        result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 64, 57, 57, 64, Size(1, 7), _1, _1, Size(0, 3), Size(0, 3), 1, aPr, tT, b16, b16), c, f1, f2);
+        result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 64, 57, 57, 64, Size(7, 1), _1, _1, Size(3, 0), Size(3, 0), 1, aPr, tT, b16, b16), c, f1, f2);
+        result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 128, 13, 13, 160, Size(1, 7), _1, _1, Size(0, 3), Size(0, 3), 1, aPr, tT, b16, b16), c, f1, f2);
+        result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 160, 13, 13, 192, Size(7, 1), _1, _1, Size(3, 0), Size(3, 0), 1, aPr, tT, b16, b16), c, f1, f2);
+#endif
 #if 0
         result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 224, 12, 12, 448, _1, _1, _2, _0, _0, 1, aId, tT, b16, b16), c, f1, f2);
         result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 56, 96, 96, 56, _3, _1, _2, _1, _1, 1, aId, tT, b16, b16), c, f1, f2);

From b261443fdf1dd696037ed03041dc711a2b75b395 Mon Sep 17 00:00:00 2001
From: Yermalayeu Ihar <ermig@tut.by>
Date: Thu, 21 May 2026 19:49:43 +0300
Subject: [PATCH 23/32] +add kernel Convolution16bNhwcSpecV3Body32x32_Yx3.

---
 ...mdAmxBf16SynetConvolution16bNhwcSpecV3.cpp | 123 +++++++++++++++++-
 .../SimdBaseSynetConvolution16bNhwcSpecV3.cpp |   2 +-
 src/Test/TestSynetConvolution16b.cpp          |  13 +-
 3 files changed, 125 insertions(+), 13 deletions(-)

diff --git a/src/Simd/SimdAmxBf16SynetConvolution16bNhwcSpecV3.cpp b/src/Simd/SimdAmxBf16SynetConvolution16bNhwcSpecV3.cpp
index 98137ca067..7154a8859f 100644
--- a/src/Simd/SimdAmxBf16SynetConvolution16bNhwcSpecV3.cpp
+++ b/src/Simd/SimdAmxBf16SynetConvolution16bNhwcSpecV3.cpp
@@ -155,7 +155,7 @@ namespace Simd
 
         //-------------------------------------------------------------------------------------------------
 
-        SIMD_INLINE void Convolution16bNhwcSpecV3Body32x32(const uint16_t* src0, const ConvParam& p, const AlgParam& a, const int* offs, size_t nK, int zero, const uint16_t* weight0, float* buf0)
+        SIMD_INLINE void Convolution16bNhwcSpecV3Body32x32_Any(const uint16_t* src0, const ConvParam& p, const AlgParam& a, const int* offs, size_t nK, int zero, const uint16_t* weight0, float* buf0)
         {
             int dB = (int)a.macroD, dS = (int)a.microC, strideS = dS * 2, dW = 512, strideW = 64, strideB = dB * 4;
             const uint16_t* weight1 = weight0 + a.K * F;
@@ -213,6 +213,115 @@ namespace Simd
             TileMoveToMemory(buf1 + F, dB);
         }
 
+        SIMD_INLINE void Convolution16bNhwcSpecV3Body32x32_Yx3(const uint16_t* src0, const ConvParam& p, const AlgParam& a, const int* offs, size_t nK, int zero, const uint16_t* weight0, float* buf0)
+        {
+            int dB = (int)a.macroD, dS = (int)a.microC, strideS = dS * 4, dW = 512, strideW = 64, strideB = dB * 8;
+            const uint16_t* weight1 = weight0 + a.K * F;
+            const uint16_t* src1 = src0 + dS;
+            float* buf1 = buf0 + dB;
+
+            if (zero)
+            {
+                _tile_zero(0);
+                _tile_zero(1);
+                _tile_zero(2);
+                _tile_zero(3);
+            }
+            else
+            {
+                _tile_stream_loadd(0, buf0 + 0, strideB);
+                _tile_stream_loadd(1, buf0 + F, strideB);
+                _tile_stream_loadd(2, buf1 + 0, strideB);
+                _tile_stream_loadd(3, buf1 + F, strideB);
+            }
+
+            int n3 = (int)nK - 3, i = 0, o = offs[i];
+            _tile_stream_loadd(4, src0 + o, strideS);
+            _tile_loadd(6, weight0, strideW);
+
+            for (; i < n3; i += 3)
+            {
+                _tile_stream_loadd(5, src1 + o, strideS);
+                _tile_loadd(7, weight1, strideW);
+                weight1 += dW;
+                _tile_dpbf16ps(0, 4, 6);
+                _tile_dpbf16ps(1, 4, 7);
+                o = offs[i + 1];
+                _tile_stream_loadd(4, src1 + o, strideS);
+                _tile_dpbf16ps(2, 5, 6);
+                weight0 += dW;
+                _tile_loadd(6, weight0, strideW);
+                _tile_dpbf16ps(3, 5, 7);
+
+                //_tile_stream_loadd(4, src1 + o, strideS);
+                _tile_loadd(7, weight1, strideW);
+                weight1 += dW;
+                _tile_dpbf16ps(0, 5, 6);
+                _tile_dpbf16ps(1, 5, 7);
+                o = offs[i + 2];
+                _tile_stream_loadd(5, src1 + o, strideS);
+                _tile_dpbf16ps(2, 4, 6);
+                weight0 += dW;
+                _tile_loadd(6, weight0, strideW);
+                _tile_dpbf16ps(3, 4, 7);
+
+                //_tile_stream_loadd(5, src1 + o, strideS);
+                _tile_loadd(7, weight1, strideW);
+                weight1 += dW;
+                _tile_dpbf16ps(0, 4, 6);
+                _tile_dpbf16ps(1, 4, 7);
+                o = offs[i + 3];
+                _tile_stream_loadd(4, src0 + o, strideS);
+                _tile_dpbf16ps(2, 5, 6);
+                weight0 += dW;
+                _tile_loadd(6, weight0, strideW);
+                _tile_dpbf16ps(3, 5, 7);
+            }
+
+            _tile_stream_loadd(5, src1 + o, strideS);
+            _tile_loadd(7, weight1, strideW);
+            weight1 += dW;
+            _tile_dpbf16ps(0, 4, 6);
+            _tile_dpbf16ps(1, 4, 7);
+            o = offs[i + 1];
+            _tile_stream_loadd(4, src1 + o, strideS);
+            _tile_dpbf16ps(2, 5, 6);
+            weight0 += dW;
+            _tile_loadd(6, weight0, strideW);
+            _tile_dpbf16ps(3, 5, 7);
+
+            //_tile_stream_loadd(5, src1 + o, strideS);
+            _tile_loadd(7, weight1, strideW);
+            weight1 += dW;
+            _tile_dpbf16ps(0, 5, 6);
+            _tile_dpbf16ps(1, 5, 7);
+            o = offs[i + 2];
+            _tile_stream_loadd(5, src1 + o, strideS);
+            _tile_dpbf16ps(2, 4, 6);
+            weight0 += dW;
+            _tile_loadd(6, weight0, strideW);
+            _tile_dpbf16ps(3, 4, 7);
+
+            _tile_loadd(7, weight1, strideW);
+            //_tile_stream_loadd(5, src1 + o, strideS);
+
+            _tile_dpbf16ps(0, 4, 6);
+            _tile_stored(0, buf0 + 0, strideB);
+            TileMoveToMemory(buf0 + 0, dB);
+
+            _tile_dpbf16ps(1, 4, 7);
+            _tile_stored(1, buf0 + F, strideB);
+            TileMoveToMemory(buf0 + F, dB);
+
+            _tile_dpbf16ps(2, 5, 6);
+            _tile_stored(2, buf1 + 0, strideB);
+            TileMoveToMemory(buf1 + 0, dB);
+
+            _tile_dpbf16ps(3, 5, 7);
+            _tile_stored(3, buf1 + F, strideB);
+            TileMoveToMemory(buf1 + F, dB);
+        }
+
         SIMD_INLINE void Convolution16bNhwcSpecV3Body32x16(const uint16_t* src0, const ConvParam& p, const AlgParam& a, const int* offs, size_t nK, int zero, const uint16_t* weight0, float* buf0)
         {
             int dB = (int)a.macroD, dS = (int)a.microC, strideS = dS * 2, dW = 512, strideW = 64, strideB = dB * 4;
@@ -334,8 +443,16 @@ namespace Simd
                 size_t i = 0;
                 if (dC > F)
                 {
-                    for (; i < nn; i += n)
-                        Convolution16bNhwcSpecV3Body32x32(src + i * dS, p, a, offs, nK, zero, weight, buf + i * dB);
+                    if (p.kernelX == 3 && 1)
+                    {
+                        for (; i < nn; i += n)
+                            Convolution16bNhwcSpecV3Body32x32_Yx3(src + i * dS, p, a, offs, nK, zero, weight, buf + i * dB);
+                    }
+                    else
+                    {
+                        for (; i < nn; i += n)
+                            Convolution16bNhwcSpecV3Body32x32_Any(src + i * dS, p, a, offs, nK, zero, weight, buf + i * dB);
+                    }
                     if (m)
                         Convolution16bNhwcSpecV3Body16x32(src + i * dS, p, a, offs, nK, zero, weight, buf + i * dB);
                 }
diff --git a/src/Simd/SimdBaseSynetConvolution16bNhwcSpecV3.cpp b/src/Simd/SimdBaseSynetConvolution16bNhwcSpecV3.cpp
index e912315c11..1884736f45 100644
--- a/src/Simd/SimdBaseSynetConvolution16bNhwcSpecV3.cpp
+++ b/src/Simd/SimdBaseSynetConvolution16bNhwcSpecV3.cpp
@@ -304,7 +304,7 @@ namespace Simd
 
         bool SynetConvolution16bNhwcSpecV3::Preferable(const ConvParam& p)
         {
-            const size_t M = p.dstH * p.dstW;
+            const size_t M = p.batch * p.dstH * p.dstW; 
             static int choise = 0;
             return 1 && p.trans != 0 && p.group == 1 && p.IsDilation(1) && p.IsStride(1) && !p.IsKernel(1) && p.dstC >= 4
                 && p.srcC >= 9 && /*p.srcC <= 128 &&*/ M >= 16 && 1;// && (choise++) & 0;
diff --git a/src/Test/TestSynetConvolution16b.cpp b/src/Test/TestSynetConvolution16b.cpp
index b8de4da8b2..af58e54b30 100644
--- a/src/Test/TestSynetConvolution16b.cpp
+++ b/src/Test/TestSynetConvolution16b.cpp
@@ -548,22 +548,17 @@ namespace Test
         result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 32, 28, 24, 32, _3, _1, _2, _1, _1, 1, aId, tT, b16, b16), c, f1, f2);
         result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 32, 96, 96, 96, _3, _1, _2, _1, _1, 1, aId, tT, b16, b16), c, f1, f2);
 #endif
-#if 1
-        //result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 256, 16, 16, 256, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2);
-        //result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 112, 24, 24, 112, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2);
+#if 0
         result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 56, 96, 96, 56, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2);
         result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 56, 48, 48, 56, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2);
-        //result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 56, 48, 48, 112, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2);
-        //result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 64, 48, 48, 56, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2);
+#endif
+#if 1
         result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 112, 24, 24, 112, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2);
-        //result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 112, 24, 24, 224, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2);
-        //result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 128, 24, 24, 112, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2);
         result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 224, 12, 12, 224, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2);
         result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 224, 12, 12, 448, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2);
         result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 448, 6, 6, 448, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2);
-        //result = result && SynetConvolution16bForwardAutoTest(eps, Param(10, 448, 6, 6, 448, _3, _1, _1, _1, _1, 1, aPr, tT, b16, b16), c, f1, f2);
 #endif
-#if 1
+#if 0
         result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 64, 57, 57, 64, Size(1, 7), _1, _1, Size(0, 3), Size(0, 3), 1, aPr, tT, b16, b16), c, f1, f2);
         result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 64, 57, 57, 64, Size(7, 1), _1, _1, Size(3, 0), Size(3, 0), 1, aPr, tT, b16, b16), c, f1, f2);
         result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 128, 13, 13, 160, Size(1, 7), _1, _1, Size(0, 3), Size(0, 3), 1, aPr, tT, b16, b16), c, f1, f2);

From 7598c32cfabd4eaf9a908c89aa58b4e2d269dc2a Mon Sep 17 00:00:00 2001
From: Yermalayeu Ihar <ermig@tut.by>
Date: Thu, 21 May 2026 20:24:28 +0300
Subject: [PATCH 24/32] *disable using of
 Convolution16bNhwcSpecV3Body32x32_Yx3.

---
 src/Simd/SimdAmxBf16SynetConvolution16bNhwcSpecV3.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Simd/SimdAmxBf16SynetConvolution16bNhwcSpecV3.cpp b/src/Simd/SimdAmxBf16SynetConvolution16bNhwcSpecV3.cpp
index 7154a8859f..56f0ccd608 100644
--- a/src/Simd/SimdAmxBf16SynetConvolution16bNhwcSpecV3.cpp
+++ b/src/Simd/SimdAmxBf16SynetConvolution16bNhwcSpecV3.cpp
@@ -443,7 +443,7 @@ namespace Simd
                 size_t i = 0;
                 if (dC > F)
                 {
-                    if (p.kernelX == 3 && 1)
+                    if (p.kernelX == 3 && 0)
                     {
                         for (; i < nn; i += n)
                             Convolution16bNhwcSpecV3Body32x32_Yx3(src + i * dS, p, a, offs, nK, zero, weight, buf + i * dB);

From 3aa46dd3855dc1f509060a870aea1ce3d821ee13 Mon Sep 17 00:00:00 2001
From: Yermalayeu Ihar <ermig@tut.by>
Date: Fri, 22 May 2026 17:28:47 +0300
Subject: [PATCH 25/32] *update help.

---
 docs/2026.html                           |  6 +++
 docs/help/functions_b.html               |  2 +-
 docs/help/functions_c.html               |  2 +-
 docs/help/functions_f.html               |  7 ++-
 docs/help/functions_func_c.html          |  2 +-
 docs/help/functions_i.html               |  2 +-
 docs/help/functions_l.html               |  2 +-
 docs/help/functions_r.html               |  2 +-
 docs/help/functions_t.html               |  3 +-
 docs/help/functions_u.html               |  2 +-
 docs/help/group__cpu__flags.html         | 69 +++++++++++++++++++-----
 docs/help/group__matrix.html             |  4 +-
 docs/help/group__thread.html             |  7 +--
 docs/help/struct_simd_1_1_detection.html |  6 +--
 docs/help/struct_simd_1_1_frame.html     | 49 +++++++++++++++++
 docs/help/struct_simd_1_1_view.html      | 46 ++++++++++++++++
 16 files changed, 178 insertions(+), 33 deletions(-)

diff --git a/docs/2026.html b/docs/2026.html
index cf0499a982..cb35619654 100644
--- a/docs/2026.html
+++ b/docs/2026.html
@@ -67,6 +67,12 @@ <h5>Improving</h5>
  <li>Description of function SimdAlign.</li>
  <li>Description of function SimdAlignment.</li>
  <li>Description of function SimdRelease.</li>
+ <li>Description of function SimdGetThreadNumber.</li>
+ <li>Description of function SimdSetThreadNumber.</li>
+ <li>Description of function SimdEmpty.</li>
+ <li>Description of function SimdGetFastMode.</li>
+ <li>Description of function SimdSetFastMode.</li>
+ <li>Description of function SimdSetAmxFull.</li>
 </ul>
 
 <a href="#HOME">Home</a>
diff --git a/docs/help/functions_b.html b/docs/help/functions_b.html
index 1be8318359..3c8eadd3f7 100644
--- a/docs/help/functions_b.html
+++ b/docs/help/functions_b.html
@@ -78,8 +78,8 @@ <h3><a id="index_b" name="index_b"></a>- b -</h3><ul>
 <li>BorderConstant&#160;:&#160;<a class="el" href="class_simd_1_1_warp_affine_flags.html#a0a80ede35a765cfff820946b7a9c81e8">WarpAffineFlags</a></li>
 <li>BorderMask&#160;:&#160;<a class="el" href="class_simd_1_1_warp_affine_flags.html#ae2d7b4217d6956355c274d438156a700">WarpAffineFlags</a></li>
 <li>BorderTransparent&#160;:&#160;<a class="el" href="class_simd_1_1_warp_affine_flags.html#a9cb2599ecb9795184d0abb307cb75a6b">WarpAffineFlags</a></li>
-<li>bottom&#160;:&#160;<a class="el" href="struct_simd_1_1_rectangle.html#adacd636ce133312813693b22f6672aa1">Rectangle&lt; T &gt;</a></li>
 <li>Bottom()&#160;:&#160;<a class="el" href="struct_simd_1_1_rectangle.html#a12ac0e6198707a383f4dfa8a324b79e9">Rectangle&lt; T &gt;</a></li>
+<li>bottom&#160;:&#160;<a class="el" href="struct_simd_1_1_rectangle.html#adacd636ce133312813693b22f6672aa1">Rectangle&lt; T &gt;</a></li>
 <li>BottomCenter&#160;:&#160;<a class="el" href="class_simd_1_1_position.html#aa7840fae6c0e8c1c66a9d17371f084f3">Position</a>, <a class="el" href="struct_simd_1_1_view.html#ab91b34ae619fcdfcba4522b4f335bf83a74bb4e63036551780b66cf8beac32a44">View&lt; A &gt;</a></li>
 <li>BottomLeft&#160;:&#160;<a class="el" href="class_simd_1_1_position.html#a3a273a1ffc84209870abf07ec0b5b561">Position</a>, <a class="el" href="struct_simd_1_1_rectangle.html#a0f245a29ed3eaabde25c8cefa6305a49">Rectangle&lt; T &gt;</a>, <a class="el" href="struct_simd_1_1_view.html#ab91b34ae619fcdfcba4522b4f335bf83ae61b9b6ea2fa75ca500d5bb1eaf6f6fc">View&lt; A &gt;</a></li>
 <li>BottomRight&#160;:&#160;<a class="el" href="class_simd_1_1_position.html#a4933837db1b048061f079b1c473e5034">Position</a>, <a class="el" href="struct_simd_1_1_rectangle.html#afd8180f206f74b4ce203b4c61c6ccef5">Rectangle&lt; T &gt;</a>, <a class="el" href="struct_simd_1_1_view.html#ab91b34ae619fcdfcba4522b4f335bf83a1640f649d644701a2f4633e6bd88b20c">View&lt; A &gt;</a></li>
diff --git a/docs/help/functions_c.html b/docs/help/functions_c.html
index 9c0dc19362..348ae5e3f6 100644
--- a/docs/help/functions_c.html
+++ b/docs/help/functions_c.html
@@ -66,7 +66,7 @@ <h3><a id="index_c" name="index_c"></a>- c -</h3><ul>
 <li>Converted()&#160;:&#160;<a class="el" href="class_simd_1_1_image.html#abe0f8052422a9ae8e345047cf8b3f507">Image</a>, <a class="el" href="class_simd_1_1_image_frame.html#aa16370f5c133aaea1b97d0d2f4bbff27">ImageFrame</a></li>
 <li>Convolutional&#160;:&#160;<a class="el" href="class_simd_1_1_neural_1_1_layer.html#a1d1cfd8ffb84e947f82999c682b666a7aaa989c15b3dc3297b193838d82a8e78a">Layer</a></li>
 <li>ConvolutionalLayer()&#160;:&#160;<a class="el" href="class_simd_1_1_neural_1_1_convolutional_layer.html#aba0a65497596dca8988b52afa2631302">ConvolutionalLayer</a></li>
-<li>Copy()&#160;:&#160;<a class="el" href="class_simd_1_1_image.html#ad858a665ec039a7244000fe205b20a38">Image</a>, <a class="el" href="class_simd_1_1_image_frame.html#ad858a665ec039a7244000fe205b20a38">ImageFrame</a>, <a class="el" href="class_simd_1_1_lib.html#a0ad025ad583c0ed2d10d4018120a0ae0">Lib</a></li>
+<li>Copy()&#160;:&#160;<a class="el" href="class_simd_1_1_image.html#ad858a665ec039a7244000fe205b20a38">Image</a>, <a class="el" href="class_simd_1_1_image_frame.html#ad858a665ec039a7244000fe205b20a38">ImageFrame</a>, <a class="el" href="class_simd_1_1_lib.html#a0ad025ad583c0ed2d10d4018120a0ae0">Lib</a>, <a class="el" href="struct_simd_1_1_frame.html#a71bdb44bdab2a0046236a5960ebb6f57">Frame&lt; A &gt;</a>, <a class="el" href="struct_simd_1_1_view.html#abdf8e3c2428064c7e53f33276573fe28">View&lt; A &gt;</a></li>
 <li>CopyToNumpyArray()&#160;:&#160;<a class="el" href="class_simd_1_1_image.html#a4308ab7670c78de9a4d082e2fa6445a5">Image</a></li>
 <li>Cores&#160;:&#160;<a class="el" href="class_simd_1_1_cpu_info.html#a1216f52ab7c9fbabfbf329bbdd0b77a2">CpuInfo</a></li>
 <li>Correlation()&#160;:&#160;<a class="el" href="struct_simd_1_1_shift_detector.html#a32605901972ea028983560327eba39df">ShiftDetector&lt; A &gt;</a></li>
diff --git a/docs/help/functions_f.html b/docs/help/functions_f.html
index db319c096d..9ca25f630f 100644
--- a/docs/help/functions_f.html
+++ b/docs/help/functions_f.html
@@ -46,9 +46,8 @@ <h3><a id="index_f" name="index_f"></a>- f -</h3><ul>
 <li>Fill()&#160;:&#160;<a class="el" href="class_simd_1_1_image.html#a1e5c43add1928245417c2fe1612840d2">Image</a></li>
 <li>FillPixel()&#160;:&#160;<a class="el" href="class_simd_1_1_lib.html#ac00ff54734904ddd7fabc0d654a8cb34">Lib</a></li>
 <li>Find()&#160;:&#160;<a class="el" href="struct_simd_1_1_image_matcher.html#adceb82495ae6f02380aa5ae23985962f">ImageMatcher&lt; Tag, Allocator &gt;</a></li>
-<li>Flipped()&#160;:&#160;<a class="el" href="struct_simd_1_1_frame.html#a61e18602057b6f1ec5db7c50474fe750">Frame&lt; A &gt;</a></li>
 <li>flipped&#160;:&#160;<a class="el" href="struct_simd_1_1_frame.html#afad0f359fd4d0fafd6cc51d58dc53ba6">Frame&lt; A &gt;</a></li>
-<li>Flipped()&#160;:&#160;<a class="el" href="struct_simd_1_1_view.html#aaa5a2e80819f2a24a7fd16b27ba0375b">View&lt; A &gt;</a></li>
+<li>Flipped()&#160;:&#160;<a class="el" href="struct_simd_1_1_frame.html#a61e18602057b6f1ec5db7c50474fe750">Frame&lt; A &gt;</a>, <a class="el" href="struct_simd_1_1_view.html#aaa5a2e80819f2a24a7fd16b27ba0375b">View&lt; A &gt;</a></li>
 <li>Float&#160;:&#160;<a class="el" href="class_simd_1_1_pixel_format.html#ac95aad531a0cbeb740b911227f8a7922">PixelFormat</a>, <a class="el" href="class_simd_1_1_resize_channel.html#ac95aad531a0cbeb740b911227f8a7922">ResizeChannel</a>, <a class="el" href="struct_simd_1_1_view.html#ab4e88c89b3b7ea1735996cc4def22d58ad67b0ee7230dcecb610254e4e5e589cd">View&lt; A &gt;</a></li>
 <li>Font()&#160;:&#160;<a class="el" href="class_simd_1_1_font.html#a95a3325298a1f2d1fb86dbf7e82de886">Font</a></li>
 <li>FontDraw()&#160;:&#160;<a class="el" href="class_simd_1_1_lib.html#a457c5d70a59283e56534fcf06ed7b69c">Lib</a></li>
@@ -56,9 +55,9 @@ <h3><a id="index_f" name="index_f"></a>- f -</h3><ul>
 <li>FontInit()&#160;:&#160;<a class="el" href="class_simd_1_1_lib.html#a3abf48e85d41d09b658ea991b00bd549">Lib</a></li>
 <li>FontMeasure()&#160;:&#160;<a class="el" href="class_simd_1_1_lib.html#ac0e93effaa2174914f3d21f060d4e6ea">Lib</a></li>
 <li>FontResize()&#160;:&#160;<a class="el" href="class_simd_1_1_lib.html#ac4dbe9aa171d1b34020fd88997eed239">Lib</a></li>
-<li>Format()&#160;:&#160;<a class="el" href="class_simd_1_1_image.html#a6d54ca9316b15cfeea34ee4cc5abcacd">Image</a>, <a class="el" href="class_simd_1_1_image_frame.html#a9cdc2ac011f752e929d4fbee711eeb52">ImageFrame</a></li>
+<li>Format()&#160;:&#160;<a class="el" href="class_simd_1_1_image.html#a6d54ca9316b15cfeea34ee4cc5abcacd">Image</a>, <a class="el" href="class_simd_1_1_image_frame.html#a9cdc2ac011f752e929d4fbee711eeb52">ImageFrame</a>, <a class="el" href="struct_simd_1_1_frame.html#ab4e88c89b3b7ea1735996cc4def22d58">Frame&lt; A &gt;</a></li>
 <li>format&#160;:&#160;<a class="el" href="struct_simd_1_1_frame.html#a7cf7e7bed8ea97ad270803076c14f83f">Frame&lt; A &gt;</a></li>
-<li>Format&#160;:&#160;<a class="el" href="struct_simd_1_1_frame.html#ab4e88c89b3b7ea1735996cc4def22d58">Frame&lt; A &gt;</a>, <a class="el" href="struct_simd_1_1_view.html#ab4e88c89b3b7ea1735996cc4def22d58">View&lt; A &gt;</a></li>
+<li>Format&#160;:&#160;<a class="el" href="struct_simd_1_1_view.html#ab4e88c89b3b7ea1735996cc4def22d58">View&lt; A &gt;</a></li>
 <li>format&#160;:&#160;<a class="el" href="struct_simd_1_1_view.html#a7cf7e7bed8ea97ad270803076c14f83f">View&lt; A &gt;</a></li>
 <li>FP16&#160;:&#160;<a class="el" href="class_simd_1_1_tensor_data.html#a6dbe46c241ac492ca3b41f55a9d32929">TensorData</a></li>
 <li>FP32&#160;:&#160;<a class="el" href="class_simd_1_1_tensor_data.html#a129c9696cdaa67046efc1c1f84216faa">TensorData</a></li>
diff --git a/docs/help/functions_func_c.html b/docs/help/functions_func_c.html
index 3f8922bfce..c674e4eb40 100644
--- a/docs/help/functions_func_c.html
+++ b/docs/help/functions_func_c.html
@@ -54,7 +54,7 @@ <h3><a id="index_c" name="index_c"></a>- c -</h3><ul>
 <li>Convert()&#160;:&#160;<a class="el" href="class_simd_1_1_image.html#af1a4caea357e7762aa28aa394c072229">Image</a>, <a class="el" href="class_simd_1_1_image_frame.html#af1a4caea357e7762aa28aa394c072229">ImageFrame</a>, <a class="el" href="class_simd_1_1_neural_1_1_network.html#aa5e72de9b473a7f84b163facdfee7eaa">Network</a></li>
 <li>Converted()&#160;:&#160;<a class="el" href="class_simd_1_1_image.html#abe0f8052422a9ae8e345047cf8b3f507">Image</a>, <a class="el" href="class_simd_1_1_image_frame.html#aa16370f5c133aaea1b97d0d2f4bbff27">ImageFrame</a></li>
 <li>ConvolutionalLayer()&#160;:&#160;<a class="el" href="class_simd_1_1_neural_1_1_convolutional_layer.html#aba0a65497596dca8988b52afa2631302">ConvolutionalLayer</a></li>
-<li>Copy()&#160;:&#160;<a class="el" href="class_simd_1_1_image.html#ad858a665ec039a7244000fe205b20a38">Image</a>, <a class="el" href="class_simd_1_1_image_frame.html#ad858a665ec039a7244000fe205b20a38">ImageFrame</a>, <a class="el" href="class_simd_1_1_lib.html#a0ad025ad583c0ed2d10d4018120a0ae0">Lib</a></li>
+<li>Copy()&#160;:&#160;<a class="el" href="class_simd_1_1_image.html#ad858a665ec039a7244000fe205b20a38">Image</a>, <a class="el" href="class_simd_1_1_image_frame.html#ad858a665ec039a7244000fe205b20a38">ImageFrame</a>, <a class="el" href="class_simd_1_1_lib.html#a0ad025ad583c0ed2d10d4018120a0ae0">Lib</a>, <a class="el" href="struct_simd_1_1_frame.html#a71bdb44bdab2a0046236a5960ebb6f57">Frame&lt; A &gt;</a>, <a class="el" href="struct_simd_1_1_view.html#abdf8e3c2428064c7e53f33276573fe28">View&lt; A &gt;</a></li>
 <li>CopyToNumpyArray()&#160;:&#160;<a class="el" href="class_simd_1_1_image.html#a4308ab7670c78de9a4d082e2fa6445a5">Image</a></li>
 <li>Correlation()&#160;:&#160;<a class="el" href="struct_simd_1_1_shift_detector.html#a32605901972ea028983560327eba39df">ShiftDetector&lt; A &gt;</a></li>
 <li>CpuDesc()&#160;:&#160;<a class="el" href="class_simd_1_1_lib.html#a6960a531e693889e878108cf5c838f1f">Lib</a></li>
diff --git a/docs/help/functions_i.html b/docs/help/functions_i.html
index 142d087858..af6514f879 100644
--- a/docs/help/functions_i.html
+++ b/docs/help/functions_i.html
@@ -49,8 +49,8 @@ <h3><a id="index_i" name="index_i"></a>- i -</h3><ul>
 <li>Index()&#160;:&#160;<a class="el" href="struct_simd_1_1_neural_1_1_index.html#aea3bf3bb46b822f1538ce92e9701092c">Index</a></li>
 <li>Init()&#160;:&#160;<a class="el" href="class_simd_1_1_lib.html#a4f64f4c09c7a335104ecb50805b2acd0">Lib</a>, <a class="el" href="struct_simd_1_1_contour_detector.html#ae8b46bd2d30fbbd3c537e83769d21650">ContourDetector&lt; A &gt;</a>, <a class="el" href="struct_simd_1_1_detection.html#ac14257967700e9bd098079663e296b06">Detection&lt; A &gt;</a>, <a class="el" href="struct_simd_1_1_image_matcher.html#af86c6e012c0e859f91562693eca2a802">ImageMatcher&lt; Tag, Allocator &gt;</a></li>
 <li>InitBuffers()&#160;:&#160;<a class="el" href="struct_simd_1_1_shift_detector.html#a3d7a25cb6a9f5e4e593fd4dd2abacaef">ShiftDetector&lt; A &gt;</a></li>
-<li>initType&#160;:&#160;<a class="el" href="struct_simd_1_1_neural_1_1_train_options.html#a2da3b34550c0f22ad4d32f9ed7ccb0dd">TrainOptions</a></li>
 <li>InitType&#160;:&#160;<a class="el" href="struct_simd_1_1_neural_1_1_train_options.html#a8b039e6ef27959d7fc6119161905f257">TrainOptions</a></li>
+<li>initType&#160;:&#160;<a class="el" href="struct_simd_1_1_neural_1_1_train_options.html#a2da3b34550c0f22ad4d32f9ed7ccb0dd">TrainOptions</a></li>
 <li>Input&#160;:&#160;<a class="el" href="class_simd_1_1_neural_1_1_layer.html#a1d1cfd8ffb84e947f82999c682b666a7abd62348391d75c7fc8c130ea346cba29">Layer</a></li>
 <li>InputIndex()&#160;:&#160;<a class="el" href="class_simd_1_1_neural_1_1_network.html#a46f1cafdcb8a99fb523f0ebda58eab2d">Network</a></li>
 <li>Int16&#160;:&#160;<a class="el" href="class_simd_1_1_pixel_format.html#adbb09bb692b7a29edd1bffbe1a7f26e8">PixelFormat</a>, <a class="el" href="struct_simd_1_1_view.html#ab4e88c89b3b7ea1735996cc4def22d58a150558aeaa819431aeb9729d26b2ac9f">View&lt; A &gt;</a></li>
diff --git a/docs/help/functions_l.html b/docs/help/functions_l.html
index f376887b85..54562c9b02 100644
--- a/docs/help/functions_l.html
+++ b/docs/help/functions_l.html
@@ -49,8 +49,8 @@ <h3><a id="index_l" name="index_l"></a>- l -</h3><ul>
 <li>lightness&#160;:&#160;<a class="el" href="struct_simd_1_1_pixel_1_1_hsl24.html#a43f5faf8e90c54f765dece4a5bd94fa5">Hsl24</a></li>
 <li>Load()&#160;:&#160;<a class="el" href="class_simd_1_1_image.html#a7bd94609598d2e38586a07aaa5b0cf82">Image</a>, <a class="el" href="class_simd_1_1_image_frame.html#a2d018f553d4fda60eef55c5aa1ca76ae">ImageFrame</a>, <a class="el" href="struct_simd_1_1_detection.html#a07d0f60286f2f852eba0d42093437603">Detection&lt; A &gt;</a>, <a class="el" href="class_simd_1_1_neural_1_1_network.html#abab7d7ec110a04877974ed615a2bdc0c">Network</a>, <a class="el" href="struct_simd_1_1_view.html#ad598aa205cbea4bf98500b13a3fffb91">View&lt; A &gt;</a></li>
 <li>LoadStringXml()&#160;:&#160;<a class="el" href="struct_simd_1_1_detection.html#a7d150b1064d2fe666fb227d3871c99bc">Detection&lt; A &gt;</a></li>
-<li>lossType&#160;:&#160;<a class="el" href="struct_simd_1_1_neural_1_1_train_options.html#a0aeec20c2050f38ae0fd632d1c514320">TrainOptions</a></li>
 <li>LossType&#160;:&#160;<a class="el" href="struct_simd_1_1_neural_1_1_train_options.html#a771478c1684e7f5452b99079005e68c2">TrainOptions</a></li>
+<li>lossType&#160;:&#160;<a class="el" href="struct_simd_1_1_neural_1_1_train_options.html#a0aeec20c2050f38ae0fd632d1c514320">TrainOptions</a></li>
 </ul>
 </div><!-- contents -->
 <div id="nav-path" class="navpath">
diff --git a/docs/help/functions_r.html b/docs/help/functions_r.html
index 87a2c5aa26..316add1203 100644
--- a/docs/help/functions_r.html
+++ b/docs/help/functions_r.html
@@ -69,8 +69,8 @@ <h3><a id="index_r" name="index_r"></a>- r -</h3><ul>
 <li>RgbaToGray()&#160;:&#160;<a class="el" href="class_simd_1_1_lib.html#a5b351988ae369913802a4714f8724c90">Lib</a></li>
 <li>RgbToBgra()&#160;:&#160;<a class="el" href="class_simd_1_1_lib.html#a27357bc1b0d8924d5ac380199dd485c0">Lib</a></li>
 <li>RgbToGray()&#160;:&#160;<a class="el" href="class_simd_1_1_lib.html#a7c30e29b63fb61b5aac0681d28c78d7f">Lib</a></li>
-<li>right&#160;:&#160;<a class="el" href="struct_simd_1_1_rectangle.html#ab57f2e6bd99f83c84663dec0b05d5e7d">Rectangle&lt; T &gt;</a></li>
 <li>Right()&#160;:&#160;<a class="el" href="struct_simd_1_1_rectangle.html#a31c73d5c402c97324953b0019fa5c886">Rectangle&lt; T &gt;</a></li>
+<li>right&#160;:&#160;<a class="el" href="struct_simd_1_1_rectangle.html#ab57f2e6bd99f83c84663dec0b05d5e7d">Rectangle&lt; T &gt;</a></li>
 <li>roi&#160;:&#160;<a class="el" href="struct_simd_1_1_motion_1_1_model.html#a437c586204cccc6b1d47d68dd12b5356">Model</a></li>
 <li>Row()&#160;:&#160;<a class="el" href="struct_simd_1_1_view.html#aec6a25e5d2ceaf42ef200850ff0fb8d0">View&lt; A &gt;</a></li>
 </ul>
diff --git a/docs/help/functions_t.html b/docs/help/functions_t.html
index 45fe30af28..089b6a23da 100644
--- a/docs/help/functions_t.html
+++ b/docs/help/functions_t.html
@@ -56,9 +56,8 @@ <h3><a id="index_t" name="index_t"></a>- t -</h3><ul>
 <li>Timestamp()&#160;:&#160;<a class="el" href="class_simd_1_1_image_frame.html#ab1381992b0e0ae3309b5f4c78313a44e">ImageFrame</a></li>
 <li>timestamp&#160;:&#160;<a class="el" href="struct_simd_1_1_frame.html#a2c17dfa2b3239f2312e94b9de57a7999">Frame&lt; A &gt;</a></li>
 <li>ToOcv()&#160;:&#160;<a class="el" href="struct_simd_1_1_view.html#aab00e01e72c1a8a04f7ef4944447bac7">View&lt; A &gt;</a></li>
-<li>Top()&#160;:&#160;<a class="el" href="struct_simd_1_1_pyramid.html#a3ab3ba0bf6587e2cace0ed3bc28ce5af">Pyramid&lt; A &gt;</a></li>
+<li>Top()&#160;:&#160;<a class="el" href="struct_simd_1_1_pyramid.html#a3ab3ba0bf6587e2cace0ed3bc28ce5af">Pyramid&lt; A &gt;</a>, <a class="el" href="struct_simd_1_1_rectangle.html#abaaf7f51d822e95a56294af0c27658aa">Rectangle&lt; T &gt;</a></li>
 <li>top&#160;:&#160;<a class="el" href="struct_simd_1_1_rectangle.html#a840a6e82bf789f4846374af9e5578332">Rectangle&lt; T &gt;</a></li>
-<li>Top()&#160;:&#160;<a class="el" href="struct_simd_1_1_rectangle.html#abaaf7f51d822e95a56294af0c27658aa">Rectangle&lt; T &gt;</a></li>
 <li>TopCenter&#160;:&#160;<a class="el" href="class_simd_1_1_position.html#aa94ab0066182fefd066f47a4b5e15ec2">Position</a>, <a class="el" href="struct_simd_1_1_view.html#ab91b34ae619fcdfcba4522b4f335bf83a2d14352c6744d2f724c5e82f7657259f">View&lt; A &gt;</a></li>
 <li>TopLeft&#160;:&#160;<a class="el" href="class_simd_1_1_position.html#a8bcfa20ce6896edfe82ea54a9066d80e">Position</a>, <a class="el" href="struct_simd_1_1_rectangle.html#a9e5e62cba793931779090ba4298bbd29">Rectangle&lt; T &gt;</a>, <a class="el" href="struct_simd_1_1_view.html#ab91b34ae619fcdfcba4522b4f335bf83a61f66ddc6702462a94d3e231f02b9017">View&lt; A &gt;</a></li>
 <li>TopRight&#160;:&#160;<a class="el" href="class_simd_1_1_position.html#a89cd4d2f7982a85d52f31a6843bf435b">Position</a>, <a class="el" href="struct_simd_1_1_rectangle.html#a44b95b02402e924e587f009e1096eff9">Rectangle&lt; T &gt;</a>, <a class="el" href="struct_simd_1_1_view.html#ab91b34ae619fcdfcba4522b4f335bf83a7e42a96f07eab63a8c9fa8a0526f34f4">View&lt; A &gt;</a></li>
diff --git a/docs/help/functions_u.html b/docs/help/functions_u.html
index aa8a91ce15..ed9721eb9e 100644
--- a/docs/help/functions_u.html
+++ b/docs/help/functions_u.html
@@ -47,8 +47,8 @@ <h3><a id="index_u" name="index_u"></a>- u -</h3><ul>
 <li>Undefined&#160;:&#160;<a class="el" href="class_simd_1_1_image_file.html#a49e9acb48005930ddcd635c93765874f">ImageFile</a></li>
 <li>UNDEFINED_OBJECT_TAG&#160;:&#160;<a class="el" href="struct_simd_1_1_detection.html#a396b8bd09ae7cf22fcf825c96b8a0000">Detection&lt; A &gt;</a></li>
 <li>Unknown&#160;:&#160;<a class="el" href="class_simd_1_1_tensor_data.html#af88d424125c9ab94ab8b125a61ca552c">TensorData</a>, <a class="el" href="class_simd_1_1_tensor_format.html#af88d424125c9ab94ab8b125a61ca552c">TensorFormat</a>, <a class="el" href="class_simd_1_1_yuv_type.html#af88d424125c9ab94ab8b125a61ca552c">YuvType</a></li>
-<li>UpdateType&#160;:&#160;<a class="el" href="struct_simd_1_1_neural_1_1_train_options.html#a0957e0df001dd796895b75f947daddb2">TrainOptions</a></li>
 <li>updateType&#160;:&#160;<a class="el" href="struct_simd_1_1_neural_1_1_train_options.html#a83a9cdb86da648dadae6a4e2ed51d505">TrainOptions</a></li>
+<li>UpdateType&#160;:&#160;<a class="el" href="struct_simd_1_1_neural_1_1_train_options.html#a0957e0df001dd796895b75f947daddb2">TrainOptions</a></li>
 <li>Uv16&#160;:&#160;<a class="el" href="class_simd_1_1_pixel_format.html#aa2f1abcba57123bc2a714dc23e42df52">PixelFormat</a>, <a class="el" href="struct_simd_1_1_view.html#ab4e88c89b3b7ea1735996cc4def22d58aeaf015fa66bcf10b637a1c67680d024f">View&lt; A &gt;</a></li>
 <li>Uyvy16&#160;:&#160;<a class="el" href="class_simd_1_1_pixel_format.html#a1a0c1cbdeb4a3b7d281026973b4b667f">PixelFormat</a>, <a class="el" href="struct_simd_1_1_view.html#ab4e88c89b3b7ea1735996cc4def22d58ae28b7f6eca80c0965f9be5e44ac085a5">View&lt; A &gt;</a></li>
 </ul>
diff --git a/docs/help/group__cpu__flags.html b/docs/help/group__cpu__flags.html
index fbcf148ab4..ed73bf0726 100644
--- a/docs/help/group__cpu__flags.html
+++ b/docs/help/group__cpu__flags.html
@@ -50,24 +50,43 @@ <h1>Simd Library Documentation.</h1>
 <table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="func-members" name="func-members"></a>
 Functions</h2></td></tr>
-<tr class="memitem:ga622a23654c42d4453321dce478406455"><td class="memItemLeft" align="right" valign="top"><a id="ga622a23654c42d4453321dce478406455" name="ga622a23654c42d4453321dce478406455"></a>
-SIMD_API void&#160;</td><td class="memItemRight" valign="bottom"><b>SimdEmpty</b> (void)</td></tr>
-<tr class="memdesc:ga622a23654c42d4453321dce478406455"><td class="mdescLeft">&#160;</td><td class="mdescRight">Clears MMX registers (runs EMMS instruction). It is x86 specific functionality. <br /></td></tr>
+<tr class="memitem:ga622a23654c42d4453321dce478406455"><td class="memItemLeft" align="right" valign="top">SIMD_API void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__cpu__flags.html#ga622a23654c42d4453321dce478406455">SimdEmpty</a> (void)</td></tr>
+<tr class="memdesc:ga622a23654c42d4453321dce478406455"><td class="mdescLeft">&#160;</td><td class="mdescRight">Clears MMX state for x86 SIMD code paths.  <a href="group__cpu__flags.html#ga622a23654c42d4453321dce478406455">More...</a><br /></td></tr>
 <tr class="separator:ga622a23654c42d4453321dce478406455"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:gad032143094f20a39d0fa7abb7a3a4fbc"><td class="memItemLeft" align="right" valign="top">SIMD_API <a class="el" href="group__c__types.html#ga128437633efebd89ca4bde565dcf5627">SimdBool</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__cpu__flags.html#gad032143094f20a39d0fa7abb7a3a4fbc">SimdGetFastMode</a> (void)</td></tr>
-<tr class="memdesc:gad032143094f20a39d0fa7abb7a3a4fbc"><td class="mdescLeft">&#160;</td><td class="mdescRight">Gets current CPU Flush-To-Zero (FTZ) and Denormals-Are-Zero (DAZ) flags. It is used in order to process subnormal numbers.  <a href="group__cpu__flags.html#gad032143094f20a39d0fa7abb7a3a4fbc">More...</a><br /></td></tr>
+<tr class="memdesc:gad032143094f20a39d0fa7abb7a3a4fbc"><td class="mdescLeft">&#160;</td><td class="mdescRight">Gets the current 'fast mode' state for floating-point subnormal handling.  <a href="group__cpu__flags.html#gad032143094f20a39d0fa7abb7a3a4fbc">More...</a><br /></td></tr>
 <tr class="separator:gad032143094f20a39d0fa7abb7a3a4fbc"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:gad98c95021e9fcf3a06d59d29ba4906c3"><td class="memItemLeft" align="right" valign="top">SIMD_API void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__cpu__flags.html#gad98c95021e9fcf3a06d59d29ba4906c3">SimdSetFastMode</a> (<a class="el" href="group__c__types.html#ga128437633efebd89ca4bde565dcf5627">SimdBool</a> value)</td></tr>
-<tr class="memdesc:gad98c95021e9fcf3a06d59d29ba4906c3"><td class="mdescLeft">&#160;</td><td class="mdescRight">Sets current CPU Flush-To-Zero (FTZ) and Denormals-Are-Zero (DAZ) flags. It is used in order to process subnormal numbers.  <a href="group__cpu__flags.html#gad98c95021e9fcf3a06d59d29ba4906c3">More...</a><br /></td></tr>
+<tr class="memdesc:gad98c95021e9fcf3a06d59d29ba4906c3"><td class="mdescLeft">&#160;</td><td class="mdescRight">Sets the current thread's 'fast mode' state for floating-point subnormal handling.  <a href="group__cpu__flags.html#gad98c95021e9fcf3a06d59d29ba4906c3">More...</a><br /></td></tr>
 <tr class="separator:gad98c95021e9fcf3a06d59d29ba4906c3"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ga0308ab59c965b9e264a377866123cfeb"><td class="memItemLeft" align="right" valign="top"><a id="ga0308ab59c965b9e264a377866123cfeb" name="ga0308ab59c965b9e264a377866123cfeb"></a>
-SIMD_API void&#160;</td><td class="memItemRight" valign="bottom"><b>SimdSetAmxFull</b> (void)</td></tr>
-<tr class="memdesc:ga0308ab59c965b9e264a377866123cfeb"><td class="mdescLeft">&#160;</td><td class="mdescRight">Set configuration of AMX registers to maximum size. It is x86 specific functionality. Affect only on CPU with AMX support. <br /></td></tr>
+<tr class="memitem:ga0308ab59c965b9e264a377866123cfeb"><td class="memItemLeft" align="right" valign="top">SIMD_API void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__cpu__flags.html#ga0308ab59c965b9e264a377866123cfeb">SimdSetAmxFull</a> (void)</td></tr>
+<tr class="memdesc:ga0308ab59c965b9e264a377866123cfeb"><td class="mdescLeft">&#160;</td><td class="mdescRight">Loads the full AMX tile configuration for the current thread.  <a href="group__cpu__flags.html#ga0308ab59c965b9e264a377866123cfeb">More...</a><br /></td></tr>
 <tr class="separator:ga0308ab59c965b9e264a377866123cfeb"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
 <a name="details" id="details"></a><h2 class="groupheader">Detailed Description</h2>
 <p >Functions for CPU flags management. </p>
 <h2 class="groupheader">Function Documentation</h2>
+<a id="ga622a23654c42d4453321dce478406455" name="ga622a23654c42d4453321dce478406455"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#ga622a23654c42d4453321dce478406455">&#9670;&#160;</a></span>SimdEmpty()</h2>
+
+<div class="memitem">
+<div class="memproto">
+      <table class="memname">
+        <tr>
+          <td class="memname">void SimdEmpty </td>
+          <td>(</td>
+          <td class="paramtype">void&#160;</td>
+          <td class="paramname"></td><td>)</td>
+          <td></td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+<p>Clears MMX state for x86 SIMD code paths. </p>
+<p >On supported x86 builds this function executes <code>EMMS</code> (via <code>_mm_empty()</code>) when the SSE4.1 backend is enabled at runtime. In other configurations it does nothing. </p>
+
+</div>
+</div>
 <a id="gad032143094f20a39d0fa7abb7a3a4fbc" name="gad032143094f20a39d0fa7abb7a3a4fbc"></a>
 <h2 class="memtitle"><span class="permalink"><a href="#gad032143094f20a39d0fa7abb7a3a4fbc">&#9670;&#160;</a></span>SimdGetFastMode()</h2>
 
@@ -84,8 +103,10 @@ <h2 class="memtitle"><span class="permalink"><a href="#gad032143094f20a39d0fa7ab
       </table>
 </div><div class="memdoc">
 
-<p>Gets current CPU Flush-To-Zero (FTZ) and Denormals-Are-Zero (DAZ) flags. It is used in order to process subnormal numbers. </p>
-<dl class="section return"><dt>Returns</dt><dd>current 'fast' mode. </dd></dl>
+<p>Gets the current 'fast mode' state for floating-point subnormal handling. </p>
+<p >When 'fast mode' is active, subnormal (denormalized) floating-point values are flushed to zero by the hardware rather than being processed normally, which avoids the significant performance penalty of software-assisted denormal handling.</p>
+<p >On x86 platforms with SSE4.1 support, reads the MXCSR register and returns <code>SimdTrue</code> when either the Flush-To-Zero (FTZ, bit 15) or the Denormals-Are-Zero (DAZ, bit 6) bit is set. On ARM platforms with Neon support, reads the FPSCR (AArch32) or FPCR (AArch64) register and returns <code>SimdTrue</code> when the Flush-To-Zero (FTZ, bit 24) bit is set. On platforms without hardware support for this feature, always returns <code>SimdFalse</code>.</p>
+<dl class="section return"><dt>Returns</dt><dd><code>SimdTrue</code> if fast mode is currently enabled, <code>SimdFalse</code> otherwise. </dd></dl>
 
 </div>
 </div>
@@ -105,14 +126,38 @@ <h2 class="memtitle"><span class="permalink"><a href="#gad98c95021e9fcf3a06d59d2
       </table>
 </div><div class="memdoc">
 
-<p>Sets current CPU Flush-To-Zero (FTZ) and Denormals-Are-Zero (DAZ) flags. It is used in order to process subnormal numbers. </p>
+<p>Sets the current thread's 'fast mode' state for floating-point subnormal handling. </p>
+<p >When 'fast mode' is enabled, subnormal (denormalized) floating-point values are flushed to zero by the hardware rather than being processed normally, which avoids the significant performance penalty of software-assisted denormal handling.</p>
+<p >On x86 platforms with SSE4.1 support, sets or clears both the Flush-To-Zero (FTZ, bit 15) and the Denormals-Are-Zero (DAZ, bit 6) bits in the current thread's MXCSR register. On ARM platforms with Neon support, sets or clears the Flush-To-Zero (FTZ, bit 24) bit in the current thread's FPSCR (AArch32) or FPCR (AArch64) register. Has no effect when this feature is not supported or the corresponding SIMD backend is not enabled at runtime.</p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">value</td><td>- a value of 'fast' mode. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">value</td><td>- <code>SimdTrue</code> to enable fast mode, <code>SimdFalse</code> to disable it. </td></tr>
   </table>
   </dd>
 </dl>
 
+</div>
+</div>
+<a id="ga0308ab59c965b9e264a377866123cfeb" name="ga0308ab59c965b9e264a377866123cfeb"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#ga0308ab59c965b9e264a377866123cfeb">&#9670;&#160;</a></span>SimdSetAmxFull()</h2>
+
+<div class="memitem">
+<div class="memproto">
+      <table class="memname">
+        <tr>
+          <td class="memname">void SimdSetAmxFull </td>
+          <td>(</td>
+          <td class="paramtype">void&#160;</td>
+          <td class="paramname"></td><td>)</td>
+          <td></td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+<p>Loads the full AMX tile configuration for the current thread. </p>
+<p >On x86 platforms with AMX-BF16 support, this function forces loading of a predefined full AMX tile configuration into the current thread by calling the AMX tile configuration instruction. It is intended for code paths that need the full tile layout used by the library.</p>
+<p >Has no effect on platforms without AMX-BF16 support or when the corresponding SIMD backend is not enabled at runtime. </p>
+
 </div>
 </div>
 </div><!-- contents -->
diff --git a/docs/help/group__matrix.html b/docs/help/group__matrix.html
index 45382b0d3a..4fb5281f1b 100644
--- a/docs/help/group__matrix.html
+++ b/docs/help/group__matrix.html
@@ -142,7 +142,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga06665ffcdef9dddd71487d5
 
 <p>Performs general matrix multiplication (for 32-bit float numbers). </p>
 <pre class="fragment">C(M, N) = alpha*A(M, K)*B(K, N) + beta*C(M, N);
-</pre><dl class="section note"><dt>Note</dt><dd>This function supports multithreading (See functions <a class="el" href="group__thread.html#ga19459720cf01b21698a32dcc8a55eb6c" title="Gets number of threads used by Simd Library to parallelize some algorithms.">SimdGetThreadNumber</a> and <a class="el" href="group__thread.html#ga604c2f8bafd54c63bf7734c6d3da085f" title="Sets number of threads used by Simd Library to parallelize some algorithms.">SimdSetThreadNumber</a>).</dd></dl>
+</pre><dl class="section note"><dt>Note</dt><dd>This function supports multithreading (See functions <a class="el" href="group__thread.html#ga19459720cf01b21698a32dcc8a55eb6c" title="Gets current global thread number configured for Simd Library parallel algorithms.">SimdGetThreadNumber</a> and <a class="el" href="group__thread.html#ga604c2f8bafd54c63bf7734c6d3da085f" title="Sets number of threads used by Simd Library to parallelize some algorithms.">SimdSetThreadNumber</a>).</dd></dl>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
     <tr><td class="paramdir">[in]</td><td class="paramname">M</td><td>- a height of A and height of C matrices. </td></tr>
@@ -244,7 +244,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga9c756433a5ff9d740a88135
 
 <p>Performs general matrix multiplication (for 32-bit float numbers). </p>
 <pre class="fragment">C(M, N) = alpha*A(M, K)*Trans(B(N, K)) + beta*C(M, N);
-</pre><dl class="section note"><dt>Note</dt><dd>This function supports multithreading (See functions <a class="el" href="group__thread.html#ga19459720cf01b21698a32dcc8a55eb6c" title="Gets number of threads used by Simd Library to parallelize some algorithms.">SimdGetThreadNumber</a> and <a class="el" href="group__thread.html#ga604c2f8bafd54c63bf7734c6d3da085f" title="Sets number of threads used by Simd Library to parallelize some algorithms.">SimdSetThreadNumber</a>).</dd></dl>
+</pre><dl class="section note"><dt>Note</dt><dd>This function supports multithreading (See functions <a class="el" href="group__thread.html#ga19459720cf01b21698a32dcc8a55eb6c" title="Gets current global thread number configured for Simd Library parallel algorithms.">SimdGetThreadNumber</a> and <a class="el" href="group__thread.html#ga604c2f8bafd54c63bf7734c6d3da085f" title="Sets number of threads used by Simd Library to parallelize some algorithms.">SimdSetThreadNumber</a>).</dd></dl>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
     <tr><td class="paramdir">[in]</td><td class="paramname">M</td><td>- a height of A and height of C matrices. </td></tr>
diff --git a/docs/help/group__thread.html b/docs/help/group__thread.html
index 51e7a79cfa..b9d0ddb3e8 100644
--- a/docs/help/group__thread.html
+++ b/docs/help/group__thread.html
@@ -51,7 +51,7 @@ <h1>Simd Library Documentation.</h1>
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="func-members" name="func-members"></a>
 Functions</h2></td></tr>
 <tr class="memitem:ga19459720cf01b21698a32dcc8a55eb6c"><td class="memItemLeft" align="right" valign="top">SIMD_API size_t&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__thread.html#ga19459720cf01b21698a32dcc8a55eb6c">SimdGetThreadNumber</a> (void)</td></tr>
-<tr class="memdesc:ga19459720cf01b21698a32dcc8a55eb6c"><td class="mdescLeft">&#160;</td><td class="mdescRight">Gets number of threads used by <a class="el" href="namespace_simd.html">Simd</a> Library to parallelize some algorithms.  <a href="group__thread.html#ga19459720cf01b21698a32dcc8a55eb6c">More...</a><br /></td></tr>
+<tr class="memdesc:ga19459720cf01b21698a32dcc8a55eb6c"><td class="mdescLeft">&#160;</td><td class="mdescRight">Gets current global thread number configured for <a class="el" href="namespace_simd.html">Simd</a> Library parallel algorithms.  <a href="group__thread.html#ga19459720cf01b21698a32dcc8a55eb6c">More...</a><br /></td></tr>
 <tr class="separator:ga19459720cf01b21698a32dcc8a55eb6c"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ga604c2f8bafd54c63bf7734c6d3da085f"><td class="memItemLeft" align="right" valign="top">SIMD_API void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__thread.html#ga604c2f8bafd54c63bf7734c6d3da085f">SimdSetThreadNumber</a> (size_t threadNumber)</td></tr>
 <tr class="memdesc:ga604c2f8bafd54c63bf7734c6d3da085f"><td class="mdescLeft">&#160;</td><td class="mdescRight">Sets number of threads used by <a class="el" href="namespace_simd.html">Simd</a> Library to parallelize some algorithms.  <a href="group__thread.html#ga604c2f8bafd54c63bf7734c6d3da085f">More...</a><br /></td></tr>
@@ -76,8 +76,9 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga19459720cf01b21698a32dc
       </table>
 </div><div class="memdoc">
 
-<p>Gets number of threads used by <a class="el" href="namespace_simd.html">Simd</a> Library to parallelize some algorithms. </p>
-<dl class="section return"><dt>Returns</dt><dd>current thread number. </dd></dl>
+<p>Gets current global thread number configured for <a class="el" href="namespace_simd.html">Simd</a> Library parallel algorithms. </p>
+<p >Returns the value set by <a class="el" href="group__thread.html#ga604c2f8bafd54c63bf7734c6d3da085f" title="Sets number of threads used by Simd Library to parallelize some algorithms.">SimdSetThreadNumber</a>. By default this value is <code>1</code>. When set, it is restricted to the range <code></code>[1, std::thread::hardware_concurrency()].</p>
+<dl class="section return"><dt>Returns</dt><dd>current configured thread number. </dd></dl>
 
 </div>
 </div>
diff --git a/docs/help/struct_simd_1_1_detection.html b/docs/help/struct_simd_1_1_detection.html
index e0f42170b5..c24c6897af 100644
--- a/docs/help/struct_simd_1_1_detection.html
+++ b/docs/help/struct_simd_1_1_detection.html
@@ -139,9 +139,9 @@ <h1>Simd Library Documentation.</h1>
 <div class="ttc" id="astruct_simd_1_1_detection_html_a5a9fe5380e859581fe00c9952f4c540a"><div class="ttname"><a href="struct_simd_1_1_detection.html#a5a9fe5380e859581fe00c9952f4c540a">Simd::Detection::Detection</a></div><div class="ttdeci">Detection()</div><div class="ttdef"><b>Definition:</b> SimdDetection.hpp:211</div></div>
 <div class="ttc" id="astruct_simd_1_1_detection_html_ac14257967700e9bd098079663e296b06"><div class="ttname"><a href="struct_simd_1_1_detection.html#ac14257967700e9bd098079663e296b06">Simd::Detection::Init</a></div><div class="ttdeci">bool Init(const Size &amp;imageSize, double scaleFactor=1.1, const Size &amp;sizeMin=Size(0, 0), const Size &amp;sizeMax=Size(INT_MAX, INT_MAX), const View &amp;roi=View(), ptrdiff_t threadNumber=-1)</div><div class="ttdef"><b>Definition:</b> SimdDetection.hpp:290</div></div>
 <div class="ttc" id="astruct_simd_1_1_view_html"><div class="ttname"><a href="struct_simd_1_1_view.html">Simd::View</a></div><div class="ttdoc">The View structure provides storage and manipulation of images.</div><div class="ttdef"><b>Definition:</b> SimdView.hpp:70</div></div>
-<div class="ttc" id="astruct_simd_1_1_view_html_abcf72a236be325ea849b737d50ebaa57"><div class="ttname"><a href="struct_simd_1_1_view.html#abcf72a236be325ea849b737d50ebaa57">Simd::View::Size</a></div><div class="ttdeci">Point&lt; ptrdiff_t &gt; Size() const</div><div class="ttdef"><b>Definition:</b> SimdView.hpp:1076</div></div>
-<div class="ttc" id="astruct_simd_1_1_view_html_ad598aa205cbea4bf98500b13a3fffb91"><div class="ttname"><a href="struct_simd_1_1_view.html#ad598aa205cbea4bf98500b13a3fffb91">Simd::View::Load</a></div><div class="ttdeci">bool Load(const std::string &amp;path, Format format=None)</div><div class="ttdef"><b>Definition:</b> SimdView.hpp:1283</div></div>
-<div class="ttc" id="astruct_simd_1_1_view_html_af67086a33d893d3b1b944ae821a61a8c"><div class="ttname"><a href="struct_simd_1_1_view.html#af67086a33d893d3b1b944ae821a61a8c">Simd::View::Save</a></div><div class="ttdeci">bool Save(const std::string &amp;path, SimdImageFileType type=SimdImageFileUndefined, int quality=100) const</div><div class="ttdef"><b>Definition:</b> SimdView.hpp:1307</div></div>
+<div class="ttc" id="astruct_simd_1_1_view_html_abcf72a236be325ea849b737d50ebaa57"><div class="ttname"><a href="struct_simd_1_1_view.html#abcf72a236be325ea849b737d50ebaa57">Simd::View::Size</a></div><div class="ttdeci">Point&lt; ptrdiff_t &gt; Size() const</div><div class="ttdef"><b>Definition:</b> SimdView.hpp:1105</div></div>
+<div class="ttc" id="astruct_simd_1_1_view_html_ad598aa205cbea4bf98500b13a3fffb91"><div class="ttname"><a href="struct_simd_1_1_view.html#ad598aa205cbea4bf98500b13a3fffb91">Simd::View::Load</a></div><div class="ttdeci">bool Load(const std::string &amp;path, Format format=None)</div><div class="ttdef"><b>Definition:</b> SimdView.hpp:1312</div></div>
+<div class="ttc" id="astruct_simd_1_1_view_html_af67086a33d893d3b1b944ae821a61a8c"><div class="ttname"><a href="struct_simd_1_1_view.html#af67086a33d893d3b1b944ae821a61a8c">Simd::View::Save</a></div><div class="ttdeci">bool Save(const std::string &amp;path, SimdImageFileType type=SimdImageFileUndefined, int quality=100) const</div><div class="ttdef"><b>Definition:</b> SimdView.hpp:1336</div></div>
 </div><!-- fragment --><p >Using example (face detection in the video captured by OpenCV): </p><div class="fragment"><div class="line"><span class="preprocessor">#include &lt;iostream&gt;</span></div>
 <div class="line"><span class="preprocessor">#include &lt;string&gt;</span></div>
 <div class="line"> </div>
diff --git a/docs/help/struct_simd_1_1_frame.html b/docs/help/struct_simd_1_1_frame.html
index 0c56a0aa63..cd195bfb3a 100644
--- a/docs/help/struct_simd_1_1_frame.html
+++ b/docs/help/struct_simd_1_1_frame.html
@@ -114,6 +114,10 @@ <h1>Simd Library Documentation.</h1>
 <tr class="separator:ab290a93e205a0a559587a8ad71448313"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:aee79dc91ecfac4ab1e1698d2c9860375"><td class="memItemLeft" align="right" valign="top"><a class="el" href="struct_simd_1_1_frame.html">Frame</a> *&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="struct_simd_1_1_frame.html#aee79dc91ecfac4ab1e1698d2c9860375">Clone</a> (<a class="el" href="struct_simd_1_1_frame.html">Frame</a> &amp;buffer) const</td></tr>
 <tr class="separator:aee79dc91ecfac4ab1e1698d2c9860375"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a71bdb44bdab2a0046236a5960ebb6f57"><td class="memItemLeft" align="right" valign="top"><a class="el" href="struct_simd_1_1_frame.html">Frame</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="struct_simd_1_1_frame.html#a71bdb44bdab2a0046236a5960ebb6f57">Copy</a> () const</td></tr>
+<tr class="separator:a71bdb44bdab2a0046236a5960ebb6f57"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a59d62b2aa8db42f1dcf63697e40a1c7e"><td class="memItemLeft" align="right" valign="top"><a class="el" href="struct_simd_1_1_frame.html">Frame</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="struct_simd_1_1_frame.html#a59d62b2aa8db42f1dcf63697e40a1c7e">Copy</a> (const <a class="el" href="struct_simd_1_1_rectangle.html">Rectangle</a>&lt; ptrdiff_t &gt; &amp;rect) const</td></tr>
+<tr class="separator:a59d62b2aa8db42f1dcf63697e40a1c7e"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:aeb0f5ceffbb62f990b89b200d6ce728c"><td class="memItemLeft" align="right" valign="top"><a class="el" href="struct_simd_1_1_frame.html">Frame</a> &amp;&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="struct_simd_1_1_frame.html#aeb0f5ceffbb62f990b89b200d6ce728c">operator=</a> (const <a class="el" href="struct_simd_1_1_frame.html">Frame</a> &amp;frame)</td></tr>
 <tr class="separator:aeb0f5ceffbb62f990b89b200d6ce728c"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a5a1fc672b62cee95f6bff6854c4aa01a"><td class="memItemLeft" align="right" valign="top"><a class="el" href="struct_simd_1_1_frame.html">Frame</a> &amp;&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="struct_simd_1_1_frame.html#a5a1fc672b62cee95f6bff6854c4aa01a">operator=</a> (<a class="el" href="struct_simd_1_1_frame.html">Frame</a> &amp;&amp;frame)</td></tr>
@@ -754,6 +758,51 @@ <h2 class="memtitle"><span class="permalink"><a href="#aee79dc91ecfac4ab1e1698d2
 </dl>
 <dl class="section return"><dt>Returns</dt><dd>a pointer to the new <a class="el" href="struct_simd_1_1_frame.html" title="The Frame structure provides storage and manipulation of frames (multiplanar images).">Frame</a> structure (not owner). The user must free this pointer after usage. </dd></dl>
 
+</div>
+</div>
+<a id="a71bdb44bdab2a0046236a5960ebb6f57" name="a71bdb44bdab2a0046236a5960ebb6f57"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a71bdb44bdab2a0046236a5960ebb6f57">&#9670;&#160;</a></span>Copy() <span class="overload">[1/2]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+      <table class="memname">
+        <tr>
+          <td class="memname"><a class="el" href="struct_simd_1_1_frame.html">Frame</a> Copy </td>
+          <td>(</td>
+          <td class="paramname"></td><td>)</td>
+          <td> const</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+<p >Gets a copy of current frame by value.</p>
+<dl class="section return"><dt>Returns</dt><dd>a new <a class="el" href="struct_simd_1_1_frame.html" title="The Frame structure provides storage and manipulation of frames (multiplanar images).">Frame</a> structure containing a copy of the frame. </dd></dl>
+
+</div>
+</div>
+<a id="a59d62b2aa8db42f1dcf63697e40a1c7e" name="a59d62b2aa8db42f1dcf63697e40a1c7e"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a59d62b2aa8db42f1dcf63697e40a1c7e">&#9670;&#160;</a></span>Copy() <span class="overload">[2/2]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+      <table class="memname">
+        <tr>
+          <td class="memname"><a class="el" href="struct_simd_1_1_frame.html">Frame</a> Copy </td>
+          <td>(</td>
+          <td class="paramtype">const <a class="el" href="struct_simd_1_1_rectangle.html">Rectangle</a>&lt; ptrdiff_t &gt; &amp;&#160;</td>
+          <td class="paramname"><em>rect</em></td><td>)</td>
+          <td> const</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+<p >Gets a copy of region of current frame bounded by the rectangle with specified coordinates, by value.</p>
+<dl class="params"><dt>Parameters</dt><dd>
+  <table class="params">
+    <tr><td class="paramdir">[in]</td><td class="paramname">rect</td><td>- a rectangle which bounds the region. </td></tr>
+  </table>
+  </dd>
+</dl>
+<dl class="section return"><dt>Returns</dt><dd>a new <a class="el" href="struct_simd_1_1_frame.html" title="The Frame structure provides storage and manipulation of frames (multiplanar images).">Frame</a> structure containing a copy of the region. </dd></dl>
+
 </div>
 </div>
 <a id="aeb0f5ceffbb62f990b89b200d6ce728c" name="aeb0f5ceffbb62f990b89b200d6ce728c"></a>
diff --git a/docs/help/struct_simd_1_1_view.html b/docs/help/struct_simd_1_1_view.html
index 2762b3820a..b5fba9c9be 100644
--- a/docs/help/struct_simd_1_1_view.html
+++ b/docs/help/struct_simd_1_1_view.html
@@ -158,6 +158,10 @@ <h1>Simd Library Documentation.</h1>
 <tr class="separator:a43a05f488e79e8cee04e7edd082b9545"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a1a1f08b1e7b639b93f9fe4cd9ec9a343"><td class="memItemLeft" align="right" valign="top"><a class="el" href="struct_simd_1_1_view.html">View</a> *&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="struct_simd_1_1_view.html#a1a1f08b1e7b639b93f9fe4cd9ec9a343">Clone</a> (<a class="el" href="struct_simd_1_1_view.html">View</a> &amp;buffer) const</td></tr>
 <tr class="separator:a1a1f08b1e7b639b93f9fe4cd9ec9a343"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:abdf8e3c2428064c7e53f33276573fe28"><td class="memItemLeft" align="right" valign="top"><a class="el" href="struct_simd_1_1_view.html">View</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="struct_simd_1_1_view.html#abdf8e3c2428064c7e53f33276573fe28">Copy</a> () const</td></tr>
+<tr class="separator:abdf8e3c2428064c7e53f33276573fe28"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a4c621c43751a78fb444a105a47b11420"><td class="memItemLeft" align="right" valign="top"><a class="el" href="struct_simd_1_1_view.html">View</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="struct_simd_1_1_view.html#a4c621c43751a78fb444a105a47b11420">Copy</a> (const <a class="el" href="struct_simd_1_1_rectangle.html">Rectangle</a>&lt; ptrdiff_t &gt; &amp;rect) const</td></tr>
+<tr class="separator:a4c621c43751a78fb444a105a47b11420"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a40a4f2d75dc5eef553b39bf929cbb4e9"><td class="memItemLeft" align="right" valign="top"><a class="el" href="struct_simd_1_1_view.html">View</a> &amp;&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="struct_simd_1_1_view.html#a40a4f2d75dc5eef553b39bf929cbb4e9">operator=</a> (const <a class="el" href="struct_simd_1_1_view.html">View</a> &amp;view)</td></tr>
 <tr class="separator:a40a4f2d75dc5eef553b39bf929cbb4e9"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ac3e5c0345b395a3935b20f03b2ea6174"><td class="memItemLeft" align="right" valign="top"><a class="el" href="struct_simd_1_1_view.html">View</a> &amp;&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="struct_simd_1_1_view.html#ac3e5c0345b395a3935b20f03b2ea6174">operator=</a> (<a class="el" href="struct_simd_1_1_view.html">View</a> &amp;&amp;view)</td></tr>
@@ -822,6 +826,48 @@ <h2 class="memtitle"><span class="permalink"><a href="#a1a1f08b1e7b639b93f9fe4cd
 </dl>
 <dl class="section return"><dt>Returns</dt><dd>a pointer to the new <a class="el" href="struct_simd_1_1_view.html" title="The View structure provides storage and manipulation of images.">View</a> structure (not owner). The user must free this pointer after usage. </dd></dl>
 
+</div>
+</div>
+<a id="abdf8e3c2428064c7e53f33276573fe28" name="abdf8e3c2428064c7e53f33276573fe28"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#abdf8e3c2428064c7e53f33276573fe28">&#9670;&#160;</a></span>Copy() <span class="overload">[1/2]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+      <table class="memname">
+        <tr>
+          <td class="memname">SIMD_INLINE <a class="el" href="struct_simd_1_1_view.html">View</a>&lt; A &gt; Copy</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+<p >Gets a copy of current image view by value.</p>
+<dl class="section return"><dt>Returns</dt><dd>a new <a class="el" href="struct_simd_1_1_view.html" title="The View structure provides storage and manipulation of images.">View</a> structure containing a copy of the image. </dd></dl>
+
+</div>
+</div>
+<a id="a4c621c43751a78fb444a105a47b11420" name="a4c621c43751a78fb444a105a47b11420"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a4c621c43751a78fb444a105a47b11420">&#9670;&#160;</a></span>Copy() <span class="overload">[2/2]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+      <table class="memname">
+        <tr>
+          <td class="memname">SIMD_INLINE <a class="el" href="struct_simd_1_1_view.html">View</a>&lt; A &gt; Copy </td>
+          <td>(</td>
+          <td class="paramtype">const <a class="el" href="struct_simd_1_1_rectangle.html">Rectangle</a>&lt; ptrdiff_t &gt; &amp;&#160;</td>
+          <td class="paramname"><em>rect</em></td><td>)</td>
+          <td> const</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+<p >Gets a copy of region of current image view bounded by the rectangle with specified coordinates, by value.</p>
+<dl class="params"><dt>Parameters</dt><dd>
+  <table class="params">
+    <tr><td class="paramdir">[in]</td><td class="paramname">rect</td><td>- a rectangle which bounds the region. </td></tr>
+  </table>
+  </dd>
+</dl>
+<dl class="section return"><dt>Returns</dt><dd>a new <a class="el" href="struct_simd_1_1_view.html" title="The View structure provides storage and manipulation of images.">View</a> structure containing a copy of the region. </dd></dl>
+
 </div>
 </div>
 <a id="a40a4f2d75dc5eef553b39bf929cbb4e9" name="a40a4f2d75dc5eef553b39bf929cbb4e9"></a>

From 03546b65f125d36fa8783a855edd26f08e160ffd Mon Sep 17 00:00:00 2001
From: Yermalayeu Ihar <ermig@tut.by>
Date: Mon, 25 May 2026 16:43:14 +0300
Subject: [PATCH 26/32] +add NEON optimizations of function Crc32c.

---
 docs/2026.html                  |  1 +
 docs/index.html                 |  1 +
 prj/cmake/arm.cmake             |  6 +++-
 prj/vs2022/Neon.vcxproj         |  1 +
 prj/vs2022/Neon.vcxproj.filters |  3 ++
 src/Simd/SimdLib.cpp            |  5 +++
 src/Simd/SimdNeon.h             |  4 +++
 src/Simd/SimdNeonCrc32.cpp      | 62 +++++++++++++++++++++++++++++++++
 src/Test/TestCrc32.cpp          |  5 +++
 9 files changed, 87 insertions(+), 1 deletion(-)
 create mode 100644 src/Simd/SimdNeonCrc32.cpp

diff --git a/docs/2026.html b/docs/2026.html
index cb35619654..b46393b7cb 100644
--- a/docs/2026.html
+++ b/docs/2026.html
@@ -50,6 +50,7 @@ <h5>New features</h5>
  <li>Method View::Copy.</li>
  <li>Method Frame::Copy.</li>
  <li>Base implementation, AMX-BF16 optimizations of class SynetConvolution16bNhwcSpecV3.</li>
+ <li>NEON optimizations of function Crc32c.</li>
 </ul>
 <h5>Bug fixing</h5>
 <ul>
diff --git a/docs/index.html b/docs/index.html
index 0104f85d91..483b97dd6d 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -83,6 +83,7 @@ <h3>Contributors</h3>
   <p> 2026-2026 <a href="http://github.com/yuchangminghit">Yu Changming</a>,</p>
   <p> 2026-2026 <a href="http://github.com/Centimo">Evgeniy Efimov</a>.</p>
   <p> 2026-2026 <a href="http://github.com/androm3da">Brian Cain</a>.</p>
+  <p> 2026-2026 <a href="http://github.com/metsw24-max">metsw24-max</a>.</p>
  </td>
 </tr> </table>
 
diff --git a/prj/cmake/arm.cmake b/prj/cmake/arm.cmake
index a95bfa7531..5b93fd2b88 100644
--- a/prj/cmake/arm.cmake
+++ b/prj/cmake/arm.cmake
@@ -2,7 +2,7 @@ if((CMAKE_CXX_COMPILER_ID MATCHES "GNU") AND (NOT(CMAKE_CXX_COMPILER_VERSION VER
 	set(COMMON_CXX_FLAGS "${COMMON_CXX_FLAGS} -Wno-psabi")
 endif()
 
-if(CMAKE_SYSTEM_PROCESSOR MATCHES "arm" AND NOT CMAKE_SYSTEM_PROCESSOR MATCHES "arm64")
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "arm" AND NOT CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
     if( NOT ((CMAKE_CXX_COMPILER_ID STREQUAL "Clang") OR (CMAKE_CXX_COMPILER MATCHES "clang") OR (CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang")))
 	  set(CXX_NEON_FLAG "-mfpu=neon -mfpu=neon-fp16")
 	endif()
@@ -13,6 +13,10 @@ else()
 	set(CXX_NEON_FLAG "")
 endif()
 
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
+	set(CXX_NEON_FLAG "${CXX_NEON_FLAG} -march=armv8-a+crc")
+endif()
+
 if((CMAKE_CXX_COMPILER_ID STREQUAL "Clang") OR (CMAKE_CXX_COMPILER MATCHES "clang")  OR (CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang"))
 	add_definitions(-DSIMD_NEON_FP16_DISABLE)
 endif()
diff --git a/prj/vs2022/Neon.vcxproj b/prj/vs2022/Neon.vcxproj
index eec5e3984a..d355f0900c 100644
--- a/prj/vs2022/Neon.vcxproj
+++ b/prj/vs2022/Neon.vcxproj
@@ -46,6 +46,7 @@
     <ClCompile Include="..\..\src\Simd\SimdNeonBinarization.cpp" />
     <ClCompile Include="..\..\src\Simd\SimdNeonConditional.cpp" />
     <ClCompile Include="..\..\src\Simd\SimdNeonCpu.cpp" />
+    <ClCompile Include="..\..\src\Simd\SimdNeonCrc32.cpp" />
     <ClCompile Include="..\..\src\Simd\SimdNeonDeinterleave.cpp" />
     <ClCompile Include="..\..\src\Simd\SimdNeonDescrInt.cpp" />
     <ClCompile Include="..\..\src\Simd\SimdNeonDescrIntCdd.cpp" />
diff --git a/prj/vs2022/Neon.vcxproj.filters b/prj/vs2022/Neon.vcxproj.filters
index 5f5e92078f..597fbb5761 100644
--- a/prj/vs2022/Neon.vcxproj.filters
+++ b/prj/vs2022/Neon.vcxproj.filters
@@ -361,6 +361,9 @@
     <ClCompile Include="..\..\src\Simd\SimdNeonAbsGradientSaturatedSum.cpp">
       <Filter>Neon\Filter</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\src\Simd\SimdNeonCrc32.cpp">
+      <Filter>Neon\System</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <Filter Include="Neon">
diff --git a/src/Simd/SimdLib.cpp b/src/Simd/SimdLib.cpp
index 38e12f405d..282024f180 100644
--- a/src/Simd/SimdLib.cpp
+++ b/src/Simd/SimdLib.cpp
@@ -273,6 +273,11 @@ SIMD_API uint32_t SimdCrc32c(const void * src, size_t size)
     if(Sse41::Enable)
         return Sse41::Crc32c(src, size);
     else
+#endif
+#if defined(SIMD_NEON_ENABLE) && defined(SIMD_ARM64_ENABLE)
+    if (Neon::Enable)
+        return Neon::Crc32c(src, size);
+    else
 #endif
         return Base::Crc32c(src, size);
 }
diff --git a/src/Simd/SimdNeon.h b/src/Simd/SimdNeon.h
index 2359e7c66e..30bda51a1b 100644
--- a/src/Simd/SimdNeon.h
+++ b/src/Simd/SimdNeon.h
@@ -33,6 +33,10 @@ namespace Simd
 #ifdef SIMD_NEON_ENABLE
     namespace Neon
     {
+#if defined(SIMD_ARM64_ENABLE)
+        uint32_t Crc32c(const void* src, size_t size);
+#endif
+
         void AbsDifference(const uint8_t* a, size_t aStride, const uint8_t* b, size_t bStride, uint8_t* c, size_t cStride,
             size_t width, size_t height);
 
diff --git a/src/Simd/SimdNeonCrc32.cpp b/src/Simd/SimdNeonCrc32.cpp
new file mode 100644
index 0000000000..ab5956c547
--- /dev/null
+++ b/src/Simd/SimdNeonCrc32.cpp
@@ -0,0 +1,62 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2017 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdStore.h"
+#include "Simd/SimdMemory.h"
+
+#if defined(SIMD_NEON_ENABLE) && defined(SIMD_ARM64_ENABLE)
+#include <arm_acle.h>
+#endif
+
+namespace Simd
+{
+#if defined(SIMD_NEON_ENABLE) && defined(SIMD_ARM64_ENABLE)
+    namespace Neon
+    {
+        SIMD_INLINE void Crc32c(uint32_t& crc, const uint64_t* p, const uint64_t* end)
+        {
+            while (p < end)
+                crc = __crc32cd(crc, *p++);
+        }
+
+        SIMD_INLINE void Crc32c(uint32_t& crc, const uint8_t* p, const uint8_t* end)
+        {
+            while (p < end)
+                crc = __crc32cb(crc, *p++);
+        }
+
+        uint32_t Crc32c(const void* src, size_t size)
+        {
+            uint8_t* nose = (uint8_t*)src;
+            uint64_t* body = (uint64_t*)AlignHi(nose, sizeof(uint64_t));
+            uint64_t* tail = (uint64_t*)AlignLo(nose + size, sizeof(uint64_t));
+
+            uint32_t crc = 0xFFFFFFFF;
+            Crc32c(crc, nose, (uint8_t*)body);
+            Crc32c(crc, body, tail);
+            Crc32c(crc, (uint8_t*)tail, nose + size);
+            return ~crc;
+        }
+    }
+#endif
+}
diff --git a/src/Test/TestCrc32.cpp b/src/Test/TestCrc32.cpp
index 6c029fde74..38260a1f4f 100644
--- a/src/Test/TestCrc32.cpp
+++ b/src/Test/TestCrc32.cpp
@@ -109,6 +109,11 @@ namespace Test
             result = result && Crc32AutoTest(FUNC(Simd::Sse41::Crc32c), FUNC(SimdCrc32c));
 #endif 
 
+#if defined(SIMD_NEON_ENABLE) && defined(SIMD_ARM64_ENABLE)
+        if (Simd::Neon::Enable && TestNeon(options))
+            result = result && Crc32AutoTest(FUNC(Simd::Neon::Crc32c), FUNC(SimdCrc32c));
+#endif
+
         return result;
     }
 }

From 4027cee11dae4b20dddd5d4a3ce7359cc7264feb Mon Sep 17 00:00:00 2001
From: Yermalayeu Ihar <ermig@tut.by>
Date: Mon, 25 May 2026 16:57:05 +0300
Subject: [PATCH 27/32] +add NEON optimizations of function Crc32.

---
 docs/2026.html             |  1 +
 src/Simd/SimdLib.cpp       |  8 ++++++--
 src/Simd/SimdNeon.h        |  2 ++
 src/Simd/SimdNeonCrc32.cpp | 27 +++++++++++++++++++++++++++
 src/Test/TestCrc32.cpp     |  5 +++++
 5 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/docs/2026.html b/docs/2026.html
index b46393b7cb..1d572bf4cd 100644
--- a/docs/2026.html
+++ b/docs/2026.html
@@ -51,6 +51,7 @@ <h5>New features</h5>
  <li>Method Frame::Copy.</li>
  <li>Base implementation, AMX-BF16 optimizations of class SynetConvolution16bNhwcSpecV3.</li>
  <li>NEON optimizations of function Crc32c.</li>
+ <li>NEON optimizations of function Crc32.</li>
 </ul>
 <h5>Bug fixing</h5>
 <ul>
diff --git a/src/Simd/SimdLib.cpp b/src/Simd/SimdLib.cpp
index 282024f180..06792097e7 100644
--- a/src/Simd/SimdLib.cpp
+++ b/src/Simd/SimdLib.cpp
@@ -260,10 +260,14 @@ SIMD_API void SimdSetAmxFull()
 #endif
 }
 
-
 SIMD_API uint32_t SimdCrc32(const void* src, size_t size)
 {
-    return Base::Crc32(src, size);
+#if defined(SIMD_NEON_ENABLE) && defined(SIMD_ARM64_ENABLE)
+    if (Neon::Enable)
+        return Neon::Crc32(src, size);
+    else
+#endif
+        return Base::Crc32(src, size);
 }
 
 SIMD_API uint32_t SimdCrc32c(const void * src, size_t size)
diff --git a/src/Simd/SimdNeon.h b/src/Simd/SimdNeon.h
index 30bda51a1b..16e68b1625 100644
--- a/src/Simd/SimdNeon.h
+++ b/src/Simd/SimdNeon.h
@@ -34,6 +34,8 @@ namespace Simd
     namespace Neon
     {
 #if defined(SIMD_ARM64_ENABLE)
+        uint32_t Crc32(const void* src, size_t size);
+
         uint32_t Crc32c(const void* src, size_t size);
 #endif
 
diff --git a/src/Simd/SimdNeonCrc32.cpp b/src/Simd/SimdNeonCrc32.cpp
index ab5956c547..b7026b476e 100644
--- a/src/Simd/SimdNeonCrc32.cpp
+++ b/src/Simd/SimdNeonCrc32.cpp
@@ -33,6 +33,33 @@ namespace Simd
 #if defined(SIMD_NEON_ENABLE) && defined(SIMD_ARM64_ENABLE)
     namespace Neon
     {
+        SIMD_INLINE void Crc32(uint32_t& crc, const uint64_t* p, const uint64_t* end)
+        {
+            while (p < end)
+                crc = __crc32d(crc, *p++);
+        }
+
+        SIMD_INLINE void Crc32(uint32_t& crc, const uint8_t* p, const uint8_t* end)
+        {
+            while (p < end)
+                crc = __crc32b(crc, *p++);
+        }
+
+        uint32_t Crc32(const void* src, size_t size)
+        {
+            uint8_t* nose = (uint8_t*)src;
+            uint64_t* body = (uint64_t*)AlignHi(nose, sizeof(uint64_t));
+            uint64_t* tail = (uint64_t*)AlignLo(nose + size, sizeof(uint64_t));
+
+            uint32_t crc = 0xFFFFFFFF;
+            Crc32(crc, nose, (uint8_t*)body);
+            Crc32(crc, body, tail);
+            Crc32(crc, (uint8_t*)tail, nose + size);
+            return ~crc;
+        }
+
+        //--------------------------------------------------------------------------------------------------
+
         SIMD_INLINE void Crc32c(uint32_t& crc, const uint64_t* p, const uint64_t* end)
         {
             while (p < end)
diff --git a/src/Test/TestCrc32.cpp b/src/Test/TestCrc32.cpp
index 38260a1f4f..9d9ccb99d2 100644
--- a/src/Test/TestCrc32.cpp
+++ b/src/Test/TestCrc32.cpp
@@ -94,6 +94,11 @@ namespace Test
         if (TestBase(options))
             result = result && Crc32AutoTest(FUNC(Simd::Base::Crc32), FUNC(SimdCrc32));
 
+#if defined(SIMD_NEON_ENABLE) && defined(SIMD_ARM64_ENABLE)
+        if (Simd::Neon::Enable && TestNeon(options))
+            result = result && Crc32AutoTest(FUNC(Simd::Neon::Crc32), FUNC(SimdCrc32));
+#endif
+
         return result;
     }
 

From 0d40c7b1e8451012002e209aa5b0557a6adff573 Mon Sep 17 00:00:00 2001
From: Yermalayeu Ihar <ermig@tut.by>
Date: Tue, 26 May 2026 10:21:15 +0300
Subject: [PATCH 28/32] +add SVE optimizations of function GetStatistics.

---
 docs/2026.html                  | 22 ++++++++++
 prj/vs2022/Sve1.vcxproj         |  1 +
 prj/vs2022/Sve1.vcxproj.filters |  3 ++
 src/Simd/SimdLib.cpp            |  6 +++
 src/Simd/SimdSve1.h             |  2 +
 src/Simd/SimdSve1Statistic.cpp  | 74 +++++++++++++++++++++++++++++++++
 src/Test/TestStatistic.cpp      |  5 +++
 7 files changed, 113 insertions(+)
 create mode 100644 src/Simd/SimdSve1Statistic.cpp

diff --git a/docs/2026.html b/docs/2026.html
index 1d572bf4cd..633d86ff9c 100644
--- a/docs/2026.html
+++ b/docs/2026.html
@@ -52,10 +52,16 @@ <h5>New features</h5>
  <li>Base implementation, AMX-BF16 optimizations of class SynetConvolution16bNhwcSpecV3.</li>
  <li>NEON optimizations of function Crc32c.</li>
  <li>NEON optimizations of function Crc32.</li>
+ <li>Support of 8-bit BMP in function ImageLoadBmp.</li>
+ <li>SVE optimizations of function GetStatistics.</li>
 </ul>
 <h5>Bug fixing</h5>
 <ul>
  <li>Error in function SimdAlignment for SVE (ARM).</li>
+ <li>Integer overflow in Base implementation of function JpegProcessFrameHeader.</li>
+ <li>Checking of correctness in Base::JpegHuffman::Build.</li>
+ <li>Checking of correctness in Base::JpegToRgba.</li>
+ <li>Error in ImagePngLoader::ReadTransparency.</li>
 </ul>
 
 <h4>Documentation</h4>
@@ -75,6 +81,22 @@ <h5>Improving</h5>
  <li>Description of function SimdGetFastMode.</li>
  <li>Description of function SimdSetFastMode.</li>
  <li>Description of function SimdSetAmxFull.</li>
+ <li>Description of function SimdCrc32.</li>
+ <li>Description of function SimdCrc32c.</li>
+ <li>Description of function SimdAbsDifference.</li>
+ <li>Description of function SimdAbsDifferenceSum.</li>
+ <li>Description of function SimdAbsDifferenceSumMasked.</li>
+ <li>Description of function SimdAbsDifferenceSums3x3.</li>
+ <li>Description of function SimdAbsDifferenceSums3x3Masked.</li>
+ <li>Description of function SimdAbsGradientSaturatedSum.</li>
+ <li>Description of function SimdAddFeatureDifference.</li>
+ <li>Description of function SimdAlphaBlending.</li>
+ <li>Description of function SimdAlphaBlending2x.</li>
+ <li>Description of function SimdAlphaBlendingBgraToYuv420p.</li>
+ <li>Description of function SimdAlphaBlendingUniform.</li>
+ <li>Description of function SimdAlphaFilling.</li>
+ <li>Description of function SimdAlphaPremultiply.</li>
+ <li>Description of function SimdAlphaUnpremultiply.</li>
 </ul>
 
 <a href="#HOME">Home</a>
diff --git a/prj/vs2022/Sve1.vcxproj b/prj/vs2022/Sve1.vcxproj
index a960799e61..8dd4b60bf8 100644
--- a/prj/vs2022/Sve1.vcxproj
+++ b/prj/vs2022/Sve1.vcxproj
@@ -30,6 +30,7 @@
     <ClCompile Include="..\..\src\Simd\SimdSve1Deinterleave.cpp" />
     <ClCompile Include="..\..\src\Simd\SimdSve1Interleave.cpp" />
     <ClCompile Include="..\..\src\Simd\SimdSve1Operation.cpp" />
+    <ClCompile Include="..\..\src\Simd\SimdSve1Statistic.cpp" />
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\..\src\Simd\SimdSve1.h" />
diff --git a/prj/vs2022/Sve1.vcxproj.filters b/prj/vs2022/Sve1.vcxproj.filters
index 87fc1e9732..6ccbb1ff48 100644
--- a/prj/vs2022/Sve1.vcxproj.filters
+++ b/prj/vs2022/Sve1.vcxproj.filters
@@ -59,5 +59,8 @@
     <ClCompile Include="..\..\src\Simd\SimdSve1Background.cpp">
       <Filter>Sve1\Motion</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\src\Simd\SimdSve1Statistic.cpp">
+      <Filter>Sve1\Statistics</Filter>
+    </ClCompile>
   </ItemGroup>
 </Project>
\ No newline at end of file
diff --git a/src/Simd/SimdLib.cpp b/src/Simd/SimdLib.cpp
index 06792097e7..115af6c345 100644
--- a/src/Simd/SimdLib.cpp
+++ b/src/Simd/SimdLib.cpp
@@ -252,6 +252,7 @@ SIMD_API void SimdEmpty()
         Sse41::Empty();
 #endif
 }
+
 SIMD_API void SimdSetAmxFull()
 {
 #ifdef SIMD_AMXBF16_ENABLE
@@ -4954,6 +4955,11 @@ SIMD_API void SimdGetStatistic(const uint8_t * src, size_t stride, size_t width,
         Sse41::GetStatistic(src, stride, width, height, min, max, average);
     else
 #endif
+#ifdef SIMD_SVE_ENABLE
+    if (Sve::Enable)
+        Sve::GetStatistic(src, stride, width, height, min, max, average);
+    else
+#endif
 #ifdef SIMD_NEON_ENABLE
     if (Neon::Enable && width >= Neon::A)
         Neon::GetStatistic(src, stride, width, height, min, max, average);
diff --git a/src/Simd/SimdSve1.h b/src/Simd/SimdSve1.h
index 853f9b4897..1e7d5274fd 100644
--- a/src/Simd/SimdSve1.h
+++ b/src/Simd/SimdSve1.h
@@ -75,6 +75,8 @@ namespace Simd
         void OperationBinary8u(const uint8_t* a, size_t aStride, const uint8_t* b, size_t bStride, size_t width, size_t height, size_t channelCount, uint8_t* dst, size_t dstStride, SimdOperationBinary8uType type);
 
         void OperationBinary16i(const uint8_t* a, size_t aStride, const uint8_t* b, size_t bStride, size_t width, size_t height, uint8_t* dst, size_t dstStride, SimdOperationBinary16iType type);
+
+        void GetStatistic(const uint8_t* src, size_t stride, size_t width, size_t height, uint8_t* min, uint8_t* max, uint8_t* average);
     }
 #endif
 }
diff --git a/src/Simd/SimdSve1Statistic.cpp b/src/Simd/SimdSve1Statistic.cpp
new file mode 100644
index 0000000000..f9d1e58c02
--- /dev/null
+++ b/src/Simd/SimdSve1Statistic.cpp
@@ -0,0 +1,74 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2023 Yermalayeu Ihar,
+*               2018-2018 Radchenko Andrey.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdMemory.h"
+#include "Simd/SimdStore.h"
+#include "Simd/SimdExtract.h"
+#include "Simd/SimdBase.h"
+
+namespace Simd
+{
+#ifdef SIMD_SVE_ENABLE    
+    namespace Sve
+    {
+        SIMD_INLINE void UpdateStatistic(const uint8_t* src, svbool_t mask, svuint8_t _1, svuint8_t& min, svuint8_t& max, svuint32_t& sum)
+        {
+            svuint8_t val = svld1_u8(mask, src);
+            min = svmin_u8_m(mask, min, val);
+            max = svmax_u8_m(mask, max, val);
+            sum = svdot_u32(sum, val, _1);
+        }
+
+        void GetStatistic(const uint8_t * src, size_t stride, size_t width, size_t height, uint8_t * min, uint8_t * max, uint8_t * average)
+        {
+            assert(width*height);
+
+            size_t A = svlen(svuint8_t());
+            size_t widthA = AlignLo(width, A);
+            const svbool_t body = svptrue_b32();
+            const svbool_t tail = svwhilelt_b8(widthA, width);
+
+            svuint8_t _1 = svdup_n_u8(1);
+            svuint8_t _min = svdup_n_u8(255);
+            svuint8_t _max = svdup_n_u8(0);
+            uint64_t sum = 0;
+            for (size_t row = 0; row < height; ++row)
+            {
+                size_t col = 0;
+                svuint32_t _sum = svdup_n_u32(0);
+                for (; col < widthA; col += A)
+                    UpdateStatistic(src + col, body, _1, _min, _max, _sum);
+                if (widthA < width)
+                    UpdateStatistic(src + col, tail, _1, _min, _max, _sum);
+                sum += svaddv_u32(svptrue_b32(), _sum);
+                src += stride;
+            }
+
+            *min = svminv_u8(svptrue_b32(), _min);
+            *max = svmaxv_u8(svptrue_b32(), _max);
+            *average = (uint8_t)((sum + width*height / 2) / (width*height));
+        }
+    }
+#endif
+}
diff --git a/src/Test/TestStatistic.cpp b/src/Test/TestStatistic.cpp
index db00029a42..6426752a11 100644
--- a/src/Test/TestStatistic.cpp
+++ b/src/Test/TestStatistic.cpp
@@ -113,6 +113,11 @@ namespace Test
             result = result && GetStatisticAutoTest(FUNC1(Simd::Neon::GetStatistic), FUNC1(SimdGetStatistic));
 #endif
 
+#ifdef SIMD_SVE_ENABLE
+        if (Simd::Sve::Enable && TestSve(options))
+            result = result && GetStatisticAutoTest(FUNC1(Simd::Sve::GetStatistic), FUNC1(SimdGetStatistic));
+#endif
+
 #ifdef SIMD_HVX_ENABLE
         if (Simd::Hvx::Enable && TestHvx(options) && W >= Simd::Hvx::A)
         {

From 7aca8cee313160b681d9c07c464e34e9b3fdba8c Mon Sep 17 00:00:00 2001
From: Yermalayeu Ihar <ermig@tut.by>
Date: Tue, 26 May 2026 10:32:27 +0300
Subject: [PATCH 29/32] *fix error int GetStatisticAutoTest.

---
 docs/2026.html             | 6 ++++++
 src/Test/TestCompare.h     | 9 ++++++++-
 src/Test/TestStatistic.cpp | 6 +++---
 3 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/docs/2026.html b/docs/2026.html
index 633d86ff9c..38614265fe 100644
--- a/docs/2026.html
+++ b/docs/2026.html
@@ -64,6 +64,12 @@ <h5>Bug fixing</h5>
  <li>Error in ImagePngLoader::ReadTransparency.</li>
 </ul>
 
+<h4>Test framework</h4>
+<h5>Bug fixing</h5>
+<ul>
+ <li>Error in test GetStatisticAutoTest.</li>
+</ul>
+
 <h4>Documentation</h4>
 <h5>Improving</h5>
 <ul>
diff --git a/src/Test/TestCompare.h b/src/Test/TestCompare.h
index 36518e697e..394ab08d35 100644
--- a/src/Test/TestCompare.h
+++ b/src/Test/TestCompare.h
@@ -72,4 +72,11 @@ namespace Test
         return false; \
     } 
 
-#endif//__TestCompare_h__
+#define TEST_CHECK_VALUE_AS_INT(name) \
+    if(name##1 != name##2) \
+    { \
+        TEST_LOG_SS(Error, "Error " << #name << ": (" << (int)name##1  << " != " << (int)name##2 << ")! "); \
+        return false; \
+    }
+
+#endif
diff --git a/src/Test/TestStatistic.cpp b/src/Test/TestStatistic.cpp
index 6426752a11..cd07117171 100644
--- a/src/Test/TestStatistic.cpp
+++ b/src/Test/TestStatistic.cpp
@@ -68,9 +68,9 @@ namespace Test
 
         TEST_EXECUTE_AT_LEAST_MIN_TIME(f2.Call(src, &min2, &max2, &average2));
 
-        TEST_CHECK_VALUE(min);
-        TEST_CHECK_VALUE(max);
-        TEST_CHECK_VALUE(average);
+        TEST_CHECK_VALUE_AS_INT(min);
+        TEST_CHECK_VALUE_AS_INT(max);
+        TEST_CHECK_VALUE_AS_INT(average);
 
         return result;
     }

From f24ffaed25ddc699283e07e83dc69d4dbbe3b72d Mon Sep 17 00:00:00 2001
From: Yermalayeu Ihar <ermig@tut.by>
Date: Tue, 26 May 2026 10:42:20 +0300
Subject: [PATCH 30/32] *fix bug: wrong body mask.

---
 src/Simd/SimdSve1Statistic.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Simd/SimdSve1Statistic.cpp b/src/Simd/SimdSve1Statistic.cpp
index f9d1e58c02..cf076d0d95 100644
--- a/src/Simd/SimdSve1Statistic.cpp
+++ b/src/Simd/SimdSve1Statistic.cpp
@@ -46,7 +46,7 @@ namespace Simd
 
             size_t A = svlen(svuint8_t());
             size_t widthA = AlignLo(width, A);
-            const svbool_t body = svptrue_b32();
+            const svbool_t body = svptrue_b8();
             const svbool_t tail = svwhilelt_b8(widthA, width);
 
             svuint8_t _1 = svdup_n_u8(1);

From 0ed97c00fcdc7ff3596cb1fb7ca7ab3f69acbe39 Mon Sep 17 00:00:00 2001
From: Yermalayeu Ihar <ermig@tut.by>
Date: Tue, 26 May 2026 11:12:21 +0300
Subject: [PATCH 31/32] *update help.

---
 docs/help/group__correlation.html            | 47 ++++++------
 docs/help/group__difference__estimation.html | 10 +--
 docs/help/group__drawing.html                | 75 ++++++++++----------
 docs/help/group__hash.html                   | 16 ++---
 docs/help/group__other__filter.html          |  8 +--
 5 files changed, 81 insertions(+), 75 deletions(-)

diff --git a/docs/help/group__correlation.html b/docs/help/group__correlation.html
index e711716c11..a4a9b982fc 100644
--- a/docs/help/group__correlation.html
+++ b/docs/help/group__correlation.html
@@ -51,19 +51,19 @@ <h1>Simd Library Documentation.</h1>
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="func-members" name="func-members"></a>
 Functions</h2></td></tr>
 <tr class="memitem:gadea7d6e612c1947395623569b4e5145d"><td class="memItemLeft" align="right" valign="top">SIMD_API void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__correlation.html#gadea7d6e612c1947395623569b4e5145d">SimdAbsDifference</a> (const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, uint8_t *c, size_t cStride, size_t width, size_t height)</td></tr>
-<tr class="memdesc:gadea7d6e612c1947395623569b4e5145d"><td class="mdescLeft">&#160;</td><td class="mdescRight">Gets absolute difference of two gray 8-bit images, pixel by pixel.  <a href="group__correlation.html#gadea7d6e612c1947395623569b4e5145d">More...</a><br /></td></tr>
+<tr class="memdesc:gadea7d6e612c1947395623569b4e5145d"><td class="mdescLeft">&#160;</td><td class="mdescRight">Calculates per-pixel absolute difference of two gray 8-bit images.  <a href="group__correlation.html#gadea7d6e612c1947395623569b4e5145d">More...</a><br /></td></tr>
 <tr class="separator:gadea7d6e612c1947395623569b4e5145d"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:gad6b60bbf1e7ac97fe80019e8edeb468c"><td class="memItemLeft" align="right" valign="top">SIMD_API void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__correlation.html#gad6b60bbf1e7ac97fe80019e8edeb468c">SimdAbsDifferenceSum</a> (const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, size_t width, size_t height, uint64_t *sum)</td></tr>
-<tr class="memdesc:gad6b60bbf1e7ac97fe80019e8edeb468c"><td class="mdescLeft">&#160;</td><td class="mdescRight">Gets sum of absolute difference of two gray 8-bit images.  <a href="group__correlation.html#gad6b60bbf1e7ac97fe80019e8edeb468c">More...</a><br /></td></tr>
+<tr class="memdesc:gad6b60bbf1e7ac97fe80019e8edeb468c"><td class="mdescLeft">&#160;</td><td class="mdescRight">Calculates sum of absolute differences (SAD) of two gray 8-bit images.  <a href="group__correlation.html#gad6b60bbf1e7ac97fe80019e8edeb468c">More...</a><br /></td></tr>
 <tr class="separator:gad6b60bbf1e7ac97fe80019e8edeb468c"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ga82d2e3f956cacecd84fa26b54203078b"><td class="memItemLeft" align="right" valign="top">SIMD_API void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__correlation.html#ga82d2e3f956cacecd84fa26b54203078b">SimdAbsDifferenceSumMasked</a> (const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, const uint8_t *mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t *sum)</td></tr>
-<tr class="memdesc:ga82d2e3f956cacecd84fa26b54203078b"><td class="mdescLeft">&#160;</td><td class="mdescRight">Gets sum of absolute difference of two gray 8-bit images based on gray 8-bit mask.  <a href="group__correlation.html#ga82d2e3f956cacecd84fa26b54203078b">More...</a><br /></td></tr>
+<tr class="memdesc:ga82d2e3f956cacecd84fa26b54203078b"><td class="mdescLeft">&#160;</td><td class="mdescRight">Calculates masked sum of absolute differences (SAD) of two gray 8-bit images.  <a href="group__correlation.html#ga82d2e3f956cacecd84fa26b54203078b">More...</a><br /></td></tr>
 <tr class="separator:ga82d2e3f956cacecd84fa26b54203078b"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:gac3d0d14be2615a2f5f8204454cc42296"><td class="memItemLeft" align="right" valign="top">SIMD_API void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__correlation.html#gac3d0d14be2615a2f5f8204454cc42296">SimdAbsDifferenceSums3x3</a> (const uint8_t *current, size_t currentStride, const uint8_t *background, size_t backgroundStride, size_t width, size_t height, uint64_t *sums)</td></tr>
-<tr class="memdesc:gac3d0d14be2615a2f5f8204454cc42296"><td class="mdescLeft">&#160;</td><td class="mdescRight">Gets 9 sums of absolute difference of two gray 8-bit images with various relative shifts in neighborhood 3x3.  <a href="group__correlation.html#gac3d0d14be2615a2f5f8204454cc42296">More...</a><br /></td></tr>
+<tr class="memdesc:gac3d0d14be2615a2f5f8204454cc42296"><td class="mdescLeft">&#160;</td><td class="mdescRight">Calculates 9 sums of absolute differences for all shifts in 3x3 neighborhood.  <a href="group__correlation.html#gac3d0d14be2615a2f5f8204454cc42296">More...</a><br /></td></tr>
 <tr class="separator:gac3d0d14be2615a2f5f8204454cc42296"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ga6063997a055e15012b2912af8dc57e20"><td class="memItemLeft" align="right" valign="top">SIMD_API void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__correlation.html#ga6063997a055e15012b2912af8dc57e20">SimdAbsDifferenceSums3x3Masked</a> (const uint8_t *current, size_t currentStride, const uint8_t *background, size_t backgroundStride, const uint8_t *mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t *sums)</td></tr>
-<tr class="memdesc:ga6063997a055e15012b2912af8dc57e20"><td class="mdescLeft">&#160;</td><td class="mdescRight">Gets 9 sums of absolute difference of two gray 8-bit images with various relative shifts in neighborhood 3x3 based on gray 8-bit mask.  <a href="group__correlation.html#ga6063997a055e15012b2912af8dc57e20">More...</a><br /></td></tr>
+<tr class="memdesc:ga6063997a055e15012b2912af8dc57e20"><td class="mdescLeft">&#160;</td><td class="mdescRight">Calculates 9 masked sums of absolute differences for all shifts in 3x3 neighborhood.  <a href="group__correlation.html#ga6063997a055e15012b2912af8dc57e20">More...</a><br /></td></tr>
 <tr class="separator:ga6063997a055e15012b2912af8dc57e20"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ga39c2cab462a380fb5044106be786a1bf"><td class="memItemLeft" align="right" valign="top">SIMD_API void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__correlation.html#ga39c2cab462a380fb5044106be786a1bf">SimdCosineDistance32f</a> (const float *a, const float *b, size_t size, float *distance)</td></tr>
 <tr class="memdesc:ga39c2cab462a380fb5044106be786a1bf"><td class="mdescLeft">&#160;</td><td class="mdescRight">Calculates cosine distance of two 32-bit float arrays.  <a href="group__correlation.html#ga39c2cab462a380fb5044106be786a1bf">More...</a><br /></td></tr>
@@ -174,8 +174,9 @@ <h2 class="memtitle"><span class="permalink"><a href="#gadea7d6e612c194739562356
       </table>
 </div><div class="memdoc">
 
-<p>Gets absolute difference of two gray 8-bit images, pixel by pixel. </p>
-<p >The three images must have the same width and height.</p>
+<p>Calculates per-pixel absolute difference of two gray 8-bit images. </p>
+<p >The destination pixel values are computed as: </p><pre class="fragment">c[x, y] = abs(a[x, y] - b[x, y]).
+</pre><p> All three images must have the same width and height.</p>
 <dl class="section note"><dt>Note</dt><dd>This function has a C++ wrapper Simd::AbsDifference(const View&lt;A&gt; &amp; a, const View&lt;A&gt; &amp; b, View&lt;A&gt; &amp; c).</dd></dl>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
@@ -249,8 +250,9 @@ <h2 class="memtitle"><span class="permalink"><a href="#gad6b60bbf1e7ac97fe80019e
       </table>
 </div><div class="memdoc">
 
-<p>Gets sum of absolute difference of two gray 8-bit images. </p>
-<p >Both images must have the same width and height.</p>
+<p>Calculates sum of absolute differences (SAD) of two gray 8-bit images. </p>
+<p >The result value is computed as: </p><pre class="fragment">sum = Σ abs(a[x, y] - b[x, y]).
+</pre><p> Both images must have the same width and height.</p>
 <dl class="section note"><dt>Note</dt><dd>This function has a C++ wrapper Simd::AbsDifferenceSum(const View&lt;A&gt; &amp; a, const View&lt;A&gt; &amp; b, uint64_t &amp; sum).</dd></dl>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
@@ -341,8 +343,9 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga82d2e3f956cacecd84fa26b
       </table>
 </div><div class="memdoc">
 
-<p>Gets sum of absolute difference of two gray 8-bit images based on gray 8-bit mask. </p>
-<p >Gets the absolute difference sum for all points when mask[i] == index. Both images and mask must have the same width and height.</p>
+<p>Calculates masked sum of absolute differences (SAD) of two gray 8-bit images. </p>
+<p >The result value is computed for points where mask[x, y] equals index: </p><pre class="fragment">sum = Σ abs(a[x, y] - b[x, y]), for all (x, y) where mask[x, y] == index.
+</pre><p> Both images and mask must have the same width and height.</p>
 <dl class="section note"><dt>Note</dt><dd>This function has a C++ wrapper Simd::AbsDifferenceSum(const View&lt;A&gt;&amp; a, const View&lt;A&gt;&amp; b, const View&lt;A&gt;&amp; mask, uint8_t index, uint64_t &amp; sum).</dd></dl>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
@@ -418,8 +421,10 @@ <h2 class="memtitle"><span class="permalink"><a href="#gac3d0d14be2615a2f5f82044
       </table>
 </div><div class="memdoc">
 
-<p>Gets 9 sums of absolute difference of two gray 8-bit images with various relative shifts in neighborhood 3x3. </p>
-<p >Both images must have the same width and height. The image height and width must be equal or greater 3. The sums are calculated with central part (indent width = 1) of current image and with part of background image with corresponding shift. The shifts are lain in the range [-1, 1] for axis x and y.</p>
+<p>Calculates 9 sums of absolute differences for all shifts in 3x3 neighborhood. </p>
+<p >Both images must have the same width and height. The image height and width must be equal or greater 3. The sums are computed for the central part of current image (without one-pixel border) and for background image shifted by dx and dy in range [-1, 1]: </p><pre class="fragment">sums[(dy + 1)*3 + (dx + 1)] = Σ abs(current[x, y] - background[x + dx, y + dy]),
+                               x = 1..width-2, y = 1..height-2.
+</pre><p> Output order is: (-1,-1), (0,-1), (1,-1), (-1,0), (0,0), (1,0), (-1,1), (0,1), (1,1).</p>
 <dl class="section note"><dt>Note</dt><dd>This function has a C++ wrapper Simd::AbsDifferenceSums3x3(const View&lt;A&gt;&amp; current, const View&lt;A&gt;&amp; background, uint64_t * sums).</dd></dl>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
@@ -510,8 +515,10 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga6063997a055e15012b2912a
       </table>
 </div><div class="memdoc">
 
-<p>Gets 9 sums of absolute difference of two gray 8-bit images with various relative shifts in neighborhood 3x3 based on gray 8-bit mask. </p>
-<p >Gets the absolute difference sums for all points when mask[i] == index. Both images and mask must have the same width and height. The image height and width must be equal or greater 3. The sums are calculated with central part (indent width = 1) of current image and with part of background image with corresponding shift. The shifts are lain in the range [-1, 1] for axis x and y.</p>
+<p>Calculates 9 masked sums of absolute differences for all shifts in 3x3 neighborhood. </p>
+<p >The sums are computed only for points where mask[x, y] equals index: </p><pre class="fragment">sums[(dy + 1)*3 + (dx + 1)] = Σ abs(current[x, y] - background[x + dx, y + dy]),
+                               for x = 1..width-2, y = 1..height-2 and mask[x, y] == index.
+</pre><p> Both images and mask must have the same width and height. The image height and width must be equal or greater 3. Output order is: (-1,-1), (0,-1), (1,-1), (-1,0), (0,0), (1,0), (-1,1), (0,1), (1,1).</p>
 <dl class="section note"><dt>Note</dt><dd>This function has a C++ wrapper Simd::AbsDifferenceSums3x3(const View&lt;A&gt;&amp; current, const View&lt;A&gt;&amp; background, const View&lt;A&gt;&amp; mask, uint8_t index, uint64_t * sums).</dd></dl>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
@@ -903,7 +910,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga56e481cefc391af9f067198
 
 <p>Gets absolute difference of two gray 8-bit images, pixel by pixel. </p>
 <p >Both images must have the same width and height.</p>
-<dl class="section note"><dt>Note</dt><dd>This function is a C++ wrapper for function <a class="el" href="group__correlation.html#gadea7d6e612c1947395623569b4e5145d" title="Gets absolute difference of two gray 8-bit images, pixel by pixel.">SimdAbsDifference</a>.</dd></dl>
+<dl class="section note"><dt>Note</dt><dd>This function is a C++ wrapper for function <a class="el" href="group__correlation.html#gadea7d6e612c1947395623569b4e5145d" title="Calculates per-pixel absolute difference of two gray 8-bit images.">SimdAbsDifference</a>.</dd></dl>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
     <tr><td class="paramdir">[in]</td><td class="paramname">a</td><td>- a first image. </td></tr>
@@ -949,7 +956,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga720da808dbb58782a893f7b
 
 <p>Gets sum of absolute difference of two gray 8-bit images. </p>
 <p >Both images must have the same width and height.</p>
-<dl class="section note"><dt>Note</dt><dd>This function is a C++ wrapper for function <a class="el" href="group__correlation.html#gad6b60bbf1e7ac97fe80019e8edeb468c" title="Gets sum of absolute difference of two gray 8-bit images.">SimdAbsDifferenceSum</a>.</dd></dl>
+<dl class="section note"><dt>Note</dt><dd>This function is a C++ wrapper for function <a class="el" href="group__correlation.html#gad6b60bbf1e7ac97fe80019e8edeb468c" title="Calculates sum of absolute differences (SAD) of two gray 8-bit images.">SimdAbsDifferenceSum</a>.</dd></dl>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
     <tr><td class="paramdir">[in]</td><td class="paramname">a</td><td>- a first image. </td></tr>
@@ -1007,7 +1014,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#gadff8180338c5947e067e42d
 
 <p>Gets sum of absolute difference of two gray 8-bit images based on gray 8-bit mask. </p>
 <p >Gets the absolute difference sum for all points when mask[i] == index. Both images and mask must have the same width and height.</p>
-<dl class="section note"><dt>Note</dt><dd>This function is a C++ wrapper for function <a class="el" href="group__correlation.html#ga82d2e3f956cacecd84fa26b54203078b" title="Gets sum of absolute difference of two gray 8-bit images based on gray 8-bit mask.">SimdAbsDifferenceSumMasked</a>.</dd></dl>
+<dl class="section note"><dt>Note</dt><dd>This function is a C++ wrapper for function <a class="el" href="group__correlation.html#ga82d2e3f956cacecd84fa26b54203078b" title="Calculates masked sum of absolute differences (SAD) of two gray 8-bit images.">SimdAbsDifferenceSumMasked</a>.</dd></dl>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
     <tr><td class="paramdir">[in]</td><td class="paramname">a</td><td>- a first image. </td></tr>
@@ -1055,7 +1062,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga847802e057685326367a37b
 
 <p>Gets 9 sums of absolute difference of two gray 8-bit images with various relative shifts in neighborhood 3x3. </p>
 <p >Both images must have the same width and height. The image height and width must be equal or greater 3. The sums are calculated with central part (indent width = 1) of current image and with part of background image with corresponding shift. The shifts are lain in the range [-1, 1] for axis x and y.</p>
-<dl class="section note"><dt>Note</dt><dd>This function is a C++ wrapper for function <a class="el" href="group__correlation.html#gac3d0d14be2615a2f5f8204454cc42296" title="Gets 9 sums of absolute difference of two gray 8-bit images with various relative shifts in neighborh...">SimdAbsDifferenceSums3x3</a>.</dd></dl>
+<dl class="section note"><dt>Note</dt><dd>This function is a C++ wrapper for function <a class="el" href="group__correlation.html#gac3d0d14be2615a2f5f8204454cc42296" title="Calculates 9 sums of absolute differences for all shifts in 3x3 neighborhood.">SimdAbsDifferenceSums3x3</a>.</dd></dl>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
     <tr><td class="paramdir">[in]</td><td class="paramname">current</td><td>- a current image. </td></tr>
@@ -1113,7 +1120,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga6587f50ce3d76a5bfcf33c1
 
 <p>Gets 9 sums of absolute difference of two gray 8-bit images with various relative shifts in neighborhood 3x3 based on gray 8-bit mask. </p>
 <p >Gets the absolute difference sums for all points when mask[i] == index. Both images and mask must have the same width and height. The image height and width must be equal or greater 3. The sums are calculated with central part (indent width = 1) of current image and with part of background image with corresponding shift. The shifts are lain in the range [-1, 1] for axis x and y.</p>
-<dl class="section note"><dt>Note</dt><dd>This function is a C++ wrapper for function <a class="el" href="group__correlation.html#ga6063997a055e15012b2912af8dc57e20" title="Gets 9 sums of absolute difference of two gray 8-bit images with various relative shifts in neighborh...">SimdAbsDifferenceSums3x3Masked</a>.</dd></dl>
+<dl class="section note"><dt>Note</dt><dd>This function is a C++ wrapper for function <a class="el" href="group__correlation.html#ga6063997a055e15012b2912af8dc57e20" title="Calculates 9 masked sums of absolute differences for all shifts in 3x3 neighborhood.">SimdAbsDifferenceSums3x3Masked</a>.</dd></dl>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
     <tr><td class="paramdir">[in]</td><td class="paramname">current</td><td>- a current image. </td></tr>
diff --git a/docs/help/group__difference__estimation.html b/docs/help/group__difference__estimation.html
index be77948901..5dbf1cdfee 100644
--- a/docs/help/group__difference__estimation.html
+++ b/docs/help/group__difference__estimation.html
@@ -51,7 +51,7 @@ <h1>Simd Library Documentation.</h1>
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="func-members" name="func-members"></a>
 Functions</h2></td></tr>
 <tr class="memitem:ga1cbf6f1a311067e08b83ab4c29c35395"><td class="memItemLeft" align="right" valign="top">SIMD_API void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__difference__estimation.html#ga1cbf6f1a311067e08b83ab4c29c35395">SimdAddFeatureDifference</a> (const uint8_t *value, size_t valueStride, size_t width, size_t height, const uint8_t *lo, size_t loStride, const uint8_t *hi, size_t hiStride, uint16_t weight, uint8_t *difference, size_t differenceStride)</td></tr>
-<tr class="memdesc:ga1cbf6f1a311067e08b83ab4c29c35395"><td class="mdescLeft">&#160;</td><td class="mdescRight">Adds feature difference to common difference sum.  <a href="group__difference__estimation.html#ga1cbf6f1a311067e08b83ab4c29c35395">More...</a><br /></td></tr>
+<tr class="memdesc:ga1cbf6f1a311067e08b83ab4c29c35395"><td class="mdescLeft">&#160;</td><td class="mdescRight">Accumulates weighted feature difference into 8-bit difference map.  <a href="group__difference__estimation.html#ga1cbf6f1a311067e08b83ab4c29c35395">More...</a><br /></td></tr>
 <tr class="separator:ga1cbf6f1a311067e08b83ab4c29c35395"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:gadf18540d615e3a16a6c8b2d12632af2a"><td class="memTemplParams" colspan="2">template&lt;template&lt; class &gt; class A&gt; </td></tr>
 <tr class="memitem:gadf18540d615e3a16a6c8b2d12632af2a"><td class="memTemplItemLeft" align="right" valign="top">SIMD_INLINE void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="group__difference__estimation.html#gadf18540d615e3a16a6c8b2d12632af2a">AddFeatureDifference</a> (const <a class="el" href="struct_simd_1_1_view.html">View</a>&lt; A &gt; &amp;value, const <a class="el" href="struct_simd_1_1_view.html">View</a>&lt; A &gt; &amp;lo, const <a class="el" href="struct_simd_1_1_view.html">View</a>&lt; A &gt; &amp;hi, uint16_t weight, <a class="el" href="struct_simd_1_1_view.html">View</a>&lt; A &gt; &amp;difference)</td></tr>
@@ -141,10 +141,10 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga1cbf6f1a311067e08b83ab4
       </table>
 </div><div class="memdoc">
 
-<p>Adds feature difference to common difference sum. </p>
+<p>Accumulates weighted feature difference into 8-bit difference map. </p>
 <p >All images must have the same width, height and format (8-bit gray).</p>
-<p >For every point: </p><pre class="fragment">excess = max(lo[i] - value[i], 0) + max(value[i] - hi[i], 0);
-difference[i] += (weight * excess*excess) &gt;&gt; 16;
+<p >For every point: </p><pre class="fragment">excess = max(max(value[i] - hi[i], lo[i] - value[i]), 0);
+difference[i] = min(difference[i] + ((weight * excess * excess) &gt;&gt; 16), 255);
 </pre><p >This function is used for difference estimation in algorithm of motion detection.</p>
 <dl class="section note"><dt>Note</dt><dd>This function has a C++ wrapper Simd::AddFeatureDifference(const View&lt;A&gt;&amp; value, const View&lt;A&gt;&amp; lo, const View&lt;A&gt;&amp; hi, uint16_t weight, View&lt;A&gt;&amp; difference).</dd></dl>
 <dl class="params"><dt>Parameters</dt><dd>
@@ -215,7 +215,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#gadf18540d615e3a16a6c8b2d
 <p >For every point: </p><pre class="fragment">excess = max(lo[i] - value[i], 0) + max(value[i] - hi[i], 0);
 difference[i] += (weight * excess*excess) &gt;&gt; 16;
 </pre><p >This function is used for difference estimation in algorithm of motion detection.</p>
-<dl class="section note"><dt>Note</dt><dd>This function is a C++ wrapper for function <a class="el" href="group__difference__estimation.html#ga1cbf6f1a311067e08b83ab4c29c35395" title="Adds feature difference to common difference sum.">SimdAddFeatureDifference</a>.</dd></dl>
+<dl class="section note"><dt>Note</dt><dd>This function is a C++ wrapper for function <a class="el" href="group__difference__estimation.html#ga1cbf6f1a311067e08b83ab4c29c35395" title="Accumulates weighted feature difference into 8-bit difference map.">SimdAddFeatureDifference</a>.</dd></dl>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
     <tr><td class="paramdir">[in]</td><td class="paramname">value</td><td>- a current feature value. </td></tr>
diff --git a/docs/help/group__drawing.html b/docs/help/group__drawing.html
index d881a6ea7d..9e37118b46 100644
--- a/docs/help/group__drawing.html
+++ b/docs/help/group__drawing.html
@@ -51,25 +51,25 @@ <h1>Simd Library Documentation.</h1>
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="func-members" name="func-members"></a>
 Functions</h2></td></tr>
 <tr class="memitem:ga24392fe6c365dbb60678c9540c860e83"><td class="memItemLeft" align="right" valign="top">SIMD_API void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__drawing.html#ga24392fe6c365dbb60678c9540c860e83">SimdAlphaBlending</a> (const uint8_t *src, size_t srcStride, size_t width, size_t height, size_t channelCount, const uint8_t *alpha, size_t alphaStride, uint8_t *dst, size_t dstStride)</td></tr>
-<tr class="memdesc:ga24392fe6c365dbb60678c9540c860e83"><td class="mdescLeft">&#160;</td><td class="mdescRight">Performs alpha blending operation.  <a href="group__drawing.html#ga24392fe6c365dbb60678c9540c860e83">More...</a><br /></td></tr>
+<tr class="memdesc:ga24392fe6c365dbb60678c9540c860e83"><td class="mdescLeft">&#160;</td><td class="mdescRight">Blends source image over destination image using per-pixel 8-bit alpha mask.  <a href="group__drawing.html#ga24392fe6c365dbb60678c9540c860e83">More...</a><br /></td></tr>
 <tr class="separator:ga24392fe6c365dbb60678c9540c860e83"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:gaab78a5cb7b7cf946aa38664b6d1efbe9"><td class="memItemLeft" align="right" valign="top">SIMD_API void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__drawing.html#gaab78a5cb7b7cf946aa38664b6d1efbe9">SimdAlphaBlending2x</a> (const uint8_t *src0, size_t src0Stride, const uint8_t *alpha0, size_t alpha0Stride, const uint8_t *src1, size_t src1Stride, const uint8_t *alpha1, size_t alpha1Stride, size_t width, size_t height, size_t channelCount, uint8_t *dst, size_t dstStride)</td></tr>
-<tr class="memdesc:gaab78a5cb7b7cf946aa38664b6d1efbe9"><td class="mdescLeft">&#160;</td><td class="mdescRight">Performs double alpha blending operation.  <a href="group__drawing.html#gaab78a5cb7b7cf946aa38664b6d1efbe9">More...</a><br /></td></tr>
+<tr class="memdesc:gaab78a5cb7b7cf946aa38664b6d1efbe9"><td class="mdescLeft">&#160;</td><td class="mdescRight">Performs two sequential alpha blendings of source images over destination image.  <a href="group__drawing.html#gaab78a5cb7b7cf946aa38664b6d1efbe9">More...</a><br /></td></tr>
 <tr class="separator:gaab78a5cb7b7cf946aa38664b6d1efbe9"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:gac766d3d07d13efab36a347b595424db1"><td class="memItemLeft" align="right" valign="top">SIMD_API void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__drawing.html#gac766d3d07d13efab36a347b595424db1">SimdAlphaBlendingBgraToYuv420p</a> (const uint8_t *bgra, size_t bgraStride, size_t width, size_t height, uint8_t *y, size_t yStride, uint8_t *u, size_t uStride, uint8_t *v, size_t vStride, <a class="el" href="group__yuv__conversion.html#gad247235213be1a24330e4ad8616d1224">SimdYuvType</a> yuvType)</td></tr>
-<tr class="memdesc:gac766d3d07d13efab36a347b595424db1"><td class="mdescLeft">&#160;</td><td class="mdescRight">Performs alpha blending of BGRA image to YUV420P.  <a href="group__drawing.html#gac766d3d07d13efab36a347b595424db1">More...</a><br /></td></tr>
+<tr class="memdesc:gac766d3d07d13efab36a347b595424db1"><td class="mdescLeft">&#160;</td><td class="mdescRight">Converts BGRA to YUV420P and alpha-blends it with destination Y, U and V planes.  <a href="group__drawing.html#gac766d3d07d13efab36a347b595424db1">More...</a><br /></td></tr>
 <tr class="separator:gac766d3d07d13efab36a347b595424db1"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:gac93b9719fbb80c729042c6cf8a737970"><td class="memItemLeft" align="right" valign="top">SIMD_API void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__drawing.html#gac93b9719fbb80c729042c6cf8a737970">SimdAlphaBlendingUniform</a> (const uint8_t *src, size_t srcStride, size_t width, size_t height, size_t channelCount, uint8_t alpha, uint8_t *dst, size_t dstStride)</td></tr>
-<tr class="memdesc:gac93b9719fbb80c729042c6cf8a737970"><td class="mdescLeft">&#160;</td><td class="mdescRight">Performs uniform alpha blending operation.  <a href="group__drawing.html#gac93b9719fbb80c729042c6cf8a737970">More...</a><br /></td></tr>
+<tr class="memdesc:gac93b9719fbb80c729042c6cf8a737970"><td class="mdescLeft">&#160;</td><td class="mdescRight">Blends source image over destination image with the same alpha value for all pixels.  <a href="group__drawing.html#gac93b9719fbb80c729042c6cf8a737970">More...</a><br /></td></tr>
 <tr class="separator:gac93b9719fbb80c729042c6cf8a737970"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ga8452528e17e594a4b12d82715425ae89"><td class="memItemLeft" align="right" valign="top">SIMD_API void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__drawing.html#ga8452528e17e594a4b12d82715425ae89">SimdAlphaFilling</a> (uint8_t *dst, size_t dstStride, size_t width, size_t height, const uint8_t *channel, size_t channelCount, const uint8_t *alpha, size_t alphaStride)</td></tr>
-<tr class="memdesc:ga8452528e17e594a4b12d82715425ae89"><td class="mdescLeft">&#160;</td><td class="mdescRight">Performs alpha filling operation.  <a href="group__drawing.html#ga8452528e17e594a4b12d82715425ae89">More...</a><br /></td></tr>
+<tr class="memdesc:ga8452528e17e594a4b12d82715425ae89"><td class="mdescLeft">&#160;</td><td class="mdescRight">Blends constant pixel value into destination image using per-pixel 8-bit alpha mask.  <a href="group__drawing.html#ga8452528e17e594a4b12d82715425ae89">More...</a><br /></td></tr>
 <tr class="separator:ga8452528e17e594a4b12d82715425ae89"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ga90a02f756b5bfca963d531127195bbce"><td class="memItemLeft" align="right" valign="top">SIMD_API void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__drawing.html#ga90a02f756b5bfca963d531127195bbce">SimdAlphaPremultiply</a> (const uint8_t *src, size_t srcStride, size_t width, size_t height, uint8_t *dst, size_t dstStride, <a class="el" href="group__c__types.html#ga128437633efebd89ca4bde565dcf5627">SimdBool</a> argb)</td></tr>
-<tr class="memdesc:ga90a02f756b5bfca963d531127195bbce"><td class="mdescLeft">&#160;</td><td class="mdescRight">Performs premultiply operation.  <a href="group__drawing.html#ga90a02f756b5bfca963d531127195bbce">More...</a><br /></td></tr>
+<tr class="memdesc:ga90a02f756b5bfca963d531127195bbce"><td class="mdescLeft">&#160;</td><td class="mdescRight">Converts straight-alpha 4-channel image to premultiplied-alpha representation.  <a href="group__drawing.html#ga90a02f756b5bfca963d531127195bbce">More...</a><br /></td></tr>
 <tr class="separator:ga90a02f756b5bfca963d531127195bbce"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ga9f2d3f5d8e87939f998b5b712f244b4d"><td class="memItemLeft" align="right" valign="top">SIMD_API void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__drawing.html#ga9f2d3f5d8e87939f998b5b712f244b4d">SimdAlphaUnpremultiply</a> (const uint8_t *src, size_t srcStride, size_t width, size_t height, uint8_t *dst, size_t dstStride, <a class="el" href="group__c__types.html#ga128437633efebd89ca4bde565dcf5627">SimdBool</a> argb)</td></tr>
-<tr class="memdesc:ga9f2d3f5d8e87939f998b5b712f244b4d"><td class="mdescLeft">&#160;</td><td class="mdescRight">Performs unpremultiply operation.  <a href="group__drawing.html#ga9f2d3f5d8e87939f998b5b712f244b4d">More...</a><br /></td></tr>
+<tr class="memdesc:ga9f2d3f5d8e87939f998b5b712f244b4d"><td class="mdescLeft">&#160;</td><td class="mdescRight">Converts premultiplied-alpha 4-channel image to straight-alpha representation.  <a href="group__drawing.html#ga9f2d3f5d8e87939f998b5b712f244b4d">More...</a><br /></td></tr>
 <tr class="separator:ga9f2d3f5d8e87939f998b5b712f244b4d"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ga23c105b8f21e71d15a8b98d3b9983622"><td class="memItemLeft" align="right" valign="top">SIMD_API void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__drawing.html#ga23c105b8f21e71d15a8b98d3b9983622">SimdDrawLine</a> (uint8_t *canvas, size_t stride, size_t width, size_t height, size_t channels, ptrdiff_t x1, ptrdiff_t y1, ptrdiff_t x2, ptrdiff_t y2, const uint8_t *color, size_t lineWidth)</td></tr>
 <tr class="memdesc:ga23c105b8f21e71d15a8b98d3b9983622"><td class="mdescLeft">&#160;</td><td class="mdescRight">Draws a line at the image.  <a href="group__drawing.html#ga23c105b8f21e71d15a8b98d3b9983622">More...</a><br /></td></tr>
@@ -192,10 +192,11 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga24392fe6c365dbb60678c95
       </table>
 </div><div class="memdoc">
 
-<p>Performs alpha blending operation. </p>
+<p>Blends source image over destination image using per-pixel 8-bit alpha mask. </p>
 <p >All images must have the same width and height. Source and destination images must have the same format (8 bit per channel, for example GRAY8, UV16, BGR24 or BGRA32). Alpha must be 8-bit gray image.</p>
-<p >For every point: </p><pre class="fragment">dst[x, y, c] = (src[x, y, c]*alpha[x, y] + dst[x, y, c]*(255 - alpha[x, y]))/255;
-</pre><p >This function is used for image drawing.</p>
+<p >For every point and channel: </p><pre class="fragment">dst[x, y, c] = DivideBy255(src[x, y, c]*alpha[x, y] + dst[x, y, c]*(255 - alpha[x, y]));
+</pre><p> where DivideBy255(v) = (v + 1 + (v &gt;&gt; 8)) &gt;&gt; 8.</p>
+<p >This function is used for image drawing.</p>
 <dl class="section note"><dt>Note</dt><dd>This function has a C++ wrapper Simd::AlphaBlending(const View&lt;A&gt;&amp; src, const View&lt;A&gt;&amp; alpha, View&lt;A&gt;&amp; dst).</dd></dl>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
@@ -306,10 +307,10 @@ <h2 class="memtitle"><span class="permalink"><a href="#gaab78a5cb7b7cf946aa38664
       </table>
 </div><div class="memdoc">
 
-<p>Performs double alpha blending operation. </p>
+<p>Performs two sequential alpha blendings of source images over destination image. </p>
 <p >All images must have the same width and height. Source and destination images must have the same format (8 bit per channel, for example GRAY8, UV16, BGR24 or BGRA32). Alphas must be 8-bit gray image.</p>
-<p >For every point: </p><pre class="fragment">tmp = (src0[x, y, c]*alpha0[x, y] + dst[x, y, c]*(255 - alpha0[x, y]))/255;
-dst[x, y, c] = (src1[x, y, c]*alpha1[x, y] + tmp*(255 - alpha1[x, y]))/255;
+<p >For every point and channel: </p><pre class="fragment">tmp = DivideBy255(src0[x, y, c]*alpha0[x, y] + dst[x, y, c]*(255 - alpha0[x, y]));
+dst[x, y, c] = DivideBy255(src1[x, y, c]*alpha1[x, y] + tmp*(255 - alpha1[x, y]));
 </pre><p >This function is used for image drawing.</p>
 <dl class="section note"><dt>Note</dt><dd>This function has a C++ wrapper Simd::AlphaBlending(const View&lt;A&gt;&amp; src0, const View&lt;A&gt;&amp; alpha0, const View&lt;A&gt;&amp; src1, const View&lt;A&gt;&amp; alpha1, View&lt;A&gt;&amp; dst).</dd></dl>
 <dl class="params"><dt>Parameters</dt><dd>
@@ -413,8 +414,8 @@ <h2 class="memtitle"><span class="permalink"><a href="#gac766d3d07d13efab36a347b
       </table>
 </div><div class="memdoc">
 
-<p>Performs alpha blending of BGRA image to YUV420P. </p>
-<p >This function is used for image drawing. The input BGRA and output Y images must have the same width and height. The output U and V images must have the same width and height (half size relative to Y component).</p>
+<p>Converts BGRA to YUV420P and alpha-blends it with destination Y, U and V planes. </p>
+<p >For every BGRA pixel, Y is computed from BGR and blended with corresponding destination Y using this pixel alpha. For every 2x2 BGRA block, U and V are computed from averaged B, G, R values and blended with destination U and V using average alpha of this 2x2 block. The input BGRA and output Y images must have the same width and height. The output U and V images must have half width and half height relative to Y component.</p>
 <dl class="section note"><dt>Note</dt><dd>This function has a C++ wrapper <a class="el" href="group__drawing.html#gadcba495fb909153866dc329a2c63a7ee" title="Performs alpha blending of BGRA image to YUV420P.">Simd::AlphaBlendingBgraToYuv420p</a>(const View&amp; bgra, View&amp; y, View&amp; u, View&amp; v, SimdYuvType yuvType = SimdYuvBt601).</dd></dl>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
@@ -497,9 +498,9 @@ <h2 class="memtitle"><span class="permalink"><a href="#gac93b9719fbb80c729042c6c
       </table>
 </div><div class="memdoc">
 
-<p>Performs uniform alpha blending operation. </p>
+<p>Blends source image over destination image with the same alpha value for all pixels. </p>
 <p >All images must have the same width and height. Source and destination images must have the same format (8 bit per channel, for example GRAY8, UV16, BGR24 or BGRA32).</p>
-<p >For every point: </p><pre class="fragment">dst[x, y, c] = (src[x, y, c]*alpha + dst[x, y, c]*(255 - alpha))/255;
+<p >For every point and channel: </p><pre class="fragment">dst[x, y, c] = DivideBy255(src[x, y, c]*alpha + dst[x, y, c]*(255 - alpha));
 </pre><p >This function is used for image drawing.</p>
 <dl class="section note"><dt>Note</dt><dd>This function has a C++ wrapper Simd::AlphaBlending(const View&lt;A&gt;&amp; src, uint8_t alpha, View&lt;A&gt;&amp; dst).</dd></dl>
 <dl class="params"><dt>Parameters</dt><dd>
@@ -580,9 +581,9 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga8452528e17e594a4b12d827
       </table>
 </div><div class="memdoc">
 
-<p>Performs alpha filling operation. </p>
+<p>Blends constant pixel value into destination image using per-pixel 8-bit alpha mask. </p>
 <p >All images must have the same width and height. Destination images must have 8 bit per channel (for example GRAY8, BGR24 or BGRA32). Alpha must be 8-bit gray image.</p>
-<p >For every point: </p><pre class="fragment">dst[x, y, c] = (channel[c]*alpha[x, y] + dst[x, y, c]*(255 - alpha[x, y]))/255;
+<p >For every point and channel: </p><pre class="fragment">dst[x, y, c] = DivideBy255(channel[c]*alpha[x, y] + dst[x, y, c]*(255 - alpha[x, y]));
 </pre><p >This function is used for image drawing.</p>
 <dl class="section note"><dt>Note</dt><dd>This function has a C++ wrapper Simd::AlphaFilling(View&lt;A&gt; &amp; dst, const Pixel &amp; pixel, const View&lt;A&gt; &amp; alpha).</dd></dl>
 <dl class="params"><dt>Parameters</dt><dd>
@@ -657,13 +658,12 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga90a02f756b5bfca963d5311
       </table>
 </div><div class="memdoc">
 
-<p>Performs premultiply operation. </p>
+<p>Converts straight-alpha 4-channel image to premultiplied-alpha representation. </p>
 <p >All images must have the same width, height and format (BGRA32, RGBA32, ARGB32).</p>
-<p >For every point (sample for BGRA32): </p><pre class="fragment"> dst[x, y, 0] = src[x, y, 0] * src[x, y, 3] / 255;
- dst[x, y, 1] = src[x, y, 1] * src[x, y, 3] / 255;
- dst[x, y, 2] = src[x, y, 2] * src[x, y, 3] / 255;
- dst[x, y, 3] = src[x, y, 3];
-</pre><p >This function is used for image drawing as a part of alpha blending operation.</p>
+<p >For every point: </p><pre class="fragment"> color = DivideBy255(color * alpha);
+ alpha is copied unchanged.
+</pre><p> If argb == SimdFalse then alpha channel index is 3 (BGRA32/RGBA32 layout). If argb == SimdTrue then alpha channel index is 0 (ARGB32 layout).</p>
+<p >This function is used for image drawing as a part of alpha blending operation.</p>
 <dl class="section note"><dt>Note</dt><dd>This function has a C++ wrapper Simd::AlphaPremultiply(const View&lt;A&gt;&amp; src, View&lt;A&gt;&amp; dst).</dd></dl>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
@@ -736,13 +736,12 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga9f2d3f5d8e87939f998b5b7
       </table>
 </div><div class="memdoc">
 
-<p>Performs unpremultiply operation. </p>
+<p>Converts premultiplied-alpha 4-channel image to straight-alpha representation. </p>
 <p >All images must have the same width, height and format (BGRA32, RGBA32, ARGB32).</p>
-<p >For every point (sample for BGRA32): </p><pre class="fragment"> dst[x, y, 0] = src[x, y, 0] / src[x, y, 3] * 255;
- dst[x, y, 1] = src[x, y, 1] / src[x, y, 3] * 255;
- dst[x, y, 2] = src[x, y, 2] / src[x, y, 3] * 255;
- dst[x, y, 3] = src[x, y, 3];
-</pre><p >This function is used for image drawing as a part of alpha blending operation.</p>
+<p >For every point: </p><pre class="fragment"> color = clamp(int(color * (alpha ? 255.00001f/alpha : 0.0f)), 0, 255);
+ alpha is copied unchanged.
+</pre><p> If argb == SimdFalse then alpha channel index is 3 (BGRA32/RGBA32 layout). If argb == SimdTrue then alpha channel index is 0 (ARGB32 layout).</p>
+<p >This function is used for image drawing as a part of alpha blending operation.</p>
 <dl class="section note"><dt>Note</dt><dd>This function has a C++ wrapper Simd::AlphaUnpremultiply(const View&lt;A&gt;&amp; src, View&lt;A&gt;&amp; dst).</dd></dl>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
@@ -1226,7 +1225,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#gad15eb53b97ba36c679da57c
 <p >All images must have the same width and height. Source and destination images must have the same format (8 bit per channel, for example GRAY8, UV16, BGR24 or BGRA32). Alpha must be 8-bit gray image.</p>
 <p >For every point: </p><pre class="fragment">dst[x, y, c] = (src[x, y, c]*alpha[x, y] + dst[x, y, c]*(255 - alpha[x, y]))/255;
 </pre><p >This function is used for image drawing.</p>
-<dl class="section note"><dt>Note</dt><dd>This function is a C++ wrapper for function <a class="el" href="group__drawing.html#ga24392fe6c365dbb60678c9540c860e83" title="Performs alpha blending operation.">SimdAlphaBlending</a>.</dd></dl>
+<dl class="section note"><dt>Note</dt><dd>This function is a C++ wrapper for function <a class="el" href="group__drawing.html#ga24392fe6c365dbb60678c9540c860e83" title="Blends source image over destination image using per-pixel 8-bit alpha mask.">SimdAlphaBlending</a>.</dd></dl>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
     <tr><td class="paramdir">[in]</td><td class="paramname">src</td><td>- a foreground image. </td></tr>
@@ -1287,7 +1286,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga96b177f21682b935bd7db05
 <p >For every point: </p><pre class="fragment">tmp = (src0[x, y, c]*alpha0[x, y] + dst[x, y, c]*(255 - alpha0[x, y]))/255;
 dst[x, y, c] = (src1[x, y, c]*alpha1[x, y] + tmp*(255 - alpha1[x, y]))/255;
 </pre><p >This function is used for image drawing.</p>
-<dl class="section note"><dt>Note</dt><dd>This function is a C++ wrapper for function <a class="el" href="group__drawing.html#gaab78a5cb7b7cf946aa38664b6d1efbe9" title="Performs double alpha blending operation.">SimdAlphaBlending2x</a>.</dd></dl>
+<dl class="section note"><dt>Note</dt><dd>This function is a C++ wrapper for function <a class="el" href="group__drawing.html#gaab78a5cb7b7cf946aa38664b6d1efbe9" title="Performs two sequential alpha blendings of source images over destination image.">SimdAlphaBlending2x</a>.</dd></dl>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
     <tr><td class="paramdir">[in]</td><td class="paramname">src0</td><td>- the first foreground image. </td></tr>
@@ -1347,7 +1346,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#gadcba495fb909153866dc329
 
 <p>Performs alpha blending of BGRA image to YUV420P. </p>
 <p >This function is used for image drawing. The input BGRA and output Y images must have the same width and height. The output U and V images must have the same width and height (half size relative to Y component).</p>
-<dl class="section note"><dt>Note</dt><dd>This function is a C++ wrapper for function <a class="el" href="group__drawing.html#gac766d3d07d13efab36a347b595424db1" title="Performs alpha blending of BGRA image to YUV420P.">SimdAlphaBlendingBgraToYuv420p</a>.</dd></dl>
+<dl class="section note"><dt>Note</dt><dd>This function is a C++ wrapper for function <a class="el" href="group__drawing.html#gac766d3d07d13efab36a347b595424db1" title="Converts BGRA to YUV420P and alpha-blends it with destination Y, U and V planes.">SimdAlphaBlendingBgraToYuv420p</a>.</dd></dl>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
     <tr><td class="paramdir">[in]</td><td class="paramname">bgra</td><td>- a foreground BGRA-32 image. </td></tr>
@@ -1397,7 +1396,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga9ef2c822405d741c6d0bb93
 <p >All images must have the same width and height. Source and destination images must have the same format (8 bit per channel, for example GRAY8, UV16, BGR24 or BGRA32).</p>
 <p >For every point: </p><pre class="fragment">dst[x, y, c] = (src[x, y, c]*alpha + dst[x, y, c]*(255 - alpha))/255;
 </pre><p >This function is used for image drawing.</p>
-<dl class="section note"><dt>Note</dt><dd>This function is a C++ wrapper for function <a class="el" href="group__drawing.html#gac93b9719fbb80c729042c6cf8a737970" title="Performs uniform alpha blending operation.">SimdAlphaBlendingUniform</a>.</dd></dl>
+<dl class="section note"><dt>Note</dt><dd>This function is a C++ wrapper for function <a class="el" href="group__drawing.html#gac93b9719fbb80c729042c6cf8a737970" title="Blends source image over destination image with the same alpha value for all pixels.">SimdAlphaBlendingUniform</a>.</dd></dl>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
     <tr><td class="paramdir">[in]</td><td class="paramname">src</td><td>- a foreground image. </td></tr>
@@ -1445,7 +1444,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#gafe81bb20ad2ddd1f1b53307
 <p >All images must have the same width and height. Destination images must have 8 bit per channel (for example GRAY8, BGR24 or BGRA32). Alpha must be 8-bit gray image.</p>
 <p >For every point: </p><pre class="fragment">dst[x, y, c] = (pixel[c]*alpha[x, y] + dst[x, y, c]*(255 - alpha[x, y]))/255;
 </pre><p >This function is used for image drawing.</p>
-<dl class="section note"><dt>Note</dt><dd>This function is a C++ wrapper for function <a class="el" href="group__drawing.html#ga8452528e17e594a4b12d82715425ae89" title="Performs alpha filling operation.">SimdAlphaFilling</a>.</dd></dl>
+<dl class="section note"><dt>Note</dt><dd>This function is a C++ wrapper for function <a class="el" href="group__drawing.html#ga8452528e17e594a4b12d82715425ae89" title="Blends constant pixel value into destination image using per-pixel 8-bit alpha mask.">SimdAlphaFilling</a>.</dd></dl>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
     <tr><td class="paramdir">[in,out]</td><td class="paramname">dst</td><td>- a background image. </td></tr>
@@ -1490,7 +1489,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#gaf3e23da64d8cd43df7f1d09
  dst[x, y, 2] = src[x, y, 2] * src[x, y, 3] / 255;
  dst[x, y, 3] = src[x, y, 3];
 </pre><p >This function is used for image drawing as a part of alpha blending operation.</p>
-<dl class="section note"><dt>Note</dt><dd>This function is a C++ wrapper for function <a class="el" href="group__drawing.html#ga90a02f756b5bfca963d531127195bbce" title="Performs premultiply operation.">SimdAlphaPremultiply</a>.</dd></dl>
+<dl class="section note"><dt>Note</dt><dd>This function is a C++ wrapper for function <a class="el" href="group__drawing.html#ga90a02f756b5bfca963d531127195bbce" title="Converts straight-alpha 4-channel image to premultiplied-alpha representation.">SimdAlphaPremultiply</a>.</dd></dl>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
     <tr><td class="paramdir">[in]</td><td class="paramname">src</td><td>- an input image. </td></tr>
@@ -1534,7 +1533,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#gaf368927da76f48bfdb141f1
  dst[x, y, 2] = src[x, y, 2] / src[x, y, 3] * 255;
  dst[x, y, 3] = src[x, y, 3];
 </pre><p >This function is used for image drawing as a part of alpha blending operation.</p>
-<dl class="section note"><dt>Note</dt><dd>This function is a C++ wrapper for function <a class="el" href="group__drawing.html#ga9f2d3f5d8e87939f998b5b712f244b4d" title="Performs unpremultiply operation.">SimdAlphaUnpremultiply</a>.</dd></dl>
+<dl class="section note"><dt>Note</dt><dd>This function is a C++ wrapper for function <a class="el" href="group__drawing.html#ga9f2d3f5d8e87939f998b5b712f244b4d" title="Converts premultiplied-alpha 4-channel image to straight-alpha representation.">SimdAlphaUnpremultiply</a>.</dd></dl>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
     <tr><td class="paramdir">[in]</td><td class="paramname">src</td><td>- an input image. </td></tr>
diff --git a/docs/help/group__hash.html b/docs/help/group__hash.html
index 3e8bac2355..f0a73d428e 100644
--- a/docs/help/group__hash.html
+++ b/docs/help/group__hash.html
@@ -51,10 +51,10 @@ <h1>Simd Library Documentation.</h1>
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="func-members" name="func-members"></a>
 Functions</h2></td></tr>
 <tr class="memitem:ga780733c618846d58525d9cb2a9adf13b"><td class="memItemLeft" align="right" valign="top">SIMD_API uint32_t&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__hash.html#ga780733c618846d58525d9cb2a9adf13b">SimdCrc32</a> (const void *src, size_t size)</td></tr>
-<tr class="memdesc:ga780733c618846d58525d9cb2a9adf13b"><td class="mdescLeft">&#160;</td><td class="mdescRight">Gets 32-bit cyclic redundancy check (CRC32) for current data.  <a href="group__hash.html#ga780733c618846d58525d9cb2a9adf13b">More...</a><br /></td></tr>
+<tr class="memdesc:ga780733c618846d58525d9cb2a9adf13b"><td class="mdescLeft">&#160;</td><td class="mdescRight">Calculates 32-bit cyclic redundancy check (CRC-32) for input data.  <a href="group__hash.html#ga780733c618846d58525d9cb2a9adf13b">More...</a><br /></td></tr>
 <tr class="separator:ga780733c618846d58525d9cb2a9adf13b"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:gacefa4dd81a5c0c5cfa54e5a2e5c59a68"><td class="memItemLeft" align="right" valign="top">SIMD_API uint32_t&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__hash.html#gacefa4dd81a5c0c5cfa54e5a2e5c59a68">SimdCrc32c</a> (const void *src, size_t size)</td></tr>
-<tr class="memdesc:gacefa4dd81a5c0c5cfa54e5a2e5c59a68"><td class="mdescLeft">&#160;</td><td class="mdescRight">Gets 32-bit cyclic redundancy check (CRC32c) for current data.  <a href="group__hash.html#gacefa4dd81a5c0c5cfa54e5a2e5c59a68">More...</a><br /></td></tr>
+<tr class="memdesc:gacefa4dd81a5c0c5cfa54e5a2e5c59a68"><td class="mdescLeft">&#160;</td><td class="mdescRight">Calculates 32-bit cyclic redundancy check (CRC-32C, Castagnoli) for input data.  <a href="group__hash.html#gacefa4dd81a5c0c5cfa54e5a2e5c59a68">More...</a><br /></td></tr>
 <tr class="separator:gacefa4dd81a5c0c5cfa54e5a2e5c59a68"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
 <a name="details" id="details"></a><h2 class="groupheader">Detailed Description</h2>
@@ -86,8 +86,8 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga780733c618846d58525d9cb
       </table>
 </div><div class="memdoc">
 
-<p>Gets 32-bit cyclic redundancy check (CRC32) for current data. </p>
-<p >Calculation is performed for polynomial 0xEDB88320.</p>
+<p>Calculates 32-bit cyclic redundancy check (CRC-32) for input data. </p>
+<p >The function uses reflected polynomial 0xEDB88320, initial value 0xFFFFFFFF and final bitwise inversion.</p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
     <tr><td class="paramdir">[in]</td><td class="paramname">src</td><td>- a pointer to data. </td></tr>
@@ -95,7 +95,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga780733c618846d58525d9cb
   </table>
   </dd>
 </dl>
-<dl class="section return"><dt>Returns</dt><dd>32-bit cyclic redundancy check (CRC32). </dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>32-bit cyclic redundancy check (CRC-32) of the input buffer. </dd></dl>
 
 </div>
 </div>
@@ -125,8 +125,8 @@ <h2 class="memtitle"><span class="permalink"><a href="#gacefa4dd81a5c0c5cfa54e5a
       </table>
 </div><div class="memdoc">
 
-<p>Gets 32-bit cyclic redundancy check (CRC32c) for current data. </p>
-<p >Calculation is performed for polynomial 0x1EDC6F41 (Castagnoli-crc).</p>
+<p>Calculates 32-bit cyclic redundancy check (CRC-32C, Castagnoli) for input data. </p>
+<p >The function uses Castagnoli polynomial (reflected form 0x82F63B78, normal form 0x1EDC6F41), initial value 0xFFFFFFFF and final bitwise inversion.</p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
     <tr><td class="paramdir">[in]</td><td class="paramname">src</td><td>- a pointer to data. </td></tr>
@@ -134,7 +134,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#gacefa4dd81a5c0c5cfa54e5a
   </table>
   </dd>
 </dl>
-<dl class="section return"><dt>Returns</dt><dd>32-bit cyclic redundancy check (CRC32c). </dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>32-bit cyclic redundancy check (CRC-32C) of the input buffer. </dd></dl>
 
 </div>
 </div>
diff --git a/docs/help/group__other__filter.html b/docs/help/group__other__filter.html
index 5fc7802a10..3a37ac7822 100644
--- a/docs/help/group__other__filter.html
+++ b/docs/help/group__other__filter.html
@@ -51,7 +51,7 @@ <h1>Simd Library Documentation.</h1>
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="func-members" name="func-members"></a>
 Functions</h2></td></tr>
 <tr class="memitem:ga173589ebbd59bf2d214c8a82cefe1487"><td class="memItemLeft" align="right" valign="top">SIMD_API void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__other__filter.html#ga173589ebbd59bf2d214c8a82cefe1487">SimdAbsGradientSaturatedSum</a> (const uint8_t *src, size_t srcStride, size_t width, size_t height, uint8_t *dst, size_t dstStride)</td></tr>
-<tr class="memdesc:ga173589ebbd59bf2d214c8a82cefe1487"><td class="mdescLeft">&#160;</td><td class="mdescRight">Puts to destination 8-bit gray image saturated sum of absolute gradient for every point of source 8-bit gray image.  <a href="group__other__filter.html#ga173589ebbd59bf2d214c8a82cefe1487">More...</a><br /></td></tr>
+<tr class="memdesc:ga173589ebbd59bf2d214c8a82cefe1487"><td class="mdescLeft">&#160;</td><td class="mdescRight">Calculates saturated sum of horizontal and vertical absolute gradients for each pixel of 8-bit gray image.  <a href="group__other__filter.html#ga173589ebbd59bf2d214c8a82cefe1487">More...</a><br /></td></tr>
 <tr class="separator:ga173589ebbd59bf2d214c8a82cefe1487"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:gaaa72e337646111ae500ae87621912f2a"><td class="memItemLeft" align="right" valign="top">SIMD_API void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__other__filter.html#gaaa72e337646111ae500ae87621912f2a">SimdLbpEstimate</a> (const uint8_t *src, size_t srcStride, size_t width, size_t height, uint8_t *dst, size_t dstStride)</td></tr>
 <tr class="memdesc:gaaa72e337646111ae500ae87621912f2a"><td class="mdescLeft">&#160;</td><td class="mdescRight">Calculates LBP (Local Binary Patterns) for 8-bit gray image.  <a href="group__other__filter.html#gaaa72e337646111ae500ae87621912f2a">More...</a><br /></td></tr>
@@ -129,10 +129,10 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga173589ebbd59bf2d214c8a8
       </table>
 </div><div class="memdoc">
 
-<p>Puts to destination 8-bit gray image saturated sum of absolute gradient for every point of source 8-bit gray image. </p>
+<p>Calculates saturated sum of horizontal and vertical absolute gradients for each pixel of 8-bit gray image. </p>
 <p >Both images must have the same width and height.</p>
 <p >For border pixels: </p><pre class="fragment">dst[x, y] = 0;
-</pre><p >For other pixels: </p><pre class="fragment">dx = abs(src[x + 1, y] - src[x - 1, y]);
+</pre><p >For non-border pixels: </p><pre class="fragment">dx = abs(src[x + 1, y] - src[x - 1, y]);
 dy = abs(src[x, y + 1] - src[x, y - 1]);
 dst[x, y] = min(dx + dy, 255);
 </pre><dl class="section note"><dt>Note</dt><dd>This function has a C++ wrapper Simd::AbsGradientSaturatedSum(const View&lt;A&gt;&amp; src, View&lt;A&gt;&amp; dst).</dd></dl>
@@ -326,7 +326,7 @@ <h2 class="memtitle"><span class="permalink"><a href="#ga9cc37f1a90fb1a8eb17f6fc
 </pre><p >For other pixels: </p><pre class="fragment">dx = abs(src[x + 1, y] - src[x - 1, y]);
 dy = abs(src[x, y + 1] - src[x, y - 1]);
 dst[x, y] = min(dx + dy, 255);
-</pre><dl class="section note"><dt>Note</dt><dd>This function is a C++ wrapper for function <a class="el" href="group__other__filter.html#ga173589ebbd59bf2d214c8a82cefe1487" title="Puts to destination 8-bit gray image saturated sum of absolute gradient for every point of source 8-b...">SimdAbsGradientSaturatedSum</a>.</dd></dl>
+</pre><dl class="section note"><dt>Note</dt><dd>This function is a C++ wrapper for function <a class="el" href="group__other__filter.html#ga173589ebbd59bf2d214c8a82cefe1487" title="Calculates saturated sum of horizontal and vertical absolute gradients for each pixel of 8-bit gray i...">SimdAbsGradientSaturatedSum</a>.</dd></dl>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
     <tr><td class="paramdir">[in]</td><td class="paramname">src</td><td>- a source 8-bit gray image. </td></tr>

From 46b4d0ebbafab9e8420476c4d0f036bc96582740 Mon Sep 17 00:00:00 2001
From: Ihar Yermalayeu <ermig@tut.by>
Date: Tue, 26 May 2026 11:31:45 +0300
Subject: [PATCH 32/32] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 docs/2026.html | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/2026.html b/docs/2026.html
index 38614265fe..9c88f424aa 100644
--- a/docs/2026.html
+++ b/docs/2026.html
@@ -53,7 +53,7 @@ <h5>New features</h5>
  <li>NEON optimizations of function Crc32c.</li>
  <li>NEON optimizations of function Crc32.</li>
  <li>Support of 8-bit BMP in function ImageLoadBmp.</li>
- <li>SVE optimizations of function GetStatistics.</li>
+ <li>SVE optimizations of function GetStatistic.</li>
 </ul>
 <h5>Bug fixing</h5>
 <ul>

[in]	context	- a pointer to Integer Descriptor Engine context. It must be created by function SimdDescrIntInit and released by function SimdRelease.
[in]	context	- a pointer to Integer Descriptor Engine context. It must be created by function SimdDescrIntInit and released by function SimdRelease.
[in]	context	- a font context. It must be created by function SimdFontInit and released by function SimdRelease.
[in]	context	- a font context. It must be created by function SimdFontInit and released by function SimdRelease.
[in]	height	- a new height of font.
[in]	filter	- a filter context. It must be created by function SimdGaussianBlurInit and released by function SimdRelease.
[in]	filter	- a filter context. It must be created by function SimdGaussianBlurInit and released by function SimdRelease.
[in]	src	- a pointer to pixels data of the original input image.
[in]	srcStride	- a row size (in bytes) of the input image.
[out]	dst	- a pointer to pixels data of the filtered output image.
	Gets version of Simd Library. More...

SIMD_API const char *	SimdCpuDesc (SimdCpuDescType type)
	Gets description of CPU and Simd Library. More...
	Gets a text description of the CPU. More...

SIMD_API uint64_t	SimdCpuInfo (SimdCpuInfoType type)
	Gets information about CPU and Simd Library. More...
[in]	type	- a type of required description.
[in]	type	- a type of required description. See SimdCpuDescType.
[in]	size	- a size of memory block.
[in]	align	- a required alignment of memory block.
[in]	size	- the number of bytes to allocate. Must be greater than zero.
[in]	align	- the required alignment of the allocated block in bytes. Must be a power of two. Use SimdAlignment to obtain the optimal alignment for the current platform.
[in]	ptr	- a pointer to the memory to be deleted.
[in]	ptr	- a pointer to the memory block to free. Must have been returned by SimdAllocate, or `NULL` (in which case the call has no effect).
[in]	size	- an original size.
[in]	align	- a required alignment.
[in]	size	- the original size in bytes (or elements) to be aligned.
[in]	align	- the required alignment in bytes. Must be a positive power of two. Use SimdAlignment to obtain the optimal alignment for the current platform.
[in]	context	- a context to be released.
[in]	context	- a pointer to the context to be released, or `NULL`.
[in]	resizer	- a resize context. It must be created by function SimdResizerInit and released by function SimdRelease.
[in]	resizer	- a resize context. It must be created by function SimdResizerInit and released by function SimdRelease.
[in]	src	- a pointer to pixels data of the original input image.
[in]	srcStride	- a row size (in bytes) of the input image.
[out]	dst	- a pointer to pixels data of the resized output image.
[in]	context	- a shift detector context. It must be created by function SimdShiftDetectorInitBuffers and released by function SimdRelease.
[in]	context	- a shift detector context. It must be created by function SimdShiftDetectorInitBuffers and released by function SimdRelease.
[in]	bkg	- a pointer to pixels data of background image.
[in]	bkgStride	- a row size of the background image.
[in]	makeCopy	- if true, copy of the background will be created.