hw-native-sys · mouliangyu · Jul 1, 2026 · gemini-code-assist · Jul 1, 2026
diff --git a/docs/isa/micro-isa/10-reduction-ops.md b/docs/isa/micro-isa/10-reduction-ops.md
@@ -194,7 +194,10 @@ for (int i = 0; i < N; i++) {
 
 ## Per-VLane (Group) Reductions
 
-The vector register is organized as **8 VLanes** of 32 bytes each. Group reductions operate within each VLane independently.
+The vector register is organized as **8 VLanes** of 32 bytes each. Group
+reductions operate within each VLane independently and produce one result per
+VLane. The 8 VLane results are written contiguously to the low elements of the
+destination vector; all remaining destination elements are zero.
 
 ```
 vreg layout (f32 example, 64 elements total):
@@ -206,79 +209,101 @@ VLane 4: [32..39] VLane 5: [40..47] VLane 6: [48..55] VLane 7: [56..63]
 
 - **syntax:** `%result = pto.vcgadd %input, %mask : !pto.vreg<NxT>, !pto.mask<G> -> !pto.vreg<NxT>`
 - **A5 types:** i16-i32, f16, f32
-- **semantics:** Sum within each VLane. 8 results at indices 0, 8, 16, 24, 32, 40, 48, 56 (for f32).
+- **semantics:** Sum active elements within each 32-byte VLane. The 8 VLane
+  sums are written to result elements `0..7`; all other result elements are
+  zero.
 
 ```c
-int K = N / 8;  // elements per VLane
+int groups = 8;
+int K = 32 / sizeof(T);  // elements per 32-byte VLane
 for (int g = 0; g < 8; g++) {
     T sum = 0;
     for (int i = 0; i < K; i++)
-        sum += src[g*K + i];
-    dst[g*K] = sum;
-    for (int i = 1; i < K; i++)
-        dst[g*K + i] = 0;
+        if (mask[g*K + i])
+            sum += src[g*K + i];
+    dst[g] = sum;
 }
-// For f32: results at dst[0], dst[8], dst[16], dst[24], dst[32], dst[40], dst[48], dst[56]
+for (int i = groups; i < N; i++)
+    dst[i] = 0;
 ```
 
 - **inputs:** `%input` is the source vector and `%mask` selects participating
   lanes.
 - **outputs:** `%result` contains one sum per 32-byte VLane group, written
-  contiguously into the low slot of each group.
+  contiguously to the low elements of the result vector.
 - **constraints and limitations:** This is a per-32-byte VLane-group reduction.
-  Inactive lanes are treated as zero.
+  Inactive lanes are treated as zero. If all lanes in a VLane are inactive, the
+  corresponding result element is `0` (`+0` for floating-point types).
 
 ---
 
 ### `pto.vcgmax`
 
 - **syntax:** `%result = pto.vcgmax %input, %mask : !pto.vreg<NxT>, !pto.mask<G> -> !pto.vreg<NxT>`
 - **A5 types:** i16-i32, f16, f32
-- **semantics:** Max within each VLane.
+- **semantics:** Find the maximum active element within each 32-byte VLane. The
+  8 VLane maxima are written to result elements `0..7`; all other result
+  elements are zero.
 
 ```c
-int K = N / 8;
+int groups = 8;
+int K = 32 / sizeof(T);
 for (int g = 0; g < 8; g++) {
-    T mx = -INF;
+    T mx = max_identity_for_T;  // -INF for float, minimum value for integer
     for (int i = 0; i < K; i++)
-        if (src[g*K + i] > mx) mx = src[g*K + i];
-    dst[g*K] = mx;
-    for (int i = 1; i < K; i++)
-        dst[g*K + i] = 0;
+        if (mask[g*K + i])
+            mx = max(mx, src[g*K + i]);
+    dst[g] = mx;
 }
+for (int i = groups; i < N; i++)
+    dst[i] = 0;
 ```
 
 - **inputs:** `%input` is the source vector and `%mask` selects participating
   lanes.
-- **outputs:** `%result` contains one maximum per 32-byte VLane group.
+- **outputs:** `%result` contains one maximum per 32-byte VLane group, written
+  contiguously to the low elements of the result vector.
 - **constraints and limitations:** Grouping is by hardware 32-byte VLane, not by
-  arbitrary software subvector.
+  arbitrary software subvector. Inactive floating-point lanes are treated as
+  `-INF`; inactive integer lanes are treated as the element type's minimum
+  value. If all lanes in a VLane are inactive, that neutral value is written for
+  the corresponding VLane result. For floating-point values, `max(+0, -0)`
+  returns `+0`.
 
 ---
 
 ### `pto.vcgmin`
 
 - **syntax:** `%result = pto.vcgmin %input, %mask : !pto.vreg<NxT>, !pto.mask<G> -> !pto.vreg<NxT>`
 - **A5 types:** i16-i32, f16, f32
-- **semantics:** Min within each VLane.
+- **semantics:** Find the minimum active element within each 32-byte VLane. The
+  8 VLane minima are written to result elements `0..7`; all other result
+  elements are zero.
 
 ```c
-int K = N / 8;
+int groups = 8;
+int K = 32 / sizeof(T);
 for (int g = 0; g < 8; g++) {
-    T mn = INF;
+    T mn = min_identity_for_T;  // +INF for float, maximum value for integer
     for (int i = 0; i < K; i++)
-        if (src[g*K + i] < mn) mn = src[g*K + i];
-    dst[g*K] = mn;
-    for (int i = 1; i < K; i++)
-        dst[g*K + i] = 0;
+        if (mask[g*K + i])
+            mn = min(mn, src[g*K + i]);
+    dst[g] = mn;
 }
+for (int i = groups; i < N; i++)
+    dst[i] = 0;
 ```
 
 - **inputs:** `%input` is the source vector and `%mask` selects participating
   lanes.
-- **outputs:** `%result` contains one minimum per 32-byte VLane group.
+- **outputs:** `%result` contains one minimum per 32-byte VLane group, written
+  contiguously to the low elements of the result vector.
 - **constraints and limitations:** Grouping is by hardware 32-byte VLane, not by
-  arbitrary software subvector.
+  arbitrary software subvector. Inactive floating-point lanes are treated as
+  `+INF`; inactive integer lanes are treated as the element type's maximum
+  value. If all lanes in a VLane are inactive, that neutral value is written for
+  the corresponding VLane result. For floating-point values, `min(-0, +0)`
+  returns `-0`.
 
 ---
 
@@ -318,9 +343,9 @@ for (int i = 1; i < N; i++)
 // max is in lane 0, broadcast it
 %max_broadcast = pto.vlds %ub_tmp[%c0] {dist = "BRC_B32"} : !pto.ptr<f32, ub> -> !pto.vreg<64xf32>
 
-// Row-wise sum using vcgadd (for 8-row tile)
+// Per-VLane sums using vcgadd
 %row_sums = pto.vcgadd %tile, %mask : !pto.vreg<64xf32>, !pto.mask<G> -> !pto.vreg<64xf32>
-// Results at indices 0, 8, 16, 24, 32, 40, 48, 56
+// Results at indices 0..7; remaining elements are zero
 
 // Full vector sum for normalization
 %total = pto.vcadd %values, %mask : !pto.vreg<64xf32>, !pto.mask<G> -> !pto.vreg<64xf32>

diff --git a/docs/vpto-spec.md b/docs/vpto-spec.md
@@ -1337,15 +1337,17 @@ for (int i = 0; i < N; i++)
 **Example — pto.vcgadd (group reduction per VLane) semantics:**
 
 ```c
-int K = N / 8;  // elements per VLane
+int groups = 8;
+int K = 32 / sizeof(T);  // elements per 32-byte VLane
 for (int g = 0; g < 8; g++) {
     T sum = 0;
     for (int i = 0; i < K; i++)
-        sum += src[g*K + i];
-    dst[g*K] = sum;
-    for (int i = 1; i < K; i++)
-        dst[g*K + i] = 0;
+        if (mask[g*K + i])
+            sum += src[g*K + i];
+    dst[g] = sum;
 }
+for (int i = groups; i < N; i++)
+    dst[i] = 0;
 ```
 
 For A5 reduction result types:

diff --git a/test/vpto/cases/micro-op/reduction/vcg-group/compare.py b/test/vpto/cases/micro-op/reduction/vcg-group/compare.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python3
+# Copyright (c) 2026 Huawei Technologies Co., Ltd.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+
+import os
+import sys
+
+import numpy as np
+
+
+def compare_bin(golden_path, output_path, eps):
+    if not os.path.exists(golden_path):
+        print(f"[ERROR] Golden missing: {golden_path}")
+        return False
+    if not os.path.exists(output_path):
+        print(f"[ERROR] Output missing: {output_path}")
+        return False
+
+    golden = np.fromfile(golden_path, dtype=np.float32)
+    output = np.fromfile(output_path, dtype=np.float32)
+    if golden.shape != output.shape:
+        print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}")
+        return False
+    if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True):
+        diff = np.abs(golden.astype(np.float64) - output.astype(np.float64))
+        idx = int(np.argmax(diff))
+        print(
+            f"[ERROR] Mismatch: {golden_path} vs {output_path}, "
+            f"idx={idx}, golden={golden[idx]}, output={output[idx]}, max_diff={diff[idx]}"
+        )
+        return False
+    return True
+
+
+def main():
+    strict = os.getenv("COMPARE_STRICT", "1") != "0"
+    checks = [
+        ("golden_add.bin", "out_add.bin", "vcgadd"),
+        ("golden_max.bin", "out_max.bin", "vcgmax"),
+        ("golden_min.bin", "out_min.bin", "vcgmin"),
+    ]
+    failed = []
+    for golden, output, label in checks:
+        if not compare_bin(golden, output, 1e-4):
+            failed.append(label)
+            print(f"[ERROR] compare failed: {label}")
+    if failed:
+        if strict:
+            print(f"[ERROR] {len(failed)} check(s) failed: {', '.join(failed)}")
+            sys.exit(2)
+        print(f"[WARN] {len(failed)} check(s) failed (non-gating): {', '.join(failed)}")
+        return
+    print("[INFO] compare passed")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/vpto/cases/micro-op/reduction/vcg-group/golden.py b/test/vpto/cases/micro-op/reduction/vcg-group/golden.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python3
+# Copyright (c) 2026 Huawei Technologies Co., Ltd.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+
+import argparse
+from pathlib import Path
+
+import numpy as np
+
+
+GROUPS = 8
+ELEMS_PER_GROUP = 8
+LANES = GROUPS * ELEMS_PER_GROUP
+
+
+def generate(output_dir: Path) -> None:
+    src = np.array(
+        [
+            -7.0, 1.0, 3.5, -2.0, 9.0, -4.5, 6.0, 0.5,
+            8.0, -1.0, -3.0, 4.0, 2.0, -6.0, 5.5, 7.0,
+            -0.0, 0.0, -5.0, 5.0, 11.0, -12.0, 13.0, -14.0,
+            1.25, 2.25, 3.25, 4.25, -8.0, -9.0, 10.0, -10.0,
+            15.0, 14.0, 13.0, 12.0, -1.5, -2.5, -3.5, -4.5,
+            -20.0, -19.0, -18.0, -17.0, 16.0, 15.5, 14.5, 13.5,
+            0.25, -0.75, 1.5, -2.25, 3.0, -3.75, 4.5, -5.25,
+            31.0, -32.0, 33.0, -34.0, 35.0, -36.0, 37.0, -38.0,
+        ],
+        dtype=np.float32,
+    )
+    groups = src.reshape(GROUPS, ELEMS_PER_GROUP)
+
+    golden_add = np.zeros(LANES, dtype=np.float32)
+    golden_max = np.zeros(LANES, dtype=np.float32)
+    golden_min = np.zeros(LANES, dtype=np.float32)
+    golden_add[:GROUPS] = np.sum(groups, axis=1, dtype=np.float32)
+    golden_max[:GROUPS] = np.max(groups, axis=1)
+    golden_min[:GROUPS] = np.min(groups, axis=1)
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+    src.tofile(output_dir / "src.bin")
+    golden_add.tofile(output_dir / "golden_add.bin")
+    golden_max.tofile(output_dir / "golden_max.bin")
+    golden_min.tofile(output_dir / "golden_min.bin")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--output-dir", type=Path, default=Path("."))
+    args = parser.parse_args()
+    generate(args.output_dir)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/vpto/cases/micro-op/reduction/vcg-group/kernel.pto b/test/vpto/cases/micro-op/reduction/vcg-group/kernel.pto
@@ -0,0 +1,60 @@
+// Copyright (c) 2026 Huawei Technologies Co., Ltd.
+// This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+// CANN Open Software License Agreement Version 2.0 (the "License").
+// Please refer to the License for details. You may not use this file except in compliance with the License.
+// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+// See LICENSE in the root of the software repository for the full text of the License.
+
+module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind<vector>} {
+  func.func @vcg_group_kernel(%src: !pto.ptr<f32, gm>,
+                              %dst_add: !pto.ptr<f32, gm>,
+                              %dst_max: !pto.ptr<f32, gm>,
+                              %dst_min: !pto.ptr<f32, gm>) attributes {pto.kernel} {
+    %c0 = arith.constant 0 : index
+    %c0_i64 = arith.constant 0 : i64
+    %c1_i64 = arith.constant 1 : i64
+    %c256_i64 = arith.constant 256 : i64
+    %c4096_i64 = arith.constant 4096 : i64
+    %c8192_i64 = arith.constant 8192 : i64
+    %c12288_i64 = arith.constant 12288 : i64
+
+    %ub_src = pto.castptr %c0_i64 : i64 -> !pto.ptr<f32, ub>
+    %ub_add = pto.castptr %c4096_i64 : i64 -> !pto.ptr<f32, ub>
+    %ub_max = pto.castptr %c8192_i64 : i64 -> !pto.ptr<f32, ub>
+    %ub_min = pto.castptr %c12288_i64 : i64 -> !pto.ptr<f32, ub>
+
+    pto.mte_gm_ub %src, %ub_src, %c0_i64, %c256_i64
+      nburst(%c1_i64, %c256_i64, %c256_i64)
+      : !pto.ptr<f32, gm>, !pto.ptr<f32, ub>, i64, i64, i64, i64, i64
+
+    pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"]
+    pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"]
+
+    pto.vecscope {
+      %mask = pto.pset_b32 "PAT_ALL" : !pto.mask<b32>
+      %vec = pto.vlds %ub_src[%c0] : !pto.ptr<f32, ub> -> !pto.vreg<64xf32>
+      %add = pto.vcgadd %vec, %mask : !pto.vreg<64xf32>, !pto.mask<b32> -> !pto.vreg<64xf32>
+      %max = pto.vcgmax %vec, %mask : !pto.vreg<64xf32>, !pto.mask<b32> -> !pto.vreg<64xf32>
+      %min = pto.vcgmin %vec, %mask : !pto.vreg<64xf32>, !pto.mask<b32> -> !pto.vreg<64xf32>
+      pto.vsts %add, %ub_add[%c0], %mask : !pto.vreg<64xf32>, !pto.ptr<f32, ub>, !pto.mask<b32>
+      pto.vsts %max, %ub_max[%c0], %mask : !pto.vreg<64xf32>, !pto.ptr<f32, ub>, !pto.mask<b32>
+      pto.vsts %min, %ub_min[%c0], %mask : !pto.vreg<64xf32>, !pto.ptr<f32, ub>, !pto.mask<b32>
+    }
+
+    pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"]
+    pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"]
+
+    pto.mte_ub_gm %ub_add, %dst_add, %c256_i64
+      nburst(%c1_i64, %c256_i64, %c256_i64)
+      : !pto.ptr<f32, ub>, !pto.ptr<f32, gm>, i64, i64, i64, i64
+    pto.mte_ub_gm %ub_max, %dst_max, %c256_i64
+      nburst(%c1_i64, %c256_i64, %c256_i64)
+      : !pto.ptr<f32, ub>, !pto.ptr<f32, gm>, i64, i64, i64, i64
+    pto.mte_ub_gm %ub_min, %dst_min, %c256_i64
+      nburst(%c1_i64, %c256_i64, %c256_i64)
+      : !pto.ptr<f32, ub>, !pto.ptr<f32, gm>, i64, i64, i64, i64
+    pto.barrier #pto.pipe<PIPE_ALL>
+    return
+  }
+}