Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 55 additions & 30 deletions docs/isa/micro-isa/10-reduction-ops.md
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,10 @@ for (int i = 0; i < N; i++) {

## Per-VLane (Group) Reductions

The vector register is organized as **8 VLanes** of 32 bytes each. Group reductions operate within each VLane independently.
The vector register is organized as **8 VLanes** of 32 bytes each. Group
reductions operate within each VLane independently and produce one result per
VLane. The 8 VLane results are written contiguously to the low elements of the
destination vector; all remaining destination elements are zero.
Comment on lines +197 to +200

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The updated documentation states that group reductions produce one result per VLane and write them contiguously to the low elements of the destination vector. However, the documentation should explicitly clarify the behavior when the vector register is not fully populated (e.g., when the element type has fewer than 64 elements, such as i64 which has 32 elements). Please clarify if the remaining elements are always zero-filled up to the full vector register size N regardless of the element type.


```
vreg layout (f32 example, 64 elements total):
Expand All @@ -206,79 +209,101 @@ VLane 4: [32..39] VLane 5: [40..47] VLane 6: [48..55] VLane 7: [56..63]

- **syntax:** `%result = pto.vcgadd %input, %mask : !pto.vreg<NxT>, !pto.mask<G> -> !pto.vreg<NxT>`
- **A5 types:** i16-i32, f16, f32
- **semantics:** Sum within each VLane. 8 results at indices 0, 8, 16, 24, 32, 40, 48, 56 (for f32).
- **semantics:** Sum active elements within each 32-byte VLane. The 8 VLane
sums are written to result elements `0..7`; all other result elements are
zero.

```c
int K = N / 8; // elements per VLane
int groups = 8;
int K = 32 / sizeof(T); // elements per 32-byte VLane
for (int g = 0; g < 8; g++) {
T sum = 0;
for (int i = 0; i < K; i++)
sum += src[g*K + i];
dst[g*K] = sum;
for (int i = 1; i < K; i++)
dst[g*K + i] = 0;
if (mask[g*K + i])
sum += src[g*K + i];
dst[g] = sum;
}
// For f32: results at dst[0], dst[8], dst[16], dst[24], dst[32], dst[40], dst[48], dst[56]
for (int i = groups; i < N; i++)
dst[i] = 0;
```

- **inputs:** `%input` is the source vector and `%mask` selects participating
lanes.
- **outputs:** `%result` contains one sum per 32-byte VLane group, written
contiguously into the low slot of each group.
contiguously to the low elements of the result vector.
- **constraints and limitations:** This is a per-32-byte VLane-group reduction.
Inactive lanes are treated as zero.
Inactive lanes are treated as zero. If all lanes in a VLane are inactive, the
corresponding result element is `0` (`+0` for floating-point types).

---

### `pto.vcgmax`

- **syntax:** `%result = pto.vcgmax %input, %mask : !pto.vreg<NxT>, !pto.mask<G> -> !pto.vreg<NxT>`
- **A5 types:** i16-i32, f16, f32
- **semantics:** Max within each VLane.
- **semantics:** Find the maximum active element within each 32-byte VLane. The
8 VLane maxima are written to result elements `0..7`; all other result
elements are zero.

```c
int K = N / 8;
int groups = 8;
int K = 32 / sizeof(T);
for (int g = 0; g < 8; g++) {
T mx = -INF;
T mx = max_identity_for_T; // -INF for float, minimum value for integer
for (int i = 0; i < K; i++)
if (src[g*K + i] > mx) mx = src[g*K + i];
dst[g*K] = mx;
for (int i = 1; i < K; i++)
dst[g*K + i] = 0;
if (mask[g*K + i])
mx = max(mx, src[g*K + i]);
dst[g] = mx;
}
for (int i = groups; i < N; i++)
dst[i] = 0;
```

- **inputs:** `%input` is the source vector and `%mask` selects participating
lanes.
- **outputs:** `%result` contains one maximum per 32-byte VLane group.
- **outputs:** `%result` contains one maximum per 32-byte VLane group, written
contiguously to the low elements of the result vector.
- **constraints and limitations:** Grouping is by hardware 32-byte VLane, not by
arbitrary software subvector.
arbitrary software subvector. Inactive floating-point lanes are treated as
`-INF`; inactive integer lanes are treated as the element type's minimum
value. If all lanes in a VLane are inactive, that neutral value is written for
the corresponding VLane result. For floating-point values, `max(+0, -0)`
returns `+0`.

---

### `pto.vcgmin`

- **syntax:** `%result = pto.vcgmin %input, %mask : !pto.vreg<NxT>, !pto.mask<G> -> !pto.vreg<NxT>`
- **A5 types:** i16-i32, f16, f32
- **semantics:** Min within each VLane.
- **semantics:** Find the minimum active element within each 32-byte VLane. The
8 VLane minima are written to result elements `0..7`; all other result
elements are zero.

```c
int K = N / 8;
int groups = 8;
int K = 32 / sizeof(T);
for (int g = 0; g < 8; g++) {
T mn = INF;
T mn = min_identity_for_T; // +INF for float, maximum value for integer
for (int i = 0; i < K; i++)
if (src[g*K + i] < mn) mn = src[g*K + i];
dst[g*K] = mn;
for (int i = 1; i < K; i++)
dst[g*K + i] = 0;
if (mask[g*K + i])
mn = min(mn, src[g*K + i]);
dst[g] = mn;
}
for (int i = groups; i < N; i++)
dst[i] = 0;
```

- **inputs:** `%input` is the source vector and `%mask` selects participating
lanes.
- **outputs:** `%result` contains one minimum per 32-byte VLane group.
- **outputs:** `%result` contains one minimum per 32-byte VLane group, written
contiguously to the low elements of the result vector.
- **constraints and limitations:** Grouping is by hardware 32-byte VLane, not by
arbitrary software subvector.
arbitrary software subvector. Inactive floating-point lanes are treated as
`+INF`; inactive integer lanes are treated as the element type's maximum
value. If all lanes in a VLane are inactive, that neutral value is written for
the corresponding VLane result. For floating-point values, `min(-0, +0)`
returns `-0`.

---

Expand Down Expand Up @@ -318,9 +343,9 @@ for (int i = 1; i < N; i++)
// max is in lane 0, broadcast it
%max_broadcast = pto.vlds %ub_tmp[%c0] {dist = "BRC_B32"} : !pto.ptr<f32, ub> -> !pto.vreg<64xf32>

// Row-wise sum using vcgadd (for 8-row tile)
// Per-VLane sums using vcgadd
%row_sums = pto.vcgadd %tile, %mask : !pto.vreg<64xf32>, !pto.mask<G> -> !pto.vreg<64xf32>
// Results at indices 0, 8, 16, 24, 32, 40, 48, 56
// Results at indices 0..7; remaining elements are zero

// Full vector sum for normalization
%total = pto.vcadd %values, %mask : !pto.vreg<64xf32>, !pto.mask<G> -> !pto.vreg<64xf32>
Expand Down
12 changes: 7 additions & 5 deletions docs/vpto-spec.md
Original file line number Diff line number Diff line change
Expand Up @@ -1337,15 +1337,17 @@ for (int i = 0; i < N; i++)
**Example — pto.vcgadd (group reduction per VLane) semantics:**

```c
int K = N / 8; // elements per VLane
int groups = 8;
int K = 32 / sizeof(T); // elements per 32-byte VLane
for (int g = 0; g < 8; g++) {
T sum = 0;
for (int i = 0; i < K; i++)
sum += src[g*K + i];
dst[g*K] = sum;
for (int i = 1; i < K; i++)
dst[g*K + i] = 0;
if (mask[g*K + i])
sum += src[g*K + i];
dst[g] = sum;
}
for (int i = groups; i < N; i++)
dst[i] = 0;
```

For A5 reduction result types:
Expand Down
62 changes: 62 additions & 0 deletions test/vpto/cases/micro-op/reduction/vcg-group/compare.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#!/usr/bin/env python3
# Copyright (c) 2026 Huawei Technologies Co., Ltd.
# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
# CANN Open Software License Agreement Version 2.0 (the "License").
# Please refer to the License for details. You may not use this file except in compliance with the License.
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
# See LICENSE in the root of the software repository for the full text of the License.

import os
import sys

import numpy as np


def compare_bin(golden_path, output_path, eps):
if not os.path.exists(golden_path):
print(f"[ERROR] Golden missing: {golden_path}")
return False
if not os.path.exists(output_path):
print(f"[ERROR] Output missing: {output_path}")
return False

golden = np.fromfile(golden_path, dtype=np.float32)
output = np.fromfile(output_path, dtype=np.float32)
if golden.shape != output.shape:
print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}")
return False
if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True):
diff = np.abs(golden.astype(np.float64) - output.astype(np.float64))
idx = int(np.argmax(diff))
print(
f"[ERROR] Mismatch: {golden_path} vs {output_path}, "
f"idx={idx}, golden={golden[idx]}, output={output[idx]}, max_diff={diff[idx]}"
)
return False
return True


def main():
strict = os.getenv("COMPARE_STRICT", "1") != "0"
checks = [
("golden_add.bin", "out_add.bin", "vcgadd"),
("golden_max.bin", "out_max.bin", "vcgmax"),
("golden_min.bin", "out_min.bin", "vcgmin"),
]
failed = []
for golden, output, label in checks:
if not compare_bin(golden, output, 1e-4):
failed.append(label)
print(f"[ERROR] compare failed: {label}")
if failed:
if strict:
print(f"[ERROR] {len(failed)} check(s) failed: {', '.join(failed)}")
sys.exit(2)
print(f"[WARN] {len(failed)} check(s) failed (non-gating): {', '.join(failed)}")
return
print("[INFO] compare passed")


if __name__ == "__main__":
main()
59 changes: 59 additions & 0 deletions test/vpto/cases/micro-op/reduction/vcg-group/golden.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#!/usr/bin/env python3
# Copyright (c) 2026 Huawei Technologies Co., Ltd.
# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
# CANN Open Software License Agreement Version 2.0 (the "License").
# Please refer to the License for details. You may not use this file except in compliance with the License.
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
# See LICENSE in the root of the software repository for the full text of the License.

import argparse
from pathlib import Path

import numpy as np


GROUPS = 8
ELEMS_PER_GROUP = 8
LANES = GROUPS * ELEMS_PER_GROUP


def generate(output_dir: Path) -> None:
src = np.array(
[
-7.0, 1.0, 3.5, -2.0, 9.0, -4.5, 6.0, 0.5,
8.0, -1.0, -3.0, 4.0, 2.0, -6.0, 5.5, 7.0,
-0.0, 0.0, -5.0, 5.0, 11.0, -12.0, 13.0, -14.0,
1.25, 2.25, 3.25, 4.25, -8.0, -9.0, 10.0, -10.0,
15.0, 14.0, 13.0, 12.0, -1.5, -2.5, -3.5, -4.5,
-20.0, -19.0, -18.0, -17.0, 16.0, 15.5, 14.5, 13.5,
0.25, -0.75, 1.5, -2.25, 3.0, -3.75, 4.5, -5.25,
31.0, -32.0, 33.0, -34.0, 35.0, -36.0, 37.0, -38.0,
],
dtype=np.float32,
)
groups = src.reshape(GROUPS, ELEMS_PER_GROUP)

golden_add = np.zeros(LANES, dtype=np.float32)
golden_max = np.zeros(LANES, dtype=np.float32)
golden_min = np.zeros(LANES, dtype=np.float32)
golden_add[:GROUPS] = np.sum(groups, axis=1, dtype=np.float32)
golden_max[:GROUPS] = np.max(groups, axis=1)
golden_min[:GROUPS] = np.min(groups, axis=1)

output_dir.mkdir(parents=True, exist_ok=True)
src.tofile(output_dir / "src.bin")
golden_add.tofile(output_dir / "golden_add.bin")
golden_max.tofile(output_dir / "golden_max.bin")
golden_min.tofile(output_dir / "golden_min.bin")


def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("--output-dir", type=Path, default=Path("."))
args = parser.parse_args()
generate(args.output_dir)


if __name__ == "__main__":
main()
60 changes: 60 additions & 0 deletions test/vpto/cases/micro-op/reduction/vcg-group/kernel.pto
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
// Copyright (c) 2026 Huawei Technologies Co., Ltd.
// This program is free software, you can redistribute it and/or modify it under the terms and conditions of
// CANN Open Software License Agreement Version 2.0 (the "License").
// Please refer to the License for details. You may not use this file except in compliance with the License.
// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
// See LICENSE in the root of the software repository for the full text of the License.

module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind<vector>} {
func.func @vcg_group_kernel(%src: !pto.ptr<f32, gm>,
%dst_add: !pto.ptr<f32, gm>,
%dst_max: !pto.ptr<f32, gm>,
%dst_min: !pto.ptr<f32, gm>) attributes {pto.kernel} {
%c0 = arith.constant 0 : index
%c0_i64 = arith.constant 0 : i64
%c1_i64 = arith.constant 1 : i64
%c256_i64 = arith.constant 256 : i64
%c4096_i64 = arith.constant 4096 : i64
%c8192_i64 = arith.constant 8192 : i64
%c12288_i64 = arith.constant 12288 : i64

%ub_src = pto.castptr %c0_i64 : i64 -> !pto.ptr<f32, ub>
%ub_add = pto.castptr %c4096_i64 : i64 -> !pto.ptr<f32, ub>
%ub_max = pto.castptr %c8192_i64 : i64 -> !pto.ptr<f32, ub>
%ub_min = pto.castptr %c12288_i64 : i64 -> !pto.ptr<f32, ub>

pto.mte_gm_ub %src, %ub_src, %c0_i64, %c256_i64
nburst(%c1_i64, %c256_i64, %c256_i64)
: !pto.ptr<f32, gm>, !pto.ptr<f32, ub>, i64, i64, i64, i64, i64

pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"]
pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"]

pto.vecscope {
%mask = pto.pset_b32 "PAT_ALL" : !pto.mask<b32>
%vec = pto.vlds %ub_src[%c0] : !pto.ptr<f32, ub> -> !pto.vreg<64xf32>
%add = pto.vcgadd %vec, %mask : !pto.vreg<64xf32>, !pto.mask<b32> -> !pto.vreg<64xf32>
%max = pto.vcgmax %vec, %mask : !pto.vreg<64xf32>, !pto.mask<b32> -> !pto.vreg<64xf32>
%min = pto.vcgmin %vec, %mask : !pto.vreg<64xf32>, !pto.mask<b32> -> !pto.vreg<64xf32>
pto.vsts %add, %ub_add[%c0], %mask : !pto.vreg<64xf32>, !pto.ptr<f32, ub>, !pto.mask<b32>
pto.vsts %max, %ub_max[%c0], %mask : !pto.vreg<64xf32>, !pto.ptr<f32, ub>, !pto.mask<b32>
pto.vsts %min, %ub_min[%c0], %mask : !pto.vreg<64xf32>, !pto.ptr<f32, ub>, !pto.mask<b32>
}

pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"]
pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"]

pto.mte_ub_gm %ub_add, %dst_add, %c256_i64
nburst(%c1_i64, %c256_i64, %c256_i64)
: !pto.ptr<f32, ub>, !pto.ptr<f32, gm>, i64, i64, i64, i64
pto.mte_ub_gm %ub_max, %dst_max, %c256_i64
nburst(%c1_i64, %c256_i64, %c256_i64)
: !pto.ptr<f32, ub>, !pto.ptr<f32, gm>, i64, i64, i64, i64
pto.mte_ub_gm %ub_min, %dst_min, %c256_i64
nburst(%c1_i64, %c256_i64, %c256_i64)
: !pto.ptr<f32, ub>, !pto.ptr<f32, gm>, i64, i64, i64, i64
pto.barrier #pto.pipe<PIPE_ALL>
return
}
}
Loading
Loading