From 465dc65d8f36e3c5ac04d0e88b1a33509dce67d5 Mon Sep 17 00:00:00 2001
From: mouliangyu <mouliangyu@huawei.com>
Date: Wed, 1 Jul 2026 15:34:52 +0800
Subject: [PATCH] Fix VCG group reduction semantics

---
 docs/isa/micro-isa/10-reduction-ops.md        |  85 ++++++++-----
 docs/vpto-spec.md                             |  12 +-
 .../micro-op/reduction/vcg-group/compare.py   |  62 ++++++++++
 .../micro-op/reduction/vcg-group/golden.py    |  59 +++++++++
 .../micro-op/reduction/vcg-group/kernel.pto   |  60 +++++++++
 .../micro-op/reduction/vcg-group/launch.cpp   |  47 +++++++
 .../micro-op/reduction/vcg-group/main.cpp     | 117 ++++++++++++++++++
 7 files changed, 407 insertions(+), 35 deletions(-)
 create mode 100644 test/vpto/cases/micro-op/reduction/vcg-group/compare.py
 create mode 100644 test/vpto/cases/micro-op/reduction/vcg-group/golden.py
 create mode 100644 test/vpto/cases/micro-op/reduction/vcg-group/kernel.pto
 create mode 100644 test/vpto/cases/micro-op/reduction/vcg-group/launch.cpp
 create mode 100644 test/vpto/cases/micro-op/reduction/vcg-group/main.cpp
diff --git a/docs/isa/micro-isa/10-reduction-ops.md b/docs/isa/micro-isa/10-reduction-ops.md
index ecae818f2..ebe95a7bf 100644
--- a/docs/isa/micro-isa/10-reduction-ops.md
+++ b/docs/isa/micro-isa/10-reduction-ops.md
@@ -194,7 +194,10 @@ for (int i = 0; i < N; i++) {
 
 ## Per-VLane (Group) Reductions
 
-The vector register is organized as **8 VLanes** of 32 bytes each. Group reductions operate within each VLane independently.
+The vector register is organized as **8 VLanes** of 32 bytes each. Group
+reductions operate within each VLane independently and produce one result per
+VLane. The 8 VLane results are written contiguously to the low elements of the
+destination vector; all remaining destination elements are zero.
 
 ```
 vreg layout (f32 example, 64 elements total):
@@ -206,27 +209,31 @@ VLane 4: [32..39] VLane 5: [40..47] VLane 6: [48..55] VLane 7: [56..63]
 
 - **syntax:** `%result = pto.vcgadd %input, %mask : !pto.vreg<NxT>, !pto.mask<G> -> !pto.vreg<NxT>`
 - **A5 types:** i16-i32, f16, f32
-- **semantics:** Sum within each VLane. 8 results at indices 0, 8, 16, 24, 32, 40, 48, 56 (for f32).
+- **semantics:** Sum active elements within each 32-byte VLane. The 8 VLane
+  sums are written to result elements `0..7`; all other result elements are
+  zero.
 
 ```c
-int K = N / 8;  // elements per VLane
+int groups = 8;
+int K = 32 / sizeof(T);  // elements per 32-byte VLane
 for (int g = 0; g < 8; g++) {
     T sum = 0;
     for (int i = 0; i < K; i++)
-        sum += src[g*K + i];
-    dst[g*K] = sum;
-    for (int i = 1; i < K; i++)
-        dst[g*K + i] = 0;
+        if (mask[g*K + i])
+            sum += src[g*K + i];
+    dst[g] = sum;
 }
-// For f32: results at dst[0], dst[8], dst[16], dst[24], dst[32], dst[40], dst[48], dst[56]
+for (int i = groups; i < N; i++)
+    dst[i] = 0;
 ```
 
 - **inputs:** `%input` is the source vector and `%mask` selects participating
   lanes.
 - **outputs:** `%result` contains one sum per 32-byte VLane group, written
-  contiguously into the low slot of each group.
+  contiguously to the low elements of the result vector.
 - **constraints and limitations:** This is a per-32-byte VLane-group reduction.
-  Inactive lanes are treated as zero.
+  Inactive lanes are treated as zero. If all lanes in a VLane are inactive, the
+  corresponding result element is `0` (`+0` for floating-point types).
 
 ---
 
@@ -234,25 +241,34 @@ for (int g = 0; g < 8; g++) {
 
 - **syntax:** `%result = pto.vcgmax %input, %mask : !pto.vreg<NxT>, !pto.mask<G> -> !pto.vreg<NxT>`
 - **A5 types:** i16-i32, f16, f32
-- **semantics:** Max within each VLane.
+- **semantics:** Find the maximum active element within each 32-byte VLane. The
+  8 VLane maxima are written to result elements `0..7`; all other result
+  elements are zero.
 
 ```c
-int K = N / 8;
+int groups = 8;
+int K = 32 / sizeof(T);
 for (int g = 0; g < 8; g++) {
-    T mx = -INF;
+    T mx = max_identity_for_T;  // -INF for float, minimum value for integer
     for (int i = 0; i < K; i++)
-        if (src[g*K + i] > mx) mx = src[g*K + i];
-    dst[g*K] = mx;
-    for (int i = 1; i < K; i++)
-        dst[g*K + i] = 0;
+        if (mask[g*K + i])
+            mx = max(mx, src[g*K + i]);
+    dst[g] = mx;
 }
+for (int i = groups; i < N; i++)
+    dst[i] = 0;
 ```
 
 - **inputs:** `%input` is the source vector and `%mask` selects participating
   lanes.
-- **outputs:** `%result` contains one maximum per 32-byte VLane group.
+- **outputs:** `%result` contains one maximum per 32-byte VLane group, written
+  contiguously to the low elements of the result vector.
 - **constraints and limitations:** Grouping is by hardware 32-byte VLane, not by
-  arbitrary software subvector.
+  arbitrary software subvector. Inactive floating-point lanes are treated as
+  `-INF`; inactive integer lanes are treated as the element type's minimum
+  value. If all lanes in a VLane are inactive, that neutral value is written for
+  the corresponding VLane result. For floating-point values, `max(+0, -0)`
+  returns `+0`.
 
 ---
 
@@ -260,25 +276,34 @@ for (int g = 0; g < 8; g++) {
 
 - **syntax:** `%result = pto.vcgmin %input, %mask : !pto.vreg<NxT>, !pto.mask<G> -> !pto.vreg<NxT>`
 - **A5 types:** i16-i32, f16, f32
-- **semantics:** Min within each VLane.
+- **semantics:** Find the minimum active element within each 32-byte VLane. The
+  8 VLane minima are written to result elements `0..7`; all other result
+  elements are zero.
 
 ```c
-int K = N / 8;
+int groups = 8;
+int K = 32 / sizeof(T);
 for (int g = 0; g < 8; g++) {
-    T mn = INF;
+    T mn = min_identity_for_T;  // +INF for float, maximum value for integer
     for (int i = 0; i < K; i++)
-        if (src[g*K + i] < mn) mn = src[g*K + i];
-    dst[g*K] = mn;
-    for (int i = 1; i < K; i++)
-        dst[g*K + i] = 0;
+        if (mask[g*K + i])
+            mn = min(mn, src[g*K + i]);
+    dst[g] = mn;
 }
+for (int i = groups; i < N; i++)
+    dst[i] = 0;
 ```
 
 - **inputs:** `%input` is the source vector and `%mask` selects participating
   lanes.
-- **outputs:** `%result` contains one minimum per 32-byte VLane group.
+- **outputs:** `%result` contains one minimum per 32-byte VLane group, written
+  contiguously to the low elements of the result vector.
 - **constraints and limitations:** Grouping is by hardware 32-byte VLane, not by
-  arbitrary software subvector.
+  arbitrary software subvector. Inactive floating-point lanes are treated as
+  `+INF`; inactive integer lanes are treated as the element type's maximum
+  value. If all lanes in a VLane are inactive, that neutral value is written for
+  the corresponding VLane result. For floating-point values, `min(-0, +0)`
+  returns `-0`.
 
 ---
 
@@ -318,9 +343,9 @@ for (int i = 1; i < N; i++)
 // max is in lane 0, broadcast it
 %max_broadcast = pto.vlds %ub_tmp[%c0] {dist = "BRC_B32"} : !pto.ptr<f32, ub> -> !pto.vreg<64xf32>
 
-// Row-wise sum using vcgadd (for 8-row tile)
+// Per-VLane sums using vcgadd
 %row_sums = pto.vcgadd %tile, %mask : !pto.vreg<64xf32>, !pto.mask<G> -> !pto.vreg<64xf32>
-// Results at indices 0, 8, 16, 24, 32, 40, 48, 56
+// Results at indices 0..7; remaining elements are zero
 
 // Full vector sum for normalization
 %total = pto.vcadd %values, %mask : !pto.vreg<64xf32>, !pto.mask<G> -> !pto.vreg<64xf32>
diff --git a/docs/vpto-spec.md b/docs/vpto-spec.md
index 9ebc93e11..a814732f6 100644
--- a/docs/vpto-spec.md
+++ b/docs/vpto-spec.md
@@ -1337,15 +1337,17 @@ for (int i = 0; i < N; i++)
 **Example — pto.vcgadd (group reduction per VLane) semantics:**
 
 ```c
-int K = N / 8;  // elements per VLane
+int groups = 8;
+int K = 32 / sizeof(T);  // elements per 32-byte VLane
 for (int g = 0; g < 8; g++) {
     T sum = 0;
     for (int i = 0; i < K; i++)
-        sum += src[g*K + i];
-    dst[g*K] = sum;
-    for (int i = 1; i < K; i++)
-        dst[g*K + i] = 0;
+        if (mask[g*K + i])
+            sum += src[g*K + i];
+    dst[g] = sum;
 }
+for (int i = groups; i < N; i++)
+    dst[i] = 0;
 ```
 
 For A5 reduction result types:
diff --git a/test/vpto/cases/micro-op/reduction/vcg-group/compare.py b/test/vpto/cases/micro-op/reduction/vcg-group/compare.py
new file mode 100644
index 000000000..bc41592a5
--- /dev/null
+++ b/test/vpto/cases/micro-op/reduction/vcg-group/compare.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python3
+# Copyright (c) 2026 Huawei Technologies Co., Ltd.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+
+import os
+import sys
+
+import numpy as np
+
+
+def compare_bin(golden_path, output_path, eps):
+    if not os.path.exists(golden_path):
+        print(f"[ERROR] Golden missing: {golden_path}")
+        return False
+    if not os.path.exists(output_path):
+        print(f"[ERROR] Output missing: {output_path}")
+        return False
+
+    golden = np.fromfile(golden_path, dtype=np.float32)
+    output = np.fromfile(output_path, dtype=np.float32)
+    if golden.shape != output.shape:
+        print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}")
+        return False
+    if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True):
+        diff = np.abs(golden.astype(np.float64) - output.astype(np.float64))
+        idx = int(np.argmax(diff))
+        print(
+            f"[ERROR] Mismatch: {golden_path} vs {output_path}, "
+            f"idx={idx}, golden={golden[idx]}, output={output[idx]}, max_diff={diff[idx]}"
+        )
+        return False
+    return True
+
+
+def main():
+    strict = os.getenv("COMPARE_STRICT", "1") != "0"
+    checks = [
+        ("golden_add.bin", "out_add.bin", "vcgadd"),
+        ("golden_max.bin", "out_max.bin", "vcgmax"),
+        ("golden_min.bin", "out_min.bin", "vcgmin"),
+    ]
+    failed = []
+    for golden, output, label in checks:
+        if not compare_bin(golden, output, 1e-4):
+            failed.append(label)
+            print(f"[ERROR] compare failed: {label}")
+    if failed:
+        if strict:
+            print(f"[ERROR] {len(failed)} check(s) failed: {', '.join(failed)}")
+            sys.exit(2)
+        print(f"[WARN] {len(failed)} check(s) failed (non-gating): {', '.join(failed)}")
+        return
+    print("[INFO] compare passed")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/vpto/cases/micro-op/reduction/vcg-group/golden.py b/test/vpto/cases/micro-op/reduction/vcg-group/golden.py
new file mode 100644
index 000000000..bc39ef32b
--- /dev/null
+++ b/test/vpto/cases/micro-op/reduction/vcg-group/golden.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python3
+# Copyright (c) 2026 Huawei Technologies Co., Ltd.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+
+import argparse
+from pathlib import Path
+
+import numpy as np
+
+
+GROUPS = 8
+ELEMS_PER_GROUP = 8
+LANES = GROUPS * ELEMS_PER_GROUP
+
+
+def generate(output_dir: Path) -> None:
+    src = np.array(
+        [
+            -7.0, 1.0, 3.5, -2.0, 9.0, -4.5, 6.0, 0.5,
+            8.0, -1.0, -3.0, 4.0, 2.0, -6.0, 5.5, 7.0,
+            -0.0, 0.0, -5.0, 5.0, 11.0, -12.0, 13.0, -14.0,
+            1.25, 2.25, 3.25, 4.25, -8.0, -9.0, 10.0, -10.0,
+            15.0, 14.0, 13.0, 12.0, -1.5, -2.5, -3.5, -4.5,
+            -20.0, -19.0, -18.0, -17.0, 16.0, 15.5, 14.5, 13.5,
+            0.25, -0.75, 1.5, -2.25, 3.0, -3.75, 4.5, -5.25,
+            31.0, -32.0, 33.0, -34.0, 35.0, -36.0, 37.0, -38.0,
+        ],
+        dtype=np.float32,
+    )
+    groups = src.reshape(GROUPS, ELEMS_PER_GROUP)
+
+    golden_add = np.zeros(LANES, dtype=np.float32)
+    golden_max = np.zeros(LANES, dtype=np.float32)
+    golden_min = np.zeros(LANES, dtype=np.float32)
+    golden_add[:GROUPS] = np.sum(groups, axis=1, dtype=np.float32)
+    golden_max[:GROUPS] = np.max(groups, axis=1)
+    golden_min[:GROUPS] = np.min(groups, axis=1)
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+    src.tofile(output_dir / "src.bin")
+    golden_add.tofile(output_dir / "golden_add.bin")
+    golden_max.tofile(output_dir / "golden_max.bin")
+    golden_min.tofile(output_dir / "golden_min.bin")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--output-dir", type=Path, default=Path("."))
+    args = parser.parse_args()
+    generate(args.output_dir)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/vpto/cases/micro-op/reduction/vcg-group/kernel.pto b/test/vpto/cases/micro-op/reduction/vcg-group/kernel.pto
new file mode 100644
index 000000000..94c8072ee
--- /dev/null
+++ b/test/vpto/cases/micro-op/reduction/vcg-group/kernel.pto
@@ -0,0 +1,60 @@
+// Copyright (c) 2026 Huawei Technologies Co., Ltd.
+// This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+// CANN Open Software License Agreement Version 2.0 (the "License").
+// Please refer to the License for details. You may not use this file except in compliance with the License.
+// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+// See LICENSE in the root of the software repository for the full text of the License.
+
+module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind<vector>} {
+  func.func @vcg_group_kernel(%src: !pto.ptr<f32, gm>,
+                              %dst_add: !pto.ptr<f32, gm>,
+                              %dst_max: !pto.ptr<f32, gm>,
+                              %dst_min: !pto.ptr<f32, gm>) attributes {pto.kernel} {
+    %c0 = arith.constant 0 : index
+    %c0_i64 = arith.constant 0 : i64
+    %c1_i64 = arith.constant 1 : i64
+    %c256_i64 = arith.constant 256 : i64
+    %c4096_i64 = arith.constant 4096 : i64
+    %c8192_i64 = arith.constant 8192 : i64
+    %c12288_i64 = arith.constant 12288 : i64
+
+    %ub_src = pto.castptr %c0_i64 : i64 -> !pto.ptr<f32, ub>
+    %ub_add = pto.castptr %c4096_i64 : i64 -> !pto.ptr<f32, ub>
+    %ub_max = pto.castptr %c8192_i64 : i64 -> !pto.ptr<f32, ub>
+    %ub_min = pto.castptr %c12288_i64 : i64 -> !pto.ptr<f32, ub>
+
+    pto.mte_gm_ub %src, %ub_src, %c0_i64, %c256_i64
+      nburst(%c1_i64, %c256_i64, %c256_i64)
+      : !pto.ptr<f32, gm>, !pto.ptr<f32, ub>, i64, i64, i64, i64, i64
+
+    pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"]
+    pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"]
+
+    pto.vecscope {
+      %mask = pto.pset_b32 "PAT_ALL" : !pto.mask<b32>
+      %vec = pto.vlds %ub_src[%c0] : !pto.ptr<f32, ub> -> !pto.vreg<64xf32>
+      %add = pto.vcgadd %vec, %mask : !pto.vreg<64xf32>, !pto.mask<b32> -> !pto.vreg<64xf32>
+      %max = pto.vcgmax %vec, %mask : !pto.vreg<64xf32>, !pto.mask<b32> -> !pto.vreg<64xf32>
+      %min = pto.vcgmin %vec, %mask : !pto.vreg<64xf32>, !pto.mask<b32> -> !pto.vreg<64xf32>
+      pto.vsts %add, %ub_add[%c0], %mask : !pto.vreg<64xf32>, !pto.ptr<f32, ub>, !pto.mask<b32>
+      pto.vsts %max, %ub_max[%c0], %mask : !pto.vreg<64xf32>, !pto.ptr<f32, ub>, !pto.mask<b32>
+      pto.vsts %min, %ub_min[%c0], %mask : !pto.vreg<64xf32>, !pto.ptr<f32, ub>, !pto.mask<b32>
+    }
+
+    pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"]
+    pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"]
+
+    pto.mte_ub_gm %ub_add, %dst_add, %c256_i64
+      nburst(%c1_i64, %c256_i64, %c256_i64)
+      : !pto.ptr<f32, ub>, !pto.ptr<f32, gm>, i64, i64, i64, i64
+    pto.mte_ub_gm %ub_max, %dst_max, %c256_i64
+      nburst(%c1_i64, %c256_i64, %c256_i64)
+      : !pto.ptr<f32, ub>, !pto.ptr<f32, gm>, i64, i64, i64, i64
+    pto.mte_ub_gm %ub_min, %dst_min, %c256_i64
+      nburst(%c1_i64, %c256_i64, %c256_i64)
+      : !pto.ptr<f32, ub>, !pto.ptr<f32, gm>, i64, i64, i64, i64
+    pto.barrier #pto.pipe<PIPE_ALL>
+    return
+  }
+}
diff --git a/test/vpto/cases/micro-op/reduction/vcg-group/launch.cpp b/test/vpto/cases/micro-op/reduction/vcg-group/launch.cpp
new file mode 100644
index 000000000..b41e821cb
--- /dev/null
+++ b/test/vpto/cases/micro-op/reduction/vcg-group/launch.cpp
@@ -0,0 +1,47 @@
+// Copyright (c) 2026 Huawei Technologies Co., Ltd.
+// This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+// CANN Open Software License Agreement Version 2.0 (the "License").
+// Please refer to the License for details. You may not use this file except in compliance with the License.
+// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+// See LICENSE in the root of the software repository for the full text of the License.
+
+#ifndef __VEC_SCOPE__
+#define __VEC_SCOPE__
+#endif
+
+#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201)
+typedef struct { unsigned char v; } hifloat8_t;
+typedef struct { unsigned char v; } float8_e4m3_t;
+typedef struct { unsigned char v; } float8_e5m2_t;
+typedef struct { unsigned char v; } float8_e8m0_t;
+typedef struct { unsigned char v; } float4_e1m2x2_t;
+typedef struct { unsigned char v; } float4_e2m1x2_t;
+#endif
+
+#include <stdint.h>
+
+#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP)
+struct MrgSortExecutedNumList {
+  uint16_t mrgSortList0;
+  uint16_t mrgSortList1;
+  uint16_t mrgSortList2;
+  uint16_t mrgSortList3;
+};
+#endif
+
+#ifndef __CPU_SIM
+#include "acl/acl.h"
+#endif
+
+extern "C" __global__ [aicore] void
+vcg_group_kernel(__gm__ float *src, __gm__ float *dst_add,
+                 __gm__ float *dst_max, __gm__ float *dst_min);
+
+void LaunchVcgGroup(float *src, float *dst_add, float *dst_max,
+                    float *dst_min, void *stream) {
+  vcg_group_kernel<<<1, nullptr, stream>>>((__gm__ float *)src,
+                                           (__gm__ float *)dst_add,
+                                           (__gm__ float *)dst_max,
+                                           (__gm__ float *)dst_min);
+}
diff --git a/test/vpto/cases/micro-op/reduction/vcg-group/main.cpp b/test/vpto/cases/micro-op/reduction/vcg-group/main.cpp
new file mode 100644
index 000000000..818240fae
--- /dev/null
+++ b/test/vpto/cases/micro-op/reduction/vcg-group/main.cpp
@@ -0,0 +1,117 @@
+// Copyright (c) 2026 Huawei Technologies Co., Ltd.
+// This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+// CANN Open Software License Agreement Version 2.0 (the "License").
+// Please refer to the License for details. You may not use this file except in compliance with the License.
+// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+// See LICENSE in the root of the software repository for the full text of the License.
+
+#include "acl/acl.h"
+#include "test_common.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+
+using namespace PtoTestCommon;
+
+#define ACL_CHECK(expr)                                                        \
+  do {                                                                         \
+    const aclError _ret = (expr);                                              \
+    if (_ret != ACL_SUCCESS) {                                                 \
+      std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr,           \
+                   (int)_ret, __FILE__, __LINE__);                             \
+      rc = 1;                                                                  \
+      goto cleanup;                                                            \
+    }                                                                          \
+  } while (0)
+
+void LaunchVcgGroup(float *src, float *dst_add, float *dst_max,
+                    float *dst_min, void *stream);
+
+int main() {
+  constexpr size_t kElems = 64;
+  constexpr size_t kBytes = kElems * sizeof(float);
+
+  float *srcHost = nullptr;
+  float *addHost = nullptr;
+  float *maxHost = nullptr;
+  float *minHost = nullptr;
+  float *srcDevice = nullptr;
+  float *addDevice = nullptr;
+  float *maxDevice = nullptr;
+  float *minDevice = nullptr;
+
+  int rc = 0;
+  bool aclInited = false;
+  bool deviceSet = false;
+  int deviceId = 0;
+  aclrtStream stream = nullptr;
+  size_t fileSize = kBytes;
+
+  ACL_CHECK(aclInit(nullptr));
+  aclInited = true;
+  if (const char *envDevice = std::getenv("ACL_DEVICE_ID"))
+    deviceId = std::atoi(envDevice);
+  ACL_CHECK(aclrtSetDevice(deviceId));
+  deviceSet = true;
+  ACL_CHECK(aclrtCreateStream(&stream));
+
+  ACL_CHECK(aclrtMallocHost((void **)&srcHost, kBytes));
+  ACL_CHECK(aclrtMallocHost((void **)&addHost, kBytes));
+  ACL_CHECK(aclrtMallocHost((void **)&maxHost, kBytes));
+  ACL_CHECK(aclrtMallocHost((void **)&minHost, kBytes));
+  ACL_CHECK(aclrtMalloc((void **)&srcDevice, kBytes, ACL_MEM_MALLOC_HUGE_FIRST));
+  ACL_CHECK(aclrtMalloc((void **)&addDevice, kBytes, ACL_MEM_MALLOC_HUGE_FIRST));
+  ACL_CHECK(aclrtMalloc((void **)&maxDevice, kBytes, ACL_MEM_MALLOC_HUGE_FIRST));
+  ACL_CHECK(aclrtMalloc((void **)&minDevice, kBytes, ACL_MEM_MALLOC_HUGE_FIRST));
+
+  if (!ReadFile("./src.bin", fileSize, srcHost, kBytes) || fileSize != kBytes) {
+    std::fprintf(stderr, "[ERROR] failed to read src.bin\n");
+    rc = 1;
+    goto cleanup;
+  }
+  std::fill_n(addHost, kElems, 0.0f);
+  std::fill_n(maxHost, kElems, 0.0f);
+  std::fill_n(minHost, kElems, 0.0f);
+
+  ACL_CHECK(aclrtMemcpy(srcDevice, kBytes, srcHost, kBytes,
+                        ACL_MEMCPY_HOST_TO_DEVICE));
+  ACL_CHECK(aclrtMemcpy(addDevice, kBytes, addHost, kBytes,
+                        ACL_MEMCPY_HOST_TO_DEVICE));
+  ACL_CHECK(aclrtMemcpy(maxDevice, kBytes, maxHost, kBytes,
+                        ACL_MEMCPY_HOST_TO_DEVICE));
+  ACL_CHECK(aclrtMemcpy(minDevice, kBytes, minHost, kBytes,
+                        ACL_MEMCPY_HOST_TO_DEVICE));
+
+  LaunchVcgGroup(srcDevice, addDevice, maxDevice, minDevice, stream);
+
+  ACL_CHECK(aclrtSynchronizeStream(stream));
+  ACL_CHECK(aclrtMemcpy(addHost, kBytes, addDevice, kBytes,
+                        ACL_MEMCPY_DEVICE_TO_HOST));
+  ACL_CHECK(aclrtMemcpy(maxHost, kBytes, maxDevice, kBytes,
+                        ACL_MEMCPY_DEVICE_TO_HOST));
+  ACL_CHECK(aclrtMemcpy(minHost, kBytes, minDevice, kBytes,
+                        ACL_MEMCPY_DEVICE_TO_HOST));
+  WriteFile("./out_add.bin", addHost, kBytes);
+  WriteFile("./out_max.bin", maxHost, kBytes);
+  WriteFile("./out_min.bin", minHost, kBytes);
+
+cleanup:
+  aclrtFree(srcDevice);
+  aclrtFree(addDevice);
+  aclrtFree(maxDevice);
+  aclrtFree(minDevice);
+  aclrtFreeHost(srcHost);
+  aclrtFreeHost(addHost);
+  aclrtFreeHost(maxHost);
+  aclrtFreeHost(minHost);
+  if (stream != nullptr)
+    aclrtDestroyStream(stream);
+  if (deviceSet)
+    aclrtResetDevice(deviceId);
+  if (aclInited)
+    aclFinalize();
+  return rc;
+}