From 465dc65d8f36e3c5ac04d0e88b1a33509dce67d5 Mon Sep 17 00:00:00 2001 From: mouliangyu Date: Wed, 1 Jul 2026 15:34:52 +0800 Subject: [PATCH] Fix VCG group reduction semantics --- docs/isa/micro-isa/10-reduction-ops.md | 85 ++++++++----- docs/vpto-spec.md | 12 +- .../micro-op/reduction/vcg-group/compare.py | 62 ++++++++++ .../micro-op/reduction/vcg-group/golden.py | 59 +++++++++ .../micro-op/reduction/vcg-group/kernel.pto | 60 +++++++++ .../micro-op/reduction/vcg-group/launch.cpp | 47 +++++++ .../micro-op/reduction/vcg-group/main.cpp | 117 ++++++++++++++++++ 7 files changed, 407 insertions(+), 35 deletions(-) create mode 100644 test/vpto/cases/micro-op/reduction/vcg-group/compare.py create mode 100644 test/vpto/cases/micro-op/reduction/vcg-group/golden.py create mode 100644 test/vpto/cases/micro-op/reduction/vcg-group/kernel.pto create mode 100644 test/vpto/cases/micro-op/reduction/vcg-group/launch.cpp create mode 100644 test/vpto/cases/micro-op/reduction/vcg-group/main.cpp diff --git a/docs/isa/micro-isa/10-reduction-ops.md b/docs/isa/micro-isa/10-reduction-ops.md index ecae818f2..ebe95a7bf 100644 --- a/docs/isa/micro-isa/10-reduction-ops.md +++ b/docs/isa/micro-isa/10-reduction-ops.md @@ -194,7 +194,10 @@ for (int i = 0; i < N; i++) { ## Per-VLane (Group) Reductions -The vector register is organized as **8 VLanes** of 32 bytes each. Group reductions operate within each VLane independently. +The vector register is organized as **8 VLanes** of 32 bytes each. Group +reductions operate within each VLane independently and produce one result per +VLane. The 8 VLane results are written contiguously to the low elements of the +destination vector; all remaining destination elements are zero. ``` vreg layout (f32 example, 64 elements total): @@ -206,27 +209,31 @@ VLane 4: [32..39] VLane 5: [40..47] VLane 6: [48..55] VLane 7: [56..63] - **syntax:** `%result = pto.vcgadd %input, %mask : !pto.vreg, !pto.mask -> !pto.vreg` - **A5 types:** i16-i32, f16, f32 -- **semantics:** Sum within each VLane. 8 results at indices 0, 8, 16, 24, 32, 40, 48, 56 (for f32). +- **semantics:** Sum active elements within each 32-byte VLane. The 8 VLane + sums are written to result elements `0..7`; all other result elements are + zero. ```c -int K = N / 8; // elements per VLane +int groups = 8; +int K = 32 / sizeof(T); // elements per 32-byte VLane for (int g = 0; g < 8; g++) { T sum = 0; for (int i = 0; i < K; i++) - sum += src[g*K + i]; - dst[g*K] = sum; - for (int i = 1; i < K; i++) - dst[g*K + i] = 0; + if (mask[g*K + i]) + sum += src[g*K + i]; + dst[g] = sum; } -// For f32: results at dst[0], dst[8], dst[16], dst[24], dst[32], dst[40], dst[48], dst[56] +for (int i = groups; i < N; i++) + dst[i] = 0; ``` - **inputs:** `%input` is the source vector and `%mask` selects participating lanes. - **outputs:** `%result` contains one sum per 32-byte VLane group, written - contiguously into the low slot of each group. + contiguously to the low elements of the result vector. - **constraints and limitations:** This is a per-32-byte VLane-group reduction. - Inactive lanes are treated as zero. + Inactive lanes are treated as zero. If all lanes in a VLane are inactive, the + corresponding result element is `0` (`+0` for floating-point types). --- @@ -234,25 +241,34 @@ for (int g = 0; g < 8; g++) { - **syntax:** `%result = pto.vcgmax %input, %mask : !pto.vreg, !pto.mask -> !pto.vreg` - **A5 types:** i16-i32, f16, f32 -- **semantics:** Max within each VLane. +- **semantics:** Find the maximum active element within each 32-byte VLane. The + 8 VLane maxima are written to result elements `0..7`; all other result + elements are zero. ```c -int K = N / 8; +int groups = 8; +int K = 32 / sizeof(T); for (int g = 0; g < 8; g++) { - T mx = -INF; + T mx = max_identity_for_T; // -INF for float, minimum value for integer for (int i = 0; i < K; i++) - if (src[g*K + i] > mx) mx = src[g*K + i]; - dst[g*K] = mx; - for (int i = 1; i < K; i++) - dst[g*K + i] = 0; + if (mask[g*K + i]) + mx = max(mx, src[g*K + i]); + dst[g] = mx; } +for (int i = groups; i < N; i++) + dst[i] = 0; ``` - **inputs:** `%input` is the source vector and `%mask` selects participating lanes. -- **outputs:** `%result` contains one maximum per 32-byte VLane group. +- **outputs:** `%result` contains one maximum per 32-byte VLane group, written + contiguously to the low elements of the result vector. - **constraints and limitations:** Grouping is by hardware 32-byte VLane, not by - arbitrary software subvector. + arbitrary software subvector. Inactive floating-point lanes are treated as + `-INF`; inactive integer lanes are treated as the element type's minimum + value. If all lanes in a VLane are inactive, that neutral value is written for + the corresponding VLane result. For floating-point values, `max(+0, -0)` + returns `+0`. --- @@ -260,25 +276,34 @@ for (int g = 0; g < 8; g++) { - **syntax:** `%result = pto.vcgmin %input, %mask : !pto.vreg, !pto.mask -> !pto.vreg` - **A5 types:** i16-i32, f16, f32 -- **semantics:** Min within each VLane. +- **semantics:** Find the minimum active element within each 32-byte VLane. The + 8 VLane minima are written to result elements `0..7`; all other result + elements are zero. ```c -int K = N / 8; +int groups = 8; +int K = 32 / sizeof(T); for (int g = 0; g < 8; g++) { - T mn = INF; + T mn = min_identity_for_T; // +INF for float, maximum value for integer for (int i = 0; i < K; i++) - if (src[g*K + i] < mn) mn = src[g*K + i]; - dst[g*K] = mn; - for (int i = 1; i < K; i++) - dst[g*K + i] = 0; + if (mask[g*K + i]) + mn = min(mn, src[g*K + i]); + dst[g] = mn; } +for (int i = groups; i < N; i++) + dst[i] = 0; ``` - **inputs:** `%input` is the source vector and `%mask` selects participating lanes. -- **outputs:** `%result` contains one minimum per 32-byte VLane group. +- **outputs:** `%result` contains one minimum per 32-byte VLane group, written + contiguously to the low elements of the result vector. - **constraints and limitations:** Grouping is by hardware 32-byte VLane, not by - arbitrary software subvector. + arbitrary software subvector. Inactive floating-point lanes are treated as + `+INF`; inactive integer lanes are treated as the element type's maximum + value. If all lanes in a VLane are inactive, that neutral value is written for + the corresponding VLane result. For floating-point values, `min(-0, +0)` + returns `-0`. --- @@ -318,9 +343,9 @@ for (int i = 1; i < N; i++) // max is in lane 0, broadcast it %max_broadcast = pto.vlds %ub_tmp[%c0] {dist = "BRC_B32"} : !pto.ptr -> !pto.vreg<64xf32> -// Row-wise sum using vcgadd (for 8-row tile) +// Per-VLane sums using vcgadd %row_sums = pto.vcgadd %tile, %mask : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> -// Results at indices 0, 8, 16, 24, 32, 40, 48, 56 +// Results at indices 0..7; remaining elements are zero // Full vector sum for normalization %total = pto.vcadd %values, %mask : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> diff --git a/docs/vpto-spec.md b/docs/vpto-spec.md index 9ebc93e11..a814732f6 100644 --- a/docs/vpto-spec.md +++ b/docs/vpto-spec.md @@ -1337,15 +1337,17 @@ for (int i = 0; i < N; i++) **Example — pto.vcgadd (group reduction per VLane) semantics:** ```c -int K = N / 8; // elements per VLane +int groups = 8; +int K = 32 / sizeof(T); // elements per 32-byte VLane for (int g = 0; g < 8; g++) { T sum = 0; for (int i = 0; i < K; i++) - sum += src[g*K + i]; - dst[g*K] = sum; - for (int i = 1; i < K; i++) - dst[g*K + i] = 0; + if (mask[g*K + i]) + sum += src[g*K + i]; + dst[g] = sum; } +for (int i = groups; i < N; i++) + dst[i] = 0; ``` For A5 reduction result types: diff --git a/test/vpto/cases/micro-op/reduction/vcg-group/compare.py b/test/vpto/cases/micro-op/reduction/vcg-group/compare.py new file mode 100644 index 000000000..bc41592a5 --- /dev/null +++ b/test/vpto/cases/micro-op/reduction/vcg-group/compare.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +import os +import sys + +import numpy as np + + +def compare_bin(golden_path, output_path, eps): + if not os.path.exists(golden_path): + print(f"[ERROR] Golden missing: {golden_path}") + return False + if not os.path.exists(output_path): + print(f"[ERROR] Output missing: {output_path}") + return False + + golden = np.fromfile(golden_path, dtype=np.float32) + output = np.fromfile(output_path, dtype=np.float32) + if golden.shape != output.shape: + print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") + return False + if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): + diff = np.abs(golden.astype(np.float64) - output.astype(np.float64)) + idx = int(np.argmax(diff)) + print( + f"[ERROR] Mismatch: {golden_path} vs {output_path}, " + f"idx={idx}, golden={golden[idx]}, output={output[idx]}, max_diff={diff[idx]}" + ) + return False + return True + + +def main(): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + checks = [ + ("golden_add.bin", "out_add.bin", "vcgadd"), + ("golden_max.bin", "out_max.bin", "vcgmax"), + ("golden_min.bin", "out_min.bin", "vcgmin"), + ] + failed = [] + for golden, output, label in checks: + if not compare_bin(golden, output, 1e-4): + failed.append(label) + print(f"[ERROR] compare failed: {label}") + if failed: + if strict: + print(f"[ERROR] {len(failed)} check(s) failed: {', '.join(failed)}") + sys.exit(2) + print(f"[WARN] {len(failed)} check(s) failed (non-gating): {', '.join(failed)}") + return + print("[INFO] compare passed") + + +if __name__ == "__main__": + main() diff --git a/test/vpto/cases/micro-op/reduction/vcg-group/golden.py b/test/vpto/cases/micro-op/reduction/vcg-group/golden.py new file mode 100644 index 000000000..bc39ef32b --- /dev/null +++ b/test/vpto/cases/micro-op/reduction/vcg-group/golden.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +import argparse +from pathlib import Path + +import numpy as np + + +GROUPS = 8 +ELEMS_PER_GROUP = 8 +LANES = GROUPS * ELEMS_PER_GROUP + + +def generate(output_dir: Path) -> None: + src = np.array( + [ + -7.0, 1.0, 3.5, -2.0, 9.0, -4.5, 6.0, 0.5, + 8.0, -1.0, -3.0, 4.0, 2.0, -6.0, 5.5, 7.0, + -0.0, 0.0, -5.0, 5.0, 11.0, -12.0, 13.0, -14.0, + 1.25, 2.25, 3.25, 4.25, -8.0, -9.0, 10.0, -10.0, + 15.0, 14.0, 13.0, 12.0, -1.5, -2.5, -3.5, -4.5, + -20.0, -19.0, -18.0, -17.0, 16.0, 15.5, 14.5, 13.5, + 0.25, -0.75, 1.5, -2.25, 3.0, -3.75, 4.5, -5.25, + 31.0, -32.0, 33.0, -34.0, 35.0, -36.0, 37.0, -38.0, + ], + dtype=np.float32, + ) + groups = src.reshape(GROUPS, ELEMS_PER_GROUP) + + golden_add = np.zeros(LANES, dtype=np.float32) + golden_max = np.zeros(LANES, dtype=np.float32) + golden_min = np.zeros(LANES, dtype=np.float32) + golden_add[:GROUPS] = np.sum(groups, axis=1, dtype=np.float32) + golden_max[:GROUPS] = np.max(groups, axis=1) + golden_min[:GROUPS] = np.min(groups, axis=1) + + output_dir.mkdir(parents=True, exist_ok=True) + src.tofile(output_dir / "src.bin") + golden_add.tofile(output_dir / "golden_add.bin") + golden_max.tofile(output_dir / "golden_max.bin") + golden_min.tofile(output_dir / "golden_min.bin") + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("--output-dir", type=Path, default=Path(".")) + args = parser.parse_args() + generate(args.output_dir) + + +if __name__ == "__main__": + main() diff --git a/test/vpto/cases/micro-op/reduction/vcg-group/kernel.pto b/test/vpto/cases/micro-op/reduction/vcg-group/kernel.pto new file mode 100644 index 000000000..94c8072ee --- /dev/null +++ b/test/vpto/cases/micro-op/reduction/vcg-group/kernel.pto @@ -0,0 +1,60 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + func.func @vcg_group_kernel(%src: !pto.ptr, + %dst_add: !pto.ptr, + %dst_max: !pto.ptr, + %dst_min: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c0_i64 = arith.constant 0 : i64 + %c1_i64 = arith.constant 1 : i64 + %c256_i64 = arith.constant 256 : i64 + %c4096_i64 = arith.constant 4096 : i64 + %c8192_i64 = arith.constant 8192 : i64 + %c12288_i64 = arith.constant 12288 : i64 + + %ub_src = pto.castptr %c0_i64 : i64 -> !pto.ptr + %ub_add = pto.castptr %c4096_i64 : i64 -> !pto.ptr + %ub_max = pto.castptr %c8192_i64 : i64 -> !pto.ptr + %ub_min = pto.castptr %c12288_i64 : i64 -> !pto.ptr + + pto.mte_gm_ub %src, %ub_src, %c0_i64, %c256_i64 + nburst(%c1_i64, %c256_i64, %c256_i64) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %mask = pto.pset_b32 "PAT_ALL" : !pto.mask + %vec = pto.vlds %ub_src[%c0] : !pto.ptr -> !pto.vreg<64xf32> + %add = pto.vcgadd %vec, %mask : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + %max = pto.vcgmax %vec, %mask : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + %min = pto.vcgmin %vec, %mask : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %add, %ub_add[%c0], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + pto.vsts %max, %ub_max[%c0], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + pto.vsts %min, %ub_min[%c0], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + + pto.mte_ub_gm %ub_add, %dst_add, %c256_i64 + nburst(%c1_i64, %c256_i64, %c256_i64) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.mte_ub_gm %ub_max, %dst_max, %c256_i64 + nburst(%c1_i64, %c256_i64, %c256_i64) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.mte_ub_gm %ub_min, %dst_min, %c256_i64 + nburst(%c1_i64, %c256_i64, %c256_i64) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + return + } +} diff --git a/test/vpto/cases/micro-op/reduction/vcg-group/launch.cpp b/test/vpto/cases/micro-op/reduction/vcg-group/launch.cpp new file mode 100644 index 000000000..b41e821cb --- /dev/null +++ b/test/vpto/cases/micro-op/reduction/vcg-group/launch.cpp @@ -0,0 +1,47 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#ifndef __VEC_SCOPE__ +#define __VEC_SCOPE__ +#endif + +#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) +typedef struct { unsigned char v; } hifloat8_t; +typedef struct { unsigned char v; } float8_e4m3_t; +typedef struct { unsigned char v; } float8_e5m2_t; +typedef struct { unsigned char v; } float8_e8m0_t; +typedef struct { unsigned char v; } float4_e1m2x2_t; +typedef struct { unsigned char v; } float4_e2m1x2_t; +#endif + +#include + +#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) +struct MrgSortExecutedNumList { + uint16_t mrgSortList0; + uint16_t mrgSortList1; + uint16_t mrgSortList2; + uint16_t mrgSortList3; +}; +#endif + +#ifndef __CPU_SIM +#include "acl/acl.h" +#endif + +extern "C" __global__ [aicore] void +vcg_group_kernel(__gm__ float *src, __gm__ float *dst_add, + __gm__ float *dst_max, __gm__ float *dst_min); + +void LaunchVcgGroup(float *src, float *dst_add, float *dst_max, + float *dst_min, void *stream) { + vcg_group_kernel<<<1, nullptr, stream>>>((__gm__ float *)src, + (__gm__ float *)dst_add, + (__gm__ float *)dst_max, + (__gm__ float *)dst_min); +} diff --git a/test/vpto/cases/micro-op/reduction/vcg-group/main.cpp b/test/vpto/cases/micro-op/reduction/vcg-group/main.cpp new file mode 100644 index 000000000..818240fae --- /dev/null +++ b/test/vpto/cases/micro-op/reduction/vcg-group/main.cpp @@ -0,0 +1,117 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include "acl/acl.h" +#include "test_common.h" + +#include +#include +#include +#include + +using namespace PtoTestCommon; + +#define ACL_CHECK(expr) \ + do { \ + const aclError _ret = (expr); \ + if (_ret != ACL_SUCCESS) { \ + std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ + (int)_ret, __FILE__, __LINE__); \ + rc = 1; \ + goto cleanup; \ + } \ + } while (0) + +void LaunchVcgGroup(float *src, float *dst_add, float *dst_max, + float *dst_min, void *stream); + +int main() { + constexpr size_t kElems = 64; + constexpr size_t kBytes = kElems * sizeof(float); + + float *srcHost = nullptr; + float *addHost = nullptr; + float *maxHost = nullptr; + float *minHost = nullptr; + float *srcDevice = nullptr; + float *addDevice = nullptr; + float *maxDevice = nullptr; + float *minDevice = nullptr; + + int rc = 0; + bool aclInited = false; + bool deviceSet = false; + int deviceId = 0; + aclrtStream stream = nullptr; + size_t fileSize = kBytes; + + ACL_CHECK(aclInit(nullptr)); + aclInited = true; + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) + deviceId = std::atoi(envDevice); + ACL_CHECK(aclrtSetDevice(deviceId)); + deviceSet = true; + ACL_CHECK(aclrtCreateStream(&stream)); + + ACL_CHECK(aclrtMallocHost((void **)&srcHost, kBytes)); + ACL_CHECK(aclrtMallocHost((void **)&addHost, kBytes)); + ACL_CHECK(aclrtMallocHost((void **)&maxHost, kBytes)); + ACL_CHECK(aclrtMallocHost((void **)&minHost, kBytes)); + ACL_CHECK(aclrtMalloc((void **)&srcDevice, kBytes, ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void **)&addDevice, kBytes, ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void **)&maxDevice, kBytes, ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void **)&minDevice, kBytes, ACL_MEM_MALLOC_HUGE_FIRST)); + + if (!ReadFile("./src.bin", fileSize, srcHost, kBytes) || fileSize != kBytes) { + std::fprintf(stderr, "[ERROR] failed to read src.bin\n"); + rc = 1; + goto cleanup; + } + std::fill_n(addHost, kElems, 0.0f); + std::fill_n(maxHost, kElems, 0.0f); + std::fill_n(minHost, kElems, 0.0f); + + ACL_CHECK(aclrtMemcpy(srcDevice, kBytes, srcHost, kBytes, + ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(addDevice, kBytes, addHost, kBytes, + ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(maxDevice, kBytes, maxHost, kBytes, + ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(minDevice, kBytes, minHost, kBytes, + ACL_MEMCPY_HOST_TO_DEVICE)); + + LaunchVcgGroup(srcDevice, addDevice, maxDevice, minDevice, stream); + + ACL_CHECK(aclrtSynchronizeStream(stream)); + ACL_CHECK(aclrtMemcpy(addHost, kBytes, addDevice, kBytes, + ACL_MEMCPY_DEVICE_TO_HOST)); + ACL_CHECK(aclrtMemcpy(maxHost, kBytes, maxDevice, kBytes, + ACL_MEMCPY_DEVICE_TO_HOST)); + ACL_CHECK(aclrtMemcpy(minHost, kBytes, minDevice, kBytes, + ACL_MEMCPY_DEVICE_TO_HOST)); + WriteFile("./out_add.bin", addHost, kBytes); + WriteFile("./out_max.bin", maxHost, kBytes); + WriteFile("./out_min.bin", minHost, kBytes); + +cleanup: + aclrtFree(srcDevice); + aclrtFree(addDevice); + aclrtFree(maxDevice); + aclrtFree(minDevice); + aclrtFreeHost(srcHost); + aclrtFreeHost(addHost); + aclrtFreeHost(maxHost); + aclrtFreeHost(minHost); + if (stream != nullptr) + aclrtDestroyStream(stream); + if (deviceSet) + aclrtResetDevice(deviceId); + if (aclInited) + aclFinalize(); + return rc; +}