diff --git a/lib/TileOps/trowexpanddiv_template.py b/lib/TileOps/trowexpanddiv_template.py index 888353014..5ffc87835 100644 --- a/lib/TileOps/trowexpanddiv_template.py +++ b/lib/TileOps/trowexpanddiv_template.py @@ -60,35 +60,64 @@ def _constraint_trowexpanddiv_row_major(src0: pto.Tile, src1: pto.Tile, dst: pto constraints=[_constraint_trowexpanddiv_row_major], ) def template_trowexpanddiv_f32(src0: pto.Tile, src1: pto.Tile, dst: pto.Tile): - """Template for pto.trowexpanddiv with f32 dtype and optional high-precision mode.""" + """Template for pto.trowexpanddiv with f32 dtype and optional high-precision mode. + + When src1 is col_major with shape [M, 1] (a per-row scalar column), + vlds on the col_major tile slice would access UB at non-512B-aligned + addresses (error 340 on A5). Use vldas+vldus (unaligned load pipeline) + for src1 in that case; keep the aligned vlds path for row_major src1. + """ dtype = dst.element_type valid_rows, valid_cols = dst.valid_shape - precision_type = pto.get_op_attr("precisionType", "default") if pto.constexpr(precision_type == "high_precision"): - for row in range(0, valid_rows, 1): - remained = valid_cols - for col in range(0, valid_cols, pto.get_lanes(dtype)): - mask, remained = pto.make_mask(dtype, remained) - # Load the scalar vector from src1[row, :] - # For row-major src1, valid_shape[1] is 32/sizeof(dtype) (e.g., 8 for f32) - # vdup broadcasts the first element to the full vector width - scalar_vec = pto.vlds(src1[row, :]) - broadcasted = pto.vdup(scalar_vec, mask) - lhs = pto.vlds(src0[row, col:]) - result = _div_ieee754_f32_impl(lhs, broadcasted, mask) - pto.vsts(result, dst[row, col:], mask) + if pto.constexpr(src1.config.b_layout == pto.BLayout.COL_MAJOR): + # col_major / high_precision + for row in range(0, valid_rows, 1): + align_src1 = pto.vldas(src1[row, :]) + scalar_vec, _ = pto.vldus(src1[row, :], align_src1) + broadcasted = pto.vdup(scalar_vec, pto.make_mask(dtype, pto.PAT.ALL)) + remained = valid_cols + for col in range(0, valid_cols, pto.get_lanes(dtype)): + mask, remained = pto.make_mask(dtype, remained) + lhs = pto.vlds(src0[row, col:]) + result = _div_ieee754_f32_impl(lhs, broadcasted, mask) + pto.vsts(result, dst[row, col:], mask) + else: + # row_major / high_precision + for row in range(0, valid_rows, 1): + remained = valid_cols + for col in range(0, valid_cols, pto.get_lanes(dtype)): + mask, remained = pto.make_mask(dtype, remained) + scalar_vec = pto.vlds(src1[row, :]) + broadcasted = pto.vdup(scalar_vec, mask) + lhs = pto.vlds(src0[row, col:]) + result = _div_ieee754_f32_impl(lhs, broadcasted, mask) + pto.vsts(result, dst[row, col:], mask) else: - for row in range(0, valid_rows, 1): - remained = valid_cols - for col in range(0, valid_cols, pto.get_lanes(dtype)): - mask, remained = pto.make_mask(dtype, remained) - # Load the scalar vector from src1[row, :] - scalar_vec = pto.vlds(src1[row, :]) - broadcasted = pto.vdup(scalar_vec, mask) - lhs = pto.vlds(src0[row, col:]) - result = pto.vdiv(lhs, broadcasted, mask) - pto.vsts(result, dst[row, col:], mask) + if pto.constexpr(src1.config.b_layout == pto.BLayout.COL_MAJOR): + # col_major / default precision + for row in range(0, valid_rows, 1): + align_src1 = pto.vldas(src1[row, :]) + scalar_vec, _ = pto.vldus(src1[row, :], align_src1) + broadcasted = pto.vdup(scalar_vec, pto.make_mask(dtype, pto.PAT.ALL)) + remained = valid_cols + for col in range(0, valid_cols, pto.get_lanes(dtype)): + mask, remained = pto.make_mask(dtype, remained) + lhs = pto.vlds(src0[row, col:]) + result = pto.vdiv(lhs, broadcasted, mask) + pto.vsts(result, dst[row, col:], mask) + else: + # row_major / default precision (existing behaviour) + for row in range(0, valid_rows, 1): + remained = valid_cols + for col in range(0, valid_cols, pto.get_lanes(dtype)): + mask, remained = pto.make_mask(dtype, remained) + scalar_vec = pto.vlds(src1[row, :]) + broadcasted = pto.vdup(scalar_vec, mask) + lhs = pto.vlds(src0[row, col:]) + result = pto.vdiv(lhs, broadcasted, mask) + pto.vsts(result, dst[row, col:], mask) return @@ -99,33 +128,62 @@ def template_trowexpanddiv_f32(src0: pto.Tile, src1: pto.Tile, dst: pto.Tile): constraints=[_constraint_trowexpanddiv_row_major], ) def template_trowexpanddiv_f16(src0: pto.Tile, src1: pto.Tile, dst: pto.Tile): - """Template for pto.trowexpanddiv with f16 dtype and optional high-precision mode.""" + """Template for pto.trowexpanddiv with f16 dtype and optional high-precision mode. + + When src1 is col_major with shape [M, 1] (a per-row scalar column), + vlds on the col_major tile slice would access UB at non-512B-aligned + addresses (error 340 on A5). Use vldas+vldus (unaligned load pipeline) + for src1 in that case; keep the aligned vlds path for row_major src1. + """ dtype = dst.element_type valid_rows, valid_cols = dst.valid_shape - precision_type = pto.get_op_attr("precisionType", "default") if pto.constexpr(precision_type == "high_precision"): - for row in range(0, valid_rows, 1): - remained = valid_cols - for col in range(0, valid_cols, pto.get_lanes(dtype)): - mask, remained = pto.make_mask(dtype, remained) - # Load the scalar vector from src1[row, :] - # For row-major src1, valid_shape[1] is 32/sizeof(dtype) (e.g., 16 for f16) - # vdup broadcasts the first element to the full vector width - scalar_vec = pto.vlds(src1[row, :]) - broadcasted = pto.vdup(scalar_vec, mask) - lhs = pto.vlds(src0[row, col:]) - result = _div_ieee754_f16_impl(lhs, broadcasted, mask) - pto.vsts(result, dst[row, col:], mask) + if pto.constexpr(src1.config.b_layout == pto.BLayout.COL_MAJOR): + # col_major / high_precision + for row in range(0, valid_rows, 1): + align_src1 = pto.vldas(src1[row, :]) + scalar_vec, _ = pto.vldus(src1[row, :], align_src1) + broadcasted = pto.vdup(scalar_vec, pto.make_mask(dtype, pto.PAT.ALL)) + remained = valid_cols + for col in range(0, valid_cols, pto.get_lanes(dtype)): + mask, remained = pto.make_mask(dtype, remained) + lhs = pto.vlds(src0[row, col:]) + result = _div_ieee754_f16_impl(lhs, broadcasted, mask) + pto.vsts(result, dst[row, col:], mask) + else: + # row_major / high_precision + for row in range(0, valid_rows, 1): + remained = valid_cols + for col in range(0, valid_cols, pto.get_lanes(dtype)): + mask, remained = pto.make_mask(dtype, remained) + scalar_vec = pto.vlds(src1[row, :]) + broadcasted = pto.vdup(scalar_vec, mask) + lhs = pto.vlds(src0[row, col:]) + result = _div_ieee754_f16_impl(lhs, broadcasted, mask) + pto.vsts(result, dst[row, col:], mask) else: - for row in range(0, valid_rows, 1): - remained = valid_cols - for col in range(0, valid_cols, pto.get_lanes(dtype)): - mask, remained = pto.make_mask(dtype, remained) - # Load the scalar vector from src1[row, :] - scalar_vec = pto.vlds(src1[row, :]) - broadcasted = pto.vdup(scalar_vec, mask) - lhs = pto.vlds(src0[row, col:]) - result = pto.vdiv(lhs, broadcasted, mask) - pto.vsts(result, dst[row, col:], mask) + if pto.constexpr(src1.config.b_layout == pto.BLayout.COL_MAJOR): + # col_major / default precision + for row in range(0, valid_rows, 1): + align_src1 = pto.vldas(src1[row, :]) + scalar_vec, _ = pto.vldus(src1[row, :], align_src1) + broadcasted = pto.vdup(scalar_vec, pto.make_mask(dtype, pto.PAT.ALL)) + remained = valid_cols + for col in range(0, valid_cols, pto.get_lanes(dtype)): + mask, remained = pto.make_mask(dtype, remained) + lhs = pto.vlds(src0[row, col:]) + result = pto.vdiv(lhs, broadcasted, mask) + pto.vsts(result, dst[row, col:], mask) + else: + # row_major / default precision (existing behaviour) + for row in range(0, valid_rows, 1): + remained = valid_cols + for col in range(0, valid_cols, pto.get_lanes(dtype)): + mask, remained = pto.make_mask(dtype, remained) + scalar_vec = pto.vlds(src1[row, :]) + broadcasted = pto.vdup(scalar_vec, mask) + lhs = pto.vlds(src0[row, col:]) + result = pto.vdiv(lhs, broadcasted, mask) + pto.vsts(result, dst[row, col:], mask) return diff --git a/lib/TileOps/trowexpandmul_template.py b/lib/TileOps/trowexpandmul_template.py index 504e22392..cc7e41945 100644 --- a/lib/TileOps/trowexpandmul_template.py +++ b/lib/TileOps/trowexpandmul_template.py @@ -57,20 +57,41 @@ def template_trowexpandmul(src0: pto.Tile, src1: pto.Tile, dst: pto.Tile): Multiply each row of src0 by a per-row scalar from src1[row, 0]. Semantics: dst[row, col] = src0[row, col] * src1[row, 0] + + When src1 is col_major with shape [M, 1] (a per-row scalar column), + vlds on the col_major tile slice would access UB at non-512B-aligned + addresses (error 340 on A5). Use vldas+vldus (unaligned load pipeline) + for src1 in that case; keep the aligned vlds path for row_major src1. """ dtype = dst.element_type valid_rows, valid_cols = dst.valid_shape - for row in range(0, valid_rows, 1): - remained = valid_cols - for col in range(0, valid_cols, pto.get_lanes(dtype)): - mask, remained = pto.make_mask(dtype, remained) - # Load the scalar vector from src1[row, :] - # For row-major src1, valid_shape[1] is 32/sizeof(dtype) (e.g., 8 for f32) - # vdup broadcasts the first element to the full vector width - scalar_vec = pto.vlds(src1[row, :]) - broadcasted = pto.vdup(scalar_vec, mask) - lhs = pto.vlds(src0[row, col:]) - result = pto.vmul(lhs, broadcasted, mask) - pto.vsts(result, dst[row, col:], mask) + if pto.constexpr(src1.config.b_layout == pto.BLayout.COL_MAJOR): + # ---- col_major [M, 1] path: unaligned load for src1 ---- + for row in range(0, valid_rows, 1): + # vldas+vldus once per row, broadcast across all col iterations + align_src1 = pto.vldas(src1[row, :]) + scalar_vec, _ = pto.vldus(src1[row, :], align_src1) + broadcasted = pto.vdup(scalar_vec, pto.make_mask(dtype, pto.PAT.ALL)) + remained = valid_cols + for col in range(0, valid_cols, pto.get_lanes(dtype)): + mask, remained = pto.make_mask(dtype, remained) + lhs = pto.vlds(src0[row, col:]) + result = pto.vmul(lhs, broadcasted, mask) + pto.vsts(result, dst[row, col:], mask) + else: + # ---- row_major path: aligned vlds (existing behaviour) ---- + for row in range(0, valid_rows, 1): + remained = valid_cols + for col in range(0, valid_cols, pto.get_lanes(dtype)): + mask, remained = pto.make_mask(dtype, remained) + # Load the scalar vector from src1[row, :] + # For row-major src1, valid_shape[1] is 32/sizeof(dtype) + # (e.g., 8 for f32). vdup broadcasts the first element + # to the full vector width. + scalar_vec = pto.vlds(src1[row, :]) + broadcasted = pto.vdup(scalar_vec, mask) + lhs = pto.vlds(src0[row, col:]) + result = pto.vmul(lhs, broadcasted, mask) + pto.vsts(result, dst[row, col:], mask) return diff --git a/lib/TileOps/trowexpandsub_template.py b/lib/TileOps/trowexpandsub_template.py index 22cc94b6e..e8394d6ed 100644 --- a/lib/TileOps/trowexpandsub_template.py +++ b/lib/TileOps/trowexpandsub_template.py @@ -57,20 +57,41 @@ def template_trowexpandsub(src0: pto.Tile, src1: pto.Tile, dst: pto.Tile): Subtract a per-row scalar from src1[row, 0] from each row of src0. Semantics: dst[row, col] = src0[row, col] - src1[row, 0] + + When src1 is col_major with shape [M, 1] (a per-row scalar column), + vlds on the col_major tile slice would access UB at non-512B-aligned + addresses (error 340 on A5). Use vldas+vldus (unaligned load pipeline) + for src1 in that case; keep the aligned vlds path for row_major src1. """ dtype = dst.element_type valid_rows, valid_cols = dst.valid_shape - for row in range(0, valid_rows, 1): - remained = valid_cols - for col in range(0, valid_cols, pto.get_lanes(dtype)): - mask, remained = pto.make_mask(dtype, remained) - # Load the scalar vector from src1[row, :] - # For row-major src1, valid_shape[1] is 32/sizeof(dtype) (e.g., 8 for f32) - # vdup broadcasts the first element to the full vector width - scalar_vec = pto.vlds(src1[row, :]) - broadcasted = pto.vdup(scalar_vec, mask) - lhs = pto.vlds(src0[row, col:]) - result = pto.vsub(lhs, broadcasted, mask) - pto.vsts(result, dst[row, col:], mask) + if pto.constexpr(src1.config.b_layout == pto.BLayout.COL_MAJOR): + # ---- col_major [M, 1] path: unaligned load for src1 ---- + for row in range(0, valid_rows, 1): + # vldas+vldus once per row, broadcast across all col iterations + align_src1 = pto.vldas(src1[row, :]) + scalar_vec, _ = pto.vldus(src1[row, :], align_src1) + broadcasted = pto.vdup(scalar_vec, pto.make_mask(dtype, pto.PAT.ALL)) + remained = valid_cols + for col in range(0, valid_cols, pto.get_lanes(dtype)): + mask, remained = pto.make_mask(dtype, remained) + lhs = pto.vlds(src0[row, col:]) + result = pto.vsub(lhs, broadcasted, mask) + pto.vsts(result, dst[row, col:], mask) + else: + # ---- row_major path: aligned vlds (existing behaviour) ---- + for row in range(0, valid_rows, 1): + remained = valid_cols + for col in range(0, valid_cols, pto.get_lanes(dtype)): + mask, remained = pto.make_mask(dtype, remained) + # Load the scalar vector from src1[row, :] + # For row-major src1, valid_shape[1] is 32/sizeof(dtype) + # (e.g., 8 for f32). vdup broadcasts the first element + # to the full vector width. + scalar_vec = pto.vlds(src1[row, :]) + broadcasted = pto.vdup(scalar_vec, mask) + lhs = pto.vlds(src0[row, col:]) + result = pto.vsub(lhs, broadcasted, mask) + pto.vsts(result, dst[row, col:], mask) return diff --git a/test/lit/vpto/trowexpanddiv_tile_op_expand_col_major.pto b/test/lit/vpto/trowexpanddiv_tile_op_expand_col_major.pto new file mode 100644 index 000000000..96401d4f5 --- /dev/null +++ b/test/lit/vpto/trowexpanddiv_tile_op_expand_col_major.pto @@ -0,0 +1,61 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Test that ExpandTileOp + InlineLibCall + FoldTileBufIntrinsics pipeline +// expands pto.trowexpanddiv when src1 is a col_major [M,1] scalar column +// with default (hardware) precision. +// +// The col_major path must use the unaligned load pipeline (vldas+vldus) +// for src1 instead of vlds, to avoid non-512B-aligned UB access on A5. +// +// Pipeline: PTOMaterializeTileHandles -> ExpandTileOp -> InlineLibCall -> FoldTileBufIntrinsics +// +// RUN: ptoas --pto-arch=a5 --pto-backend=vpto --emit-vpto --enable-tile-op-expand %s -o - 2>/dev/null | FileCheck %s + +// After expansion the original pto.trowexpanddiv must be gone. +// CHECK: func.func @TROWEXPANDDIV_COLMAJOR +// CHECK-NOT: pto.trowexpanddiv ins +// CHECK: pto.vecscope +// The col_major src1 path uses vldas+vldus (unaligned load pipeline). +// CHECK: pto.vldas +// CHECK: pto.vldus +// Broadcast is still used to replicate the scalar across the vector. +// CHECK: pto.vdup +// src0 (row_major) still uses aligned vlds. +// CHECK: pto.vlds +// Core arithmetic: default precision uses hardware vdiv. +// CHECK: pto.vdiv +// Result store: +// CHECK: pto.vsts + +module attributes {pto.kernel_kind = #pto.kernel_kind} { + func.func @TROWEXPANDDIV_COLMAJOR() { + // src0: 32x32 matrix (row-major) + %src0 = pto.alloc_tile + : !pto.tile_buf + // src1: 32x1 col_major column — one scalar per row. + // When src1 is col_major, the template uses vldas+vldus (unaligned) + // instead of vlds to handle the non-512B-aligned slice access. + %src1 = pto.alloc_tile + : !pto.tile_buf + // dst: 32x32 result (row-major) + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.trowexpanddiv ins(%src0, %src1 : !pto.tile_buf, + !pto.tile_buf) + outs(%dst : !pto.tile_buf) + return + } +} diff --git a/test/lit/vpto/trowexpanddiv_tile_op_expand_f16_col_major.pto b/test/lit/vpto/trowexpanddiv_tile_op_expand_f16_col_major.pto new file mode 100644 index 000000000..58320cdd6 --- /dev/null +++ b/test/lit/vpto/trowexpanddiv_tile_op_expand_f16_col_major.pto @@ -0,0 +1,60 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Test that ExpandTileOp + InlineLibCall + FoldTileBufIntrinsics pipeline +// expands pto.trowexpanddiv when src1 is a col_major [M,1] scalar column +// with f16 dtype and default (hardware) precision. +// +// trowexpanddiv has separate f32 and f16 template specializations +// (template_trowexpanddiv_f32 / template_trowexpanddiv_f16), so the +// f16 col_major path must be tested independently. +// +// Pipeline: PTOMaterializeTileHandles -> ExpandTileOp -> InlineLibCall -> FoldTileBufIntrinsics +// +// RUN: ptoas --pto-arch=a5 --pto-backend=vpto --emit-vpto --enable-tile-op-expand %s -o - 2>/dev/null | FileCheck %s + +// After expansion the original pto.trowexpanddiv must be gone. +// CHECK: func.func @TROWEXPANDDIV_F16_COLMAJOR +// CHECK-NOT: pto.trowexpanddiv ins +// CHECK: pto.vecscope +// The col_major src1 path uses vldas+vldus (unaligned load pipeline). +// CHECK: pto.vldas +// CHECK: pto.vldus +// Broadcast is still used to replicate the scalar across the vector. +// CHECK: pto.vdup +// src0 (row_major) still uses aligned vlds. +// CHECK: pto.vlds +// Core arithmetic: default precision uses hardware vdiv. +// CHECK: pto.vdiv +// Result store: +// CHECK: pto.vsts + +module attributes {pto.kernel_kind = #pto.kernel_kind} { + func.func @TROWEXPANDDIV_F16_COLMAJOR() { + // src0: 32x32 matrix (row-major, f16) + %src0 = pto.alloc_tile + : !pto.tile_buf + // src1: 32x1 col_major column — one f16 scalar per row. + %src1 = pto.alloc_tile + : !pto.tile_buf + // dst: 32x32 result (row-major, f16) + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.trowexpanddiv ins(%src0, %src1 : !pto.tile_buf, + !pto.tile_buf) + outs(%dst : !pto.tile_buf) + return + } +} diff --git a/test/lit/vpto/trowexpandmul_tile_op_expand_col_major.pto b/test/lit/vpto/trowexpandmul_tile_op_expand_col_major.pto new file mode 100644 index 000000000..1b758a731 --- /dev/null +++ b/test/lit/vpto/trowexpandmul_tile_op_expand_col_major.pto @@ -0,0 +1,60 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Test that ExpandTileOp + InlineLibCall + FoldTileBufIntrinsics pipeline +// expands pto.trowexpandmul when src1 is a col_major [M,1] scalar column. +// +// The col_major path must use the unaligned load pipeline (vldas+vldus) +// for src1 instead of vlds, to avoid non-512B-aligned UB access on A5. +// +// Pipeline: PTOMaterializeTileHandles -> ExpandTileOp -> InlineLibCall -> FoldTileBufIntrinsics +// +// RUN: ptoas --pto-arch=a5 --pto-backend=vpto --emit-vpto --enable-tile-op-expand %s -o - 2>/dev/null | FileCheck %s + +// After expansion the original pto.trowexpandmul must be gone. +// CHECK: func.func @TROWEXPANDMUL_COLMAJOR +// CHECK-NOT: pto.trowexpandmul ins +// CHECK: pto.vecscope +// The col_major src1 path uses vldas+vldus (unaligned load pipeline). +// CHECK: pto.vldas +// CHECK: pto.vldus +// Broadcast is still used to replicate the scalar across the vector. +// CHECK: pto.vdup +// src0 (row_major) still uses aligned vlds. +// CHECK: pto.vlds +// Core arithmetic: +// CHECK: pto.vmul +// Result store: +// CHECK: pto.vsts + +module attributes {pto.kernel_kind = #pto.kernel_kind} { + func.func @TROWEXPANDMUL_COLMAJOR() { + // src0: 32x32 matrix (row-major) + %src0 = pto.alloc_tile + : !pto.tile_buf + // src1: 32x1 col_major column — one scalar per row. + // When src1 is col_major, the template uses vldas+vldus (unaligned) + // instead of vlds to handle the non-512B-aligned slice access. + %src1 = pto.alloc_tile + : !pto.tile_buf + // dst: 32x32 result (row-major) + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.trowexpandmul ins(%src0, %src1 : !pto.tile_buf, + !pto.tile_buf) + outs(%dst : !pto.tile_buf) + return + } +} diff --git a/test/lit/vpto/trowexpandsub_tile_op_expand_col_major.pto b/test/lit/vpto/trowexpandsub_tile_op_expand_col_major.pto new file mode 100644 index 000000000..afe44a6f7 --- /dev/null +++ b/test/lit/vpto/trowexpandsub_tile_op_expand_col_major.pto @@ -0,0 +1,60 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Test that ExpandTileOp + InlineLibCall + FoldTileBufIntrinsics pipeline +// expands pto.trowexpandsub when src1 is a col_major [M,1] scalar column. +// +// The col_major path must use the unaligned load pipeline (vldas+vldus) +// for src1 instead of vlds, to avoid non-512B-aligned UB access on A5. +// +// Pipeline: PTOMaterializeTileHandles -> ExpandTileOp -> InlineLibCall -> FoldTileBufIntrinsics +// +// RUN: ptoas --pto-arch=a5 --pto-backend=vpto --emit-vpto --enable-tile-op-expand %s -o - 2>/dev/null | FileCheck %s + +// After expansion the original pto.trowexpandsub must be gone. +// CHECK: func.func @TROWEXPANDSUB_COLMAJOR +// CHECK-NOT: pto.trowexpandsub ins +// CHECK: pto.vecscope +// The col_major src1 path uses vldas+vldus (unaligned load pipeline). +// CHECK: pto.vldas +// CHECK: pto.vldus +// Broadcast is still used to replicate the scalar across the vector. +// CHECK: pto.vdup +// src0 (row_major) still uses aligned vlds. +// CHECK: pto.vlds +// Core arithmetic: +// CHECK: pto.vsub +// Result store: +// CHECK: pto.vsts + +module attributes {pto.kernel_kind = #pto.kernel_kind} { + func.func @TROWEXPANDSUB_COLMAJOR() { + // src0: 32x32 matrix (row-major) + %src0 = pto.alloc_tile + : !pto.tile_buf + // src1: 32x1 col_major column — one scalar per row. + // When src1 is col_major, the template uses vldas+vldus (unaligned) + // instead of vlds to handle the non-512B-aligned slice access. + %src1 = pto.alloc_tile + : !pto.tile_buf + // dst: 32x32 result (row-major) + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.trowexpandsub ins(%src0, %src1 : !pto.tile_buf, + !pto.tile_buf) + outs(%dst : !pto.tile_buf) + return + } +} diff --git a/tilelang-dsl/python/tilelang_dsl/lowering.py b/tilelang-dsl/python/tilelang_dsl/lowering.py index d57dcb60a..e18f1c1fc 100644 --- a/tilelang-dsl/python/tilelang_dsl/lowering.py +++ b/tilelang-dsl/python/tilelang_dsl/lowering.py @@ -202,6 +202,7 @@ def __init__(self, kernel: SemanticKernel): self._constant_cache: dict[tuple[str, object], str] = {} self._castptr_cache: dict[tuple[str, str], str] = {} self._tile_memref_cache: dict[str, _RenderedValue] = {} + self._tile_ptr_cache: dict[str, _RenderedValue] = {} self._tile_valid_dim_cache: dict[tuple[str, int], _RenderedValue] = {} self._used_tile_buffers = self._collect_used_tile_buffers(kernel.body) self._temp_counter = 0 @@ -871,24 +872,42 @@ def _render_multi_result_assign( lines = [] source = self._lower_expr(stmt.value.args[0], env, indent=indent, into=lines) index_args = stmt.value.args[1:-1] + + # Convert tile directly to ptr (not memref) — same as vldas. if isinstance(source.type, SemanticTileType): - source = self._materialize_tile_memref(source, indent=indent, into=lines) + source = self._materialize_tile_ptr(source, indent=indent, into=lines) + if ( isinstance(stmt.value.args[0].type, SemanticTileType) and stmt.value.args[0].type.rank == 2 and len(index_args) == 2 ): - source = self._materialize_rank2_tile_subview( - source, - stmt.value.args[0].type, - index_args, - env, - indent=indent, - into=lines, + orig_tile_type = stmt.value.args[0].type + row_index, col_index = index_args + row_value = self._lower_expr(row_index, env, indent=indent, into=lines) + col_value = self._lower_expr(col_index, env, indent=indent, into=lines) + total_cols = orig_tile_type.shape[1] + total_cols_const = self._materialize_constant( + total_cols, SemanticIndexType() + ) + # offset = row * total_cols + col + row_offset = self._new_temp() + lines.append( + self._indent(indent) + + f"{row_offset} = arith.muli {row_value.name}, {total_cols_const} : index" + ) + elem_offset = self._new_temp() + lines.append( + self._indent(indent) + + f"{elem_offset} = arith.addi {row_offset}, {col_value.name} : index" + ) + ptr_name = self._new_temp() + lines.append( + self._indent(indent) + + f"{ptr_name} = pto.addptr {source.name}, {elem_offset} : " + + f"{self._render_type(source.type)} -> {self._render_type(source.type)}" ) - if self._is_memref_like_type(source.type): - ptr_name, ptr_type = self._materialize_copy_buffer_ptr(source, indent=indent, into=lines) - source = _RenderedValue(name=ptr_name, type=_RenderedTextualType(ptr_type)) + source = _RenderedValue(name=ptr_name, type=source.type) align = self._lower_expr(stmt.value.args[-1], env, indent=indent, into=lines) result_target, align_target = stmt.targets result_type, align_type = stmt.value.type.elements @@ -1868,24 +1887,47 @@ def _lower_call_expr( if expr.name == "vldas": source = self._lower_expr(expr.args[0], env, indent=indent, into=into) index_args = expr.args[1:] + + # Convert tile directly to ptr (not memref) for vldas. + # vldas only accepts !pto.ptr, and the memref→subview→castptr + # path is broken. Instead: tile → ptr via tile_buf_addr, then + # pto.addptr with the element offset computed from the indices. if isinstance(source.type, SemanticTileType): - source = self._materialize_tile_memref(source, indent=indent, into=into) + source = self._materialize_tile_ptr(source, indent=indent, into=into) + if ( isinstance(expr.args[0].type, SemanticTileType) and expr.args[0].type.rank == 2 and len(index_args) == 2 ): - source = self._materialize_rank2_tile_subview( - source, - expr.args[0].type, - index_args, - env, - indent=indent, - into=into, + orig_tile_type = expr.args[0].type + row_index, col_index = index_args + row_value = self._lower_expr(row_index, env, indent=indent, into=into) + col_value = self._lower_expr(col_index, env, indent=indent, into=into) + # uniform offset formula: row * shape[1] + col. + # Works for row_major [M,N] and col_major [M,1] (col=0). + total_cols = orig_tile_type.shape[1] + total_cols_const = self._materialize_constant( + total_cols, SemanticIndexType() ) - if self._is_memref_like_type(source.type): - ptr_name, ptr_type = self._materialize_copy_buffer_ptr(source, indent=indent, into=into) - source = _RenderedValue(name=ptr_name, type=_RenderedTextualType(ptr_type)) + # offset = row * total_cols + col + row_offset = self._new_temp() + into.append( + self._indent(indent) + + f"{row_offset} = arith.muli {row_value.name}, {total_cols_const} : index" + ) + elem_offset = self._new_temp() + into.append( + self._indent(indent) + + f"{elem_offset} = arith.addi {row_offset}, {col_value.name} : index" + ) + ptr_name = self._new_temp() + into.append( + self._indent(indent) + + f"{ptr_name} = pto.addptr {source.name}, {elem_offset} : " + + f"{self._render_type(source.type)} -> {self._render_type(source.type)}" + ) + source = _RenderedValue(name=ptr_name, type=source.type) into.append( self._indent(indent) + f"{result_name} = pto.vldas {source.name} : " @@ -3225,6 +3267,35 @@ def _materialize_copy_buffer_ptr( return value.name, ptr_type + def _materialize_tile_ptr( + self, + value: _RenderedValue, + *, + indent: int, + into: list[str], + ) -> _RenderedValue: + """Convert a SemanticTileType value directly to a typed !pto.ptr. + + Unlike _materialize_tile_memref which produces a memref, this emits + pto.tile_buf_addr with a !pto.ptr<...> result type so the ptr can be + used directly with pto.addptr / pto.vldas / pto.vldus. + """ + existing = self._tile_ptr_cache.get(value.name) + if existing is not None: + return existing + if not isinstance(value.type, SemanticTileType): + return value + ptr_type = self._render_copy_buffer_type(value.type) + ptr_name = self._new_temp() + into.append( + self._indent(indent) + + f"{ptr_name} = pto.tile_buf_addr {value.name} : " + + f"{self._render_type(value.type)} -> {ptr_type}" + ) + rendered = _RenderedValue(name=ptr_name, type=_RenderedTextualType(ptr_type)) + self._tile_ptr_cache[value.name] = rendered + return rendered + def _coerce_rendered_value( self, value: _RenderedValue,