Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
154 changes: 106 additions & 48 deletions lib/TileOps/trowexpanddiv_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,35 +60,64 @@ def _constraint_trowexpanddiv_row_major(src0: pto.Tile, src1: pto.Tile, dst: pto
constraints=[_constraint_trowexpanddiv_row_major],
)
def template_trowexpanddiv_f32(src0: pto.Tile, src1: pto.Tile, dst: pto.Tile):
"""Template for pto.trowexpanddiv with f32 dtype and optional high-precision mode."""

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

请补一个st用例,看护这个场景

"""Template for pto.trowexpanddiv with f32 dtype and optional high-precision mode.

When src1 is col_major with shape [M, 1] (a per-row scalar column),
vlds on the col_major tile slice would access UB at non-512B-aligned
addresses (error 340 on A5). Use vldas+vldus (unaligned load pipeline)
for src1 in that case; keep the aligned vlds path for row_major src1.
"""
dtype = dst.element_type
valid_rows, valid_cols = dst.valid_shape

precision_type = pto.get_op_attr("precisionType", "default")
if pto.constexpr(precision_type == "high_precision"):
for row in range(0, valid_rows, 1):
remained = valid_cols
for col in range(0, valid_cols, pto.get_lanes(dtype)):
mask, remained = pto.make_mask(dtype, remained)
# Load the scalar vector from src1[row, :]
# For row-major src1, valid_shape[1] is 32/sizeof(dtype) (e.g., 8 for f32)
# vdup broadcasts the first element to the full vector width
scalar_vec = pto.vlds(src1[row, :])
broadcasted = pto.vdup(scalar_vec, mask)
lhs = pto.vlds(src0[row, col:])
result = _div_ieee754_f32_impl(lhs, broadcasted, mask)
pto.vsts(result, dst[row, col:], mask)
if pto.constexpr(src1.config.b_layout == pto.BLayout.COL_MAJOR):
# col_major / high_precision
for row in range(0, valid_rows, 1):
align_src1 = pto.vldas(src1[row, :])
scalar_vec, _ = pto.vldus(src1[row, :], align_src1)
broadcasted = pto.vdup(scalar_vec, pto.make_mask(dtype, pto.PAT.ALL))
remained = valid_cols
for col in range(0, valid_cols, pto.get_lanes(dtype)):
mask, remained = pto.make_mask(dtype, remained)
lhs = pto.vlds(src0[row, col:])
result = _div_ieee754_f32_impl(lhs, broadcasted, mask)
pto.vsts(result, dst[row, col:], mask)
else:
# row_major / high_precision
for row in range(0, valid_rows, 1):
remained = valid_cols
for col in range(0, valid_cols, pto.get_lanes(dtype)):
mask, remained = pto.make_mask(dtype, remained)
scalar_vec = pto.vlds(src1[row, :])
broadcasted = pto.vdup(scalar_vec, mask)
lhs = pto.vlds(src0[row, col:])
result = _div_ieee754_f32_impl(lhs, broadcasted, mask)
pto.vsts(result, dst[row, col:], mask)
else:
for row in range(0, valid_rows, 1):
remained = valid_cols
for col in range(0, valid_cols, pto.get_lanes(dtype)):
mask, remained = pto.make_mask(dtype, remained)
# Load the scalar vector from src1[row, :]
scalar_vec = pto.vlds(src1[row, :])
broadcasted = pto.vdup(scalar_vec, mask)
lhs = pto.vlds(src0[row, col:])
result = pto.vdiv(lhs, broadcasted, mask)
pto.vsts(result, dst[row, col:], mask)
if pto.constexpr(src1.config.b_layout == pto.BLayout.COL_MAJOR):
# col_major / default precision
for row in range(0, valid_rows, 1):
align_src1 = pto.vldas(src1[row, :])
scalar_vec, _ = pto.vldus(src1[row, :], align_src1)
broadcasted = pto.vdup(scalar_vec, pto.make_mask(dtype, pto.PAT.ALL))
remained = valid_cols
for col in range(0, valid_cols, pto.get_lanes(dtype)):
mask, remained = pto.make_mask(dtype, remained)
lhs = pto.vlds(src0[row, col:])
result = pto.vdiv(lhs, broadcasted, mask)
pto.vsts(result, dst[row, col:], mask)
else:
# row_major / default precision (existing behaviour)
for row in range(0, valid_rows, 1):
remained = valid_cols
for col in range(0, valid_cols, pto.get_lanes(dtype)):
mask, remained = pto.make_mask(dtype, remained)
scalar_vec = pto.vlds(src1[row, :])
broadcasted = pto.vdup(scalar_vec, mask)
lhs = pto.vlds(src0[row, col:])
result = pto.vdiv(lhs, broadcasted, mask)
pto.vsts(result, dst[row, col:], mask)
return


Expand All @@ -99,33 +128,62 @@ def template_trowexpanddiv_f32(src0: pto.Tile, src1: pto.Tile, dst: pto.Tile):
constraints=[_constraint_trowexpanddiv_row_major],
)
def template_trowexpanddiv_f16(src0: pto.Tile, src1: pto.Tile, dst: pto.Tile):
"""Template for pto.trowexpanddiv with f16 dtype and optional high-precision mode."""
"""Template for pto.trowexpanddiv with f16 dtype and optional high-precision mode.

When src1 is col_major with shape [M, 1] (a per-row scalar column),
vlds on the col_major tile slice would access UB at non-512B-aligned
addresses (error 340 on A5). Use vldas+vldus (unaligned load pipeline)
for src1 in that case; keep the aligned vlds path for row_major src1.
"""
dtype = dst.element_type
valid_rows, valid_cols = dst.valid_shape

precision_type = pto.get_op_attr("precisionType", "default")
if pto.constexpr(precision_type == "high_precision"):
for row in range(0, valid_rows, 1):
remained = valid_cols
for col in range(0, valid_cols, pto.get_lanes(dtype)):
mask, remained = pto.make_mask(dtype, remained)
# Load the scalar vector from src1[row, :]
# For row-major src1, valid_shape[1] is 32/sizeof(dtype) (e.g., 16 for f16)
# vdup broadcasts the first element to the full vector width
scalar_vec = pto.vlds(src1[row, :])
broadcasted = pto.vdup(scalar_vec, mask)
lhs = pto.vlds(src0[row, col:])
result = _div_ieee754_f16_impl(lhs, broadcasted, mask)
pto.vsts(result, dst[row, col:], mask)
if pto.constexpr(src1.config.b_layout == pto.BLayout.COL_MAJOR):
# col_major / high_precision
for row in range(0, valid_rows, 1):
align_src1 = pto.vldas(src1[row, :])
scalar_vec, _ = pto.vldus(src1[row, :], align_src1)
broadcasted = pto.vdup(scalar_vec, pto.make_mask(dtype, pto.PAT.ALL))
remained = valid_cols
for col in range(0, valid_cols, pto.get_lanes(dtype)):
mask, remained = pto.make_mask(dtype, remained)
lhs = pto.vlds(src0[row, col:])
result = _div_ieee754_f16_impl(lhs, broadcasted, mask)
pto.vsts(result, dst[row, col:], mask)
else:
# row_major / high_precision
for row in range(0, valid_rows, 1):
remained = valid_cols
for col in range(0, valid_cols, pto.get_lanes(dtype)):
mask, remained = pto.make_mask(dtype, remained)
scalar_vec = pto.vlds(src1[row, :])
broadcasted = pto.vdup(scalar_vec, mask)
lhs = pto.vlds(src0[row, col:])
result = _div_ieee754_f16_impl(lhs, broadcasted, mask)
pto.vsts(result, dst[row, col:], mask)
else:
for row in range(0, valid_rows, 1):
remained = valid_cols
for col in range(0, valid_cols, pto.get_lanes(dtype)):
mask, remained = pto.make_mask(dtype, remained)
# Load the scalar vector from src1[row, :]
scalar_vec = pto.vlds(src1[row, :])
broadcasted = pto.vdup(scalar_vec, mask)
lhs = pto.vlds(src0[row, col:])
result = pto.vdiv(lhs, broadcasted, mask)
pto.vsts(result, dst[row, col:], mask)
if pto.constexpr(src1.config.b_layout == pto.BLayout.COL_MAJOR):
# col_major / default precision
for row in range(0, valid_rows, 1):
align_src1 = pto.vldas(src1[row, :])
scalar_vec, _ = pto.vldus(src1[row, :], align_src1)
broadcasted = pto.vdup(scalar_vec, pto.make_mask(dtype, pto.PAT.ALL))
remained = valid_cols
for col in range(0, valid_cols, pto.get_lanes(dtype)):
mask, remained = pto.make_mask(dtype, remained)
lhs = pto.vlds(src0[row, col:])
result = pto.vdiv(lhs, broadcasted, mask)
pto.vsts(result, dst[row, col:], mask)
else:
# row_major / default precision (existing behaviour)
for row in range(0, valid_rows, 1):
remained = valid_cols
for col in range(0, valid_cols, pto.get_lanes(dtype)):
mask, remained = pto.make_mask(dtype, remained)
scalar_vec = pto.vlds(src1[row, :])
broadcasted = pto.vdup(scalar_vec, mask)
lhs = pto.vlds(src0[row, col:])
result = pto.vdiv(lhs, broadcasted, mask)
pto.vsts(result, dst[row, col:], mask)
return
45 changes: 33 additions & 12 deletions lib/TileOps/trowexpandmul_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,20 +57,41 @@ def template_trowexpandmul(src0: pto.Tile, src1: pto.Tile, dst: pto.Tile):

Multiply each row of src0 by a per-row scalar from src1[row, 0].
Semantics: dst[row, col] = src0[row, col] * src1[row, 0]

When src1 is col_major with shape [M, 1] (a per-row scalar column),
vlds on the col_major tile slice would access UB at non-512B-aligned
addresses (error 340 on A5). Use vldas+vldus (unaligned load pipeline)
for src1 in that case; keep the aligned vlds path for row_major src1.
"""
dtype = dst.element_type
valid_rows, valid_cols = dst.valid_shape

for row in range(0, valid_rows, 1):
remained = valid_cols
for col in range(0, valid_cols, pto.get_lanes(dtype)):
mask, remained = pto.make_mask(dtype, remained)
# Load the scalar vector from src1[row, :]
# For row-major src1, valid_shape[1] is 32/sizeof(dtype) (e.g., 8 for f32)
# vdup broadcasts the first element to the full vector width
scalar_vec = pto.vlds(src1[row, :])
broadcasted = pto.vdup(scalar_vec, mask)
lhs = pto.vlds(src0[row, col:])
result = pto.vmul(lhs, broadcasted, mask)
pto.vsts(result, dst[row, col:], mask)
if pto.constexpr(src1.config.b_layout == pto.BLayout.COL_MAJOR):
# ---- col_major [M, 1] path: unaligned load for src1 ----
for row in range(0, valid_rows, 1):
# vldas+vldus once per row, broadcast across all col iterations
align_src1 = pto.vldas(src1[row, :])
scalar_vec, _ = pto.vldus(src1[row, :], align_src1)
broadcasted = pto.vdup(scalar_vec, pto.make_mask(dtype, pto.PAT.ALL))
remained = valid_cols
for col in range(0, valid_cols, pto.get_lanes(dtype)):
mask, remained = pto.make_mask(dtype, remained)
lhs = pto.vlds(src0[row, col:])
result = pto.vmul(lhs, broadcasted, mask)
pto.vsts(result, dst[row, col:], mask)
else:
# ---- row_major path: aligned vlds (existing behaviour) ----
for row in range(0, valid_rows, 1):
remained = valid_cols
for col in range(0, valid_cols, pto.get_lanes(dtype)):
mask, remained = pto.make_mask(dtype, remained)
# Load the scalar vector from src1[row, :]
# For row-major src1, valid_shape[1] is 32/sizeof(dtype)
# (e.g., 8 for f32). vdup broadcasts the first element
# to the full vector width.
scalar_vec = pto.vlds(src1[row, :])
broadcasted = pto.vdup(scalar_vec, mask)
lhs = pto.vlds(src0[row, col:])
result = pto.vmul(lhs, broadcasted, mask)
pto.vsts(result, dst[row, col:], mask)
return
45 changes: 33 additions & 12 deletions lib/TileOps/trowexpandsub_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,20 +57,41 @@ def template_trowexpandsub(src0: pto.Tile, src1: pto.Tile, dst: pto.Tile):

Subtract a per-row scalar from src1[row, 0] from each row of src0.
Semantics: dst[row, col] = src0[row, col] - src1[row, 0]

When src1 is col_major with shape [M, 1] (a per-row scalar column),
vlds on the col_major tile slice would access UB at non-512B-aligned
addresses (error 340 on A5). Use vldas+vldus (unaligned load pipeline)
for src1 in that case; keep the aligned vlds path for row_major src1.
"""
dtype = dst.element_type
valid_rows, valid_cols = dst.valid_shape

for row in range(0, valid_rows, 1):
remained = valid_cols
for col in range(0, valid_cols, pto.get_lanes(dtype)):
mask, remained = pto.make_mask(dtype, remained)
# Load the scalar vector from src1[row, :]
# For row-major src1, valid_shape[1] is 32/sizeof(dtype) (e.g., 8 for f32)
# vdup broadcasts the first element to the full vector width
scalar_vec = pto.vlds(src1[row, :])
broadcasted = pto.vdup(scalar_vec, mask)
lhs = pto.vlds(src0[row, col:])
result = pto.vsub(lhs, broadcasted, mask)
pto.vsts(result, dst[row, col:], mask)
if pto.constexpr(src1.config.b_layout == pto.BLayout.COL_MAJOR):
# ---- col_major [M, 1] path: unaligned load for src1 ----
for row in range(0, valid_rows, 1):
# vldas+vldus once per row, broadcast across all col iterations
align_src1 = pto.vldas(src1[row, :])
scalar_vec, _ = pto.vldus(src1[row, :], align_src1)
broadcasted = pto.vdup(scalar_vec, pto.make_mask(dtype, pto.PAT.ALL))
remained = valid_cols
for col in range(0, valid_cols, pto.get_lanes(dtype)):
mask, remained = pto.make_mask(dtype, remained)
lhs = pto.vlds(src0[row, col:])
result = pto.vsub(lhs, broadcasted, mask)
pto.vsts(result, dst[row, col:], mask)
else:
# ---- row_major path: aligned vlds (existing behaviour) ----
for row in range(0, valid_rows, 1):
remained = valid_cols
for col in range(0, valid_cols, pto.get_lanes(dtype)):
mask, remained = pto.make_mask(dtype, remained)
# Load the scalar vector from src1[row, :]
# For row-major src1, valid_shape[1] is 32/sizeof(dtype)
# (e.g., 8 for f32). vdup broadcasts the first element
# to the full vector width.
scalar_vec = pto.vlds(src1[row, :])
broadcasted = pto.vdup(scalar_vec, mask)
lhs = pto.vlds(src0[row, col:])
result = pto.vsub(lhs, broadcasted, mask)
pto.vsts(result, dst[row, col:], mask)
return
61 changes: 61 additions & 0 deletions test/lit/vpto/trowexpanddiv_tile_op_expand_col_major.pto
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
// Copyright (c) 2026 Huawei Technologies Co., Ltd.
// This program is free software, you can redistribute it and/or modify it under the terms and conditions of
// CANN Open Software License Agreement Version 2.0 (the "License").
// Please refer to the License for details. You may not use this file except in compliance with the License.
// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
// See LICENSE in the root of the software repository for the full text of the License.

// Test that ExpandTileOp + InlineLibCall + FoldTileBufIntrinsics pipeline
// expands pto.trowexpanddiv when src1 is a col_major [M,1] scalar column
// with default (hardware) precision.
//
// The col_major path must use the unaligned load pipeline (vldas+vldus)
// for src1 instead of vlds, to avoid non-512B-aligned UB access on A5.
//
// Pipeline: PTOMaterializeTileHandles -> ExpandTileOp -> InlineLibCall -> FoldTileBufIntrinsics
//
// RUN: ptoas --pto-arch=a5 --pto-backend=vpto --emit-vpto --enable-tile-op-expand %s -o - 2>/dev/null | FileCheck %s

// After expansion the original pto.trowexpanddiv must be gone.
// CHECK: func.func @TROWEXPANDDIV_COLMAJOR
// CHECK-NOT: pto.trowexpanddiv ins
// CHECK: pto.vecscope
// The col_major src1 path uses vldas+vldus (unaligned load pipeline).
// CHECK: pto.vldas
// CHECK: pto.vldus
// Broadcast is still used to replicate the scalar across the vector.
// CHECK: pto.vdup
// src0 (row_major) still uses aligned vlds.
// CHECK: pto.vlds
// Core arithmetic: default precision uses hardware vdiv.
// CHECK: pto.vdiv
// Result store:
// CHECK: pto.vsts

module attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
func.func @TROWEXPANDDIV_COLMAJOR() {
// src0: 32x32 matrix (row-major)
%src0 = pto.alloc_tile
: !pto.tile_buf<loc=vec, dtype=f32, rows=32, cols=32, v_row=32, v_col=32,
blayout=row_major, slayout=none_box, fractal=512, pad=0>
// src1: 32x1 col_major column — one scalar per row.
// When src1 is col_major, the template uses vldas+vldus (unaligned)
// instead of vlds to handle the non-512B-aligned slice access.
%src1 = pto.alloc_tile
: !pto.tile_buf<loc=vec, dtype=f32, rows=32, cols=1, v_row=32, v_col=1,
blayout=col_major, slayout=none_box, fractal=512, pad=0>
// dst: 32x32 result (row-major)
%dst = pto.alloc_tile
: !pto.tile_buf<loc=vec, dtype=f32, rows=32, cols=32, v_row=32, v_col=32,
blayout=row_major, slayout=none_box, fractal=512, pad=0>

pto.trowexpanddiv ins(%src0, %src1 : !pto.tile_buf<loc=vec, dtype=f32, rows=32, cols=32, v_row=32, v_col=32,
blayout=row_major, slayout=none_box, fractal=512, pad=0>,
!pto.tile_buf<loc=vec, dtype=f32, rows=32, cols=1, v_row=32, v_col=1,
blayout=col_major, slayout=none_box, fractal=512, pad=0>)
outs(%dst : !pto.tile_buf<loc=vec, dtype=f32, rows=32, cols=32, v_row=32, v_col=32,
blayout=row_major, slayout=none_box, fractal=512, pad=0>)
return
}
}
Loading
Loading