diff --git a/lib/TileOps/trowexpanddiv_template.py b/lib/TileOps/trowexpanddiv_template.py
index 888353014..5ffc87835 100644
--- a/lib/TileOps/trowexpanddiv_template.py
+++ b/lib/TileOps/trowexpanddiv_template.py
@@ -60,35 +60,64 @@ def _constraint_trowexpanddiv_row_major(src0: pto.Tile, src1: pto.Tile, dst: pto
     constraints=[_constraint_trowexpanddiv_row_major],
 )
 def template_trowexpanddiv_f32(src0: pto.Tile, src1: pto.Tile, dst: pto.Tile):
-    """Template for pto.trowexpanddiv with f32 dtype and optional high-precision mode."""
+    """Template for pto.trowexpanddiv with f32 dtype and optional high-precision mode.
+
+    When src1 is col_major with shape [M, 1] (a per-row scalar column),
+    vlds on the col_major tile slice would access UB at non-512B-aligned
+    addresses (error 340 on A5).  Use vldas+vldus (unaligned load pipeline)
+    for src1 in that case; keep the aligned vlds path for row_major src1.
+    """
     dtype = dst.element_type
     valid_rows, valid_cols = dst.valid_shape
-
     precision_type = pto.get_op_attr("precisionType", "default")
     if pto.constexpr(precision_type == "high_precision"):
-        for row in range(0, valid_rows, 1):
-            remained = valid_cols
-            for col in range(0, valid_cols, pto.get_lanes(dtype)):
-                mask, remained = pto.make_mask(dtype, remained)
-                # Load the scalar vector from src1[row, :]
-                # For row-major src1, valid_shape[1] is 32/sizeof(dtype) (e.g., 8 for f32)
-                # vdup broadcasts the first element to the full vector width
-                scalar_vec = pto.vlds(src1[row, :])
-                broadcasted = pto.vdup(scalar_vec, mask)
-                lhs = pto.vlds(src0[row, col:])
-                result = _div_ieee754_f32_impl(lhs, broadcasted, mask)
-                pto.vsts(result, dst[row, col:], mask)
+        if pto.constexpr(src1.config.b_layout == pto.BLayout.COL_MAJOR):
+            # col_major / high_precision
+            for row in range(0, valid_rows, 1):
+                align_src1 = pto.vldas(src1[row, :])
+                scalar_vec, _ = pto.vldus(src1[row, :], align_src1)
+                broadcasted = pto.vdup(scalar_vec, pto.make_mask(dtype, pto.PAT.ALL))
+                remained = valid_cols
+                for col in range(0, valid_cols, pto.get_lanes(dtype)):
+                    mask, remained = pto.make_mask(dtype, remained)
+                    lhs = pto.vlds(src0[row, col:])
+                    result = _div_ieee754_f32_impl(lhs, broadcasted, mask)
+                    pto.vsts(result, dst[row, col:], mask)
+        else:
+            # row_major / high_precision
+            for row in range(0, valid_rows, 1):
+                remained = valid_cols
+                for col in range(0, valid_cols, pto.get_lanes(dtype)):
+                    mask, remained = pto.make_mask(dtype, remained)
+                    scalar_vec = pto.vlds(src1[row, :])
+                    broadcasted = pto.vdup(scalar_vec, mask)
+                    lhs = pto.vlds(src0[row, col:])
+                    result = _div_ieee754_f32_impl(lhs, broadcasted, mask)
+                    pto.vsts(result, dst[row, col:], mask)
     else:
-        for row in range(0, valid_rows, 1):
-            remained = valid_cols
-            for col in range(0, valid_cols, pto.get_lanes(dtype)):
-                mask, remained = pto.make_mask(dtype, remained)
-                # Load the scalar vector from src1[row, :]
-                scalar_vec = pto.vlds(src1[row, :])
-                broadcasted = pto.vdup(scalar_vec, mask)
-                lhs = pto.vlds(src0[row, col:])
-                result = pto.vdiv(lhs, broadcasted, mask)
-                pto.vsts(result, dst[row, col:], mask)
+        if pto.constexpr(src1.config.b_layout == pto.BLayout.COL_MAJOR):
+            # col_major / default precision
+            for row in range(0, valid_rows, 1):
+                align_src1 = pto.vldas(src1[row, :])
+                scalar_vec, _ = pto.vldus(src1[row, :], align_src1)
+                broadcasted = pto.vdup(scalar_vec, pto.make_mask(dtype, pto.PAT.ALL))
+                remained = valid_cols
+                for col in range(0, valid_cols, pto.get_lanes(dtype)):
+                    mask, remained = pto.make_mask(dtype, remained)
+                    lhs = pto.vlds(src0[row, col:])
+                    result = pto.vdiv(lhs, broadcasted, mask)
+                    pto.vsts(result, dst[row, col:], mask)
+        else:
+            # row_major / default precision (existing behaviour)
+            for row in range(0, valid_rows, 1):
+                remained = valid_cols
+                for col in range(0, valid_cols, pto.get_lanes(dtype)):
+                    mask, remained = pto.make_mask(dtype, remained)
+                    scalar_vec = pto.vlds(src1[row, :])
+                    broadcasted = pto.vdup(scalar_vec, mask)
+                    lhs = pto.vlds(src0[row, col:])
+                    result = pto.vdiv(lhs, broadcasted, mask)
+                    pto.vsts(result, dst[row, col:], mask)
     return
 
 
@@ -99,33 +128,62 @@ def template_trowexpanddiv_f32(src0: pto.Tile, src1: pto.Tile, dst: pto.Tile):
     constraints=[_constraint_trowexpanddiv_row_major],
 )
 def template_trowexpanddiv_f16(src0: pto.Tile, src1: pto.Tile, dst: pto.Tile):
-    """Template for pto.trowexpanddiv with f16 dtype and optional high-precision mode."""
+    """Template for pto.trowexpanddiv with f16 dtype and optional high-precision mode.
+
+    When src1 is col_major with shape [M, 1] (a per-row scalar column),
+    vlds on the col_major tile slice would access UB at non-512B-aligned
+    addresses (error 340 on A5).  Use vldas+vldus (unaligned load pipeline)
+    for src1 in that case; keep the aligned vlds path for row_major src1.
+    """
     dtype = dst.element_type
     valid_rows, valid_cols = dst.valid_shape
-
     precision_type = pto.get_op_attr("precisionType", "default")
     if pto.constexpr(precision_type == "high_precision"):
-        for row in range(0, valid_rows, 1):
-            remained = valid_cols
-            for col in range(0, valid_cols, pto.get_lanes(dtype)):
-                mask, remained = pto.make_mask(dtype, remained)
-                # Load the scalar vector from src1[row, :]
-                # For row-major src1, valid_shape[1] is 32/sizeof(dtype) (e.g., 16 for f16)
-                # vdup broadcasts the first element to the full vector width
-                scalar_vec = pto.vlds(src1[row, :])
-                broadcasted = pto.vdup(scalar_vec, mask)
-                lhs = pto.vlds(src0[row, col:])
-                result = _div_ieee754_f16_impl(lhs, broadcasted, mask)
-                pto.vsts(result, dst[row, col:], mask)
+        if pto.constexpr(src1.config.b_layout == pto.BLayout.COL_MAJOR):
+            # col_major / high_precision
+            for row in range(0, valid_rows, 1):
+                align_src1 = pto.vldas(src1[row, :])
+                scalar_vec, _ = pto.vldus(src1[row, :], align_src1)
+                broadcasted = pto.vdup(scalar_vec, pto.make_mask(dtype, pto.PAT.ALL))
+                remained = valid_cols
+                for col in range(0, valid_cols, pto.get_lanes(dtype)):
+                    mask, remained = pto.make_mask(dtype, remained)
+                    lhs = pto.vlds(src0[row, col:])
+                    result = _div_ieee754_f16_impl(lhs, broadcasted, mask)
+                    pto.vsts(result, dst[row, col:], mask)
+        else:
+            # row_major / high_precision
+            for row in range(0, valid_rows, 1):
+                remained = valid_cols
+                for col in range(0, valid_cols, pto.get_lanes(dtype)):
+                    mask, remained = pto.make_mask(dtype, remained)
+                    scalar_vec = pto.vlds(src1[row, :])
+                    broadcasted = pto.vdup(scalar_vec, mask)
+                    lhs = pto.vlds(src0[row, col:])
+                    result = _div_ieee754_f16_impl(lhs, broadcasted, mask)
+                    pto.vsts(result, dst[row, col:], mask)
     else:
-        for row in range(0, valid_rows, 1):
-            remained = valid_cols
-            for col in range(0, valid_cols, pto.get_lanes(dtype)):
-                mask, remained = pto.make_mask(dtype, remained)
-                # Load the scalar vector from src1[row, :]
-                scalar_vec = pto.vlds(src1[row, :])
-                broadcasted = pto.vdup(scalar_vec, mask)
-                lhs = pto.vlds(src0[row, col:])
-                result = pto.vdiv(lhs, broadcasted, mask)
-                pto.vsts(result, dst[row, col:], mask)
+        if pto.constexpr(src1.config.b_layout == pto.BLayout.COL_MAJOR):
+            # col_major / default precision
+            for row in range(0, valid_rows, 1):
+                align_src1 = pto.vldas(src1[row, :])
+                scalar_vec, _ = pto.vldus(src1[row, :], align_src1)
+                broadcasted = pto.vdup(scalar_vec, pto.make_mask(dtype, pto.PAT.ALL))
+                remained = valid_cols
+                for col in range(0, valid_cols, pto.get_lanes(dtype)):
+                    mask, remained = pto.make_mask(dtype, remained)
+                    lhs = pto.vlds(src0[row, col:])
+                    result = pto.vdiv(lhs, broadcasted, mask)
+                    pto.vsts(result, dst[row, col:], mask)
+        else:
+            # row_major / default precision (existing behaviour)
+            for row in range(0, valid_rows, 1):
+                remained = valid_cols
+                for col in range(0, valid_cols, pto.get_lanes(dtype)):
+                    mask, remained = pto.make_mask(dtype, remained)
+                    scalar_vec = pto.vlds(src1[row, :])
+                    broadcasted = pto.vdup(scalar_vec, mask)
+                    lhs = pto.vlds(src0[row, col:])
+                    result = pto.vdiv(lhs, broadcasted, mask)
+                    pto.vsts(result, dst[row, col:], mask)
     return
diff --git a/lib/TileOps/trowexpandmul_template.py b/lib/TileOps/trowexpandmul_template.py
index 504e22392..cc7e41945 100644
--- a/lib/TileOps/trowexpandmul_template.py
+++ b/lib/TileOps/trowexpandmul_template.py
@@ -57,20 +57,41 @@ def template_trowexpandmul(src0: pto.Tile, src1: pto.Tile, dst: pto.Tile):
 
     Multiply each row of src0 by a per-row scalar from src1[row, 0].
     Semantics: dst[row, col] = src0[row, col] * src1[row, 0]
+
+    When src1 is col_major with shape [M, 1] (a per-row scalar column),
+    vlds on the col_major tile slice would access UB at non-512B-aligned
+    addresses (error 340 on A5).  Use vldas+vldus (unaligned load pipeline)
+    for src1 in that case; keep the aligned vlds path for row_major src1.
     """
     dtype = dst.element_type
     valid_rows, valid_cols = dst.valid_shape
 
-    for row in range(0, valid_rows, 1):
-        remained = valid_cols
-        for col in range(0, valid_cols, pto.get_lanes(dtype)):
-            mask, remained = pto.make_mask(dtype, remained)
-            # Load the scalar vector from src1[row, :]
-            # For row-major src1, valid_shape[1] is 32/sizeof(dtype) (e.g., 8 for f32)
-            # vdup broadcasts the first element to the full vector width
-            scalar_vec = pto.vlds(src1[row, :])
-            broadcasted = pto.vdup(scalar_vec, mask)
-            lhs = pto.vlds(src0[row, col:])
-            result = pto.vmul(lhs, broadcasted, mask)
-            pto.vsts(result, dst[row, col:], mask)
+    if pto.constexpr(src1.config.b_layout == pto.BLayout.COL_MAJOR):
+        # ---- col_major [M, 1] path: unaligned load for src1 ----
+        for row in range(0, valid_rows, 1):
+            # vldas+vldus once per row, broadcast across all col iterations
+            align_src1 = pto.vldas(src1[row, :])
+            scalar_vec, _ = pto.vldus(src1[row, :], align_src1)
+            broadcasted = pto.vdup(scalar_vec, pto.make_mask(dtype, pto.PAT.ALL))
+            remained = valid_cols
+            for col in range(0, valid_cols, pto.get_lanes(dtype)):
+                mask, remained = pto.make_mask(dtype, remained)
+                lhs = pto.vlds(src0[row, col:])
+                result = pto.vmul(lhs, broadcasted, mask)
+                pto.vsts(result, dst[row, col:], mask)
+    else:
+        # ---- row_major path: aligned vlds (existing behaviour) ----
+        for row in range(0, valid_rows, 1):
+            remained = valid_cols
+            for col in range(0, valid_cols, pto.get_lanes(dtype)):
+                mask, remained = pto.make_mask(dtype, remained)
+                # Load the scalar vector from src1[row, :]
+                # For row-major src1, valid_shape[1] is 32/sizeof(dtype)
+                # (e.g., 8 for f32).  vdup broadcasts the first element
+                # to the full vector width.
+                scalar_vec = pto.vlds(src1[row, :])
+                broadcasted = pto.vdup(scalar_vec, mask)
+                lhs = pto.vlds(src0[row, col:])
+                result = pto.vmul(lhs, broadcasted, mask)
+                pto.vsts(result, dst[row, col:], mask)
     return
diff --git a/lib/TileOps/trowexpandsub_template.py b/lib/TileOps/trowexpandsub_template.py
index 22cc94b6e..e8394d6ed 100644
--- a/lib/TileOps/trowexpandsub_template.py
+++ b/lib/TileOps/trowexpandsub_template.py
@@ -57,20 +57,41 @@ def template_trowexpandsub(src0: pto.Tile, src1: pto.Tile, dst: pto.Tile):
 
     Subtract a per-row scalar from src1[row, 0] from each row of src0.
     Semantics: dst[row, col] = src0[row, col] - src1[row, 0]
+
+    When src1 is col_major with shape [M, 1] (a per-row scalar column),
+    vlds on the col_major tile slice would access UB at non-512B-aligned
+    addresses (error 340 on A5).  Use vldas+vldus (unaligned load pipeline)
+    for src1 in that case; keep the aligned vlds path for row_major src1.
     """
     dtype = dst.element_type
     valid_rows, valid_cols = dst.valid_shape
 
-    for row in range(0, valid_rows, 1):
-        remained = valid_cols
-        for col in range(0, valid_cols, pto.get_lanes(dtype)):
-            mask, remained = pto.make_mask(dtype, remained)
-            # Load the scalar vector from src1[row, :]
-            # For row-major src1, valid_shape[1] is 32/sizeof(dtype) (e.g., 8 for f32)
-            # vdup broadcasts the first element to the full vector width
-            scalar_vec = pto.vlds(src1[row, :])
-            broadcasted = pto.vdup(scalar_vec, mask)
-            lhs = pto.vlds(src0[row, col:])
-            result = pto.vsub(lhs, broadcasted, mask)
-            pto.vsts(result, dst[row, col:], mask)
+    if pto.constexpr(src1.config.b_layout == pto.BLayout.COL_MAJOR):
+        # ---- col_major [M, 1] path: unaligned load for src1 ----
+        for row in range(0, valid_rows, 1):
+            # vldas+vldus once per row, broadcast across all col iterations
+            align_src1 = pto.vldas(src1[row, :])
+            scalar_vec, _ = pto.vldus(src1[row, :], align_src1)
+            broadcasted = pto.vdup(scalar_vec, pto.make_mask(dtype, pto.PAT.ALL))
+            remained = valid_cols
+            for col in range(0, valid_cols, pto.get_lanes(dtype)):
+                mask, remained = pto.make_mask(dtype, remained)
+                lhs = pto.vlds(src0[row, col:])
+                result = pto.vsub(lhs, broadcasted, mask)
+                pto.vsts(result, dst[row, col:], mask)
+    else:
+        # ---- row_major path: aligned vlds (existing behaviour) ----
+        for row in range(0, valid_rows, 1):
+            remained = valid_cols
+            for col in range(0, valid_cols, pto.get_lanes(dtype)):
+                mask, remained = pto.make_mask(dtype, remained)
+                # Load the scalar vector from src1[row, :]
+                # For row-major src1, valid_shape[1] is 32/sizeof(dtype)
+                # (e.g., 8 for f32).  vdup broadcasts the first element
+                # to the full vector width.
+                scalar_vec = pto.vlds(src1[row, :])
+                broadcasted = pto.vdup(scalar_vec, mask)
+                lhs = pto.vlds(src0[row, col:])
+                result = pto.vsub(lhs, broadcasted, mask)
+                pto.vsts(result, dst[row, col:], mask)
     return
diff --git a/test/lit/vpto/trowexpanddiv_tile_op_expand_col_major.pto b/test/lit/vpto/trowexpanddiv_tile_op_expand_col_major.pto
new file mode 100644
index 000000000..96401d4f5
--- /dev/null
+++ b/test/lit/vpto/trowexpanddiv_tile_op_expand_col_major.pto
@@ -0,0 +1,61 @@
+// Copyright (c) 2026 Huawei Technologies Co., Ltd.
+// This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+// CANN Open Software License Agreement Version 2.0 (the "License").
+// Please refer to the License for details. You may not use this file except in compliance with the License.
+// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+// See LICENSE in the root of the software repository for the full text of the License.
+
+// Test that ExpandTileOp + InlineLibCall + FoldTileBufIntrinsics pipeline
+// expands pto.trowexpanddiv when src1 is a col_major [M,1] scalar column
+// with default (hardware) precision.
+//
+// The col_major path must use the unaligned load pipeline (vldas+vldus)
+// for src1 instead of vlds, to avoid non-512B-aligned UB access on A5.
+//
+// Pipeline: PTOMaterializeTileHandles -> ExpandTileOp -> InlineLibCall -> FoldTileBufIntrinsics
+//
+// RUN: ptoas --pto-arch=a5 --pto-backend=vpto --emit-vpto --enable-tile-op-expand %s -o - 2>/dev/null | FileCheck %s
+
+// After expansion the original pto.trowexpanddiv must be gone.
+// CHECK: func.func @TROWEXPANDDIV_COLMAJOR
+// CHECK-NOT: pto.trowexpanddiv ins
+// CHECK: pto.vecscope
+// The col_major src1 path uses vldas+vldus (unaligned load pipeline).
+// CHECK: pto.vldas
+// CHECK: pto.vldus
+// Broadcast is still used to replicate the scalar across the vector.
+// CHECK: pto.vdup
+// src0 (row_major) still uses aligned vlds.
+// CHECK: pto.vlds
+// Core arithmetic: default precision uses hardware vdiv.
+// CHECK: pto.vdiv
+// Result store:
+// CHECK: pto.vsts
+
+module attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  func.func @TROWEXPANDDIV_COLMAJOR() {
+    // src0: 32x32 matrix (row-major)
+    %src0 = pto.alloc_tile
+      : !pto.tile_buf<loc=vec, dtype=f32, rows=32, cols=32, v_row=32, v_col=32,
+                      blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    // src1: 32x1 col_major column — one scalar per row.
+    // When src1 is col_major, the template uses vldas+vldus (unaligned)
+    // instead of vlds to handle the non-512B-aligned slice access.
+    %src1 = pto.alloc_tile
+      : !pto.tile_buf<loc=vec, dtype=f32, rows=32, cols=1, v_row=32, v_col=1,
+                      blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    // dst: 32x32 result (row-major)
+    %dst = pto.alloc_tile
+      : !pto.tile_buf<loc=vec, dtype=f32, rows=32, cols=32, v_row=32, v_col=32,
+                      blayout=row_major, slayout=none_box, fractal=512, pad=0>
+
+    pto.trowexpanddiv ins(%src0, %src1 : !pto.tile_buf<loc=vec, dtype=f32, rows=32, cols=32, v_row=32, v_col=32,
+                                   blayout=row_major, slayout=none_box, fractal=512, pad=0>,
+                               !pto.tile_buf<loc=vec, dtype=f32, rows=32, cols=1, v_row=32, v_col=1,
+                                   blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+                       outs(%dst : !pto.tile_buf<loc=vec, dtype=f32, rows=32, cols=32, v_row=32, v_col=32,
+                                   blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    return
+  }
+}
diff --git a/test/lit/vpto/trowexpanddiv_tile_op_expand_f16_col_major.pto b/test/lit/vpto/trowexpanddiv_tile_op_expand_f16_col_major.pto
new file mode 100644
index 000000000..58320cdd6
--- /dev/null
+++ b/test/lit/vpto/trowexpanddiv_tile_op_expand_f16_col_major.pto
@@ -0,0 +1,60 @@
+// Copyright (c) 2026 Huawei Technologies Co., Ltd.
+// This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+// CANN Open Software License Agreement Version 2.0 (the "License").
+// Please refer to the License for details. You may not use this file except in compliance with the License.
+// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+// See LICENSE in the root of the software repository for the full text of the License.
+
+// Test that ExpandTileOp + InlineLibCall + FoldTileBufIntrinsics pipeline
+// expands pto.trowexpanddiv when src1 is a col_major [M,1] scalar column
+// with f16 dtype and default (hardware) precision.
+//
+// trowexpanddiv has separate f32 and f16 template specializations
+// (template_trowexpanddiv_f32 / template_trowexpanddiv_f16), so the
+// f16 col_major path must be tested independently.
+//
+// Pipeline: PTOMaterializeTileHandles -> ExpandTileOp -> InlineLibCall -> FoldTileBufIntrinsics
+//
+// RUN: ptoas --pto-arch=a5 --pto-backend=vpto --emit-vpto --enable-tile-op-expand %s -o - 2>/dev/null | FileCheck %s
+
+// After expansion the original pto.trowexpanddiv must be gone.
+// CHECK: func.func @TROWEXPANDDIV_F16_COLMAJOR
+// CHECK-NOT: pto.trowexpanddiv ins
+// CHECK: pto.vecscope
+// The col_major src1 path uses vldas+vldus (unaligned load pipeline).
+// CHECK: pto.vldas
+// CHECK: pto.vldus
+// Broadcast is still used to replicate the scalar across the vector.
+// CHECK: pto.vdup
+// src0 (row_major) still uses aligned vlds.
+// CHECK: pto.vlds
+// Core arithmetic: default precision uses hardware vdiv.
+// CHECK: pto.vdiv
+// Result store:
+// CHECK: pto.vsts
+
+module attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  func.func @TROWEXPANDDIV_F16_COLMAJOR() {
+    // src0: 32x32 matrix (row-major, f16)
+    %src0 = pto.alloc_tile
+      : !pto.tile_buf<loc=vec, dtype=f16, rows=32, cols=32, v_row=32, v_col=32,
+                      blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    // src1: 32x1 col_major column — one f16 scalar per row.
+    %src1 = pto.alloc_tile
+      : !pto.tile_buf<loc=vec, dtype=f16, rows=32, cols=1, v_row=32, v_col=1,
+                      blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    // dst: 32x32 result (row-major, f16)
+    %dst = pto.alloc_tile
+      : !pto.tile_buf<loc=vec, dtype=f16, rows=32, cols=32, v_row=32, v_col=32,
+                      blayout=row_major, slayout=none_box, fractal=512, pad=0>
+
+    pto.trowexpanddiv ins(%src0, %src1 : !pto.tile_buf<loc=vec, dtype=f16, rows=32, cols=32, v_row=32, v_col=32,
+                                   blayout=row_major, slayout=none_box, fractal=512, pad=0>,
+                               !pto.tile_buf<loc=vec, dtype=f16, rows=32, cols=1, v_row=32, v_col=1,
+                                   blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+                       outs(%dst : !pto.tile_buf<loc=vec, dtype=f16, rows=32, cols=32, v_row=32, v_col=32,
+                                   blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    return
+  }
+}
diff --git a/test/lit/vpto/trowexpandmul_tile_op_expand_col_major.pto b/test/lit/vpto/trowexpandmul_tile_op_expand_col_major.pto
new file mode 100644
index 000000000..1b758a731
--- /dev/null
+++ b/test/lit/vpto/trowexpandmul_tile_op_expand_col_major.pto
@@ -0,0 +1,60 @@
+// Copyright (c) 2026 Huawei Technologies Co., Ltd.
+// This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+// CANN Open Software License Agreement Version 2.0 (the "License").
+// Please refer to the License for details. You may not use this file except in compliance with the License.
+// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+// See LICENSE in the root of the software repository for the full text of the License.
+
+// Test that ExpandTileOp + InlineLibCall + FoldTileBufIntrinsics pipeline
+// expands pto.trowexpandmul when src1 is a col_major [M,1] scalar column.
+//
+// The col_major path must use the unaligned load pipeline (vldas+vldus)
+// for src1 instead of vlds, to avoid non-512B-aligned UB access on A5.
+//
+// Pipeline: PTOMaterializeTileHandles -> ExpandTileOp -> InlineLibCall -> FoldTileBufIntrinsics
+//
+// RUN: ptoas --pto-arch=a5 --pto-backend=vpto --emit-vpto --enable-tile-op-expand %s -o - 2>/dev/null | FileCheck %s
+
+// After expansion the original pto.trowexpandmul must be gone.
+// CHECK: func.func @TROWEXPANDMUL_COLMAJOR
+// CHECK-NOT: pto.trowexpandmul ins
+// CHECK: pto.vecscope
+// The col_major src1 path uses vldas+vldus (unaligned load pipeline).
+// CHECK: pto.vldas
+// CHECK: pto.vldus
+// Broadcast is still used to replicate the scalar across the vector.
+// CHECK: pto.vdup
+// src0 (row_major) still uses aligned vlds.
+// CHECK: pto.vlds
+// Core arithmetic:
+// CHECK: pto.vmul
+// Result store:
+// CHECK: pto.vsts
+
+module attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  func.func @TROWEXPANDMUL_COLMAJOR() {
+    // src0: 32x32 matrix (row-major)
+    %src0 = pto.alloc_tile
+      : !pto.tile_buf<loc=vec, dtype=f32, rows=32, cols=32, v_row=32, v_col=32,
+                      blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    // src1: 32x1 col_major column — one scalar per row.
+    // When src1 is col_major, the template uses vldas+vldus (unaligned)
+    // instead of vlds to handle the non-512B-aligned slice access.
+    %src1 = pto.alloc_tile
+      : !pto.tile_buf<loc=vec, dtype=f32, rows=32, cols=1, v_row=32, v_col=1,
+                      blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    // dst: 32x32 result (row-major)
+    %dst = pto.alloc_tile
+      : !pto.tile_buf<loc=vec, dtype=f32, rows=32, cols=32, v_row=32, v_col=32,
+                      blayout=row_major, slayout=none_box, fractal=512, pad=0>
+
+    pto.trowexpandmul ins(%src0, %src1 : !pto.tile_buf<loc=vec, dtype=f32, rows=32, cols=32, v_row=32, v_col=32,
+                                   blayout=row_major, slayout=none_box, fractal=512, pad=0>,
+                               !pto.tile_buf<loc=vec, dtype=f32, rows=32, cols=1, v_row=32, v_col=1,
+                                   blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+                       outs(%dst : !pto.tile_buf<loc=vec, dtype=f32, rows=32, cols=32, v_row=32, v_col=32,
+                                   blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    return
+  }
+}
diff --git a/test/lit/vpto/trowexpandsub_tile_op_expand_col_major.pto b/test/lit/vpto/trowexpandsub_tile_op_expand_col_major.pto
new file mode 100644
index 000000000..afe44a6f7
--- /dev/null
+++ b/test/lit/vpto/trowexpandsub_tile_op_expand_col_major.pto
@@ -0,0 +1,60 @@
+// Copyright (c) 2026 Huawei Technologies Co., Ltd.
+// This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+// CANN Open Software License Agreement Version 2.0 (the "License").
+// Please refer to the License for details. You may not use this file except in compliance with the License.
+// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+// See LICENSE in the root of the software repository for the full text of the License.
+
+// Test that ExpandTileOp + InlineLibCall + FoldTileBufIntrinsics pipeline
+// expands pto.trowexpandsub when src1 is a col_major [M,1] scalar column.
+//
+// The col_major path must use the unaligned load pipeline (vldas+vldus)
+// for src1 instead of vlds, to avoid non-512B-aligned UB access on A5.
+//
+// Pipeline: PTOMaterializeTileHandles -> ExpandTileOp -> InlineLibCall -> FoldTileBufIntrinsics
+//
+// RUN: ptoas --pto-arch=a5 --pto-backend=vpto --emit-vpto --enable-tile-op-expand %s -o - 2>/dev/null | FileCheck %s
+
+// After expansion the original pto.trowexpandsub must be gone.
+// CHECK: func.func @TROWEXPANDSUB_COLMAJOR
+// CHECK-NOT: pto.trowexpandsub ins
+// CHECK: pto.vecscope
+// The col_major src1 path uses vldas+vldus (unaligned load pipeline).
+// CHECK: pto.vldas
+// CHECK: pto.vldus
+// Broadcast is still used to replicate the scalar across the vector.
+// CHECK: pto.vdup
+// src0 (row_major) still uses aligned vlds.
+// CHECK: pto.vlds
+// Core arithmetic:
+// CHECK: pto.vsub
+// Result store:
+// CHECK: pto.vsts
+
+module attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+  func.func @TROWEXPANDSUB_COLMAJOR() {
+    // src0: 32x32 matrix (row-major)
+    %src0 = pto.alloc_tile
+      : !pto.tile_buf<loc=vec, dtype=f32, rows=32, cols=32, v_row=32, v_col=32,
+                      blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    // src1: 32x1 col_major column — one scalar per row.
+    // When src1 is col_major, the template uses vldas+vldus (unaligned)
+    // instead of vlds to handle the non-512B-aligned slice access.
+    %src1 = pto.alloc_tile
+      : !pto.tile_buf<loc=vec, dtype=f32, rows=32, cols=1, v_row=32, v_col=1,
+                      blayout=col_major, slayout=none_box, fractal=512, pad=0>
+    // dst: 32x32 result (row-major)
+    %dst = pto.alloc_tile
+      : !pto.tile_buf<loc=vec, dtype=f32, rows=32, cols=32, v_row=32, v_col=32,
+                      blayout=row_major, slayout=none_box, fractal=512, pad=0>
+
+    pto.trowexpandsub ins(%src0, %src1 : !pto.tile_buf<loc=vec, dtype=f32, rows=32, cols=32, v_row=32, v_col=32,
+                                   blayout=row_major, slayout=none_box, fractal=512, pad=0>,
+                               !pto.tile_buf<loc=vec, dtype=f32, rows=32, cols=1, v_row=32, v_col=1,
+                                   blayout=col_major, slayout=none_box, fractal=512, pad=0>)
+                       outs(%dst : !pto.tile_buf<loc=vec, dtype=f32, rows=32, cols=32, v_row=32, v_col=32,
+                                   blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    return
+  }
+}
diff --git a/tilelang-dsl/python/tilelang_dsl/lowering.py b/tilelang-dsl/python/tilelang_dsl/lowering.py
index d57dcb60a..e18f1c1fc 100644
--- a/tilelang-dsl/python/tilelang_dsl/lowering.py
+++ b/tilelang-dsl/python/tilelang_dsl/lowering.py
@@ -202,6 +202,7 @@ def __init__(self, kernel: SemanticKernel):
         self._constant_cache: dict[tuple[str, object], str] = {}
         self._castptr_cache: dict[tuple[str, str], str] = {}
         self._tile_memref_cache: dict[str, _RenderedValue] = {}
+        self._tile_ptr_cache: dict[str, _RenderedValue] = {}
         self._tile_valid_dim_cache: dict[tuple[str, int], _RenderedValue] = {}
         self._used_tile_buffers = self._collect_used_tile_buffers(kernel.body)
         self._temp_counter = 0
@@ -871,24 +872,42 @@ def _render_multi_result_assign(
             lines = []
             source = self._lower_expr(stmt.value.args[0], env, indent=indent, into=lines)
             index_args = stmt.value.args[1:-1]
+
+            # Convert tile directly to ptr (not memref) — same as vldas.
             if isinstance(source.type, SemanticTileType):
-                source = self._materialize_tile_memref(source, indent=indent, into=lines)
+                source = self._materialize_tile_ptr(source, indent=indent, into=lines)
+
             if (
                 isinstance(stmt.value.args[0].type, SemanticTileType)
                 and stmt.value.args[0].type.rank == 2
                 and len(index_args) == 2
             ):
-                source = self._materialize_rank2_tile_subview(
-                    source,
-                    stmt.value.args[0].type,
-                    index_args,
-                    env,
-                    indent=indent,
-                    into=lines,
+                orig_tile_type = stmt.value.args[0].type
+                row_index, col_index = index_args
+                row_value = self._lower_expr(row_index, env, indent=indent, into=lines)
+                col_value = self._lower_expr(col_index, env, indent=indent, into=lines)
+                total_cols = orig_tile_type.shape[1]
+                total_cols_const = self._materialize_constant(
+                    total_cols, SemanticIndexType()
+                )
+                # offset = row * total_cols + col
+                row_offset = self._new_temp()
+                lines.append(
+                    self._indent(indent)
+                    + f"{row_offset} = arith.muli {row_value.name}, {total_cols_const} : index"
+                )
+                elem_offset = self._new_temp()
+                lines.append(
+                    self._indent(indent)
+                    + f"{elem_offset} = arith.addi {row_offset}, {col_value.name} : index"
+                )
+                ptr_name = self._new_temp()
+                lines.append(
+                    self._indent(indent)
+                    + f"{ptr_name} = pto.addptr {source.name}, {elem_offset} : "
+                    + f"{self._render_type(source.type)} -> {self._render_type(source.type)}"
                 )
-            if self._is_memref_like_type(source.type):
-                ptr_name, ptr_type = self._materialize_copy_buffer_ptr(source, indent=indent, into=lines)
-                source = _RenderedValue(name=ptr_name, type=_RenderedTextualType(ptr_type))
+                source = _RenderedValue(name=ptr_name, type=source.type)
             align = self._lower_expr(stmt.value.args[-1], env, indent=indent, into=lines)
             result_target, align_target = stmt.targets
             result_type, align_type = stmt.value.type.elements
@@ -1868,24 +1887,47 @@ def _lower_call_expr(
         if expr.name == "vldas":
             source = self._lower_expr(expr.args[0], env, indent=indent, into=into)
             index_args = expr.args[1:]
+
+            # Convert tile directly to ptr (not memref) for vldas.
+            # vldas only accepts !pto.ptr, and the memref→subview→castptr
+            # path is broken.  Instead: tile → ptr via tile_buf_addr, then
+            # pto.addptr with the element offset computed from the indices.
             if isinstance(source.type, SemanticTileType):
-                source = self._materialize_tile_memref(source, indent=indent, into=into)
+                source = self._materialize_tile_ptr(source, indent=indent, into=into)
+
             if (
                 isinstance(expr.args[0].type, SemanticTileType)
                 and expr.args[0].type.rank == 2
                 and len(index_args) == 2
             ):
-                source = self._materialize_rank2_tile_subview(
-                    source,
-                    expr.args[0].type,
-                    index_args,
-                    env,
-                    indent=indent,
-                    into=into,
+                orig_tile_type = expr.args[0].type
+                row_index, col_index = index_args
+                row_value = self._lower_expr(row_index, env, indent=indent, into=into)
+                col_value = self._lower_expr(col_index, env, indent=indent, into=into)
+                # uniform offset formula: row * shape[1] + col.
+                # Works for row_major [M,N] and col_major [M,1] (col=0).
+                total_cols = orig_tile_type.shape[1]
+                total_cols_const = self._materialize_constant(
+                    total_cols, SemanticIndexType()
                 )
-            if self._is_memref_like_type(source.type):
-                ptr_name, ptr_type = self._materialize_copy_buffer_ptr(source, indent=indent, into=into)
-                source = _RenderedValue(name=ptr_name, type=_RenderedTextualType(ptr_type))
+                # offset = row * total_cols + col
+                row_offset = self._new_temp()
+                into.append(
+                    self._indent(indent)
+                    + f"{row_offset} = arith.muli {row_value.name}, {total_cols_const} : index"
+                )
+                elem_offset = self._new_temp()
+                into.append(
+                    self._indent(indent)
+                    + f"{elem_offset} = arith.addi {row_offset}, {col_value.name} : index"
+                )
+                ptr_name = self._new_temp()
+                into.append(
+                    self._indent(indent)
+                    + f"{ptr_name} = pto.addptr {source.name}, {elem_offset} : "
+                    + f"{self._render_type(source.type)} -> {self._render_type(source.type)}"
+                )
+                source = _RenderedValue(name=ptr_name, type=source.type)
             into.append(
                 self._indent(indent)
                 + f"{result_name} = pto.vldas {source.name} : "
@@ -3225,6 +3267,35 @@ def _materialize_copy_buffer_ptr(
 
         return value.name, ptr_type
 
+    def _materialize_tile_ptr(
+        self,
+        value: _RenderedValue,
+        *,
+        indent: int,
+        into: list[str],
+    ) -> _RenderedValue:
+        """Convert a SemanticTileType value directly to a typed !pto.ptr.
+
+        Unlike _materialize_tile_memref which produces a memref, this emits
+        pto.tile_buf_addr with a !pto.ptr<...> result type so the ptr can be
+        used directly with pto.addptr / pto.vldas / pto.vldus.
+        """
+        existing = self._tile_ptr_cache.get(value.name)
+        if existing is not None:
+            return existing
+        if not isinstance(value.type, SemanticTileType):
+            return value
+        ptr_type = self._render_copy_buffer_type(value.type)
+        ptr_name = self._new_temp()
+        into.append(
+            self._indent(indent)
+            + f"{ptr_name} = pto.tile_buf_addr {value.name} : "
+            + f"{self._render_type(value.type)} -> {ptr_type}"
+        )
+        rendered = _RenderedValue(name=ptr_name, type=_RenderedTextualType(ptr_type))
+        self._tile_ptr_cache[value.name] = rendered
+        return rendered
+
     def _coerce_rendered_value(
         self,
         value: _RenderedValue,