From 80000fea643d696004aaf5631a1c808a1bfa325e Mon Sep 17 00:00:00 2001 From: Andrew Brown Date: Fri, 23 Jan 2026 16:57:28 -0800 Subject: [PATCH 1/2] Add lit tests; shows initial `bfloat16` support The "expand to target type" logic added in #231 works for `bfloat16` types, not just `float16`. This change adds lit tests to show the general form of the lowering our `convert-triton-cpu-to-llvm` does when run. --- test/Conversion/dot.mlir | 41 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 test/Conversion/dot.mlir diff --git a/test/Conversion/dot.mlir b/test/Conversion/dot.mlir new file mode 100644 index 00000000..90431b0f --- /dev/null +++ b/test/Conversion/dot.mlir @@ -0,0 +1,41 @@ +// RUN: triton-opt %s -split-input-file --convert-triton-cpu-to-llvm | FileCheck %s + +#blocked = #ttg.blocked<{sizePerThread = [2, 2], threadsPerWarp = [1, 1], warpsPerCTA = [1, 1], order = [1, 0]}> +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.shared = 0 : i32, ttg.target = "cpu", "ttg.threads-per-warp" = 1 : i32} { + tt.func public @kernel( + %a: tensor<2x4xf16, #ttg.dot_op<{opIdx = 0, parent = #blocked}>>, + %b: tensor<4x2xf16, #ttg.dot_op<{opIdx = 1, parent = #blocked}>>, + %c: tensor<2x2xf32, #blocked>) -> tensor<2x2xf32, #blocked> attributes {noinline = false} { + %d = tt.dot %a, %b, %c : + tensor<2x4xf16, #ttg.dot_op<{opIdx = 0, parent = #blocked}>> * + tensor<4x2xf16, #ttg.dot_op<{opIdx = 1, parent = #blocked}>> -> + tensor<2x2xf32, #blocked> + // COM: We should see a bunch of repetitions of this pattern: + // CHECK: [[A:%.*]] = llvm.fpext {{%.*}} : f16 to f32 + // CHECK: [[B:%.*]] = llvm.fpext {{%.*}} : f16 to f32 + // CHECK: [[MUL:%.*]] = llvm.fmul [[A]], [[B]] : f32 + // CHECK: {{%.*}} = llvm.fadd {{%.*}}, [[MUL]] : f32 + tt.return %d : tensor<2x2xf32, #blocked> + } +} + +// ----- + +#blocked = #ttg.blocked<{sizePerThread = [2, 2], threadsPerWarp = [1, 1], warpsPerCTA = [1, 1], order = [1, 0]}> +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.shared = 0 : i32, ttg.target = "cpu", "ttg.threads-per-warp" = 1 : i32} { + tt.func public @kernel( + %a: tensor<2x4xbf16, #ttg.dot_op<{opIdx = 0, parent = #blocked}>>, + %b: tensor<4x2xbf16, #ttg.dot_op<{opIdx = 1, parent = #blocked}>>, + %c: tensor<2x2xf32, #blocked>) -> tensor<2x2xf32, #blocked> attributes {noinline = false} { + %d = tt.dot %a, %b, %c : + tensor<2x4xbf16, #ttg.dot_op<{opIdx = 0, parent = #blocked}>> * + tensor<4x2xbf16, #ttg.dot_op<{opIdx = 1, parent = #blocked}>> -> + tensor<2x2xf32, #blocked> + // COM: We should see a bunch of repetitions of this pattern: + // CHECK: [[A:%.*]] = llvm.fpext {{%.*}} : bf16 to f32 + // CHECK: [[B:%.*]] = llvm.fpext {{%.*}} : bf16 to f32 + // CHECK: [[MUL:%.*]] = llvm.fmul [[A]], [[B]] : f32 + // CHECK: {{%.*}} = llvm.fadd {{%.*}}, [[MUL]] : f32 + tt.return %d : tensor<2x2xf32, #blocked> + } +} From d250c84238d957c17575d30a2249d6a03e4a33bd Mon Sep 17 00:00:00 2001 From: Andrew Brown Date: Wed, 28 Jan 2026 10:49:36 -0800 Subject: [PATCH 2/2] review: add --canonicalize pass --- test/Conversion/dot.mlir | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/Conversion/dot.mlir b/test/Conversion/dot.mlir index 90431b0f..a0b59ffd 100644 --- a/test/Conversion/dot.mlir +++ b/test/Conversion/dot.mlir @@ -1,4 +1,4 @@ -// RUN: triton-opt %s -split-input-file --convert-triton-cpu-to-llvm | FileCheck %s +// RUN: triton-opt %s -split-input-file --convert-triton-cpu-to-llvm --canonicalize | FileCheck %s #blocked = #ttg.blocked<{sizePerThread = [2, 2], threadsPerWarp = [1, 1], warpsPerCTA = [1, 1], order = [1, 0]}> module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.shared = 0 : i32, ttg.target = "cpu", "ttg.threads-per-warp" = 1 : i32} {