From 80000fea643d696004aaf5631a1c808a1bfa325e Mon Sep 17 00:00:00 2001
From: Andrew Brown <andrew.brown@intel.com>
Date: Fri, 23 Jan 2026 16:57:28 -0800
Subject: [PATCH 1/2] Add lit tests; shows initial `bfloat16` support

The "expand to target type" logic added in #231 works for `bfloat16`
types, not just `float16`. This change adds lit tests to show the
general form of the lowering our `convert-triton-cpu-to-llvm` does when
run.
---
 test/Conversion/dot.mlir | 41 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)
 create mode 100644 test/Conversion/dot.mlir

diff --git a/test/Conversion/dot.mlir b/test/Conversion/dot.mlir
new file mode 100644
index 00000000..90431b0f
--- /dev/null
+++ b/test/Conversion/dot.mlir
@@ -0,0 +1,41 @@
+// RUN: triton-opt %s -split-input-file --convert-triton-cpu-to-llvm | FileCheck %s
+
+#blocked = #ttg.blocked<{sizePerThread = [2, 2], threadsPerWarp = [1, 1], warpsPerCTA = [1, 1], order = [1, 0]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.shared = 0 : i32, ttg.target = "cpu", "ttg.threads-per-warp" = 1 : i32} {
+  tt.func public @kernel(
+    %a: tensor<2x4xf16, #ttg.dot_op<{opIdx = 0, parent = #blocked}>>,
+    %b: tensor<4x2xf16, #ttg.dot_op<{opIdx = 1, parent = #blocked}>>,
+    %c: tensor<2x2xf32, #blocked>) -> tensor<2x2xf32, #blocked> attributes {noinline = false} {
+    %d = tt.dot %a, %b, %c :
+        tensor<2x4xf16, #ttg.dot_op<{opIdx = 0, parent = #blocked}>> *
+        tensor<4x2xf16, #ttg.dot_op<{opIdx = 1, parent = #blocked}>> ->
+        tensor<2x2xf32, #blocked>
+    // COM: We should see a bunch of repetitions of this pattern:
+    // CHECK: [[A:%.*]] = llvm.fpext {{%.*}} : f16 to f32
+    // CHECK: [[B:%.*]] = llvm.fpext {{%.*}} : f16 to f32
+    // CHECK: [[MUL:%.*]] = llvm.fmul [[A]], [[B]] : f32
+    // CHECK: {{%.*}} = llvm.fadd {{%.*}}, [[MUL]] : f32
+    tt.return %d : tensor<2x2xf32, #blocked>
+  }
+}
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [2, 2], threadsPerWarp = [1, 1], warpsPerCTA = [1, 1], order = [1, 0]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.shared = 0 : i32, ttg.target = "cpu", "ttg.threads-per-warp" = 1 : i32} {
+  tt.func public @kernel(
+    %a: tensor<2x4xbf16, #ttg.dot_op<{opIdx = 0, parent = #blocked}>>,
+    %b: tensor<4x2xbf16, #ttg.dot_op<{opIdx = 1, parent = #blocked}>>,
+    %c: tensor<2x2xf32, #blocked>) -> tensor<2x2xf32, #blocked> attributes {noinline = false} {
+    %d = tt.dot %a, %b, %c :
+        tensor<2x4xbf16, #ttg.dot_op<{opIdx = 0, parent = #blocked}>> *
+        tensor<4x2xbf16, #ttg.dot_op<{opIdx = 1, parent = #blocked}>> ->
+        tensor<2x2xf32, #blocked>
+    // COM: We should see a bunch of repetitions of this pattern:
+    // CHECK: [[A:%.*]] = llvm.fpext {{%.*}} : bf16 to f32
+    // CHECK: [[B:%.*]] = llvm.fpext {{%.*}} : bf16 to f32
+    // CHECK: [[MUL:%.*]] = llvm.fmul [[A]], [[B]] : f32
+    // CHECK: {{%.*}} = llvm.fadd {{%.*}}, [[MUL]] : f32
+    tt.return %d : tensor<2x2xf32, #blocked>
+  }
+}

From d250c84238d957c17575d30a2249d6a03e4a33bd Mon Sep 17 00:00:00 2001
From: Andrew Brown <andrew.brown@intel.com>
Date: Wed, 28 Jan 2026 10:49:36 -0800
Subject: [PATCH 2/2] review: add --canonicalize pass

---
 test/Conversion/dot.mlir | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/Conversion/dot.mlir b/test/Conversion/dot.mlir
index 90431b0f..a0b59ffd 100644
--- a/test/Conversion/dot.mlir
+++ b/test/Conversion/dot.mlir
@@ -1,4 +1,4 @@
-// RUN: triton-opt %s -split-input-file --convert-triton-cpu-to-llvm | FileCheck %s
+// RUN: triton-opt %s -split-input-file --convert-triton-cpu-to-llvm --canonicalize | FileCheck %s
 
 #blocked = #ttg.blocked<{sizePerThread = [2, 2], threadsPerWarp = [1, 1], warpsPerCTA = [1, 1], order = [1, 0]}>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.shared = 0 : i32, ttg.target = "cpu", "ttg.threads-per-warp" = 1 : i32} {