diff --git a/compiler/plugins/target/LLVMCPU/builtins/ukernel/BUILD.bazel b/compiler/plugins/target/LLVMCPU/builtins/ukernel/BUILD.bazel
index d99ef1863520..7b9a8e1b88bd 100644
--- a/compiler/plugins/target/LLVMCPU/builtins/ukernel/BUILD.bazel
+++ b/compiler/plugins/target/LLVMCPU/builtins/ukernel/BUILD.bazel
@@ -42,6 +42,7 @@ X86_64_AVX512_BF16_COPTS = [
 
 X86_64_AVX512_VNNI_COPTS = [
     "-mavx512f",
+    "-mavx512bw",
     "-mavx512vnni",
 ]
 
diff --git a/compiler/plugins/target/LLVMCPU/builtins/ukernel/CMakeLists.txt b/compiler/plugins/target/LLVMCPU/builtins/ukernel/CMakeLists.txt
index c7b3069c4f24..3e33a4794c82 100644
--- a/compiler/plugins/target/LLVMCPU/builtins/ukernel/CMakeLists.txt
+++ b/compiler/plugins/target/LLVMCPU/builtins/ukernel/CMakeLists.txt
@@ -43,6 +43,7 @@ iree_bitcode_library(
     "iree_uk_mma_x86_avx512vnni_16x16x2_i32_i8_casti16.c"
   COPTS
     "-mavx512f"
+    "-mavx512bw"
     "-mavx512vnni"
 )
 
diff --git a/compiler/plugins/target/LLVMCPU/builtins/ukernel/iree_uk_mma_x86_avx512bf16_1x16x2_f32_bf16.c b/compiler/plugins/target/LLVMCPU/builtins/ukernel/iree_uk_mma_x86_avx512bf16_1x16x2_f32_bf16.c
index 978c47827e58..4a10aa42a282 100644
--- a/compiler/plugins/target/LLVMCPU/builtins/ukernel/iree_uk_mma_x86_avx512bf16_1x16x2_f32_bf16.c
+++ b/compiler/plugins/target/LLVMCPU/builtins/ukernel/iree_uk_mma_x86_avx512bf16_1x16x2_f32_bf16.c
@@ -13,40 +13,94 @@
 // intrinsic name verbatim (lowercased, with the `iree_uk_` prefix), in line
 // with the AMDGPU C ukernel convention.
 //
-// The "inner K loop" the ukernel owns is the loop over the K *tiles* that
-// sits *inside* the outer M/N loops; those outer M/N loops are tiled away by
-// ordinary IREE tiling before this ukernel runs. The ukernel handles
-// arbitrary positive `intrinsics_{m,n,k}` (passed as arguments and looped
-// over); the loops fully unroll after the ukernel is inlined into its
-// constant-`intrinsics_*` caller -- the bitcode-LTO equivalent of a C++
-// template.
+// Adapted from the AMDGPU C ukernel
+// `iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8`: same shape — accumulators
+// held in registers, an outer loop over the K dimension, and inside it the
+// `(intrinsics_m, intrinsics_n, intrinsics_k)` unrolling — minus the
+// GPU-specific shared-memory / subgroup machinery.
+//
+// The "inner K loop" the ukernel owns is the loop over the K *tiles*
+// (`k_outer` below) that sits *inside* the outer M/N loops; those outer M/N
+// loops are tiled away by ordinary IREE tiling before this ukernel runs.
+// This is NOT a restriction to `intrinsics_{m,n,k} = 1`: the ukernel handles
+// arbitrary positive `intrinsics_{m,n,k}` via the `for` loops below.
+//
+// `intrinsics_{m,n,k}` are passed as function arguments and so look like
+// runtime values inside this translation unit, but the ukernel is always
+// inlined into its caller (a bug otherwise) and the caller always passes the
+// matching `DataTiledMMAAttr` constants. Together with post-inline IR
+// optimization on the linked bitcode, the `for` loops fully unroll and the
+// `acc_regs` VLA becomes a fixed register array specialized to each call
+// site's `intrinsics_{m,n,k}` — the bitcode-LTO equivalent of C++ templates.
 //
 // ABI: each shaped operand is passed as (base pointer, element offset) so the
-// caller doesn't need a GEP before the call; the accumulator additionally
-// gets the element stride of its innermost cross-intrinsic (N) dimension.
+// caller doesn't need a GEP before the call (the offset is added here); the
+// accumulator additionally gets the element stride of its innermost
+// cross-intrinsic (N) dimension. Offsets/strides are in units of the operand
+// element type (bf16 for LHS/RHS, f32 for ACC).
 //
-// NOTE (seed scaffolding): this initial seed has a stub body. It exists so
-// that the surrounding *framework* -- bitcode build, embedding,
-// `hal.executable_object` injection, IR rewrite to `ukernel.generic` -- can
-// be landed and lit-tested. A follow-up commit replaces the body with the
-// `_mm512_dpbf16_ps`-based inner loop and adds an e2e matmul test for it.
+// Data-tiled operand layout (matching the `DataTiledMMAAttr` swizzle, same as
+// the AMDGPU ukernel):
+//   - ACC: one `__m512` (= M0=1 x N0=16 f32) per (m, n) intrinsic. The (m, n)
+//     grid is row-major with `acc_stride` the element stride of the innermost
+//     cross-intrinsic dim (N), so fragment (m, n) is at
+//     `acc + (m * intrinsics_n + n) * acc_stride`.
+//   - LHS: per outer-K step, `intrinsics_m * intrinsics_k` units of 2 bf16
+//     (= one M0=1 x K0=2 fragment = a 4-byte `vdpbf16ps` m_bcst unit),
+//     ordered [m][k]; consecutive outer-K steps are contiguous.
+//   - RHS: per outer-K step, `intrinsics_n * intrinsics_k` panels of 32 bf16
+//     (= one N0=16 x K0=2 fragment = one `__m512`), ordered [n][k];
+//     consecutive outer-K steps are contiguous.
 IREE_UK_ALWAYS_INLINE
 void iree_uk_mma_x86_avx512bf16_1x16x2_f32_bf16(
     const uint16_t *lhs_base, int64_t lhs_offset, const uint16_t *rhs_base,
     int64_t rhs_offset, float *acc_base, int64_t acc_offset, int64_t acc_stride,
     int32_t k_outer, int32_t intrinsics_m, int32_t intrinsics_n,
     int32_t intrinsics_k) {
-  (void)lhs_base;
-  (void)lhs_offset;
-  (void)rhs_base;
-  (void)rhs_offset;
-  (void)acc_base;
-  (void)acc_offset;
-  (void)acc_stride;
-  (void)k_outer;
-  (void)intrinsics_m;
-  (void)intrinsics_n;
-  (void)intrinsics_k;
-  // TODO(ukernels): real inner K loop using `_mm512_dpbf16_ps`, looping over
-  // intrinsics_{m,n,k}.
+  const uint16_t *lhs = lhs_base + lhs_offset;
+  const float *rhs = (const float *)(rhs_base + rhs_offset);
+  float *acc = acc_base + acc_offset;
+
+  // One accumulator register (1x16 f32) per (m, n) intrinsic. The VLA
+  // dimensions are compile-time constants at the inlined call site, so this
+  // lowers to a fixed register array.
+  __m512 acc_regs[intrinsics_m][intrinsics_n];
+  for (int32_t m = 0; m < intrinsics_m; ++m) {
+    for (int32_t n = 0; n < intrinsics_n; ++n) {
+      acc_regs[m][n] =
+          _mm512_loadu_ps(acc + (m * intrinsics_n + n) * acc_stride);
+    }
+  }
+
+  for (int32_t ko = 0; ko < k_outer; ++ko) {
+    const uint16_t *lhs_block =
+        lhs + (int64_t)ko * intrinsics_m * intrinsics_k * 2;
+    const float *rhs_block =
+        rhs + (int64_t)ko * intrinsics_n * intrinsics_k * 16;
+    for (int32_t m = 0; m < intrinsics_m; ++m) {
+      for (int32_t n = 0; n < intrinsics_n; ++n) {
+        for (int32_t k = 0; k < intrinsics_k; ++k) {
+          // LHS fragment: 2 bf16 (one M-row's K-pair) broadcast across the
+          // 16 SIMD lanes via `set1_ps` (the splat shape `vdpbf16ps`'s
+          // m_bcst variant pattern-matches). The bitcast to `__m512bh` is a
+          // width-preserving no-op LLVM elides.
+          __m512 lhs_bcast = _mm512_set1_ps(
+              *(const float *)(lhs_block + (m * intrinsics_k + k) * 2));
+          // RHS fragment: one (N=16 x K=2) bf16 panel = 16 f32.
+          __m512 rhs_panel =
+              _mm512_loadu_ps(rhs_block + (n * intrinsics_k + k) * 16);
+          acc_regs[m][n] =
+              _mm512_dpbf16_ps(acc_regs[m][n], *(const __m512bh *)&lhs_bcast,
+                               *(const __m512bh *)&rhs_panel);
+        }
+      }
+    }
+  }
+
+  for (int32_t m = 0; m < intrinsics_m; ++m) {
+    for (int32_t n = 0; n < intrinsics_n; ++n) {
+      _mm512_storeu_ps(acc + (m * intrinsics_n + n) * acc_stride,
+                       acc_regs[m][n]);
+    }
+  }
 }
diff --git a/compiler/plugins/target/LLVMCPU/builtins/ukernel/iree_uk_mma_x86_avx512vnni_16x16x2_i32_i8_casti16.c b/compiler/plugins/target/LLVMCPU/builtins/ukernel/iree_uk_mma_x86_avx512vnni_16x16x2_i32_i8_casti16.c
index 478ea1a9a8d0..d587837a8eaf 100644
--- a/compiler/plugins/target/LLVMCPU/builtins/ukernel/iree_uk_mma_x86_avx512vnni_16x16x2_i32_i8_casti16.c
+++ b/compiler/plugins/target/LLVMCPU/builtins/ukernel/iree_uk_mma_x86_avx512vnni_16x16x2_i32_i8_casti16.c
@@ -13,49 +13,113 @@
 // intrinsic name verbatim (lowercased, with the `iree_uk_` prefix), in line
 // with the AMDGPU C ukernel convention.
 //
-// Implements the inner K-loop for the unrolled (intrinsics_m, intrinsics_n,
-// intrinsics_k) tile built from the 16x16x2 i8 VNNI intrinsic via AVX-512
-// VNNI `vpdpwssd`. The "CASTI16" in the MMA intrinsic name reflects that
-// the s8 inputs are zero/sign-extended into i16 lanes before being fed to
-// the 16-bit VNNI instruction; that cast is handled in the inner loop.
+// Structurally identical to `iree_uk_mma_x86_avx512bf16_1x16x2_f32_bf16`:
+// accumulators held in registers, an outer loop over the K *tiles*
+// (`k_outer`), and inside it the `(intrinsics_m, intrinsics_n, intrinsics_k)`
+// unroll. This is NOT a restriction to `intrinsics_{m,n,k} = 1`: the ukernel
+// handles arbitrary positive `intrinsics_{m,n,k}` via the `for` loops below,
+// which fully unroll once inlined into the constant-`intrinsics_*` caller.
 //
-// `intrinsics_{m,n,k}` are passed as function arguments and so look like
-// runtime values inside this translation unit, but the ukernel is always
-// inlined into its caller (a bug otherwise) and the caller always passes
-// the matching `DataTiledMMAAttr` constants. Together with post-inline IR
-// optimization on the linked bitcode, the body specializes to specific
-// compile-time `intrinsics_{m,n,k}` values at each call site.
+// The "CASTI16" in the intrinsic name reflects that the s8 inputs are
+// sign-extended to i16 lanes before being fed to the 16-bit VNNI instruction
+// `vpdpwssd` (the 8-bit `vpdpbusd` would mishandle the s8 x s8 signedness);
+// that widen happens once per panel in the inner loop.
 //
-// NOTE (seed scaffolding): this initial seed has a stub body. It exists so
-// that the surrounding *framework* -- bitcode build, embedding,
-// `hal.executable_object` injection, IR rewrite to `ukernel.generic` -- can
-// be exercised end-to-end. A follow-up commit replaces the body with the
-// `_mm512_dpwssd_epi32`-based inner loop and adds an e2e matmul test for
-// it. This is the "practically useful" seed: i8x i8->i32 via VNNI is a
-// workhorse for quantized inference, and codegen has a residual perf gap
-// on this case.
-// ABI matches the inner_tiled -> ukernel.generic lowering (see
-// `iree_uk_mma_x86_avx512bf16_1x16x2_f32_bf16`): each shaped operand passed
-// as (base, element offset), ACC additionally as its innermost
-// cross-intrinsic stride, then the scalar `k_outer` / `intrinsics_{m,n,k}`.
+// ABI: each shaped operand is passed as (base pointer, element offset) so the
+// caller doesn't need a GEP before the call (the offset is added here); the
+// accumulator additionally gets the element stride of its innermost
+// cross-intrinsic (N) dimension. Offsets/strides are in units of the operand
+// element type (i8 for LHS/RHS, i32 for ACC).
+//
+// Per-intrinsic (16x16x2) tile, matching `lowerX86Avx512Vnni16x16x2I8` in
+// IREECPUAttrs.cpp (the codegen path this ukernel must be bit-compatible
+// with) and the `getIntrinsicSwizzle` data layout:
+//   - LHS: one row-major 16x2 i8 panel (= 32 i8 = <32xi8>). Dword `x` of its
+//     i16-widened form holds the (k0, k1) pair of LHS row `x`.
+//   - RHS: one row-major 16x2 i8 panel (= 32 i8), same shape; dword `x` holds
+//     column `x`'s (k0, k1) pair.
+//   - ACC: one 16x16 i32 tile (= 256 i32) in the block-interleaved
+//     (rlo, chi, rhi, clo) order, with row r = 4*rhi + rlo and column
+//     c = 4*chi + clo. The 16 i32 at flat offset (4*rlo + chi)*16 are one
+//     `vpdpwssd` accumulator, whose dword `4*rhi + clo` holds ACC element
+//     (r, c). Fragments (m, n) are `acc_stride` apart along N.
 IREE_UK_ALWAYS_INLINE
 void iree_uk_mma_x86_avx512vnni_16x16x2_i32_i8_casti16(
     const void *lhs_base, int64_t lhs_offset, const void *rhs_base,
     int64_t rhs_offset, void *acc_base, int64_t acc_offset, int64_t acc_stride,
     int32_t k_outer, int32_t intrinsics_m, int32_t intrinsics_n,
     int32_t intrinsics_k) {
-  (void)lhs_base;
-  (void)lhs_offset;
-  (void)rhs_base;
-  (void)rhs_offset;
-  (void)acc_base;
-  (void)acc_offset;
-  (void)acc_stride;
-  (void)k_outer;
-  (void)intrinsics_m;
-  (void)intrinsics_n;
-  (void)intrinsics_k;
-  // TODO(ukernels): real inner K loop using `_mm512_dpwssd_epi32` after
-  // widening the s8 LHS/RHS halves to i16 lanes (loop over
-  // intrinsics_{m,n,k} like the bf16 ukernel).
+  const int8_t *lhs = (const int8_t *)lhs_base + lhs_offset;
+  const int8_t *rhs = (const int8_t *)rhs_base + rhs_offset;
+  int32_t *acc = (int32_t *)acc_base + acc_offset;
+
+  // 256 i32 (= 16 __m512i `vpdpwssd` accumulators) per (m, n) intrinsic. The
+  // VLA dimensions are compile-time constants at the inlined call site, so
+  // this lowers to a fixed register array.
+  __m512i acc_regs[intrinsics_m][intrinsics_n][16];
+  for (int32_t m = 0; m < intrinsics_m; ++m) {
+    for (int32_t n = 0; n < intrinsics_n; ++n) {
+      const int32_t *frag = acc + (m * intrinsics_n + n) * acc_stride;
+      for (int c = 0; c < 16; ++c) {
+        acc_regs[m][n][c] = _mm512_loadu_si512(frag + c * 16);
+      }
+    }
+  }
+
+  for (int32_t ko = 0; ko < k_outer; ++ko) {
+    // Each (m, k) / (n, k) fragment is one 16x2 i8 panel = 32 i8.
+    const int8_t *lhs_block =
+        lhs + (int64_t)ko * intrinsics_m * intrinsics_k * 32;
+    const int8_t *rhs_block =
+        rhs + (int64_t)ko * intrinsics_n * intrinsics_k * 32;
+    for (int32_t m = 0; m < intrinsics_m; ++m) {
+      for (int32_t n = 0; n < intrinsics_n; ++n) {
+        __m512i(*regs)[16] = &acc_regs[m][n];
+        for (int32_t k = 0; k < intrinsics_k; ++k) {
+          // Widen each i8 panel to i16 once (one `vpmovsxbw`); dword `x` then
+          // holds the (k0, k1) pair of LHS row / RHS column `x`.
+          __m512i lhs_i16 = _mm512_cvtepi8_epi16(_mm256_loadu_si256(
+              (const __m256i *)(lhs_block + (m * intrinsics_k + k) * 32)));
+          __m512i rhs_i16 = _mm512_cvtepi8_epi16(_mm256_loadu_si256(
+              (const __m256i *)(rhs_block + (n * intrinsics_k + k) * 32)));
+          // lhs_dup[rlo]: `vpshufd` broadcasting dword `4*lane + rlo` across
+          // each 128-bit lane (lane L then holds LHS row 4*L + rlo).
+          // rhs_bcast[chi]: `vbroadcasti32x4` of the 128-bit block of columns
+          // [4*chi, 4*chi+4) to all 4 lanes. The shuffle immediates must be
+          // compile-time constants (`s * 0x55` for s = 0..3), so the 4 cases
+          // are spelled out rather than looped.
+          __m512i lhs_dup[4] = {
+              _mm512_shuffle_epi32(lhs_i16, (_MM_PERM_ENUM)0x00),
+              _mm512_shuffle_epi32(lhs_i16, (_MM_PERM_ENUM)0x55),
+              _mm512_shuffle_epi32(lhs_i16, (_MM_PERM_ENUM)0xAA),
+              _mm512_shuffle_epi32(lhs_i16, (_MM_PERM_ENUM)0xFF),
+          };
+          __m512i rhs_bcast[4] = {
+              _mm512_shuffle_i32x4(rhs_i16, rhs_i16, 0x00),
+              _mm512_shuffle_i32x4(rhs_i16, rhs_i16, 0x55),
+              _mm512_shuffle_i32x4(rhs_i16, rhs_i16, 0xAA),
+              _mm512_shuffle_i32x4(rhs_i16, rhs_i16, 0xFF),
+          };
+          // 16 `vpdpwssd` over the 4x4 (rlo, chi) grid; accumulator (rlo, chi)
+          // lives at flat offset (4*rlo + chi)*16.
+          for (int rlo = 0; rlo < 4; ++rlo) {
+            for (int chi = 0; chi < 4; ++chi) {
+              int idx = 4 * rlo + chi;
+              (*regs)[idx] = _mm512_dpwssd_epi32((*regs)[idx], lhs_dup[rlo],
+                                                 rhs_bcast[chi]);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  for (int32_t m = 0; m < intrinsics_m; ++m) {
+    for (int32_t n = 0; n < intrinsics_n; ++n) {
+      int32_t *frag = acc + (m * intrinsics_n + n) * acc_stride;
+      for (int c = 0; c < 16; ++c) {
+        _mm512_storeu_si512(frag + c * 16, acc_regs[m][n][c]);
+      }
+    }
+  }
 }
diff --git a/compiler/plugins/target/LLVMCPU/builtins/ukernel/test/select_ukernel.mlir b/compiler/plugins/target/LLVMCPU/builtins/ukernel/test/select_ukernel.mlir
index 2952f41f1868..4699ce342ea6 100644
--- a/compiler/plugins/target/LLVMCPU/builtins/ukernel/test/select_ukernel.mlir
+++ b/compiler/plugins/target/LLVMCPU/builtins/ukernel/test/select_ukernel.mlir
@@ -98,3 +98,46 @@ func.func @bf16_inner_tiled_ukernel_disabled(
 // CHAIN:         iree_codegen.inner_tiled
 // CHAIN-NOT:     iree_codegen.ukernel.generic
 // CHAIN-NOT:     hal.executable.objects
+
+// -----
+
+// Same configuration as `bf16_inner_tiled_ukernel_enabled` but with
+// `intrinsics_m = 2`. Arbitrary positive `intrinsics_{m,n,k}` are
+// supported — the ukernel loops over them — so `selectCPUUKernel` matches
+// this case just like the unit one, and the chained pipeline rewrites it
+// to a `ukernel.generic`.
+#executable_target_enabled_unrolled = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {
+  cpu_features = "+avx512f,+avx512bf16",
+  data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
+  iree_codegen.ukernel_provider = #iree_cpu.ukernel_provider,
+  llvm_ukernels = "inner_tiled",
+  native_vector_size = 64 : index,
+  target_triple = "x86_64-unknown-unknown-eabi-elf"
+}>
+
+func.func @bf16_inner_tiled_ukernel_unrolled_accepted(
+    %lhs: tensor<2x4x2x1x2xbf16>, %rhs: tensor<2x4x1x16x2xbf16>, %acc: tensor<2x2x2x1x1x16xf32>
+  ) -> tensor<2x2x2x1x1x16xf32>
+    attributes {hal.executable.target = #executable_target_enabled_unrolled} {
+  %0 = iree_codegen.inner_tiled ins(%lhs, %rhs) outs(%acc) {
+    indexing_maps = [
+      affine_map<(d0, d1, d2) -> (d0, d2)>,
+      affine_map<(d0, d1, d2) -> (d1, d2)>,
+      affine_map<(d0, d1, d2) -> (d0, d1)>
+    ],
+    iterator_types = [#linalg.iterator_type<parallel>,
+                      #linalg.iterator_type<parallel>,
+                      #linalg.iterator_type<reduction>],
+    kind = #iree_cpu.data_tiled_mma_layout<intrinsic = MMA_X86_AVX512BF16_1x16x2_F32_BF16, intrinsics_m = 2>,
+    semantics = #iree_cpu.mma_semantics<>
+  } : tensor<2x4x2x1x2xbf16>, tensor<2x4x1x16x2xbf16> into tensor<2x2x2x1x1x16xf32>
+  return %0 : tensor<2x2x2x1x1x16xf32>
+}
+// CHECK-LABEL: func.func @bf16_inner_tiled_ukernel_unrolled_accepted
+// CHECK:         iree_codegen.inner_tiled
+// CHECK-SAME:      iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"iree_uk_mma_x86_avx512bf16_1x16x2_f32_bf16", bitcode>
+
+// CHAIN-LABEL: func.func @bf16_inner_tiled_ukernel_unrolled_accepted
+// CHAIN:         iree_codegen.ukernel.generic
+// CHAIN-SAME:      "iree_uk_mma_x86_avx512bf16_1x16x2_f32_bf16"
+// CHAIN-NOT:     iree_codegen.inner_tiled
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUSelectUKernels.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUSelectUKernels.cpp
index bb977866e7b5..c52b714e3977 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUSelectUKernels.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUSelectUKernels.cpp
@@ -54,6 +54,10 @@ IREE::Codegen::UKernelDescriptorAttr selectCPUUKernel(Operation *op) {
     return {};
   }
 
+  // Any positive `intrinsics_{m,n,k}` is supported: the ukernel takes them as
+  // arguments and loops over them, and those loops fully unroll after the
+  // ukernel is inlined into its (constant-`intrinsics_*`) caller.
+
   auto execTarget = IREE::HAL::ExecutableTargetAttr::lookup(op);
   if (!execTarget) {
     return {};
@@ -74,7 +78,16 @@ IREE::Codegen::UKernelDescriptorAttr selectCPUUKernel(Operation *op) {
   // (`ensureUKernelBitcodeAndFinalizeConfig` in
   // `compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUSelectUKernels.cpp`)
   // and makes the configuration-pass output self-contained for lit tests.
-  IREE::CPU::attachUKernelBitcodeOnOp(op, name);
+  //
+  // Only select the ukernel if its bitcode actually exists. Not every
+  // `MMAIntrinsic` the cost model picks has a built-in ukernel — e.g. the
+  // M<->N-swapped `MMA_X86_AVX512BF16_16x1x2_F32_BF16` orientation has no
+  // seed even though its natural sibling does — and matching one without
+  // bitcode would dangle an undefined symbol at link time. When absent, fall
+  // back to codegen by returning {}.
+  if (!IREE::CPU::attachUKernelBitcodeOnOp(op, name)) {
+    return {};
+  }
 
   MLIRContext *context = op->getContext();
   return IREE::Codegen::UKernelDescriptorAttr::get(
diff --git a/tests/e2e/matmul/BUILD.bazel b/tests/e2e/matmul/BUILD.bazel
index 0ae01d9384f5..872298a8d993 100644
--- a/tests/e2e/matmul/BUILD.bazel
+++ b/tests/e2e/matmul/BUILD.bazel
@@ -270,6 +270,72 @@ X86_64_AVX512_BF16 = X86_64_AVX512 + [
     ("bf16", "f32"),
 ]]
 
+# LLVMCPU, data-tiling lowering to iree_codegen.inner_tiled AND routing the
+# inner_tiled op through a built-in C-bitcode microkernel via
+# `--iree-llvmcpu-enable-llvm-ukernels=inner_tiled`. Restricted to the
+# avx512bf16 variant, which is the one for which a bf16 ukernel
+# (`iree_uk_mma_x86_avx512bf16_1x16x2_f32_bf16`) exists; this is the
+# numerical end-to-end check that the ukernel (and the generic
+# `intrinsics_{m,n,k}` unrolling + operand threading) computes correct
+# results, complementing the IR-level lit tests under
+# compiler/plugins/target/LLVMCPU/builtins/ukernel/test/.
+iree_generated_e2e_runner_test(
+    name = "e2e_matmul_cpu_dt_inner_tiled_llvm_ukernel_bf16_f32",
+    compiler_flags = [
+        "--iree-opt-data-tiling",
+        "--iree-llvmcpu-enable-inner-tiled",
+        "--iree-llvmcpu-enable-llvm-ukernels=inner_tiled",
+    ],
+    generator = ":generate_e2e_matmul_tests",
+    generator_args = [
+        "--lhs_rhs_type=bf16",
+        "--acc_type=f32",
+    ],
+    tags = [
+        # bf16 triggers internal LLVM assertion errors on riscv and wasm.
+        "noriscv",
+        "nowasm",
+    ],
+    target_backends_and_drivers = [
+        ("llvm-cpu", "local-task"),
+    ],
+    target_cpu_features_variants = [
+        "x86_64:avx512bf16:" + ",".join(X86_64_AVX512_BF16),
+    ],
+    test_runner = "//tools/testing/e2e:iree-e2e-matmul-test",
+    test_type = "matmul",
+)
+
+# Companion of the bf16 test above for the symmetric signed-i8 -> i32 path:
+# routes the data-tiled inner_tiled op through the C-bitcode microkernel
+# `iree_uk_mma_x86_avx512vnni_16x16x2_i32_i8_casti16` (x86 AVX-512 VNNI
+# `vpdpwssd`, with the s8 inputs widened to i16). Restricted to the
+# avx512vnni variant, which is the one for which that ukernel exists; this
+# is the numerical end-to-end check that the ukernel computes correct
+# results, complementing the IR-level lit tests under
+# compiler/plugins/target/LLVMCPU/builtins/ukernel/test/.
+iree_generated_e2e_runner_test(
+    name = "e2e_matmul_cpu_dt_inner_tiled_llvm_ukernel_i8_i32",
+    compiler_flags = [
+        "--iree-opt-data-tiling",
+        "--iree-llvmcpu-enable-inner-tiled",
+        "--iree-llvmcpu-enable-llvm-ukernels=inner_tiled",
+    ],
+    generator = ":generate_e2e_matmul_tests",
+    generator_args = [
+        "--lhs_rhs_type=i8",
+        "--acc_type=i32",
+    ],
+    target_backends_and_drivers = [
+        ("llvm-cpu", "local-task"),
+    ],
+    target_cpu_features_variants = [
+        "x86_64:avx512vnni:" + ",".join(X86_64_AVX512_VNNI),
+    ],
+    test_runner = "//tools/testing/e2e:iree-e2e-matmul-test",
+    test_type = "matmul",
+)
+
 # Mixed unsigned-LHS / signed-RHS i8 matmul: exercises x86 AVX-512 VNNI
 # `vpdpbusd`, our first asymmetric MMA. The `generic` variant exercises
 # the standard codegen fallback (no matching MMA intrinsic).
diff --git a/tests/e2e/matmul/CMakeLists.txt b/tests/e2e/matmul/CMakeLists.txt
index 58b4c0921130..e31ef1195635 100644
--- a/tests/e2e/matmul/CMakeLists.txt
+++ b/tests/e2e/matmul/CMakeLists.txt
@@ -714,6 +714,57 @@ iree_generated_e2e_runner_test(
     "x86_64:avx512bf16:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq,+avx512bf16"
 )
 
+iree_generated_e2e_runner_test(
+  NAME
+    e2e_matmul_cpu_dt_inner_tiled_llvm_ukernel_bf16_f32
+  TEST_TYPE
+    matmul
+  GENERATOR
+    "generate_e2e_matmul_tests.py"
+  GENERATOR_ARGS
+    "--lhs_rhs_type=bf16"
+    "--acc_type=f32"
+  TEST_RUNNER
+    iree_tools_testing_e2e_iree-e2e-matmul-test
+  TARGET_BACKENDS
+    "llvm-cpu"
+  DRIVERS
+    "local-task"
+  COMPILER_FLAGS
+    "--iree-opt-data-tiling"
+    "--iree-llvmcpu-enable-inner-tiled"
+    "--iree-llvmcpu-enable-llvm-ukernels=inner_tiled"
+  LABELS
+    "noriscv"
+    "nowasm"
+  TARGET_CPU_FEATURES_VARIANTS
+    "x86_64:avx512bf16:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq,+avx512bf16"
+)
+
+iree_generated_e2e_runner_test(
+  NAME
+    e2e_matmul_cpu_dt_inner_tiled_llvm_ukernel_i8_i32
+  TEST_TYPE
+    matmul
+  GENERATOR
+    "generate_e2e_matmul_tests.py"
+  GENERATOR_ARGS
+    "--lhs_rhs_type=i8"
+    "--acc_type=i32"
+  TEST_RUNNER
+    iree_tools_testing_e2e_iree-e2e-matmul-test
+  TARGET_BACKENDS
+    "llvm-cpu"
+  DRIVERS
+    "local-task"
+  COMPILER_FLAGS
+    "--iree-opt-data-tiling"
+    "--iree-llvmcpu-enable-inner-tiled"
+    "--iree-llvmcpu-enable-llvm-ukernels=inner_tiled"
+  TARGET_CPU_FEATURES_VARIANTS
+    "x86_64:avx512vnni:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq,+avx512vnni"
+)
+
 iree_generated_e2e_runner_test(
   NAME
     e2e_matmul_cpu_dt_inner_tiled_ui8_i8_i32