joeyye-work · stevenvar · Apr 29, 2026 · Apr 29, 2026 · Apr 29, 2026
diff --git a/third_party/xla/xla/debug_options_flags.cc b/third_party/xla/xla/debug_options_flags.cc
@@ -101,6 +101,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_cpu_use_acl(true);
 #endif
   opts.set_xla_cpu_use_fusion_emitters(true);
+  opts.set_xla_cpu_disable_instruction_fusion(false);
   opts.set_xla_cpu_use_thunk_runtime(true);
   opts.set_xla_cpu_use_xnnpack(false);
   opts.set_xla_compile_batch_sizes("");
@@ -985,6 +986,11 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
                 debug_options->xla_cpu_use_fusion_emitters(),
                 "Use fusion emitters for code generation in the CPU backend. "
                 "Note: only works with --xla_cpu_use_thunk_runtime=true."));
+  flag_list->push_back(tsl::Flag(
+      "xla_cpu_disable_instruction_fusion",
+      bool_setter_for(&DebugOptions::set_xla_cpu_disable_instruction_fusion),
+      debug_options->xla_cpu_disable_instruction_fusion(),
+      "Skip the CpuInstructionFusion HLO pass in the CPU backend."));
   flag_list->push_back(
       tsl::Flag("xla_cpu_use_thunk_runtime",
                 bool_setter_for(&DebugOptions::set_xla_cpu_use_thunk_runtime),

diff --git a/third_party/xla/xla/service/cpu/cpu_instruction_fusion.cc b/third_party/xla/xla/service/cpu/cpu_instruction_fusion.cc
@@ -143,6 +143,13 @@ FusionDecision CpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
         "Don't fuse instructions from custom fusions/calls");
   }
 
+  if (consumer->GetModule()
+          ->config()
+          .debug_options()
+          .xla_cpu_disable_instruction_fusion()) {
+    return FusionDecision::Forbid("CPU instruction fusion disabled by flag.");
+  }
+
   HloInstruction* producer = consumer->mutable_operand(operand_index);
   VLOG(2) << "Considering for fusion: operand " << operand_index << " of "
           << consumer->ToString();

diff --git a/third_party/xla/xla/service/cpu/dot_op_emitter.cc b/third_party/xla/xla/service/cpu/dot_op_emitter.cc
@@ -124,22 +124,19 @@ bool CanEmitTiledLlvmIrGemm(
     const TargetMachineFeatures& target_machine_features) {
   CHECK(IsAlignedGemm(dot_info, target_machine_features));
 
-  if (ShouldUseMultiThreadedEigen(config)) {
-    return false;
-  }
-
   int m = dot_info.result_shape.dimensions(0);
   int k = dot_info.lhs_shape.dimensions(
       dot_info.dim_nums.lhs_contracting_dimensions(0));
   int n = dot_info.result_shape.dimensions(1);
 
-  if (!options::ForceEnableExperimentalLlvmIrGemm(config)) {
-    // TODO(sanjoy):  We should make these numbers micro-arch specific.
-    bool small_gemm =
-        k <= 128 && ((m <= 32 && n <= 128) || (m <= 128 && n <= 32));
-    if (!small_gemm) {
-      return false;
-    }
+  bool force_tiled_llvm = options::ForceEnableExperimentalLlvmIrGemm(config);
+  // Keep the existing heuristic for which GEMMs are worth lowering to the
+  // tiled LLVM path, but allow those small GEMMs even when multi-threaded
+  // Eigen is enabled since the Eigen runtime overhead can dominate here.
+  bool small_gemm =
+      k <= 128 && ((m <= 32 && n <= 128) || (m <= 128 && n <= 32));
+  if (!force_tiled_llvm && !small_gemm) {
+    return false;
   }
 
   bool lhs_canonical = dot_info.dim_nums.lhs_contracting_dimensions(0) == 1;

diff --git a/third_party/xla/xla/xla.proto b/third_party/xla/xla/xla.proto
@@ -216,6 +216,9 @@ message DebugOptions {
   // If set, XLA:CPU uses "fusion emitters" for codegen.
   bool xla_cpu_use_fusion_emitters = 376;
 
+  // If set, XLA:CPU skips the CpuInstructionFusion HLO pass entirely.
+  bool xla_cpu_disable_instruction_fusion = 400;
+
   // When true, XLA:CPU uses the thunk runtime to execute compiled program.
   bool xla_cpu_use_thunk_runtime = 298;