openvinotoolkit · peterchen-intel · Apr 12, 2026 · Apr 14, 2026 · Apr 14, 2026 · Apr 14, 2026
@@ -564,12 +564,13 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
             return false;
         }();
 
+#ifdef ENABLE_ONEDNN_FOR_GPU
         const bool disable_moe_opt = GPU_DEBUG_VALUE_OR(config.get_disable_moe_opt(), false);
 
         // MOE: TiledMoeBlock -> GatherMatmuls(compressed) -> MoeOp(compressed) -> MoeOpWithRouting(compressed).
-        // Gated on supports_immad (systolic-only) and oneDNN (required for expert GEMM dispatch).
-        // Note: even though we are already inside `if (supports_immad)`, oneDNN can still be explicitly disabled by the user.
-        if (device_info.supports_immad && config.get_use_onednn()) {
+        // Gated on oneDNN supports platforms.
+        // Note: even though we are already inside `oneDNN supports platforms`, oneDNN can still be explicitly disabled by the user.
+        if (device_info.arch >= cldnn::gpu_arch::xe_lp && config.get_use_onednn()) {
             manager.register_pass<ov::pass::ConvertTiledMoeBlockToGatherMatmuls>();
 
             // f32 listed because this pass runs before ConvertPrecision (line ~588);
@@ -591,6 +592,7 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
                 manager.register_pass<ov::intel_gpu::FuseMOE3GemmCompressed>();
             }
         }
+#endif // ENABLE_ONEDNN_FOR_GPU
         manager.register_pass<ov::pass::GatedDeltaNetFusion>();
         manager.register_pass<ov::pass::InitNodeInfo>();
         manager.register_pass<EinsumDecomposition>();

@@ -7,6 +7,9 @@
 
 #include "intel_gpu/op/indirect_sdpa.hpp"
 #include "intel_gpu/op/kv_cache.hpp"
+#ifdef ENABLE_ONEDNN_FOR_GPU
+#include "intel_gpu/op/moe_3gemm_fused_compressed.hpp"
+#endif // ENABLE_ONEDNN_FOR_GPU
 #include "intel_gpu/op/sdpa.hpp"
 #include "intel_gpu/plugin/remote_context.hpp"
 #include "intel_gpu/primitives/paged_attention.hpp"
@@ -217,13 +220,27 @@ void ExecutionConfig::apply_model_specific_options(const IRemoteContext* context
             m_max_kernels_per_batch = 4;
         }
 
+#ifdef ENABLE_ONEDNN_FOR_GPU
         // Allow using onednn for models with LSTMSequence op as it's much more performant than existing ocl impl
         // Onednn only support on Gen12 (XeLP) and later architectures
         if ((ov::is_type<ov::op::v5::LSTMSequence>(op) || ov::is_type<ov::op::v5::GRUSequence>(op)) &&
             info.arch >= cldnn::gpu_arch::xe_lp) {
             m_use_onednn = true;
         }
 
+        // moe_3gemm_fused_compressed uses oneDNN internally for matrix multiplications
+        // (onednn_linear wrappers in moe_3gemm_swiglu_opt.cpp), which requires:
+        //   1. use_onednn=true so create_onednn_engine() is called during program build
+        //      (see program.cpp: lo.enable_onednn_for<lstm_seq/gru_seq> path which makes
+        //       onednn_impls_optimization_attribute non-empty, triggering engine init).
+        //   2. in-order OCL command queue (finalize_impl sets this when use_onednn=true).
+        // Auto-enable this only on architectures with oneDNN support, consistent with
+        // the LSTM/GRU path above, to avoid initializing oneDNN on unsupported devices.
+        if (ov::is_type<ov::intel_gpu::op::MOE3GemmFusedCompressed>(op) &&
+            info.arch >= cldnn::gpu_arch::xe_lp) {
+            m_use_onednn = true;
+        }
+#endif //ENABLE_ONEDNN_FOR_GPU
         if (auto multi_subgraph_op = ov::as_type_ptr<ov::op::util::MultiSubGraphOp>(op)) {
             for (const auto& sub_graph : multi_subgraph_op->get_functions()) {
                 for (auto& sub_op : sub_graph->get_ops()) {