pytorch
diff --git a/‎Makefile‎
Lines changed: 21 additions & 1 deletion b/‎Makefile‎
Lines changed: 21 additions & 1 deletion
diff --git a/‎examples/models/eagle3/CMakeLists.txt‎
Lines changed: 11 additions & 2 deletions b/‎examples/models/eagle3/CMakeLists.txt‎
Lines changed: 11 additions & 2 deletions
diff --git a/‎examples/models/eagle3/CMakePresets.json‎
Lines changed: 39 additions & 0 deletions b/‎examples/models/eagle3/CMakePresets.json‎
Lines changed: 39 additions & 0 deletions
diff --git a/‎examples/models/eagle3/export.py‎
Lines changed: 6 additions & 1 deletion b/‎examples/models/eagle3/export.py‎
Lines changed: 6 additions & 1 deletion
@@ -91,7 +91,7 @@
 #
 # ==============================================================================
 
-.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral-mlx voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal voxtral_realtime-mlx voxtral_tts-cpu voxtral_tts-cuda whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal parakeet-mlx parakeet-vulkan dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu lfm_2_5-mlx llava-cpu gemma3-cuda gemma3-cpu gemma4_31b-cuda gemma4_31b-mlx qwen3_5_moe-cuda qwen3_5_moe-metal clean help
+.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral-mlx voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal voxtral_realtime-mlx voxtral_tts-cpu voxtral_tts-cuda whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal parakeet-mlx parakeet-vulkan dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu lfm_2_5-mlx llava-cpu gemma3-cuda gemma3-cpu gemma4_31b-cuda gemma4_31b-mlx eagle3-cuda eagle3-mlx qwen3_5_moe-cuda qwen3_5_moe-metal clean help
 
 help:
 	@echo "This Makefile adds targets to build runners for various models on various backends. Run using \`make <target>\`. Available targets:"
@@ -129,6 +129,8 @@ help:
 	@echo "  gemma3-cpu          - Build Gemma3 runner with CPU backend"
 	@echo "  gemma4_31b-cuda     - Build Gemma 4 31B runner with CUDA backend"
 	@echo "  gemma4_31b-mlx      - Build Gemma 4 31B runner with MLX backend"
+	@echo "  eagle3-cuda         - Build EAGLE-3 speculator runner with CUDA backend"
+	@echo "  eagle3-mlx          - Build EAGLE-3 speculator runner with MLX backend"
 	@echo "  qwen3_5_moe-cuda    - Build Qwen3.5 MoE runner with CUDA backend"
 	@echo "  qwen3_5_moe-metal   - Build Qwen3.5 MoE runner with Metal backend"
 	@echo "  clean               - Clean build artifacts"
@@ -457,6 +459,24 @@ gemma4_31b-mlx:
 	@echo "✓ Build complete!"
 	@echo "  Binary: cmake-out/examples/models/gemma4_31b/gemma4_31b_runner"
 
+eagle3-cuda:
+	@echo "==> Building and installing ExecuTorch with CUDA..."
+	cmake --workflow --preset llm-release-cuda
+	@echo "==> Building EAGLE-3 speculator runner with CUDA..."
+	cd examples/models/eagle3 && cmake --workflow --preset eagle3-cuda
+	@echo ""
+	@echo "✓ Build complete!"
+	@echo "  Binary: cmake-out/examples/models/eagle3/eagle3_speculator_runner"
+
+eagle3-mlx:
+	@echo "==> Building and installing ExecuTorch with MLX..."
+	cmake --workflow --preset mlx-release
+	@echo "==> Building EAGLE-3 speculator runner with MLX..."
+	cd examples/models/eagle3 && cmake --workflow --preset eagle3-mlx
+	@echo ""
+	@echo "✓ Build complete!"
+	@echo "  Binary: cmake-out/examples/models/eagle3/eagle3_speculator_runner"
+
 qwen3_5_moe-metal:
 	@echo "==> Building and installing ExecuTorch with Metal..."
 	cmake --workflow --preset llm-release-metal
 
@@ -42,14 +42,19 @@ list(
   extension_flat_tensor
 )
 
-# Backend: CUDA (AOTI). The EAGLE-3 speculator export is CUDA-only.
+# Backend: CUDA (AOTI) or MLX (exactly one required). CUDA returns greedy ids;
+# MLX returns logits and the runner argmaxes + maps draft ids via d2t on the
+# host.
 if(EXECUTORCH_BUILD_CUDA)
   find_package(CUDAToolkit REQUIRED)
   list(APPEND link_libraries aoti_cuda_backend)
   executorch_target_link_options_shared_lib(aoti_cuda_backend)
   add_compile_definitions(EXECUTORCH_BUILD_CUDA)
+elseif(TARGET mlxdelegate)
+  list(APPEND link_libraries mlxdelegate mlx)
+  executorch_target_link_options_shared_lib(mlxdelegate)
 else()
-  message(FATAL_ERROR "EAGLE-3 speculator runner requires EXECUTORCH_BUILD_CUDA=ON")
+  message(FATAL_ERROR "Set EXECUTORCH_BUILD_CUDA=ON or EXECUTORCH_BUILD_MLX=ON")
 endif()
 
 # Tokenizer (HuggingFace tokenizer.json)
@@ -67,3 +72,7 @@ if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
     target_link_options(eagle3_speculator_runner PRIVATE "LINKER:-s")
   endif()
 endif()
+
+if(TARGET mlxdelegate)
+  executorch_target_copy_mlx_metallib(eagle3_speculator_runner)
+endif()
@@ -16,6 +16,21 @@
                 "string": "${hostSystemName}",
                 "list": ["Linux", "Windows"]
             }
+        },
+        {
+            "name": "eagle3-mlx",
+            "displayName": "EAGLE-3 speculator runner (MLX)",
+            "binaryDir": "${sourceDir}/../../../cmake-out/examples/models/eagle3",
+            "cacheVariables": {
+                "CMAKE_BUILD_TYPE": "Release",
+                "CMAKE_FIND_ROOT_PATH": "${sourceDir}/../../../cmake-out",
+                "CMAKE_PREFIX_PATH": "${sourceDir}/../../../cmake-out"
+            },
+            "condition": {
+                "type": "equals",
+                "lhs": "${hostSystemName}",
+                "rhs": "Darwin"
+            }
         }
     ],
     "buildPresets": [
@@ -24,6 +39,30 @@
             "displayName": "Build EAGLE-3 speculator runner (CUDA)",
             "configurePreset": "eagle3-cuda",
             "targets": ["eagle3_speculator_runner"]
+        },
+        {
+            "name": "eagle3-mlx",
+            "displayName": "Build EAGLE-3 speculator runner (MLX)",
+            "configurePreset": "eagle3-mlx",
+            "targets": ["eagle3_speculator_runner"]
+        }
+    ],
+    "workflowPresets": [
+        {
+            "name": "eagle3-cuda",
+            "displayName": "Configure and build EAGLE-3 speculator runner (CUDA)",
+            "steps": [
+                {"type": "configure", "name": "eagle3-cuda"},
+                {"type": "build", "name": "eagle3-cuda"}
+            ]
+        },
+        {
+            "name": "eagle3-mlx",
+            "displayName": "Configure and build EAGLE-3 speculator runner (MLX)",
+            "steps": [
+                {"type": "configure", "name": "eagle3-mlx"},
+                {"type": "build", "name": "eagle3-mlx"}
+            ]
         }
     ]
 }
@@ -364,9 +364,10 @@ def _export_mlx(
             strict=True,
         )
 
+    # Capture d2t before freeing the speculator; baked in as get_d2t below.
+    d2t_const = spec.draft.d2t.to(torch.long).cpu().contiguous()
     del spec
     gc.collect()
-
     print("Lowering to ExecuTorch with MLX backend...")
     et_prog = to_edge_transform_and_lower(
         {"target_forward": target_ep, "draft_decode": draft_ep},
@@ -387,6 +388,10 @@ def _export_mlx(
             "get_min_prefill_chunk": 1,
             "get_chain_len": chain_len,
             "get_draft_vocab_size": draft_vocab_size,
+            # draft->target vocab map (target_id = draft_id + d2t[draft_id]); the
+            # MLX draft_decode returns draft-vocab logits, so a logits-consuming
+            # runner reads this to map proposals back to target ids.
+            "get_d2t": d2t_const,
             "use_kv_cache": True,
             "enable_dynamic_shape": True,
         },