Skip to content

Commit 263bc33

Browse files
committed
up
1 parent eff6821 commit 263bc33

8 files changed

Lines changed: 558 additions & 85 deletions

File tree

Makefile

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@
9191
#
9292
# ==============================================================================
9393

94-
.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral-mlx voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal voxtral_realtime-mlx voxtral_tts-cpu voxtral_tts-cuda whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal parakeet-mlx parakeet-vulkan dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu lfm_2_5-mlx llava-cpu gemma3-cuda gemma3-cpu gemma4_31b-cuda gemma4_31b-mlx qwen3_5_moe-cuda qwen3_5_moe-metal clean help
94+
.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral-mlx voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal voxtral_realtime-mlx voxtral_tts-cpu voxtral_tts-cuda whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal parakeet-mlx parakeet-vulkan dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu lfm_2_5-mlx llava-cpu gemma3-cuda gemma3-cpu gemma4_31b-cuda gemma4_31b-mlx eagle3-cuda eagle3-mlx qwen3_5_moe-cuda qwen3_5_moe-metal clean help
9595

9696
help:
9797
@echo "This Makefile adds targets to build runners for various models on various backends. Run using \`make <target>\`. Available targets:"
@@ -129,6 +129,8 @@ help:
129129
@echo " gemma3-cpu - Build Gemma3 runner with CPU backend"
130130
@echo " gemma4_31b-cuda - Build Gemma 4 31B runner with CUDA backend"
131131
@echo " gemma4_31b-mlx - Build Gemma 4 31B runner with MLX backend"
132+
@echo " eagle3-cuda - Build EAGLE-3 speculator runner with CUDA backend"
133+
@echo " eagle3-mlx - Build EAGLE-3 speculator runner with MLX backend"
132134
@echo " qwen3_5_moe-cuda - Build Qwen3.5 MoE runner with CUDA backend"
133135
@echo " qwen3_5_moe-metal - Build Qwen3.5 MoE runner with Metal backend"
134136
@echo " clean - Clean build artifacts"
@@ -457,6 +459,24 @@ gemma4_31b-mlx:
457459
@echo "✓ Build complete!"
458460
@echo " Binary: cmake-out/examples/models/gemma4_31b/gemma4_31b_runner"
459461

462+
eagle3-cuda:
463+
@echo "==> Building and installing ExecuTorch with CUDA..."
464+
cmake --workflow --preset llm-release-cuda
465+
@echo "==> Building EAGLE-3 speculator runner with CUDA..."
466+
cd examples/models/eagle3 && cmake --workflow --preset eagle3-cuda
467+
@echo ""
468+
@echo "✓ Build complete!"
469+
@echo " Binary: cmake-out/examples/models/eagle3/eagle3_speculator_runner"
470+
471+
eagle3-mlx:
472+
@echo "==> Building and installing ExecuTorch with MLX..."
473+
cmake --workflow --preset mlx-release
474+
@echo "==> Building EAGLE-3 speculator runner with MLX..."
475+
cd examples/models/eagle3 && cmake --workflow --preset eagle3-mlx
476+
@echo ""
477+
@echo "✓ Build complete!"
478+
@echo " Binary: cmake-out/examples/models/eagle3/eagle3_speculator_runner"
479+
460480
qwen3_5_moe-metal:
461481
@echo "==> Building and installing ExecuTorch with Metal..."
462482
cmake --workflow --preset llm-release-metal

examples/models/eagle3/CMakeLists.txt

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,14 +42,19 @@ list(
4242
extension_flat_tensor
4343
)
4444

45-
# Backend: CUDA (AOTI). The EAGLE-3 speculator export is CUDA-only.
45+
# Backend: CUDA (AOTI) or MLX (exactly one required). CUDA returns greedy ids;
46+
# MLX returns logits and the runner argmaxes + maps draft ids via d2t on the
47+
# host.
4648
if(EXECUTORCH_BUILD_CUDA)
4749
find_package(CUDAToolkit REQUIRED)
4850
list(APPEND link_libraries aoti_cuda_backend)
4951
executorch_target_link_options_shared_lib(aoti_cuda_backend)
5052
add_compile_definitions(EXECUTORCH_BUILD_CUDA)
53+
elseif(TARGET mlxdelegate)
54+
list(APPEND link_libraries mlxdelegate mlx)
55+
executorch_target_link_options_shared_lib(mlxdelegate)
5156
else()
52-
message(FATAL_ERROR "EAGLE-3 speculator runner requires EXECUTORCH_BUILD_CUDA=ON")
57+
message(FATAL_ERROR "Set EXECUTORCH_BUILD_CUDA=ON or EXECUTORCH_BUILD_MLX=ON")
5358
endif()
5459

5560
# Tokenizer (HuggingFace tokenizer.json)
@@ -67,3 +72,7 @@ if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
6772
target_link_options(eagle3_speculator_runner PRIVATE "LINKER:-s")
6873
endif()
6974
endif()
75+
76+
if(TARGET mlxdelegate)
77+
executorch_target_copy_mlx_metallib(eagle3_speculator_runner)
78+
endif()

examples/models/eagle3/CMakePresets.json

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,21 @@
1616
"string": "${hostSystemName}",
1717
"list": ["Linux", "Windows"]
1818
}
19+
},
20+
{
21+
"name": "eagle3-mlx",
22+
"displayName": "EAGLE-3 speculator runner (MLX)",
23+
"binaryDir": "${sourceDir}/../../../cmake-out/examples/models/eagle3",
24+
"cacheVariables": {
25+
"CMAKE_BUILD_TYPE": "Release",
26+
"CMAKE_FIND_ROOT_PATH": "${sourceDir}/../../../cmake-out",
27+
"CMAKE_PREFIX_PATH": "${sourceDir}/../../../cmake-out"
28+
},
29+
"condition": {
30+
"type": "equals",
31+
"lhs": "${hostSystemName}",
32+
"rhs": "Darwin"
33+
}
1934
}
2035
],
2136
"buildPresets": [
@@ -24,6 +39,30 @@
2439
"displayName": "Build EAGLE-3 speculator runner (CUDA)",
2540
"configurePreset": "eagle3-cuda",
2641
"targets": ["eagle3_speculator_runner"]
42+
},
43+
{
44+
"name": "eagle3-mlx",
45+
"displayName": "Build EAGLE-3 speculator runner (MLX)",
46+
"configurePreset": "eagle3-mlx",
47+
"targets": ["eagle3_speculator_runner"]
48+
}
49+
],
50+
"workflowPresets": [
51+
{
52+
"name": "eagle3-cuda",
53+
"displayName": "Configure and build EAGLE-3 speculator runner (CUDA)",
54+
"steps": [
55+
{"type": "configure", "name": "eagle3-cuda"},
56+
{"type": "build", "name": "eagle3-cuda"}
57+
]
58+
},
59+
{
60+
"name": "eagle3-mlx",
61+
"displayName": "Configure and build EAGLE-3 speculator runner (MLX)",
62+
"steps": [
63+
{"type": "configure", "name": "eagle3-mlx"},
64+
{"type": "build", "name": "eagle3-mlx"}
65+
]
2766
}
2867
]
2968
}

examples/models/eagle3/export.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -364,9 +364,10 @@ def _export_mlx(
364364
strict=True,
365365
)
366366

367+
# Capture d2t before freeing the speculator; baked in as get_d2t below.
368+
d2t_const = spec.draft.d2t.to(torch.long).cpu().contiguous()
367369
del spec
368370
gc.collect()
369-
370371
print("Lowering to ExecuTorch with MLX backend...")
371372
et_prog = to_edge_transform_and_lower(
372373
{"target_forward": target_ep, "draft_decode": draft_ep},
@@ -387,6 +388,10 @@ def _export_mlx(
387388
"get_min_prefill_chunk": 1,
388389
"get_chain_len": chain_len,
389390
"get_draft_vocab_size": draft_vocab_size,
391+
# draft->target vocab map (target_id = draft_id + d2t[draft_id]); the
392+
# MLX draft_decode returns draft-vocab logits, so a logits-consuming
393+
# runner reads this to map proposals back to target ids.
394+
"get_d2t": d2t_const,
390395
"use_kv_cache": True,
391396
"enable_dynamic_shape": True,
392397
},

0 commit comments

Comments
 (0)