Skip to content

Commit def40a8

Browse files
committed
up
1 parent b4203aa commit def40a8

5 files changed

Lines changed: 147 additions & 31 deletions

File tree

Makefile

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@
9191
#
9292
# ==============================================================================
9393

94-
.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral-mlx voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal voxtral_realtime-mlx voxtral_tts-cpu voxtral_tts-cuda whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal parakeet-mlx parakeet-vulkan dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu lfm_2_5-mlx llava-cpu gemma3-cuda gemma3-cpu gemma4_31b-cuda gemma4_31b-mlx qwen3_5_moe-cuda qwen3_5_moe-metal clean help
94+
.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral-mlx voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal voxtral_realtime-mlx voxtral_tts-cpu voxtral_tts-cuda whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal parakeet-mlx parakeet-vulkan dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu lfm_2_5-mlx llava-cpu gemma3-cuda gemma3-cpu gemma4_31b-cuda gemma4_31b-mlx qwen3_5_moe-cuda qwen3_5_moe-metal qwen3_5_moe-mlx clean help
9595

9696
help:
9797
@echo "This Makefile adds targets to build runners for various models on various backends. Run using \`make <target>\`. Available targets:"
@@ -131,6 +131,7 @@ help:
131131
@echo " gemma4_31b-mlx - Build Gemma 4 31B runner with MLX backend"
132132
@echo " qwen3_5_moe-cuda - Build Qwen3.5 MoE runner with CUDA backend"
133133
@echo " qwen3_5_moe-metal - Build Qwen3.5 MoE runner with Metal backend"
134+
@echo " qwen3_5_moe-mlx - Build Qwen3.5 MoE runner with MLX backend"
134135
@echo " clean - Clean build artifacts"
135136

136137
voxtral-cuda:
@@ -467,6 +468,15 @@ qwen3_5_moe-metal:
467468
@echo "✓ Build complete!"
468469
@echo " Binary: cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner"
469470

471+
qwen3_5_moe-mlx:
472+
@echo "==> Building and installing ExecuTorch with MLX..."
473+
cmake --workflow --preset mlx-release
474+
@echo "==> Building Qwen3.5 MoE runner with MLX..."
475+
cd examples/models/qwen3_5_moe && cmake --workflow --preset qwen3-5-moe-mlx
476+
@echo ""
477+
@echo "✓ Build complete!"
478+
@echo " Binary: cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner"
479+
470480
clean:
471481
rm -rf cmake-out \
472482
extension/llm/tokenizers/build \

examples/models/qwen3_5_moe/CMakeLists.txt

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,9 +54,14 @@ elseif(EXECUTORCH_BUILD_CUDA)
5454
list(APPEND link_libraries aoti_cuda_backend)
5555
executorch_target_link_options_shared_lib(aoti_cuda_backend)
5656
add_compile_definitions(EXECUTORCH_BUILD_CUDA)
57+
elseif(TARGET mlxdelegate)
58+
list(APPEND link_libraries mlxdelegate mlx)
59+
executorch_target_link_options_shared_lib(mlxdelegate)
60+
add_compile_definitions(EXECUTORCH_BUILD_MLX)
5761
else()
5862
message(
59-
FATAL_ERROR "Set EXECUTORCH_BUILD_CUDA=ON or EXECUTORCH_BUILD_METAL=ON"
63+
FATAL_ERROR
64+
"Set EXECUTORCH_BUILD_CUDA=ON, EXECUTORCH_BUILD_METAL=ON, or EXECUTORCH_BUILD_MLX=ON"
6065
)
6166
endif()
6267

@@ -74,6 +79,10 @@ if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
7479
target_link_options(qwen3_5_moe_runner PRIVATE "LINKER:-s")
7580
endif()
7681

82+
if(TARGET mlxdelegate)
83+
executorch_target_copy_mlx_metallib(qwen3_5_moe_runner)
84+
endif()
85+
7786
if(EXECUTORCH_BUILD_CUDA)
7887
enable_testing()
7988
add_executable(

examples/models/qwen3_5_moe/CMakePresets.json

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,17 @@
3636
"type": "equals",
3737
"rhs": "Darwin"
3838
}
39+
},
40+
{
41+
"name": "qwen3-5-moe-mlx",
42+
"displayName": "Qwen3.5 MoE runner (MLX)",
43+
"inherits": ["qwen3-5-moe-base"],
44+
"cacheVariables": {},
45+
"condition": {
46+
"type": "equals",
47+
"lhs": "${hostSystemName}",
48+
"rhs": "Darwin"
49+
}
3950
}
4051
],
4152
"buildPresets": [
@@ -50,6 +61,12 @@
5061
"displayName": "Build Qwen3.5 MoE runner (Metal)",
5162
"configurePreset": "qwen3-5-moe-metal",
5263
"targets": ["qwen3_5_moe_runner"]
64+
},
65+
{
66+
"name": "qwen3-5-moe-mlx",
67+
"displayName": "Build Qwen3.5 MoE runner (MLX)",
68+
"configurePreset": "qwen3-5-moe-mlx",
69+
"targets": ["qwen3_5_moe_runner"]
5370
}
5471
],
5572
"workflowPresets": [
@@ -80,6 +97,20 @@
8097
"name": "qwen3-5-moe-metal"
8198
}
8299
]
100+
},
101+
{
102+
"name": "qwen3-5-moe-mlx",
103+
"displayName": "Configure and build Qwen3.5 MoE runner (MLX)",
104+
"steps": [
105+
{
106+
"type": "configure",
107+
"name": "qwen3-5-moe-mlx"
108+
},
109+
{
110+
"type": "build",
111+
"name": "qwen3-5-moe-mlx"
112+
}
113+
]
83114
}
84115
]
85116
}

examples/models/qwen3_5_moe/README.md

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -211,7 +211,38 @@ python export.py \
211211
| `--qembedding` | (none) | Embedding quantization: `8w` |
212212
| `--tiny-test` | off | Build tiny model with random weights for CI testing |
213213

214-
### Run (MLX)
214+
### Build (MLX)
215+
216+
Like the CUDA/Metal builds, the `make` target builds ExecuTorch core with the
217+
MLX backend and the runner binary. Requires Apple Silicon (Darwin).
218+
219+
```bash
220+
make qwen3_5_moe-mlx
221+
```
222+
223+
This builds ExecuTorch with MLX support, then the runner binary at
224+
`cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner` (with `mlx.metallib`
225+
copied next to it). Unlike CUDA, the MLX `.pte` is self-contained — no `.ptd`
226+
data file is produced or needed.
227+
228+
### Run (MLX, C++ runner)
229+
230+
The C++ runner requires a local HuggingFace `tokenizer.json` (the MLX `.pte` and
231+
a `tokenizer.json`; no `--data_path`):
232+
233+
```bash
234+
cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner \
235+
--model_path ./qwen35_moe_mlx/model.pte \
236+
--tokenizer_path ~/models/Qwen3.5-35B-A3B/tokenizer.json \
237+
--prompt "What is the capital of France?" \
238+
--max_new_tokens 50
239+
```
240+
241+
The MLX export emits a single dynamic-seq `forward` method; the runner loads and
242+
calls it for both prefill and decode (sampling on host), matching the Python
243+
runner. See the [Run](#run) section above for the full flag list.
244+
245+
### Run (MLX, Python)
215246

216247
```bash
217248
python -m executorch.examples.models.qwen3_5_moe.run \

examples/models/qwen3_5_moe/qwen35_moe_engine.cpp

Lines changed: 63 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
#include <cmath>
2020
#include <cstring>
2121

22+
#include <algorithm>
23+
2224
#ifdef EXECUTORCH_BUILD_CUDA
2325
#include <cuda_runtime.h>
2426
#include <executorch/backends/cuda/runtime/cuda_mutable_state.h>
@@ -39,6 +41,20 @@ using SizesType = executorch::aten::SizesType;
3941

4042
namespace {
4143

44+
#ifdef EXECUTORCH_BUILD_MLX
45+
// The MLX export emits a single dynamic-seq `forward` method that handles both
46+
// prefill (T>=2) and decode (T=1). Mirror gemma4_31b's MLX runner, which loads
47+
// and calls `forward` for both phases.
48+
constexpr const char* kPrefillMethod = "forward";
49+
constexpr const char* kDecodeMethod = "forward";
50+
// Prefill is chunked on MLX to cap peak memory and the compiled prefill shape.
51+
constexpr int64_t kPrefillChunkSize = 1024;
52+
#else
53+
// CUDA/Metal exports emit two separate methods.
54+
constexpr const char* kPrefillMethod = "prefill";
55+
constexpr const char* kDecodeMethod = "decode";
56+
#endif
57+
4258
Result<uint64_t> read_sampled_token(
4359
const executorch::aten::Tensor& output,
4460
float temperature) {
@@ -98,8 +114,10 @@ Result<std::unique_ptr<Module>> build_qwen_module(
98114
}
99115
#endif
100116

101-
ET_CHECK_OK_OR_RETURN_ERROR(module->load_method("prefill"));
102-
ET_CHECK_OK_OR_RETURN_ERROR(module->load_method("decode"));
117+
ET_CHECK_OK_OR_RETURN_ERROR(module->load_method(kPrefillMethod));
118+
if (std::string(kDecodeMethod) != std::string(kPrefillMethod)) {
119+
ET_CHECK_OK_OR_RETURN_ERROR(module->load_method(kDecodeMethod));
120+
}
103121
return module;
104122
}
105123

@@ -240,34 +258,51 @@ class Qwen35MoESession : public LLMSession {
240258
}
241259

242260
stop_.store(false, std::memory_order_relaxed);
243-
std::vector<int64_t> token_data(tokens.begin(), tokens.end());
244-
std::vector<int64_t> pos_data(T);
245-
for (int64_t i = 0; i < T; ++i) {
246-
pos_data[i] = pos_ + i;
247-
}
248-
auto tokens_tensor = from_blob(
249-
token_data.data(),
250-
{1, static_cast<SizesType>(T)},
251-
executorch::aten::ScalarType::Long);
252-
auto pos_tensor = from_blob(
253-
pos_data.data(),
254-
{static_cast<SizesType>(T)},
255-
executorch::aten::ScalarType::Long);
256-
257-
const char* method = (T >= 2) ? "prefill" : "decode";
258-
std::vector<EValue> inputs;
259-
inputs.push_back(tokens_tensor);
260-
inputs.push_back(pos_tensor);
261+
262+
// On MLX, run prefill in fixed-size chunks (caps peak memory and the
263+
// compiled prefill shape). Other backends prefill the whole prompt in one
264+
// pass. Only the final chunk's sampled token is kept; the recurrence/KV
265+
// state from earlier chunks persists via pos_ advancement.
266+
#ifdef EXECUTORCH_BUILD_MLX
267+
const int64_t chunk_size = kPrefillChunkSize;
268+
#else
269+
const int64_t chunk_size = T;
270+
#endif
271+
272+
uint64_t sampled_token = 0;
273+
for (int64_t off = 0; off < T; off += chunk_size) {
274+
const int64_t len = std::min(chunk_size, T - off);
275+
std::vector<int64_t> token_data(
276+
tokens.begin() + off, tokens.begin() + off + len);
277+
std::vector<int64_t> pos_data(len);
278+
for (int64_t i = 0; i < len; ++i) {
279+
pos_data[i] = pos_ + i;
280+
}
281+
auto tokens_tensor = from_blob(
282+
token_data.data(),
283+
{1, static_cast<SizesType>(len)},
284+
executorch::aten::ScalarType::Long);
285+
auto pos_tensor = from_blob(
286+
pos_data.data(),
287+
{static_cast<SizesType>(len)},
288+
executorch::aten::ScalarType::Long);
289+
290+
const char* method = (len >= 2) ? kPrefillMethod : kDecodeMethod;
291+
std::vector<EValue> inputs;
292+
inputs.push_back(tokens_tensor);
293+
inputs.push_back(pos_tensor);
261294
#ifdef EXECUTORCH_BUILD_CUDA
262-
set_temp(first_token_temp);
263-
inputs.push_back(EValue(temp_tensor_));
295+
set_temp(first_token_temp);
296+
inputs.push_back(EValue(temp_tensor_));
264297
#endif
265-
auto sampled =
266-
run_locked(method, inputs, first_token_temp, /*sync_after=*/true);
267-
ET_CHECK_OK_OR_RETURN_ERROR(sampled.error());
268-
pending_ = sampled.get();
298+
auto sampled =
299+
run_locked(method, inputs, first_token_temp, /*sync_after=*/true);
300+
ET_CHECK_OK_OR_RETURN_ERROR(sampled.error());
301+
sampled_token = sampled.get();
302+
pos_ += len;
303+
}
304+
pending_ = sampled_token;
269305
prev_decode_token_.reset();
270-
pos_ += T;
271306
return Error::Ok;
272307
}
273308

@@ -334,7 +369,7 @@ class Qwen35MoESession : public LLMSession {
334369
inputs.push_back(EValue(temp_tensor_));
335370
#endif
336371
auto sampled =
337-
run_locked("decode", inputs, temperature_, /*sync_after=*/false);
372+
run_locked(kDecodeMethod, inputs, temperature_, /*sync_after=*/false);
338373
ET_CHECK_OK_OR_RETURN_ERROR(sampled.error());
339374
pending_ = sampled.get();
340375
prev_decode_token_ = token;

0 commit comments

Comments
 (0)