up

metascroy · metascroy · commit def40a86148c · 2026-06-17T18:05:35.000-07:00
diff --git a/Makefile b/Makefile
@@ -91,7 +91,7 @@
 #
 # ==============================================================================
 
-.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral-mlx voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal voxtral_realtime-mlx voxtral_tts-cpu voxtral_tts-cuda whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal parakeet-mlx parakeet-vulkan dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu lfm_2_5-mlx llava-cpu gemma3-cuda gemma3-cpu gemma4_31b-cuda gemma4_31b-mlx qwen3_5_moe-cuda qwen3_5_moe-metal clean help
+.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral-mlx voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal voxtral_realtime-mlx voxtral_tts-cpu voxtral_tts-cuda whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal parakeet-mlx parakeet-vulkan dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu lfm_2_5-mlx llava-cpu gemma3-cuda gemma3-cpu gemma4_31b-cuda gemma4_31b-mlx qwen3_5_moe-cuda qwen3_5_moe-metal qwen3_5_moe-mlx clean help
 
 help:
 	@echo "This Makefile adds targets to build runners for various models on various backends. Run using \`make <target>\`. Available targets:"
@@ -131,6 +131,7 @@ help:
 	@echo "  gemma4_31b-mlx      - Build Gemma 4 31B runner with MLX backend"
 	@echo "  qwen3_5_moe-cuda    - Build Qwen3.5 MoE runner with CUDA backend"
 	@echo "  qwen3_5_moe-metal   - Build Qwen3.5 MoE runner with Metal backend"
+	@echo "  qwen3_5_moe-mlx     - Build Qwen3.5 MoE runner with MLX backend"
 	@echo "  clean               - Clean build artifacts"
 
 voxtral-cuda:
@@ -467,6 +468,15 @@ qwen3_5_moe-metal:
 	@echo "✓ Build complete!"
 	@echo "  Binary: cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner"
 
+qwen3_5_moe-mlx:
+	@echo "==> Building and installing ExecuTorch with MLX..."
+	cmake --workflow --preset mlx-release
+	@echo "==> Building Qwen3.5 MoE runner with MLX..."
+	cd examples/models/qwen3_5_moe && cmake --workflow --preset qwen3-5-moe-mlx
+	@echo ""
+	@echo "✓ Build complete!"
+	@echo "  Binary: cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner"
+
 clean:
 	rm -rf cmake-out \
 	       extension/llm/tokenizers/build \
diff --git a/examples/models/qwen3_5_moe/CMakeLists.txt b/examples/models/qwen3_5_moe/CMakeLists.txt
@@ -54,9 +54,14 @@ elseif(EXECUTORCH_BUILD_CUDA)
   list(APPEND link_libraries aoti_cuda_backend)
   executorch_target_link_options_shared_lib(aoti_cuda_backend)
   add_compile_definitions(EXECUTORCH_BUILD_CUDA)
+elseif(TARGET mlxdelegate)
+  list(APPEND link_libraries mlxdelegate mlx)
+  executorch_target_link_options_shared_lib(mlxdelegate)
+  add_compile_definitions(EXECUTORCH_BUILD_MLX)
 else()
   message(
-    FATAL_ERROR "Set EXECUTORCH_BUILD_CUDA=ON or EXECUTORCH_BUILD_METAL=ON"
+    FATAL_ERROR
+    "Set EXECUTORCH_BUILD_CUDA=ON, EXECUTORCH_BUILD_METAL=ON, or EXECUTORCH_BUILD_MLX=ON"
   )
 endif()
 
@@ -74,6 +79,10 @@ if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
   target_link_options(qwen3_5_moe_runner PRIVATE "LINKER:-s")
 endif()
 
+if(TARGET mlxdelegate)
+  executorch_target_copy_mlx_metallib(qwen3_5_moe_runner)
+endif()
+
 if(EXECUTORCH_BUILD_CUDA)
   enable_testing()
   add_executable(
diff --git a/examples/models/qwen3_5_moe/CMakePresets.json b/examples/models/qwen3_5_moe/CMakePresets.json
@@ -36,6 +36,17 @@
                 "type": "equals",
                 "rhs": "Darwin"
             }
+        },
+        {
+            "name": "qwen3-5-moe-mlx",
+            "displayName": "Qwen3.5 MoE runner (MLX)",
+            "inherits": ["qwen3-5-moe-base"],
+            "cacheVariables": {},
+            "condition": {
+                "type": "equals",
+                "lhs": "${hostSystemName}",
+                "rhs": "Darwin"
+            }
         }
     ],
     "buildPresets": [
@@ -50,6 +61,12 @@
             "displayName": "Build Qwen3.5 MoE runner (Metal)",
             "configurePreset": "qwen3-5-moe-metal",
             "targets": ["qwen3_5_moe_runner"]
+        },
+        {
+            "name": "qwen3-5-moe-mlx",
+            "displayName": "Build Qwen3.5 MoE runner (MLX)",
+            "configurePreset": "qwen3-5-moe-mlx",
+            "targets": ["qwen3_5_moe_runner"]
         }
     ],
     "workflowPresets": [
@@ -80,6 +97,20 @@
                     "name": "qwen3-5-moe-metal"
                 }
             ]
+        },
+        {
+            "name": "qwen3-5-moe-mlx",
+            "displayName": "Configure and build Qwen3.5 MoE runner (MLX)",
+            "steps": [
+                {
+                    "type": "configure",
+                    "name": "qwen3-5-moe-mlx"
+                },
+                {
+                    "type": "build",
+                    "name": "qwen3-5-moe-mlx"
+                }
+            ]
         }
     ]
 }
diff --git a/examples/models/qwen3_5_moe/README.md b/examples/models/qwen3_5_moe/README.md
@@ -211,7 +211,38 @@ python export.py \
 | `--qembedding` | (none) | Embedding quantization: `8w` |
 | `--tiny-test` | off | Build tiny model with random weights for CI testing |
 
-### Run (MLX)
+### Build (MLX)
+
+Like the CUDA/Metal builds, the `make` target builds ExecuTorch core with the
+MLX backend and the runner binary. Requires Apple Silicon (Darwin).
+
+```bash
+make qwen3_5_moe-mlx
+```
+
+This builds ExecuTorch with MLX support, then the runner binary at
+`cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner` (with `mlx.metallib`
+copied next to it). Unlike CUDA, the MLX `.pte` is self-contained — no `.ptd`
+data file is produced or needed.
+
+### Run (MLX, C++ runner)
+
+The C++ runner requires a local HuggingFace `tokenizer.json` (the MLX `.pte` and
+a `tokenizer.json`; no `--data_path`):
+
+```bash
+cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner \
+    --model_path ./qwen35_moe_mlx/model.pte \
+    --tokenizer_path ~/models/Qwen3.5-35B-A3B/tokenizer.json \
+    --prompt "What is the capital of France?" \
+    --max_new_tokens 50
+```
+
+The MLX export emits a single dynamic-seq `forward` method; the runner loads and
+calls it for both prefill and decode (sampling on host), matching the Python
+runner. See the [Run](#run) section above for the full flag list.
+
+### Run (MLX, Python)
 
 ```bash
 python -m executorch.examples.models.qwen3_5_moe.run \
diff --git a/examples/models/qwen3_5_moe/qwen35_moe_engine.cpp b/examples/models/qwen3_5_moe/qwen35_moe_engine.cpp
@@ -19,6 +19,8 @@
 #include <cmath>
 #include <cstring>
 
+#include <algorithm>
+
 #ifdef EXECUTORCH_BUILD_CUDA
 #include <cuda_runtime.h>
 #include <executorch/backends/cuda/runtime/cuda_mutable_state.h>
@@ -39,6 +41,20 @@ using SizesType = executorch::aten::SizesType;
 
 namespace {
 
+#ifdef EXECUTORCH_BUILD_MLX
+// The MLX export emits a single dynamic-seq `forward` method that handles both
+// prefill (T>=2) and decode (T=1). Mirror gemma4_31b's MLX runner, which loads
+// and calls `forward` for both phases.
+constexpr const char* kPrefillMethod = "forward";
+constexpr const char* kDecodeMethod = "forward";
+// Prefill is chunked on MLX to cap peak memory and the compiled prefill shape.
+constexpr int64_t kPrefillChunkSize = 1024;
+#else
+// CUDA/Metal exports emit two separate methods.
+constexpr const char* kPrefillMethod = "prefill";
+constexpr const char* kDecodeMethod = "decode";
+#endif
+
 Result<uint64_t> read_sampled_token(
     const executorch::aten::Tensor& output,
     float temperature) {
@@ -98,8 +114,10 @@ Result<std::unique_ptr<Module>> build_qwen_module(
   }
 #endif
 
-  ET_CHECK_OK_OR_RETURN_ERROR(module->load_method("prefill"));
-  ET_CHECK_OK_OR_RETURN_ERROR(module->load_method("decode"));
+  ET_CHECK_OK_OR_RETURN_ERROR(module->load_method(kPrefillMethod));
+  if (std::string(kDecodeMethod) != std::string(kPrefillMethod)) {
+    ET_CHECK_OK_OR_RETURN_ERROR(module->load_method(kDecodeMethod));
+  }
   return module;
 }
 
@@ -240,34 +258,51 @@ class Qwen35MoESession : public LLMSession {
     }
 
     stop_.store(false, std::memory_order_relaxed);
-    std::vector<int64_t> token_data(tokens.begin(), tokens.end());
-    std::vector<int64_t> pos_data(T);
-    for (int64_t i = 0; i < T; ++i) {
-      pos_data[i] = pos_ + i;
-    }
-    auto tokens_tensor = from_blob(
-        token_data.data(),
-        {1, static_cast<SizesType>(T)},
-        executorch::aten::ScalarType::Long);
-    auto pos_tensor = from_blob(
-        pos_data.data(),
-        {static_cast<SizesType>(T)},
-        executorch::aten::ScalarType::Long);
-
-    const char* method = (T >= 2) ? "prefill" : "decode";
-    std::vector<EValue> inputs;
-    inputs.push_back(tokens_tensor);
-    inputs.push_back(pos_tensor);
+
+    // On MLX, run prefill in fixed-size chunks (caps peak memory and the
+    // compiled prefill shape). Other backends prefill the whole prompt in one
+    // pass. Only the final chunk's sampled token is kept; the recurrence/KV
+    // state from earlier chunks persists via pos_ advancement.
+#ifdef EXECUTORCH_BUILD_MLX
+    const int64_t chunk_size = kPrefillChunkSize;
+#else
+    const int64_t chunk_size = T;
+#endif
+
+    uint64_t sampled_token = 0;
+    for (int64_t off = 0; off < T; off += chunk_size) {
+      const int64_t len = std::min(chunk_size, T - off);
+      std::vector<int64_t> token_data(
+          tokens.begin() + off, tokens.begin() + off + len);
+      std::vector<int64_t> pos_data(len);
+      for (int64_t i = 0; i < len; ++i) {
+        pos_data[i] = pos_ + i;
+      }
+      auto tokens_tensor = from_blob(
+          token_data.data(),
+          {1, static_cast<SizesType>(len)},
+          executorch::aten::ScalarType::Long);
+      auto pos_tensor = from_blob(
+          pos_data.data(),
+          {static_cast<SizesType>(len)},
+          executorch::aten::ScalarType::Long);
+
+      const char* method = (len >= 2) ? kPrefillMethod : kDecodeMethod;
+      std::vector<EValue> inputs;
+      inputs.push_back(tokens_tensor);
+      inputs.push_back(pos_tensor);
 #ifdef EXECUTORCH_BUILD_CUDA
-    set_temp(first_token_temp);
-    inputs.push_back(EValue(temp_tensor_));
+      set_temp(first_token_temp);
+      inputs.push_back(EValue(temp_tensor_));
 #endif
-    auto sampled =
-        run_locked(method, inputs, first_token_temp, /*sync_after=*/true);
-    ET_CHECK_OK_OR_RETURN_ERROR(sampled.error());
-    pending_ = sampled.get();
+      auto sampled =
+          run_locked(method, inputs, first_token_temp, /*sync_after=*/true);
+      ET_CHECK_OK_OR_RETURN_ERROR(sampled.error());
+      sampled_token = sampled.get();
+      pos_ += len;
+    }
+    pending_ = sampled_token;
     prev_decode_token_.reset();
-    pos_ += T;
     return Error::Ok;
   }
 
@@ -334,7 +369,7 @@ class Qwen35MoESession : public LLMSession {
     inputs.push_back(EValue(temp_tensor_));
 #endif
     auto sampled =
-        run_locked("decode", inputs, temperature_, /*sync_after=*/false);
+        run_locked(kDecodeMethod, inputs, temperature_, /*sync_after=*/false);
     ET_CHECK_OK_OR_RETURN_ERROR(sampled.error());
     pending_ = sampled.get();
     prev_decode_token_ = token;