Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
67 commits
Select commit Hold shift + click to select a range
a3a42e4
Update
manuelcandales Apr 14, 2026
1c965c6
Update
manuelcandales Apr 14, 2026
1be53ab
Update
manuelcandales Apr 14, 2026
47cbe76
Update
manuelcandales Apr 14, 2026
805a09d
Update
manuelcandales Apr 14, 2026
5306c5a
Update
manuelcandales Apr 14, 2026
638edaa
Update
manuelcandales Apr 14, 2026
ca524a8
Update
manuelcandales Apr 14, 2026
958712e
Update
manuelcandales Apr 14, 2026
eba74c4
Update
manuelcandales Apr 14, 2026
c9ecdde
Update
manuelcandales Apr 14, 2026
c222005
Update
manuelcandales Apr 14, 2026
982d0d9
Update
manuelcandales Apr 14, 2026
e7a7acc
Update
manuelcandales Apr 14, 2026
5530242
Update
manuelcandales Apr 14, 2026
59f88db
Update
manuelcandales Apr 14, 2026
1fbb94f
Update
manuelcandales Apr 14, 2026
60ca500
Update
manuelcandales Apr 14, 2026
d70d646
Update
manuelcandales Apr 14, 2026
d80da37
Update
manuelcandales Apr 14, 2026
598c58f
Update
manuelcandales Apr 14, 2026
f8ff857
Update
manuelcandales Apr 16, 2026
ae7a13e
Update
manuelcandales Apr 16, 2026
4632a83
Update
manuelcandales Apr 20, 2026
98d2f81
Update
manuelcandales Apr 20, 2026
95fb7f9
Update
manuelcandales Apr 20, 2026
440f7fc
Update
manuelcandales Apr 20, 2026
525e67b
Update
manuelcandales Apr 20, 2026
33ce3f5
Update
manuelcandales Apr 20, 2026
f4f616e
Update
manuelcandales Apr 20, 2026
b8e1201
Update
manuelcandales Apr 20, 2026
9ce837a
Update
manuelcandales Apr 20, 2026
bd12247
Update
manuelcandales Apr 20, 2026
9bf4c74
Update
manuelcandales Apr 20, 2026
248115a
Update
manuelcandales Apr 20, 2026
ee865c3
Update
manuelcandales Apr 20, 2026
36d45ef
Update
manuelcandales Apr 20, 2026
08a9fa2
Update
manuelcandales Apr 20, 2026
88e24e1
Update
manuelcandales Apr 20, 2026
9000488
Update
manuelcandales Apr 20, 2026
a060d19
Update
manuelcandales Apr 20, 2026
01c3ce5
Update
manuelcandales Apr 20, 2026
0c1a88b
Update
manuelcandales Apr 20, 2026
2c56804
Update
manuelcandales Apr 20, 2026
7b480b3
Update
manuelcandales Apr 20, 2026
68672d8
Update
manuelcandales Apr 20, 2026
933122c
Update
manuelcandales Apr 20, 2026
9def0ed
Update
manuelcandales Apr 20, 2026
01ecf6a
Update
manuelcandales Apr 20, 2026
1766789
Update
manuelcandales Apr 20, 2026
21057d6
Update
manuelcandales Apr 20, 2026
b2b34e5
Update
manuelcandales Apr 20, 2026
7423226
Update
manuelcandales Apr 20, 2026
4b791ea
Update
manuelcandales Apr 20, 2026
ff92256
Update
manuelcandales Apr 20, 2026
b9b75e3
Update
manuelcandales Apr 20, 2026
d761fdb
Update
manuelcandales Apr 20, 2026
f8ebcfb
Update
manuelcandales Apr 21, 2026
4cf31c8
Update
manuelcandales Apr 21, 2026
ba0e56e
Update
manuelcandales Apr 21, 2026
3285bb2
Update
manuelcandales Apr 21, 2026
187e4f5
Update
manuelcandales Apr 21, 2026
23bec62
Update
manuelcandales Apr 21, 2026
f031916
Update
manuelcandales Apr 21, 2026
c53ecc6
Update
manuelcandales Apr 21, 2026
3b7f7ce
Update
manuelcandales Apr 21, 2026
f697e84
Update
manuelcandales Apr 21, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@
#
# ==============================================================================

.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral-mlx voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal voxtral_realtime-mlx whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal parakeet-mlx parakeet-vulkan dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu llava-cpu gemma3-cuda gemma3-cpu qwen3_5_moe-cuda clean help
.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral-mlx voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal voxtral_realtime-mlx whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal parakeet-mlx parakeet-vulkan dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu llava-cpu gemma3-cuda gemma3-cpu qwen3_5_moe-cuda qwen3_5_moe-metal clean help

help:
@echo "This Makefile adds targets to build runners for various models on various backends. Run using \`make <target>\`. Available targets:"
Expand Down Expand Up @@ -125,6 +125,7 @@ help:
@echo " gemma3-cuda - Build Gemma3 runner with CUDA backend"
@echo " gemma3-cpu - Build Gemma3 runner with CPU backend"
@echo " qwen3_5_moe-cuda - Build Qwen3.5 MoE runner with CUDA backend"
@echo " qwen3_5_moe-metal - Build Qwen3.5 MoE runner with Metal backend"
@echo " clean - Clean build artifacts"

voxtral-cuda:
Expand Down Expand Up @@ -404,6 +405,15 @@ qwen3_5_moe-cuda:
@echo "✓ Build complete!"
@echo " Binary: cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner"

qwen3_5_moe-metal:
@echo "==> Building and installing ExecuTorch with Metal..."
cmake --workflow --preset llm-release-metal
@echo "==> Building Qwen3.5 MoE runner with Metal..."
cd examples/models/qwen3_5_moe && cmake --workflow --preset qwen3-5-moe-metal
@echo ""
@echo "✓ Build complete!"
@echo " Binary: cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner"

clean:
rm -rf cmake-out \
extension/llm/tokenizers/build \
Expand Down
18 changes: 14 additions & 4 deletions examples/models/qwen3_5_moe/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,20 @@ list(
extension_flat_tensor
)

# CUDA backend (required)
find_package(CUDAToolkit REQUIRED)
list(APPEND link_libraries aoti_cuda_backend)
executorch_target_link_options_shared_lib(aoti_cuda_backend)
# Backend selection
if(EXECUTORCH_BUILD_METAL)
list(APPEND link_libraries metal_backend)
executorch_target_link_options_shared_lib(metal_backend)
elseif(EXECUTORCH_BUILD_CUDA)
find_package(CUDAToolkit REQUIRED)
list(APPEND link_libraries aoti_cuda_backend)
executorch_target_link_options_shared_lib(aoti_cuda_backend)
add_compile_definitions(EXECUTORCH_BUILD_CUDA)
else()
message(
FATAL_ERROR "Set EXECUTORCH_BUILD_CUDA=ON or EXECUTORCH_BUILD_METAL=ON"
)
endif()

# Tokenizer
list(APPEND link_libraries tokenizers::tokenizers)
Expand Down
33 changes: 33 additions & 0 deletions examples/models/qwen3_5_moe/CMakePresets.json
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,19 @@
"string": "${hostSystemName}",
"list": ["Linux", "Windows"]
}
},
{
"name": "qwen3-5-moe-metal",
"displayName": "Qwen3.5 MoE runner (Metal)",
"inherits": ["qwen3-5-moe-base"],
"cacheVariables": {
"EXECUTORCH_BUILD_METAL": "ON"
},
"condition": {
"lhs": "${hostSystemName}",
"type": "equals",
"rhs": "Darwin"
}
}
],
"buildPresets": [
Expand All @@ -31,6 +44,12 @@
"displayName": "Build Qwen3.5 MoE runner (CUDA)",
"configurePreset": "qwen3-5-moe-cuda",
"targets": ["qwen3_5_moe_runner"]
},
{
"name": "qwen3-5-moe-metal",
"displayName": "Build Qwen3.5 MoE runner (Metal)",
"configurePreset": "qwen3-5-moe-metal",
"targets": ["qwen3_5_moe_runner"]
}
],
"workflowPresets": [
Expand All @@ -47,6 +66,20 @@
"name": "qwen3-5-moe-cuda"
}
]
},
{
"name": "qwen3-5-moe-metal",
"displayName": "Configure and build Qwen3.5 MoE runner (Metal)",
"steps": [
{
"type": "configure",
"name": "qwen3-5-moe-metal"
},
{
"type": "build",
"name": "qwen3-5-moe-metal"
}
]
}
]
}
14 changes: 12 additions & 2 deletions examples/models/qwen3_5_moe/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@
#include <string>
#include <vector>

#ifdef EXECUTORCH_BUILD_CUDA
#include <cuda_runtime.h>
#endif

DEFINE_string(model_path, "", "Model .pte file path.");
DEFINE_string(data_path, "", "Data file (.ptd) for CUDA backend.");
Expand Down Expand Up @@ -130,7 +132,13 @@ int main(int argc, char** argv) {
uint64_t cur_token = 0;
auto prefill_start = std::chrono::steady_clock::now();

// Chunked prefill
// Use prefill method for T>=2, decode method for T=1
// (prefill was exported with min seq_len=2)
std::string run_method = prefill_method;
if (dual_method && num_prompt_tokens == 1) {
run_method = "decode";
}

std::vector<int64_t> pos_data(num_prompt_tokens);
for (int64_t i = 0; i < num_prompt_tokens; i++) {
pos_data[i] = i;
Expand All @@ -149,7 +157,7 @@ int main(int argc, char** argv) {
prefill_inputs.push_back(tokens_tensor);
prefill_inputs.push_back(pos_tensor);

auto prefill_result = module->execute(prefill_method, prefill_inputs);
auto prefill_result = module->execute(run_method, prefill_inputs);
if (prefill_result.error() != Error::Ok) {
ET_LOG(Error, "Prefill failed");
return 1;
Expand All @@ -171,10 +179,12 @@ int main(int argc, char** argv) {
prefill_ms,
num_prompt_tokens * 1000.0 / prefill_ms);

#ifdef EXECUTORCH_BUILD_CUDA
// Synchronize CUDA device to ensure prefill's writes to shared mutable
// buffers (KV cache, conv_state, recurrent_state) are visible to the
// decode method, which may run on a different CUDA stream.
cudaDeviceSynchronize();
#endif

if (!dual_method) {
printf("Single-method mode: skipping decode\n");
Expand Down
Loading