Skip to content

Commit 4e35b60

Browse files
author
Github Executorch
committed
Support multimethod in export_llama_lib
Pull Request resolved: #17231 Note: multimethod export is currently limited to: - xnnpack or portable lib - only lora (does not support arbitrary nn.Modules in each method) - if quant is enabled, lora models must share quant schemes at source transformation time - no pt2e quant, as each model could have slightly different results after calibration Changes: 1. Add MultimethodLoraConfig to yaml 2. Deepcopy yaml config. Move each lora_config into base.lora_config. 3. Create and export the model 4. Repeat 2,3 for each method. 5. Pass a dict of method_name: ep to `to_edge_transform_and_lower` ghstack-source-id: 341235800 @exported-using-ghexport Differential Revision: [D92315602](https://our.internmc.facebook.com/intern/diff/D92315602/)
1 parent 1c4e293 commit 4e35b60

7 files changed

Lines changed: 331 additions & 22 deletions

File tree

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
#!/bin/bash
2+
# Copyright (c) Meta Platforms, Inc. and affiliates.
3+
# All rights reserved.
4+
#
5+
# This source code is licensed under the BSD-style license found in the
6+
# LICENSE file in the root directory of this source tree.
7+
8+
set -exu
9+
# shellcheck source=/dev/null
10+
source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
11+
12+
cmake_install_executorch_libraries() {
13+
echo "Installing libexecutorch.a, libextension_module.so, libportable_ops_lib.a"
14+
rm -rf cmake-out
15+
cmake --workflow llm-release
16+
}
17+
18+
cmake_build_llama_runner() {
19+
echo "Building llama runner"
20+
pushd extension/llm/tokenizers
21+
echo "Updating tokenizers submodule"
22+
git submodule update --init
23+
popd
24+
make llama-cpu
25+
}
26+
27+
cleanup_files() {
28+
echo "Deleting downloaded and generated files"
29+
rm -rf "${HF_QWEN_PATH}/"
30+
rm -rf "${HF_ADAPTER_PATH}/"
31+
rm -rf *.pte
32+
rm -f result*.txt
33+
}
34+
35+
# Download LoRA adapter.
36+
python -m pip install -q huggingface_hub
37+
HF_ADAPTER_REPO="lucylq/qwen3_06B_lora_math"
38+
HF_ADAPTER_PATH=$(
39+
bash "$(dirname "${BASH_SOURCE[0]}")/download_hf_hub.sh" \
40+
--model_id "${HF_ADAPTER_REPO}" \
41+
--files "adapter_config.json" "adapter_model.safetensors"
42+
)
43+
44+
# Download base model (for tokenizer path).
45+
HF_QWEN_PATH=$(python -c "from huggingface_hub import snapshot_download; print(snapshot_download('unsloth/Qwen3-0.6B'))")
46+
echo "Model downloaded to: $HF_QWEN_PATH"
47+
48+
### EXPORT MULTIMETHOD PTE ###
49+
# Set environment variables for OmegaConf interpolation in yaml.
50+
export LORA_ADAPTER_CHECKPOINT="${HF_ADAPTER_PATH}/adapter_model.safetensors"
51+
export LORA_ADAPTER_CONFIG="${HF_ADAPTER_PATH}/adapter_config.json"
52+
53+
$PYTHON_EXECUTABLE -m extension.llm.export.export_llm \
54+
--config examples/models/qwen3/config/qwen3_multimethod.yaml
55+
56+
### BUILD LLAMA RUNNER ###
57+
cmake_install_executorch_libraries
58+
cmake_build_llama_runner
59+
60+
# Runner constants.
61+
RUNTIME_ARGS="--tokenizer_path=${HF_QWEN_PATH}/ --temperature=0 --seq_len=100 --warmup=1"
62+
PROMPT="<|im_start|>user Calculate 15% of 80?<|im_end|><|im_start|>assistant"
63+
64+
# Expected outputs.
65+
EXPECTED_LORA_PREFIX="
66+
<|im_start|>user Calculate 15% of 80?<|im_end|><|im_start|>assistant
67+
To calculate 15% of 80"
68+
69+
EXPECTED_BASE_PREFIX="<|im_start|>user Calculate 15% of 80?<|im_end|><|im_start|>assistant:
70+
<think>
71+
Okay, so I need to calculate 15% of 80."
72+
73+
### TEST 1: Run lora_forward method ###
74+
NOW=$(date +"%H:%M:%S")
75+
echo "Test 1: Multimethod lora_forward. Starting at ${NOW}"
76+
# shellcheck source=/dev/null
77+
cmake-out/examples/models/llama/llama_main \
78+
--model_path=multimethod_qwen.pte \
79+
--method_name=lora_forward \
80+
--prompt="${PROMPT}" \
81+
${RUNTIME_ARGS} > result_lora.txt
82+
NOW=$(date +"%H:%M:%S")
83+
echo "Finished at ${NOW}"
84+
85+
RESULT=$(cat result_lora.txt)
86+
if [[ "${RESULT}" == "${EXPECTED_LORA_PREFIX}"* ]]; then
87+
echo "Test 1 (lora_forward): Success"
88+
else
89+
echo "Test 1 (lora_forward): Failure"
90+
echo "Expected result prefix: ${EXPECTED_LORA_PREFIX}"
91+
echo "Actual result: ${RESULT}"
92+
cleanup_files
93+
exit 1
94+
fi
95+
96+
### TEST 2: Run base_forward method ###
97+
NOW=$(date +"%H:%M:%S")
98+
echo "Test 2: Multimethod base_forward. Starting at ${NOW}"
99+
# shellcheck source=/dev/null
100+
cmake-out/examples/models/llama/llama_main \
101+
--model_path=multimethod_qwen.pte \
102+
--method_name=base_forward \
103+
--prompt="${PROMPT}" \
104+
${RUNTIME_ARGS} > result_base.txt
105+
NOW=$(date +"%H:%M:%S")
106+
echo "Finished at ${NOW}"
107+
108+
RESULT=$(cat result_base.txt)
109+
if [[ "${RESULT}" == "${EXPECTED_BASE_PREFIX}"* ]]; then
110+
echo "Test 2 (base_forward): Success"
111+
else
112+
echo "Test 2 (base_forward): Failure"
113+
echo "Expected result prefix: ${EXPECTED_BASE_PREFIX}"
114+
echo "Actual result: ${RESULT}"
115+
cleanup_files
116+
exit 1
117+
fi
118+
119+
echo "Multimethod tests passed!"
120+
cleanup_files

.github/workflows/pull.yml

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -826,6 +826,33 @@ jobs:
826826
# run llama runner in eager mode
827827
PYTHON_EXECUTABLE=python bash .ci/scripts/test_lora.sh
828828
829+
test-lora-multimethod-linux:
830+
name: test-lora-multimethod-linux
831+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
832+
permissions:
833+
id-token: write
834+
contents: read
835+
strategy:
836+
fail-fast: false
837+
with:
838+
runner: linux.24xlarge
839+
docker-image: ci-image:executorch-ubuntu-22.04-clang12
840+
submodules: 'recursive'
841+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
842+
timeout: 90
843+
script: |
844+
# The generic Linux job chooses to use base env, not the one setup by the image
845+
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
846+
conda activate "${CONDA_ENV}"
847+
848+
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool "cmake"
849+
850+
# Install llama requirements
851+
bash examples/models/llama/install_requirements.sh
852+
853+
# run llama runner in eager mode
854+
PYTHON_EXECUTABLE=python bash .ci/scripts/test_lora_multimethod.sh
855+
829856
test-mediatek-models-linux:
830857
name: test-mediatek-models-linux
831858
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main

examples/models/llama/BUCK

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,7 @@ fbcode_target(_kind = runtime.python_library,
148148
fbcode_target(_kind = runtime.python_library,
149149
name = "export_library",
150150
srcs = [
151+
"convert_weights.py",
151152
"export_llama.py",
152153
"export_llama_lib.py",
153154
"model.py",

examples/models/llama/export_llama_lib.py

Lines changed: 149 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -16,19 +16,19 @@
1616
import re
1717
import shlex
1818
from functools import partial
19-
2019
from importlib import resources as _resources
2120
from json import JSONDecodeError
2221
from pathlib import Path
23-
from typing import Callable, List, Optional, Union
22+
from typing import Callable, Dict, List, Optional, Union
2423

2524
import torch
26-
2725
from executorch.devtools.backend_debug import print_delegation_info
2826
from executorch.devtools.etrecord import generate_etrecord as generate_etrecord_func
2927
from executorch.examples.models.llama.hf_download import (
3028
download_and_convert_hf_checkpoint,
3129
)
30+
from executorch.exir import to_edge_transform_and_lower
31+
from executorch.exir.backend.partitioner import Partitioner
3232
from executorch.exir.passes.init_mutable_pass import InitializedMutableBufferPass
3333
from executorch.extension.llm.export.builder import DType, LLMEdgeManager
3434
from executorch.extension.llm.export.config.llm_config import LlmConfig
@@ -52,6 +52,7 @@
5252
)
5353
from executorch.util.activation_memory_profiler import generate_memory_trace
5454
from omegaconf import DictConfig
55+
from torch.export import ExportedProgram
5556

5657
from ..model_factory import EagerModelFactory
5758
from .source_transformation.apply_spin_quant_r1_r2 import (
@@ -852,6 +853,28 @@ def _validate_args(llm_config):
852853
"Shared embedding is only supported with torchao quantization."
853854
)
854855

856+
if llm_config.multimethod_lora.enabled:
857+
if llm_config.base.lora_config is not None:
858+
raise ValueError(
859+
"Cannot use both base.lora_config and multimethod_lora.methods. "
860+
"Use multimethod_lora.methods for all LoRA variants."
861+
)
862+
if llm_config.quantization.pt2e_quantize is not None:
863+
raise ValueError(
864+
"PT2E quantization is not supported with multimethod_lora export."
865+
)
866+
if (
867+
llm_config.backend.coreml.enabled
868+
or llm_config.backend.vulkan.enabled
869+
or llm_config.backend.qnn.enabled
870+
or llm_config.backend.mps.enabled
871+
or llm_config.backend.openvino.enabled
872+
):
873+
raise ValueError(
874+
"multimethod_lora export only supports XNNPACK backend or portable ops"
875+
"Please disable other backends (coreml, vulkan, qnn, mps, openvino)."
876+
)
877+
855878

856879
def _to_edge_and_lower_llama_xnnpack(
857880
builder_exported,
@@ -946,7 +969,6 @@ def _to_edge_and_lower_llama_tosa(
946969
tosa_spec,
947970
verbose: bool = False,
948971
) -> LLMEdgeManager:
949-
950972
logging.info("Lowering model using TOSA partitioner")
951973

952974
partitioners = []
@@ -1141,9 +1163,126 @@ def _to_edge_and_lower_llama( # noqa: C901
11411163
return builder
11421164

11431165

1166+
def _get_xnnpack_partitioners(llm_config: LlmConfig) -> Optional[List[Partitioner]]:
1167+
"""Get XNNPACK partitioners for multimethod_lora export."""
1168+
partitioners = []
1169+
1170+
if llm_config.backend.xnnpack.enabled:
1171+
partitioners.append(
1172+
get_xnnpack_partitioner(dynamic_quant_only_partitioner=True)
1173+
)
1174+
if llm_config.backend.xnnpack.extended_ops:
1175+
partitioners.append(
1176+
get_xnnpack_partitioner(dynamic_quant_only_partitioner=False)
1177+
)
1178+
1179+
return partitioners if partitioners else None
1180+
1181+
1182+
def _get_output_filename(
1183+
llm_config: LlmConfig, modelname: str, output_dir: str, dtype: DType
1184+
) -> str:
1185+
"""Determine output filename for the .pte file."""
1186+
if dtype == DType.fp16:
1187+
modelname = f"{modelname}_h"
1188+
1189+
if llm_config.export.output_name:
1190+
output_name = llm_config.export.output_name
1191+
if output_name.endswith(".pte"):
1192+
return output_name
1193+
else:
1194+
return f"{output_dir}/{output_name}.pte"
1195+
else:
1196+
return f"{output_dir}/{modelname}.pte"
1197+
1198+
1199+
def _export_llama_multimethod(llm_config: LlmConfig) -> LLMEdgeManager:
1200+
"""
1201+
Export multiple methods (base + LoRA variants) to a single .pte file.
1202+
1203+
For each method in llm_config.multimethod_lora.methods:
1204+
- If LoraConfig is None: use base model
1205+
- If LoraConfig is provided: create model with LoRA weights
1206+
1207+
Limitations:
1208+
- Only XNNPACK backend is supported for multimethod_lora export.
1209+
- PT2E quantization is not supported.
1210+
- Each method is exported separately; export time scales linearly
1211+
with the number of methods.
1212+
- The final .pte file deduplicates shared weights automatically.
1213+
"""
1214+
num_methods = len(llm_config.multimethod_lora.methods)
1215+
logging.info(
1216+
f"multimethod_lora export: exporting {num_methods} method(s). "
1217+
"Each method requires separate model instantiation and export."
1218+
)
1219+
1220+
additional_passes = []
1221+
if llm_config.base.model_class.value in TORCHTUNE_DEFINED_MODELS:
1222+
additional_passes = [InitializedMutableBufferPass(["kv_cache_pos"])]
1223+
1224+
# Build dict of exported programs
1225+
method_to_program: Dict[str, ExportedProgram] = {}
1226+
first_builder = None
1227+
1228+
for method_name, lora_config in llm_config.multimethod_lora.methods.items():
1229+
logging.info(f"Exporting method: {method_name}")
1230+
1231+
# Create a copy of config with this method's LoRA setting
1232+
method_config = copy.deepcopy(llm_config)
1233+
method_config.base.lora_config = lora_config
1234+
# Disable multimethod_lora to avoid infinite recursion
1235+
method_config.multimethod_lora.methods = {}
1236+
1237+
# Load and prepare model for this method
1238+
builder = _prepare_for_llama_export(method_config)
1239+
builder = builder.export()
1240+
builder.run_canonical_optimizations()
1241+
1242+
# Get the exported program
1243+
exported_program = builder._export(builder.pre_autograd_graph_module)
1244+
method_to_program[method_name] = exported_program
1245+
1246+
if first_builder is None:
1247+
first_builder = builder
1248+
1249+
assert first_builder is not None, "No methods to export"
1250+
1251+
# Get partitioners based on backend config
1252+
partitioners = _get_xnnpack_partitioners(llm_config)
1253+
1254+
# Lower all methods together using multimethod_lora API
1255+
edge_config = first_builder._get_edge_config()
1256+
edge_manager = to_edge_transform_and_lower(
1257+
method_to_program,
1258+
partitioner=partitioners,
1259+
compile_config=edge_config,
1260+
constant_methods=first_builder.metadata,
1261+
generate_etrecord=llm_config.debug.generate_etrecord,
1262+
)
1263+
1264+
# Convert to executorch and save
1265+
first_builder.edge_manager = edge_manager
1266+
first_builder = first_builder.to_executorch(passes=additional_passes)
1267+
1268+
output_file = _get_output_filename(
1269+
llm_config,
1270+
first_builder.modelname,
1271+
first_builder.output_dir,
1272+
first_builder.dtype,
1273+
)
1274+
first_builder.save_to_pte(output_file)
1275+
1276+
return first_builder
1277+
1278+
11441279
def _export_llama(llm_config: LlmConfig) -> LLMEdgeManager: # noqa: C901
11451280
_validate_args(llm_config)
11461281

1282+
# Check for multimethod_lora export
1283+
if llm_config.multimethod_lora.enabled:
1284+
return _export_llama_multimethod(llm_config)
1285+
11471286
pt2e_quant_params, quantizers, quant_dtype = get_quantizer_and_quant_params(
11481287
llm_config
11491288
)
@@ -1247,23 +1386,12 @@ def _export_llama(llm_config: LlmConfig) -> LLMEdgeManager: # noqa: C901
12471386
if llm_config.debug.profile_memory:
12481387
generate_memory_trace(builder.export_program, "memory_profile.json")
12491388

1250-
if builder.dtype == DType.fp16:
1251-
modelname = f"{modelname}_h"
1252-
1253-
if llm_config.export.output_name:
1254-
modelname = llm_config.export.output_name
1255-
if modelname.endswith(".pte"):
1256-
output_file = modelname
1257-
modelname = modelname[:-4]
1258-
print(f"modelname: {modelname}")
1259-
print(f"output_file: {output_file}")
1260-
else:
1261-
output_file = f"{builder.output_dir}/{modelname}.pte"
1262-
print(f"modelname: {modelname}")
1263-
print(f"output_file: {output_file}")
1264-
else:
1265-
output_file = f"{builder.output_dir}/{modelname}.pte"
1266-
1389+
output_file = _get_output_filename(
1390+
llm_config,
1391+
modelname,
1392+
builder.output_dir,
1393+
builder.dtype,
1394+
)
12671395
builder.save_to_pte(output_file)
12681396
return builder
12691397

examples/models/llama/runner/targets.bzl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ def define_common_targets():
4747
"//executorch/examples/models/llama/tokenizer:tiktoken",
4848
"//pytorch/tokenizers:llama2c_tokenizer",
4949
"//pytorch/tokenizers:hf_tokenizer",
50+
"//pytorch/tokenizers:regex_lookahead",
5051
] + (_get_operator_lib(aten)) + ([
5152
# Vulkan API currently cannot build on some platforms (e.g. Apple, FBCODE)
5253
# Therefore enable it explicitly for now to avoid failing tests

0 commit comments

Comments
 (0)