pytorch · JacobSzwejbka · Jun 10, 2026 · Jun 10, 2026
@@ -1882,6 +1882,7 @@ def __init__(
         self._named_data: Optional[NamedDataStoreOutput] = named_data
 
         backend_config = backend_config or ExecutorchBackendConfig()
+        self._backend_config = backend_config
 
         # Emit methods
         self._emitter_output: EmitterOutput = emit_program(

@@ -27,6 +27,7 @@ fbcode_target(_kind = runtime.python_library,
         "partitioner_lib.py",
         "quantize.py",
         "quantizer_lib.py",
+        "tokenizer_delegate.py",
     ],
     _is_external_target = True,
     base_module = "executorch.extension.llm.export",

@@ -34,6 +34,9 @@
 from executorch.extension.export_util.utils import export_to_edge, save_pte_program
 
 from executorch.extension.llm.export.export_passes import RemoveRedundantTransposes
+from executorch.extension.llm.export.tokenizer_delegate import (
+    append_tokenizer_delegate_method,
+)
 from pytorch_tokenizers import get_tokenizer
 from torch.export import export, ExportedProgram
 from torch.nn.attention import SDPBackend
@@ -519,6 +522,14 @@ def to_executorch(
                 external_constants=external_constants_tag,
             )
         )
+        if self.tokenizer_path is not None:
+            append_tokenizer_delegate_method(
+                self.export_program,
+                tokenizer_path=self.tokenizer_path,
+                max_context_length=int(
+                    self.metadata.get("get_max_context_len", self.max_seq_len)
+                ),
+            )
         logging.info(
             "Required memory for activation in bytes: {}".format(
                 self.export_program._emitter_output.program.execution_plan[

@@ -17,3 +17,11 @@ fbcode_target(_kind = runtime.python_test,
         "//caffe2:torch",
     ],
 )
+
+fbcode_target(_kind = runtime.python_test,
+    name = "test_tokenizer_delegate",
+    srcs = ["test_tokenizer_delegate.py"],
+    deps = [
+        "//executorch/extension/llm/export:export_lib",
+    ],
+)
@@ -0,0 +1,119 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+
+import tempfile
+import unittest
+from pathlib import Path
+from types import SimpleNamespace
+from typing import Any
+from unittest.mock import patch
+
+from executorch.exir.scalar_type import ScalarType
+from executorch.exir.schema import (
+    DataLocation,
+    DelegateCall,
+    Program,
+    String,
+    SubsegmentOffsets,
+    Tensor,
+    TensorShapeDynamism,
+)
+from executorch.extension.llm.export.tokenizer_delegate import (
+    append_tokenizer_delegate_method,
+    TOKENIZER_BACKEND_ID,
+    TOKENIZER_METHOD_NAME,
+)
+
+
+class TestTokenizerDelegate(unittest.TestCase):
+    def _make_program_manager(self) -> Any:
+        program = Program(
+            version=0,
+            execution_plan=[],
+            constant_buffer=[],
+            backend_delegate_data=[],
+            segments=[],
+            constant_segment=SubsegmentOffsets(segment_index=0, offsets=[]),
+        )
+        return SimpleNamespace(
+            _emitter_output=SimpleNamespace(program=program),
+            _backend_config=object(),
+            _data_serializer=None,
+            _named_data=None,
+            _pte_data=None,
+            _tensor_data=None,
+            _buffer=b"stale",
+        )
+
+    def test_appends_tokenizer_execution_plan(self) -> None:
+        tokenizer_bytes = b"llama-stories-tokenizer-bytes"
+        manager = self._make_program_manager()
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            tokenizer_path = Path(tmpdir) / "tokenizer.model"
+            tokenizer_path.write_bytes(tokenizer_bytes)
+            with patch(
+                "executorch.extension.llm.export.tokenizer_delegate"
+                ".serialize_for_executorch",
+                return_value=(b"serialized", {"tensor": b"data"}),
+            ) as serialize:
+                append_tokenizer_delegate_method(
+                    manager,
+                    tokenizer_path=str(tokenizer_path),
+                    max_context_length=16,
+                )
+
+        program = manager._emitter_output.program
+        self.assertEqual(manager._pte_data, b"serialized")
+        self.assertEqual(manager._tensor_data, {"tensor": b"data"})
+        self.assertIsNone(manager._buffer)
+        serialize.assert_called_once()
+
+        self.assertEqual(len(program.backend_delegate_data), 1)
+        self.assertEqual(program.backend_delegate_data[0].data, tokenizer_bytes)
+        self.assertEqual(len(program.execution_plan), 1)
+
+        plan = program.execution_plan[0]
+        self.assertEqual(plan.name, TOKENIZER_METHOD_NAME)
+        self.assertEqual(plan.inputs, [0])
+        self.assertEqual(plan.outputs, [1])
+        self.assertEqual(plan.non_const_buffer_sizes, [0, 16 * 8])
+
+        self.assertIsInstance(plan.values[0].val, String)
+        self.assertEqual(plan.values[0].val.string_val, "")
+        self.assertIsInstance(plan.values[1].val, Tensor)
+        token_tensor = plan.values[1].val
+        self.assertEqual(token_tensor.scalar_type, ScalarType.LONG)
+        self.assertEqual(token_tensor.sizes, [16])
+        self.assertEqual(
+            token_tensor.shape_dynamism, TensorShapeDynamism.DYNAMIC_BOUND
+        )
+
+        self.assertEqual(len(plan.delegates), 1)
+        delegate = plan.delegates[0]
+        self.assertEqual(delegate.id, TOKENIZER_BACKEND_ID)
+        self.assertEqual(delegate.processed.location, DataLocation.INLINE)
+        self.assertEqual(delegate.processed.index, 0)
+        self.assertEqual(
+            {spec.key: spec.value for spec in delegate.compile_specs},
+            {
+                "max_context_length": b"16",
+                "bos": b"0",
+                "eos": b"0",
+            },
+        )
+
+        self.assertEqual(len(plan.chains), 1)
+        delegate_call = plan.chains[0].instructions[0].instr_args
+        self.assertIsInstance(delegate_call, DelegateCall)
+        self.assertEqual(delegate_call.delegate_index, 0)
+        self.assertEqual(delegate_call.args, [0, 1])
+
+
+if __name__ == "__main__":
+    unittest.main()
@@ -0,0 +1,144 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+from executorch.exir._serialize._serialize import serialize_for_executorch
+from executorch.exir.backend.compile_spec_schema import CompileSpec
+from executorch.exir.schema import (
+    AllocationDetails,
+    BackendDelegate,
+    BackendDelegateDataReference,
+    BackendDelegateInlineData,
+    Chain,
+    ContainerMetadata,
+    DataLocation,
+    DelegateCall,
+    EValue,
+    ExecutionPlan,
+    Instruction,
+    String,
+    Tensor,
+    TensorShapeDynamism,
+)
+from executorch.exir.scalar_type import ScalarType
+
+
+TOKENIZER_BACKEND_ID = "TokenizerBackend"
+TOKENIZER_METHOD_NAME = "tokenize"
+
+
+def _allocation_info(memory_id: int, memory_offset: int) -> AllocationDetails:
+    return AllocationDetails(
+        memory_id=memory_id,
+        memory_offset_low=memory_offset & ((1 << 32) - 1),
+        memory_offset_high=memory_offset >> 32,
+    )
+
+
+def _make_token_tensor(max_context_length: int) -> Tensor:
+    if max_context_length <= 0:
+        raise ValueError(
+            f"max_context_length must be positive, got {max_context_length}"
+        )
+    return Tensor(
+        scalar_type=ScalarType.LONG,
+        storage_offset=0,
+        sizes=[max_context_length],
+        dim_order=[0],
+        requires_grad=False,
+        layout=0,
+        data_buffer_idx=0,
+        allocation_info=_allocation_info(memory_id=1, memory_offset=0),
+        shape_dynamism=TensorShapeDynamism.DYNAMIC_BOUND,
+    )
+
+
+def append_tokenizer_delegate_method(
+    executorch_program_manager: Any,
+    tokenizer_path: str,
+    max_context_length: int,
+    method_name: str = TOKENIZER_METHOD_NAME,
+    bos: int = 0,
+    eos: int = 0,
+) -> None:
+    """
+    Add a tokenizer entry point directly to an ExecuTorch program.
+
+    The method takes one string EValue and returns one int64 token tensor. The
+    tensor is memory planned to the model's max context length and resized by
+    the runtime tokenizer delegate to the actual token count.
+    """
+    tokenizer_bytes = Path(tokenizer_path).read_bytes()
+    program = executorch_program_manager._emitter_output.program
+
+    if any(plan.name == method_name for plan in program.execution_plan):
+        raise ValueError(f"Program already has a method named {method_name}")
+
+    delegate_data_index = len(program.backend_delegate_data)
+    program.backend_delegate_data.append(
+        BackendDelegateInlineData(data=tokenizer_bytes)
+    )
+
+    delegate = BackendDelegate(
+        id=TOKENIZER_BACKEND_ID,
+        processed=BackendDelegateDataReference(
+            location=DataLocation.INLINE,
+            index=delegate_data_index,
+        ),
+        compile_specs=[
+            CompileSpec("max_context_length", str(max_context_length).encode()),
+            CompileSpec("bos", str(bos).encode()),
+            CompileSpec("eos", str(eos).encode()),
+        ],
+    )
+
+    input_id = 0
+    output_id = 1
+    plan = ExecutionPlan(
+        name=method_name,
+        values=[
+            EValue(String("")),
+            EValue(_make_token_tensor(max_context_length)),
+        ],
+        inputs=[input_id],
+        outputs=[output_id],
+        chains=[
+            Chain(
+                inputs=[input_id],
+                outputs=[output_id],
+                instructions=[
+                    Instruction(
+                        DelegateCall(
+                            delegate_index=0,
+                            args=[input_id, output_id],
+                        )
+                    )
+                ],
+                stacktrace=None,
+            )
+        ],
+        operators=[],
+        delegates=[delegate],
+        non_const_buffer_sizes=[0, max_context_length * 8],
+        container_meta_type=ContainerMetadata("", ""),
+    )
+    program.execution_plan.append(plan)
+
+    executorch_program_manager._pte_data, executorch_program_manager._tensor_data = (
+        serialize_for_executorch(
+            executorch_program_manager._emitter_output,
+            executorch_program_manager._backend_config,
+            executorch_program_manager._data_serializer,
+            executorch_program_manager._named_data,
+        )
+    )
+    executorch_program_manager._buffer = None