Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions exir/program/_program.py
Original file line number Diff line number Diff line change
Expand Up @@ -1882,6 +1882,7 @@ def __init__(
self._named_data: Optional[NamedDataStoreOutput] = named_data

backend_config = backend_config or ExecutorchBackendConfig()
self._backend_config = backend_config

# Emit methods
self._emitter_output: EmitterOutput = emit_program(
Expand Down
1 change: 1 addition & 0 deletions extension/llm/export/BUCK
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ fbcode_target(_kind = runtime.python_library,
"partitioner_lib.py",
"quantize.py",
"quantizer_lib.py",
"tokenizer_delegate.py",
],
_is_external_target = True,
base_module = "executorch.extension.llm.export",
Expand Down
11 changes: 11 additions & 0 deletions extension/llm/export/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@
from executorch.extension.export_util.utils import export_to_edge, save_pte_program

from executorch.extension.llm.export.export_passes import RemoveRedundantTransposes
from executorch.extension.llm.export.tokenizer_delegate import (
append_tokenizer_delegate_method,
)
from pytorch_tokenizers import get_tokenizer
from torch.export import export, ExportedProgram
from torch.nn.attention import SDPBackend
Expand Down Expand Up @@ -519,6 +522,14 @@ def to_executorch(
external_constants=external_constants_tag,
)
)
if self.tokenizer_path is not None:
append_tokenizer_delegate_method(
self.export_program,
tokenizer_path=self.tokenizer_path,
max_context_length=int(
self.metadata.get("get_max_context_len", self.max_seq_len)
),
)
logging.info(
"Required memory for activation in bytes: {}".format(
self.export_program._emitter_output.program.execution_plan[
Expand Down
8 changes: 8 additions & 0 deletions extension/llm/export/test/BUCK
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,11 @@ fbcode_target(_kind = runtime.python_test,
"//caffe2:torch",
],
)

fbcode_target(_kind = runtime.python_test,
name = "test_tokenizer_delegate",
srcs = ["test_tokenizer_delegate.py"],
deps = [
"//executorch/extension/llm/export:export_lib",
],
)
119 changes: 119 additions & 0 deletions extension/llm/export/test/test_tokenizer_delegate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

# pyre-strict

import tempfile
import unittest
from pathlib import Path
from types import SimpleNamespace
from typing import Any
from unittest.mock import patch

from executorch.exir.scalar_type import ScalarType
from executorch.exir.schema import (
DataLocation,
DelegateCall,
Program,
String,
SubsegmentOffsets,
Tensor,
TensorShapeDynamism,
)
from executorch.extension.llm.export.tokenizer_delegate import (
append_tokenizer_delegate_method,
TOKENIZER_BACKEND_ID,
TOKENIZER_METHOD_NAME,
)


class TestTokenizerDelegate(unittest.TestCase):
def _make_program_manager(self) -> Any:
program = Program(
version=0,
execution_plan=[],
constant_buffer=[],
backend_delegate_data=[],
segments=[],
constant_segment=SubsegmentOffsets(segment_index=0, offsets=[]),
)
return SimpleNamespace(
_emitter_output=SimpleNamespace(program=program),
_backend_config=object(),
_data_serializer=None,
_named_data=None,
_pte_data=None,
_tensor_data=None,
_buffer=b"stale",
)

def test_appends_tokenizer_execution_plan(self) -> None:
tokenizer_bytes = b"llama-stories-tokenizer-bytes"
manager = self._make_program_manager()

with tempfile.TemporaryDirectory() as tmpdir:
tokenizer_path = Path(tmpdir) / "tokenizer.model"
tokenizer_path.write_bytes(tokenizer_bytes)
with patch(
"executorch.extension.llm.export.tokenizer_delegate"
".serialize_for_executorch",
return_value=(b"serialized", {"tensor": b"data"}),
) as serialize:
append_tokenizer_delegate_method(
manager,
tokenizer_path=str(tokenizer_path),
max_context_length=16,
)

program = manager._emitter_output.program
self.assertEqual(manager._pte_data, b"serialized")
self.assertEqual(manager._tensor_data, {"tensor": b"data"})
self.assertIsNone(manager._buffer)
serialize.assert_called_once()

self.assertEqual(len(program.backend_delegate_data), 1)
self.assertEqual(program.backend_delegate_data[0].data, tokenizer_bytes)
self.assertEqual(len(program.execution_plan), 1)

plan = program.execution_plan[0]
self.assertEqual(plan.name, TOKENIZER_METHOD_NAME)
self.assertEqual(plan.inputs, [0])
self.assertEqual(plan.outputs, [1])
self.assertEqual(plan.non_const_buffer_sizes, [0, 16 * 8])

self.assertIsInstance(plan.values[0].val, String)
self.assertEqual(plan.values[0].val.string_val, "")
self.assertIsInstance(plan.values[1].val, Tensor)
token_tensor = plan.values[1].val
self.assertEqual(token_tensor.scalar_type, ScalarType.LONG)
self.assertEqual(token_tensor.sizes, [16])
self.assertEqual(
token_tensor.shape_dynamism, TensorShapeDynamism.DYNAMIC_BOUND
)

self.assertEqual(len(plan.delegates), 1)
delegate = plan.delegates[0]
self.assertEqual(delegate.id, TOKENIZER_BACKEND_ID)
self.assertEqual(delegate.processed.location, DataLocation.INLINE)
self.assertEqual(delegate.processed.index, 0)
self.assertEqual(
{spec.key: spec.value for spec in delegate.compile_specs},
{
"max_context_length": b"16",
"bos": b"0",
"eos": b"0",
},
)

self.assertEqual(len(plan.chains), 1)
delegate_call = plan.chains[0].instructions[0].instr_args
self.assertIsInstance(delegate_call, DelegateCall)
self.assertEqual(delegate_call.delegate_index, 0)
self.assertEqual(delegate_call.args, [0, 1])


if __name__ == "__main__":
unittest.main()
144 changes: 144 additions & 0 deletions extension/llm/export/tokenizer_delegate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

# pyre-unsafe

from __future__ import annotations

from pathlib import Path
from typing import Any

from executorch.exir._serialize._serialize import serialize_for_executorch
from executorch.exir.backend.compile_spec_schema import CompileSpec
from executorch.exir.schema import (
AllocationDetails,
BackendDelegate,
BackendDelegateDataReference,
BackendDelegateInlineData,
Chain,
ContainerMetadata,
DataLocation,
DelegateCall,
EValue,
ExecutionPlan,
Instruction,
String,
Tensor,
TensorShapeDynamism,
)
from executorch.exir.scalar_type import ScalarType


TOKENIZER_BACKEND_ID = "TokenizerBackend"
TOKENIZER_METHOD_NAME = "tokenize"


def _allocation_info(memory_id: int, memory_offset: int) -> AllocationDetails:
return AllocationDetails(
memory_id=memory_id,
memory_offset_low=memory_offset & ((1 << 32) - 1),
memory_offset_high=memory_offset >> 32,
)


def _make_token_tensor(max_context_length: int) -> Tensor:
if max_context_length <= 0:
raise ValueError(
f"max_context_length must be positive, got {max_context_length}"
)
return Tensor(
scalar_type=ScalarType.LONG,
storage_offset=0,
sizes=[max_context_length],
dim_order=[0],
requires_grad=False,
layout=0,
data_buffer_idx=0,
allocation_info=_allocation_info(memory_id=1, memory_offset=0),
shape_dynamism=TensorShapeDynamism.DYNAMIC_BOUND,
)


def append_tokenizer_delegate_method(
executorch_program_manager: Any,
tokenizer_path: str,
max_context_length: int,
method_name: str = TOKENIZER_METHOD_NAME,
bos: int = 0,
eos: int = 0,
) -> None:
"""
Add a tokenizer entry point directly to an ExecuTorch program.

The method takes one string EValue and returns one int64 token tensor. The
tensor is memory planned to the model's max context length and resized by
the runtime tokenizer delegate to the actual token count.
"""
tokenizer_bytes = Path(tokenizer_path).read_bytes()
program = executorch_program_manager._emitter_output.program

if any(plan.name == method_name for plan in program.execution_plan):
raise ValueError(f"Program already has a method named {method_name}")

delegate_data_index = len(program.backend_delegate_data)
program.backend_delegate_data.append(
BackendDelegateInlineData(data=tokenizer_bytes)
)

delegate = BackendDelegate(
id=TOKENIZER_BACKEND_ID,
processed=BackendDelegateDataReference(
location=DataLocation.INLINE,
index=delegate_data_index,
),
compile_specs=[
CompileSpec("max_context_length", str(max_context_length).encode()),
CompileSpec("bos", str(bos).encode()),
CompileSpec("eos", str(eos).encode()),
],
)

input_id = 0
output_id = 1
plan = ExecutionPlan(
name=method_name,
values=[
EValue(String("")),
EValue(_make_token_tensor(max_context_length)),
],
inputs=[input_id],
outputs=[output_id],
chains=[
Chain(
inputs=[input_id],
outputs=[output_id],
instructions=[
Instruction(
DelegateCall(
delegate_index=0,
args=[input_id, output_id],
)
)
],
stacktrace=None,
)
],
operators=[],
delegates=[delegate],
non_const_buffer_sizes=[0, max_context_length * 8],
container_meta_type=ContainerMetadata("", ""),
)
program.execution_plan.append(plan)

executorch_program_manager._pte_data, executorch_program_manager._tensor_data = (
serialize_for_executorch(
executorch_program_manager._emitter_output,
executorch_program_manager._backend_config,
executorch_program_manager._data_serializer,
executorch_program_manager._named_data,
)
)
executorch_program_manager._buffer = None
Loading
Loading