diff --git a/exir/program/_program.py b/exir/program/_program.py index 950e203c86c..f49e44fbf68 100644 --- a/exir/program/_program.py +++ b/exir/program/_program.py @@ -1882,6 +1882,7 @@ def __init__( self._named_data: Optional[NamedDataStoreOutput] = named_data backend_config = backend_config or ExecutorchBackendConfig() + self._backend_config = backend_config # Emit methods self._emitter_output: EmitterOutput = emit_program( diff --git a/extension/llm/export/BUCK b/extension/llm/export/BUCK index 17b3b951642..e89e8e3174a 100644 --- a/extension/llm/export/BUCK +++ b/extension/llm/export/BUCK @@ -27,6 +27,7 @@ fbcode_target(_kind = runtime.python_library, "partitioner_lib.py", "quantize.py", "quantizer_lib.py", + "tokenizer_delegate.py", ], _is_external_target = True, base_module = "executorch.extension.llm.export", diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py index c25c1190990..05b54119235 100644 --- a/extension/llm/export/builder.py +++ b/extension/llm/export/builder.py @@ -34,6 +34,9 @@ from executorch.extension.export_util.utils import export_to_edge, save_pte_program from executorch.extension.llm.export.export_passes import RemoveRedundantTransposes +from executorch.extension.llm.export.tokenizer_delegate import ( + append_tokenizer_delegate_method, +) from pytorch_tokenizers import get_tokenizer from torch.export import export, ExportedProgram from torch.nn.attention import SDPBackend @@ -519,6 +522,14 @@ def to_executorch( external_constants=external_constants_tag, ) ) + if self.tokenizer_path is not None: + append_tokenizer_delegate_method( + self.export_program, + tokenizer_path=self.tokenizer_path, + max_context_length=int( + self.metadata.get("get_max_context_len", self.max_seq_len) + ), + ) logging.info( "Required memory for activation in bytes: {}".format( self.export_program._emitter_output.program.execution_plan[ diff --git a/extension/llm/export/test/BUCK b/extension/llm/export/test/BUCK index 5537a8c5f29..dfe6b28984d 100644 --- a/extension/llm/export/test/BUCK +++ b/extension/llm/export/test/BUCK @@ -17,3 +17,11 @@ fbcode_target(_kind = runtime.python_test, "//caffe2:torch", ], ) + +fbcode_target(_kind = runtime.python_test, + name = "test_tokenizer_delegate", + srcs = ["test_tokenizer_delegate.py"], + deps = [ + "//executorch/extension/llm/export:export_lib", + ], +) diff --git a/extension/llm/export/test/test_tokenizer_delegate.py b/extension/llm/export/test/test_tokenizer_delegate.py new file mode 100644 index 00000000000..8b3cc1a93dc --- /dev/null +++ b/extension/llm/export/test/test_tokenizer_delegate.py @@ -0,0 +1,119 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-strict + +import tempfile +import unittest +from pathlib import Path +from types import SimpleNamespace +from typing import Any +from unittest.mock import patch + +from executorch.exir.scalar_type import ScalarType +from executorch.exir.schema import ( + DataLocation, + DelegateCall, + Program, + String, + SubsegmentOffsets, + Tensor, + TensorShapeDynamism, +) +from executorch.extension.llm.export.tokenizer_delegate import ( + append_tokenizer_delegate_method, + TOKENIZER_BACKEND_ID, + TOKENIZER_METHOD_NAME, +) + + +class TestTokenizerDelegate(unittest.TestCase): + def _make_program_manager(self) -> Any: + program = Program( + version=0, + execution_plan=[], + constant_buffer=[], + backend_delegate_data=[], + segments=[], + constant_segment=SubsegmentOffsets(segment_index=0, offsets=[]), + ) + return SimpleNamespace( + _emitter_output=SimpleNamespace(program=program), + _backend_config=object(), + _data_serializer=None, + _named_data=None, + _pte_data=None, + _tensor_data=None, + _buffer=b"stale", + ) + + def test_appends_tokenizer_execution_plan(self) -> None: + tokenizer_bytes = b"llama-stories-tokenizer-bytes" + manager = self._make_program_manager() + + with tempfile.TemporaryDirectory() as tmpdir: + tokenizer_path = Path(tmpdir) / "tokenizer.model" + tokenizer_path.write_bytes(tokenizer_bytes) + with patch( + "executorch.extension.llm.export.tokenizer_delegate" + ".serialize_for_executorch", + return_value=(b"serialized", {"tensor": b"data"}), + ) as serialize: + append_tokenizer_delegate_method( + manager, + tokenizer_path=str(tokenizer_path), + max_context_length=16, + ) + + program = manager._emitter_output.program + self.assertEqual(manager._pte_data, b"serialized") + self.assertEqual(manager._tensor_data, {"tensor": b"data"}) + self.assertIsNone(manager._buffer) + serialize.assert_called_once() + + self.assertEqual(len(program.backend_delegate_data), 1) + self.assertEqual(program.backend_delegate_data[0].data, tokenizer_bytes) + self.assertEqual(len(program.execution_plan), 1) + + plan = program.execution_plan[0] + self.assertEqual(plan.name, TOKENIZER_METHOD_NAME) + self.assertEqual(plan.inputs, [0]) + self.assertEqual(plan.outputs, [1]) + self.assertEqual(plan.non_const_buffer_sizes, [0, 16 * 8]) + + self.assertIsInstance(plan.values[0].val, String) + self.assertEqual(plan.values[0].val.string_val, "") + self.assertIsInstance(plan.values[1].val, Tensor) + token_tensor = plan.values[1].val + self.assertEqual(token_tensor.scalar_type, ScalarType.LONG) + self.assertEqual(token_tensor.sizes, [16]) + self.assertEqual( + token_tensor.shape_dynamism, TensorShapeDynamism.DYNAMIC_BOUND + ) + + self.assertEqual(len(plan.delegates), 1) + delegate = plan.delegates[0] + self.assertEqual(delegate.id, TOKENIZER_BACKEND_ID) + self.assertEqual(delegate.processed.location, DataLocation.INLINE) + self.assertEqual(delegate.processed.index, 0) + self.assertEqual( + {spec.key: spec.value for spec in delegate.compile_specs}, + { + "max_context_length": b"16", + "bos": b"0", + "eos": b"0", + }, + ) + + self.assertEqual(len(plan.chains), 1) + delegate_call = plan.chains[0].instructions[0].instr_args + self.assertIsInstance(delegate_call, DelegateCall) + self.assertEqual(delegate_call.delegate_index, 0) + self.assertEqual(delegate_call.args, [0, 1]) + + +if __name__ == "__main__": + unittest.main() diff --git a/extension/llm/export/tokenizer_delegate.py b/extension/llm/export/tokenizer_delegate.py new file mode 100644 index 00000000000..bb4e881fc0a --- /dev/null +++ b/extension/llm/export/tokenizer_delegate.py @@ -0,0 +1,144 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-unsafe + +from __future__ import annotations + +from pathlib import Path +from typing import Any + +from executorch.exir._serialize._serialize import serialize_for_executorch +from executorch.exir.backend.compile_spec_schema import CompileSpec +from executorch.exir.schema import ( + AllocationDetails, + BackendDelegate, + BackendDelegateDataReference, + BackendDelegateInlineData, + Chain, + ContainerMetadata, + DataLocation, + DelegateCall, + EValue, + ExecutionPlan, + Instruction, + String, + Tensor, + TensorShapeDynamism, +) +from executorch.exir.scalar_type import ScalarType + + +TOKENIZER_BACKEND_ID = "TokenizerBackend" +TOKENIZER_METHOD_NAME = "tokenize" + + +def _allocation_info(memory_id: int, memory_offset: int) -> AllocationDetails: + return AllocationDetails( + memory_id=memory_id, + memory_offset_low=memory_offset & ((1 << 32) - 1), + memory_offset_high=memory_offset >> 32, + ) + + +def _make_token_tensor(max_context_length: int) -> Tensor: + if max_context_length <= 0: + raise ValueError( + f"max_context_length must be positive, got {max_context_length}" + ) + return Tensor( + scalar_type=ScalarType.LONG, + storage_offset=0, + sizes=[max_context_length], + dim_order=[0], + requires_grad=False, + layout=0, + data_buffer_idx=0, + allocation_info=_allocation_info(memory_id=1, memory_offset=0), + shape_dynamism=TensorShapeDynamism.DYNAMIC_BOUND, + ) + + +def append_tokenizer_delegate_method( + executorch_program_manager: Any, + tokenizer_path: str, + max_context_length: int, + method_name: str = TOKENIZER_METHOD_NAME, + bos: int = 0, + eos: int = 0, +) -> None: + """ + Add a tokenizer entry point directly to an ExecuTorch program. + + The method takes one string EValue and returns one int64 token tensor. The + tensor is memory planned to the model's max context length and resized by + the runtime tokenizer delegate to the actual token count. + """ + tokenizer_bytes = Path(tokenizer_path).read_bytes() + program = executorch_program_manager._emitter_output.program + + if any(plan.name == method_name for plan in program.execution_plan): + raise ValueError(f"Program already has a method named {method_name}") + + delegate_data_index = len(program.backend_delegate_data) + program.backend_delegate_data.append( + BackendDelegateInlineData(data=tokenizer_bytes) + ) + + delegate = BackendDelegate( + id=TOKENIZER_BACKEND_ID, + processed=BackendDelegateDataReference( + location=DataLocation.INLINE, + index=delegate_data_index, + ), + compile_specs=[ + CompileSpec("max_context_length", str(max_context_length).encode()), + CompileSpec("bos", str(bos).encode()), + CompileSpec("eos", str(eos).encode()), + ], + ) + + input_id = 0 + output_id = 1 + plan = ExecutionPlan( + name=method_name, + values=[ + EValue(String("")), + EValue(_make_token_tensor(max_context_length)), + ], + inputs=[input_id], + outputs=[output_id], + chains=[ + Chain( + inputs=[input_id], + outputs=[output_id], + instructions=[ + Instruction( + DelegateCall( + delegate_index=0, + args=[input_id, output_id], + ) + ) + ], + stacktrace=None, + ) + ], + operators=[], + delegates=[delegate], + non_const_buffer_sizes=[0, max_context_length * 8], + container_meta_type=ContainerMetadata("", ""), + ) + program.execution_plan.append(plan) + + executorch_program_manager._pte_data, executorch_program_manager._tensor_data = ( + serialize_for_executorch( + executorch_program_manager._emitter_output, + executorch_program_manager._backend_config, + executorch_program_manager._data_serializer, + executorch_program_manager._named_data, + ) + ) + executorch_program_manager._buffer = None diff --git a/extension/llm/runner/llm_runner_helper.cpp b/extension/llm/runner/llm_runner_helper.cpp index 4d34fd716e3..0e36ce3fbc0 100644 --- a/extension/llm/runner/llm_runner_helper.cpp +++ b/extension/llm/runner/llm_runner_helper.cpp @@ -18,14 +18,24 @@ #include #include #include +#include +#include #include #include +#include #include #include -#include #include #include +#include +#include +#include +#include +#include +#include +#include + namespace executorch::extension::llm { using ::executorch::extension::Module; @@ -88,6 +98,269 @@ std::unique_ptr load_tokenizer( return nullptr; } +std::unique_ptr load_tokenizer_from_buffer( + const void* data, + size_t size, + std::unique_ptr> special_tokens, + std::optional pattern, + size_t bos_token_index, + size_t eos_token_index) { + runtime::runtime_init(); + auto tekken_tokenizer = std::make_unique(); + if (tekken_tokenizer->load_from_buffer(data, size) == + ::tokenizers::Error::Ok) { + ET_LOG(Info, "Loaded tekken tokenizer from buffer"); + return tekken_tokenizer; + } + + auto json_tokenizer = std::make_unique(); + if (json_tokenizer->load_from_buffer(data, size) == ::tokenizers::Error::Ok) { + ET_LOG(Info, "Loaded json tokenizer from buffer"); + return json_tokenizer; + } + + std::unique_ptr<::tokenizers::Tiktoken> tiktoken_tokenizer; + if (special_tokens != nullptr && !pattern.has_value()) { + tiktoken_tokenizer = std::make_unique<::tokenizers::Tiktoken>( + std::move(special_tokens), bos_token_index, eos_token_index); + } else if (special_tokens != nullptr && pattern.has_value()) { + tiktoken_tokenizer = std::make_unique<::tokenizers::Tiktoken>( + pattern.value(), + std::move(special_tokens), + bos_token_index, + eos_token_index); + } else { + tiktoken_tokenizer = std::make_unique<::tokenizers::Tiktoken>(); + } + if (tiktoken_tokenizer->load_from_buffer(data, size) == + ::tokenizers::Error::Ok) { + ET_LOG(Info, "Loaded TikToken tokenizer from buffer"); + return tiktoken_tokenizer; + } + + auto sp_tokenizer = std::make_unique<::tokenizers::SPTokenizer>(); + if (sp_tokenizer->load_from_buffer(data, size) == ::tokenizers::Error::Ok) { + ET_LOG(Info, "Loaded Sentencepiece tokenizer from buffer"); + return sp_tokenizer; + } + + auto bpe_tokenizer = std::make_unique<::tokenizers::Llama2cTokenizer>(); + if (bpe_tokenizer->load_from_buffer(data, size) == ::tokenizers::Error::Ok) { + ET_LOG(Info, "Loaded BPE tokenizer from buffer"); + return bpe_tokenizer; + } + + return nullptr; +} + +namespace { + +constexpr const char* kTokenizerBackendId = "TokenizerBackend"; +constexpr const char* kMaxContextLengthSpec = "max_context_length"; +constexpr const char* kBosSpec = "bos"; +constexpr const char* kEosSpec = "eos"; + +struct TokenizerDelegateHandle final { + std::unique_ptr<::tokenizers::Tokenizer> tokenizer; + size_t max_context_length = 0; + int8_t bos = 0; + int8_t eos = 0; +}; + +Error parse_size_compile_spec( + executorch::runtime::ArrayRef + compile_specs, + const char* key, + bool required, + size_t* out) { + for (size_t i = 0; i < compile_specs.size(); ++i) { + const auto& spec = compile_specs[i]; + if (std::strcmp(spec.key, key) != 0) { + continue; + } + std::string value( + static_cast(spec.value.buffer), spec.value.nbytes); + errno = 0; + char* end = nullptr; + const unsigned long long parsed = std::strtoull(value.c_str(), &end, 10); + ET_CHECK_OR_RETURN_ERROR( + !value.empty() && value[0] != '-' && errno != ERANGE && + end == value.c_str() + value.size(), + InvalidProgram, + "Invalid TokenizerBackend compile spec %s=%s", + key, + value.c_str()); + ET_CHECK_OR_RETURN_ERROR( + parsed <= std::numeric_limits::max(), + InvalidProgram, + "TokenizerBackend compile spec %s is too large", + key); + *out = static_cast(parsed); + return Error::Ok; + } + ET_CHECK_OR_RETURN_ERROR( + !required, + InvalidProgram, + "Missing TokenizerBackend compile spec %s", + key); + return Error::Ok; +} + +Error parse_i8_compile_spec( + executorch::runtime::ArrayRef + compile_specs, + const char* key, + int8_t* out) { + size_t parsed = static_cast(*out); + ET_CHECK_OK_OR_RETURN_ERROR( + parse_size_compile_spec(compile_specs, key, /*required=*/false, &parsed)); + ET_CHECK_OR_RETURN_ERROR( + parsed <= static_cast(std::numeric_limits::max()), + InvalidProgram, + "TokenizerBackend compile spec %s is too large for int8_t", + key); + *out = static_cast(parsed); + return Error::Ok; +} + +class TokenizerBackend final : public executorch::runtime::BackendInterface { + public: + bool is_available() const override { + return true; + } + + executorch::runtime::Result init( + executorch::runtime::BackendInitContext& context, + executorch::runtime::FreeableBuffer* processed, + executorch::runtime::ArrayRef + compile_specs) const override { + ET_CHECK_OR_RETURN_ERROR( + processed != nullptr && processed->data() != nullptr && + processed->size() > 0, + InvalidProgram, + "TokenizerBackend requires non-empty bundled tokenizer data"); + + size_t max_context_length = 0; + ET_CHECK_OK_OR_RETURN_ERROR(parse_size_compile_spec( + compile_specs, + kMaxContextLengthSpec, + /*required=*/true, + &max_context_length)); + ET_CHECK_OR_RETURN_ERROR( + max_context_length > 0, + InvalidProgram, + "TokenizerBackend max_context_length must be positive"); + + int8_t bos = 0; + int8_t eos = 0; + ET_CHECK_OK_OR_RETURN_ERROR( + parse_i8_compile_spec(compile_specs, kBosSpec, &bos)); + ET_CHECK_OK_OR_RETURN_ERROR( + parse_i8_compile_spec(compile_specs, kEosSpec, &eos)); + + auto* handle = context.get_runtime_allocator() + ->allocateInstance(); + ET_CHECK_OR_RETURN_ERROR( + handle != nullptr, + MemoryAllocationFailed, + "Failed to allocate TokenizerBackend handle"); + new (handle) TokenizerDelegateHandle(); + handle->max_context_length = max_context_length; + handle->bos = bos; + handle->eos = eos; + handle->tokenizer = load_tokenizer_from_buffer( + processed->data(), processed->size()); + if (handle->tokenizer == nullptr) { + handle->~TokenizerDelegateHandle(); + ET_LOG(Error, "Failed to load bundled tokenizer"); + return Error::InvalidProgram; + } + return reinterpret_cast(handle); + } + + Error execute( + executorch::runtime::BackendExecutionContext&, + executorch::runtime::DelegateHandle* handle, + executorch::runtime::Span args) + const override { + ET_CHECK_OR_RETURN_ERROR( + handle != nullptr, + DelegateInvalidHandle, + "TokenizerBackend handle is null"); + ET_CHECK_OR_RETURN_ERROR( + args.size() == 2, + InvalidProgram, + "TokenizerBackend expects 2 arguments, got %zu", + args.size()); + + auto* tokenizer_handle = + reinterpret_cast(handle); + auto* input = args[0]; + auto* output_value = args[1]; + ET_CHECK_OR_RETURN_ERROR( + input != nullptr && input->isString(), + InvalidArgument, + "TokenizerBackend input must be a string"); + ET_CHECK_OR_RETURN_ERROR( + output_value != nullptr && output_value->isTensor(), + InvalidArgument, + "TokenizerBackend output must be a tensor"); + + const std::string_view prompt = input->toString(); + auto tokens_result = tokenizer_handle->tokenizer->encode( + std::string(prompt), tokenizer_handle->bos, tokenizer_handle->eos); + if (!tokens_result.ok()) { + ET_LOG(Error, "Bundled tokenizer failed to encode input"); + return Error::InvalidArgument; + } + const auto& tokens = tokens_result.get(); + ET_CHECK_OR_RETURN_ERROR( + tokens.size() <= tokenizer_handle->max_context_length, + InvalidArgument, + "Tokenizer output length %zu exceeds max context length %zu", + tokens.size(), + tokenizer_handle->max_context_length); + + auto& output = output_value->toTensor(); + ET_CHECK_OR_RETURN_ERROR( + output.scalar_type() == executorch::aten::ScalarType::Long, + InvalidArgument, + "TokenizerBackend output tensor must be int64"); + ET_CHECK_OR_RETURN_ERROR( + output.dim() == 1, + InvalidArgument, + "TokenizerBackend output tensor must be rank 1"); + executorch::aten::SizesType output_size = + static_cast(tokens.size()); + ET_CHECK_OK_OR_RETURN_ERROR( + executorch::runtime::resize_tensor( + output, + executorch::aten::ArrayRef( + &output_size, 1))); + auto* output_data = output.mutable_data_ptr(); + for (size_t i = 0; i < tokens.size(); ++i) { + output_data[i] = static_cast(tokens[i]); + } + return Error::Ok; + } + + void destroy(executorch::runtime::DelegateHandle* handle) const override { + if (handle != nullptr) { + reinterpret_cast(handle) + ->~TokenizerDelegateHandle(); + } + } +}; + +TokenizerBackend tokenizer_backend; +executorch::runtime::Backend tokenizer_backend_registration{ + kTokenizerBackendId, + &tokenizer_backend}; +static auto tokenizer_backend_registration_status = + executorch::runtime::register_backend(tokenizer_backend_registration); + +} // namespace + ::executorch::runtime::Result> get_llm_metadata(tokenizers::Tokenizer* tokenizer, Module* module) { // Initialize metadata with default values diff --git a/extension/llm/runner/llm_runner_helper.h b/extension/llm/runner/llm_runner_helper.h index b4c7c59806d..697f6162171 100644 --- a/extension/llm/runner/llm_runner_helper.h +++ b/extension/llm/runner/llm_runner_helper.h @@ -51,6 +51,21 @@ ET_EXPERIMENTAL std::unique_ptr load_tokenizer( size_t bos_token_index = 0, size_t eos_token_index = 1); +/** + * @brief Loads a tokenizer from an in-memory model buffer + * + * This mirrors load_tokenizer(), but consumes bytes bundled inside an + * ExecuTorch program, such as tokenizer delegate inline data. + */ +ET_EXPERIMENTAL std::unique_ptr +load_tokenizer_from_buffer( + const void* data, + size_t size, + std::unique_ptr> special_tokens = nullptr, + std::optional pattern = std::nullopt, + size_t bos_token_index = 0, + size_t eos_token_index = 1); + /** * @brief Gets LLM metadata from the model and tokenizer * diff --git a/extension/llm/runner/multimodal_runner.cpp b/extension/llm/runner/multimodal_runner.cpp index 9afaab0b97e..49d6d67f4c0 100644 --- a/extension/llm/runner/multimodal_runner.cpp +++ b/extension/llm/runner/multimodal_runner.cpp @@ -12,8 +12,8 @@ #include #include #include -#include #include +#include #ifdef CUDA_AVAILABLE #include diff --git a/extension/llm/runner/text_llm_runner.cpp b/extension/llm/runner/text_llm_runner.cpp index 160b254460a..94a186e3065 100644 --- a/extension/llm/runner/text_llm_runner.cpp +++ b/extension/llm/runner/text_llm_runner.cpp @@ -15,9 +15,9 @@ #include #include #include +#include #include #include -#include #include namespace executorch::extension::llm { diff --git a/extension/llm/tokenizers b/extension/llm/tokenizers index b642403834a..95eb49cec72 160000 --- a/extension/llm/tokenizers +++ b/extension/llm/tokenizers @@ -1 +1 @@ -Subproject commit b642403834a67c8ef14a7109dcd1bb5e5f3cb68a +Subproject commit 95eb49cec721b23e40b96ca2560225b169c5ffe4 diff --git a/extension/wasm/tokenizers/tokenizers.cpp b/extension/wasm/tokenizers/tokenizers.cpp index b1558464f20..a015921955b 100644 --- a/extension/wasm/tokenizers/tokenizers.cpp +++ b/extension/wasm/tokenizers/tokenizers.cpp @@ -9,9 +9,9 @@ #include #include #include +#include #include #include -#include #include #include #include diff --git a/runtime/executor/method.cpp b/runtime/executor/method.cpp index 1610804586d..58c71fdfc45 100644 --- a/runtime/executor/method.cpp +++ b/runtime/executor/method.cpp @@ -1290,14 +1290,9 @@ Method::set_input(const EValue& input_evalue, size_t input_idx) { lhs, rhs); } else if (e.isString()) { - ET_CHECK_OR_RETURN_ERROR( - e.toString() == input_evalue.toString(), - InvalidArgument, - "The %" ET_PRIsize_t - "-th input of method should have the same value as the input_evalue, but get %s and %s", - input_idx, - e.toString().data(), - input_evalue.toString().data()); + // Strings are runtime inputs. The EValue references caller-owned storage, + // which must outlive this execution, matching non-memory-planned tensors. + mutable_value(get_input_index(input_idx)) = input_evalue; } else { #if ET_LOG_ENABLED std::array tag_name;