diff --git a/exir/program/_program.py b/exir/program/_program.py
index 950e203c86c..f49e44fbf68 100644
--- a/exir/program/_program.py
+++ b/exir/program/_program.py
@@ -1882,6 +1882,7 @@ def __init__(
         self._named_data: Optional[NamedDataStoreOutput] = named_data
 
         backend_config = backend_config or ExecutorchBackendConfig()
+        self._backend_config = backend_config
 
         # Emit methods
         self._emitter_output: EmitterOutput = emit_program(
diff --git a/extension/llm/export/BUCK b/extension/llm/export/BUCK
index 17b3b951642..e89e8e3174a 100644
--- a/extension/llm/export/BUCK
+++ b/extension/llm/export/BUCK
@@ -27,6 +27,7 @@ fbcode_target(_kind = runtime.python_library,
         "partitioner_lib.py",
         "quantize.py",
         "quantizer_lib.py",
+        "tokenizer_delegate.py",
     ],
     _is_external_target = True,
     base_module = "executorch.extension.llm.export",
diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py
index c25c1190990..05b54119235 100644
--- a/extension/llm/export/builder.py
+++ b/extension/llm/export/builder.py
@@ -34,6 +34,9 @@
 from executorch.extension.export_util.utils import export_to_edge, save_pte_program
 
 from executorch.extension.llm.export.export_passes import RemoveRedundantTransposes
+from executorch.extension.llm.export.tokenizer_delegate import (
+    append_tokenizer_delegate_method,
+)
 from pytorch_tokenizers import get_tokenizer
 from torch.export import export, ExportedProgram
 from torch.nn.attention import SDPBackend
@@ -519,6 +522,14 @@ def to_executorch(
                 external_constants=external_constants_tag,
             )
         )
+        if self.tokenizer_path is not None:
+            append_tokenizer_delegate_method(
+                self.export_program,
+                tokenizer_path=self.tokenizer_path,
+                max_context_length=int(
+                    self.metadata.get("get_max_context_len", self.max_seq_len)
+                ),
+            )
         logging.info(
             "Required memory for activation in bytes: {}".format(
                 self.export_program._emitter_output.program.execution_plan[
diff --git a/extension/llm/export/test/BUCK b/extension/llm/export/test/BUCK
index 5537a8c5f29..dfe6b28984d 100644
--- a/extension/llm/export/test/BUCK
+++ b/extension/llm/export/test/BUCK
@@ -17,3 +17,11 @@ fbcode_target(_kind = runtime.python_test,
         "//caffe2:torch",
     ],
 )
+
+fbcode_target(_kind = runtime.python_test,
+    name = "test_tokenizer_delegate",
+    srcs = ["test_tokenizer_delegate.py"],
+    deps = [
+        "//executorch/extension/llm/export:export_lib",
+    ],
+)
diff --git a/extension/llm/export/test/test_tokenizer_delegate.py b/extension/llm/export/test/test_tokenizer_delegate.py
new file mode 100644
index 00000000000..8b3cc1a93dc
--- /dev/null
+++ b/extension/llm/export/test/test_tokenizer_delegate.py
@@ -0,0 +1,119 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+
+import tempfile
+import unittest
+from pathlib import Path
+from types import SimpleNamespace
+from typing import Any
+from unittest.mock import patch
+
+from executorch.exir.scalar_type import ScalarType
+from executorch.exir.schema import (
+    DataLocation,
+    DelegateCall,
+    Program,
+    String,
+    SubsegmentOffsets,
+    Tensor,
+    TensorShapeDynamism,
+)
+from executorch.extension.llm.export.tokenizer_delegate import (
+    append_tokenizer_delegate_method,
+    TOKENIZER_BACKEND_ID,
+    TOKENIZER_METHOD_NAME,
+)
+
+
+class TestTokenizerDelegate(unittest.TestCase):
+    def _make_program_manager(self) -> Any:
+        program = Program(
+            version=0,
+            execution_plan=[],
+            constant_buffer=[],
+            backend_delegate_data=[],
+            segments=[],
+            constant_segment=SubsegmentOffsets(segment_index=0, offsets=[]),
+        )
+        return SimpleNamespace(
+            _emitter_output=SimpleNamespace(program=program),
+            _backend_config=object(),
+            _data_serializer=None,
+            _named_data=None,
+            _pte_data=None,
+            _tensor_data=None,
+            _buffer=b"stale",
+        )
+
+    def test_appends_tokenizer_execution_plan(self) -> None:
+        tokenizer_bytes = b"llama-stories-tokenizer-bytes"
+        manager = self._make_program_manager()
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            tokenizer_path = Path(tmpdir) / "tokenizer.model"
+            tokenizer_path.write_bytes(tokenizer_bytes)
+            with patch(
+                "executorch.extension.llm.export.tokenizer_delegate"
+                ".serialize_for_executorch",
+                return_value=(b"serialized", {"tensor": b"data"}),
+            ) as serialize:
+                append_tokenizer_delegate_method(
+                    manager,
+                    tokenizer_path=str(tokenizer_path),
+                    max_context_length=16,
+                )
+
+        program = manager._emitter_output.program
+        self.assertEqual(manager._pte_data, b"serialized")
+        self.assertEqual(manager._tensor_data, {"tensor": b"data"})
+        self.assertIsNone(manager._buffer)
+        serialize.assert_called_once()
+
+        self.assertEqual(len(program.backend_delegate_data), 1)
+        self.assertEqual(program.backend_delegate_data[0].data, tokenizer_bytes)
+        self.assertEqual(len(program.execution_plan), 1)
+
+        plan = program.execution_plan[0]
+        self.assertEqual(plan.name, TOKENIZER_METHOD_NAME)
+        self.assertEqual(plan.inputs, [0])
+        self.assertEqual(plan.outputs, [1])
+        self.assertEqual(plan.non_const_buffer_sizes, [0, 16 * 8])
+
+        self.assertIsInstance(plan.values[0].val, String)
+        self.assertEqual(plan.values[0].val.string_val, "")
+        self.assertIsInstance(plan.values[1].val, Tensor)
+        token_tensor = plan.values[1].val
+        self.assertEqual(token_tensor.scalar_type, ScalarType.LONG)
+        self.assertEqual(token_tensor.sizes, [16])
+        self.assertEqual(
+            token_tensor.shape_dynamism, TensorShapeDynamism.DYNAMIC_BOUND
+        )
+
+        self.assertEqual(len(plan.delegates), 1)
+        delegate = plan.delegates[0]
+        self.assertEqual(delegate.id, TOKENIZER_BACKEND_ID)
+        self.assertEqual(delegate.processed.location, DataLocation.INLINE)
+        self.assertEqual(delegate.processed.index, 0)
+        self.assertEqual(
+            {spec.key: spec.value for spec in delegate.compile_specs},
+            {
+                "max_context_length": b"16",
+                "bos": b"0",
+                "eos": b"0",
+            },
+        )
+
+        self.assertEqual(len(plan.chains), 1)
+        delegate_call = plan.chains[0].instructions[0].instr_args
+        self.assertIsInstance(delegate_call, DelegateCall)
+        self.assertEqual(delegate_call.delegate_index, 0)
+        self.assertEqual(delegate_call.args, [0, 1])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/extension/llm/export/tokenizer_delegate.py b/extension/llm/export/tokenizer_delegate.py
new file mode 100644
index 00000000000..bb4e881fc0a
--- /dev/null
+++ b/extension/llm/export/tokenizer_delegate.py
@@ -0,0 +1,144 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+from executorch.exir._serialize._serialize import serialize_for_executorch
+from executorch.exir.backend.compile_spec_schema import CompileSpec
+from executorch.exir.schema import (
+    AllocationDetails,
+    BackendDelegate,
+    BackendDelegateDataReference,
+    BackendDelegateInlineData,
+    Chain,
+    ContainerMetadata,
+    DataLocation,
+    DelegateCall,
+    EValue,
+    ExecutionPlan,
+    Instruction,
+    String,
+    Tensor,
+    TensorShapeDynamism,
+)
+from executorch.exir.scalar_type import ScalarType
+
+
+TOKENIZER_BACKEND_ID = "TokenizerBackend"
+TOKENIZER_METHOD_NAME = "tokenize"
+
+
+def _allocation_info(memory_id: int, memory_offset: int) -> AllocationDetails:
+    return AllocationDetails(
+        memory_id=memory_id,
+        memory_offset_low=memory_offset & ((1 << 32) - 1),
+        memory_offset_high=memory_offset >> 32,
+    )
+
+
+def _make_token_tensor(max_context_length: int) -> Tensor:
+    if max_context_length <= 0:
+        raise ValueError(
+            f"max_context_length must be positive, got {max_context_length}"
+        )
+    return Tensor(
+        scalar_type=ScalarType.LONG,
+        storage_offset=0,
+        sizes=[max_context_length],
+        dim_order=[0],
+        requires_grad=False,
+        layout=0,
+        data_buffer_idx=0,
+        allocation_info=_allocation_info(memory_id=1, memory_offset=0),
+        shape_dynamism=TensorShapeDynamism.DYNAMIC_BOUND,
+    )
+
+
+def append_tokenizer_delegate_method(
+    executorch_program_manager: Any,
+    tokenizer_path: str,
+    max_context_length: int,
+    method_name: str = TOKENIZER_METHOD_NAME,
+    bos: int = 0,
+    eos: int = 0,
+) -> None:
+    """
+    Add a tokenizer entry point directly to an ExecuTorch program.
+
+    The method takes one string EValue and returns one int64 token tensor. The
+    tensor is memory planned to the model's max context length and resized by
+    the runtime tokenizer delegate to the actual token count.
+    """
+    tokenizer_bytes = Path(tokenizer_path).read_bytes()
+    program = executorch_program_manager._emitter_output.program
+
+    if any(plan.name == method_name for plan in program.execution_plan):
+        raise ValueError(f"Program already has a method named {method_name}")
+
+    delegate_data_index = len(program.backend_delegate_data)
+    program.backend_delegate_data.append(
+        BackendDelegateInlineData(data=tokenizer_bytes)
+    )
+
+    delegate = BackendDelegate(
+        id=TOKENIZER_BACKEND_ID,
+        processed=BackendDelegateDataReference(
+            location=DataLocation.INLINE,
+            index=delegate_data_index,
+        ),
+        compile_specs=[
+            CompileSpec("max_context_length", str(max_context_length).encode()),
+            CompileSpec("bos", str(bos).encode()),
+            CompileSpec("eos", str(eos).encode()),
+        ],
+    )
+
+    input_id = 0
+    output_id = 1
+    plan = ExecutionPlan(
+        name=method_name,
+        values=[
+            EValue(String("")),
+            EValue(_make_token_tensor(max_context_length)),
+        ],
+        inputs=[input_id],
+        outputs=[output_id],
+        chains=[
+            Chain(
+                inputs=[input_id],
+                outputs=[output_id],
+                instructions=[
+                    Instruction(
+                        DelegateCall(
+                            delegate_index=0,
+                            args=[input_id, output_id],
+                        )
+                    )
+                ],
+                stacktrace=None,
+            )
+        ],
+        operators=[],
+        delegates=[delegate],
+        non_const_buffer_sizes=[0, max_context_length * 8],
+        container_meta_type=ContainerMetadata("", ""),
+    )
+    program.execution_plan.append(plan)
+
+    executorch_program_manager._pte_data, executorch_program_manager._tensor_data = (
+        serialize_for_executorch(
+            executorch_program_manager._emitter_output,
+            executorch_program_manager._backend_config,
+            executorch_program_manager._data_serializer,
+            executorch_program_manager._named_data,
+        )
+    )
+    executorch_program_manager._buffer = None
diff --git a/extension/llm/runner/llm_runner_helper.cpp b/extension/llm/runner/llm_runner_helper.cpp
index 4d34fd716e3..0e36ce3fbc0 100644
--- a/extension/llm/runner/llm_runner_helper.cpp
+++ b/extension/llm/runner/llm_runner_helper.cpp
@@ -18,14 +18,24 @@
 #include <executorch/extension/llm/runner/text_prefiller.h>
 #include <executorch/extension/llm/runner/text_token_generator.h>
 #include <executorch/extension/memory_allocator/cpu_caching_malloc_allocator.h>
+#include <executorch/runtime/backend/interface.h>
+#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
 #include <executorch/runtime/core/result.h>
 #include <executorch/runtime/platform/runtime.h>
+#include <pytorch/tokenizers/sentencepiece.h>
 #include <pytorch/tokenizers/hf_tokenizer.h>
 #include <pytorch/tokenizers/llama2c_tokenizer.h>
-#include <pytorch/tokenizers/sentencepiece.h>
 #include <pytorch/tokenizers/tekken.h>
 #include <pytorch/tokenizers/tiktoken.h>
 
+#include <cerrno>
+#include <cinttypes>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <limits>
+#include <string_view>
+
 namespace executorch::extension::llm {
 
 using ::executorch::extension::Module;
@@ -88,6 +98,269 @@ std::unique_ptr<tokenizers::Tokenizer> load_tokenizer(
   return nullptr;
 }
 
+std::unique_ptr<tokenizers::Tokenizer> load_tokenizer_from_buffer(
+    const void* data,
+    size_t size,
+    std::unique_ptr<std::vector<std::string>> special_tokens,
+    std::optional<std::string> pattern,
+    size_t bos_token_index,
+    size_t eos_token_index) {
+  runtime::runtime_init();
+  auto tekken_tokenizer = std::make_unique<tokenizers::Tekken>();
+  if (tekken_tokenizer->load_from_buffer(data, size) ==
+      ::tokenizers::Error::Ok) {
+    ET_LOG(Info, "Loaded tekken tokenizer from buffer");
+    return tekken_tokenizer;
+  }
+
+  auto json_tokenizer = std::make_unique<tokenizers::HFTokenizer>();
+  if (json_tokenizer->load_from_buffer(data, size) == ::tokenizers::Error::Ok) {
+    ET_LOG(Info, "Loaded json tokenizer from buffer");
+    return json_tokenizer;
+  }
+
+  std::unique_ptr<::tokenizers::Tiktoken> tiktoken_tokenizer;
+  if (special_tokens != nullptr && !pattern.has_value()) {
+    tiktoken_tokenizer = std::make_unique<::tokenizers::Tiktoken>(
+        std::move(special_tokens), bos_token_index, eos_token_index);
+  } else if (special_tokens != nullptr && pattern.has_value()) {
+    tiktoken_tokenizer = std::make_unique<::tokenizers::Tiktoken>(
+        pattern.value(),
+        std::move(special_tokens),
+        bos_token_index,
+        eos_token_index);
+  } else {
+    tiktoken_tokenizer = std::make_unique<::tokenizers::Tiktoken>();
+  }
+  if (tiktoken_tokenizer->load_from_buffer(data, size) ==
+      ::tokenizers::Error::Ok) {
+    ET_LOG(Info, "Loaded TikToken tokenizer from buffer");
+    return tiktoken_tokenizer;
+  }
+
+  auto sp_tokenizer = std::make_unique<::tokenizers::SPTokenizer>();
+  if (sp_tokenizer->load_from_buffer(data, size) == ::tokenizers::Error::Ok) {
+    ET_LOG(Info, "Loaded Sentencepiece tokenizer from buffer");
+    return sp_tokenizer;
+  }
+
+  auto bpe_tokenizer = std::make_unique<::tokenizers::Llama2cTokenizer>();
+  if (bpe_tokenizer->load_from_buffer(data, size) == ::tokenizers::Error::Ok) {
+    ET_LOG(Info, "Loaded BPE tokenizer from buffer");
+    return bpe_tokenizer;
+  }
+
+  return nullptr;
+}
+
+namespace {
+
+constexpr const char* kTokenizerBackendId = "TokenizerBackend";
+constexpr const char* kMaxContextLengthSpec = "max_context_length";
+constexpr const char* kBosSpec = "bos";
+constexpr const char* kEosSpec = "eos";
+
+struct TokenizerDelegateHandle final {
+  std::unique_ptr<::tokenizers::Tokenizer> tokenizer;
+  size_t max_context_length = 0;
+  int8_t bos = 0;
+  int8_t eos = 0;
+};
+
+Error parse_size_compile_spec(
+    executorch::runtime::ArrayRef<executorch::runtime::CompileSpec>
+        compile_specs,
+    const char* key,
+    bool required,
+    size_t* out) {
+  for (size_t i = 0; i < compile_specs.size(); ++i) {
+    const auto& spec = compile_specs[i];
+    if (std::strcmp(spec.key, key) != 0) {
+      continue;
+    }
+    std::string value(
+        static_cast<const char*>(spec.value.buffer), spec.value.nbytes);
+    errno = 0;
+    char* end = nullptr;
+    const unsigned long long parsed = std::strtoull(value.c_str(), &end, 10);
+    ET_CHECK_OR_RETURN_ERROR(
+        !value.empty() && value[0] != '-' && errno != ERANGE &&
+            end == value.c_str() + value.size(),
+        InvalidProgram,
+        "Invalid TokenizerBackend compile spec %s=%s",
+        key,
+        value.c_str());
+    ET_CHECK_OR_RETURN_ERROR(
+        parsed <= std::numeric_limits<size_t>::max(),
+        InvalidProgram,
+        "TokenizerBackend compile spec %s is too large",
+        key);
+    *out = static_cast<size_t>(parsed);
+    return Error::Ok;
+  }
+  ET_CHECK_OR_RETURN_ERROR(
+      !required,
+      InvalidProgram,
+      "Missing TokenizerBackend compile spec %s",
+      key);
+  return Error::Ok;
+}
+
+Error parse_i8_compile_spec(
+    executorch::runtime::ArrayRef<executorch::runtime::CompileSpec>
+        compile_specs,
+    const char* key,
+    int8_t* out) {
+  size_t parsed = static_cast<size_t>(*out);
+  ET_CHECK_OK_OR_RETURN_ERROR(
+      parse_size_compile_spec(compile_specs, key, /*required=*/false, &parsed));
+  ET_CHECK_OR_RETURN_ERROR(
+      parsed <= static_cast<size_t>(std::numeric_limits<int8_t>::max()),
+      InvalidProgram,
+      "TokenizerBackend compile spec %s is too large for int8_t",
+      key);
+  *out = static_cast<int8_t>(parsed);
+  return Error::Ok;
+}
+
+class TokenizerBackend final : public executorch::runtime::BackendInterface {
+ public:
+  bool is_available() const override {
+    return true;
+  }
+
+  executorch::runtime::Result<executorch::runtime::DelegateHandle*> init(
+      executorch::runtime::BackendInitContext& context,
+      executorch::runtime::FreeableBuffer* processed,
+      executorch::runtime::ArrayRef<executorch::runtime::CompileSpec>
+          compile_specs) const override {
+    ET_CHECK_OR_RETURN_ERROR(
+        processed != nullptr && processed->data() != nullptr &&
+            processed->size() > 0,
+        InvalidProgram,
+        "TokenizerBackend requires non-empty bundled tokenizer data");
+
+    size_t max_context_length = 0;
+    ET_CHECK_OK_OR_RETURN_ERROR(parse_size_compile_spec(
+        compile_specs,
+        kMaxContextLengthSpec,
+        /*required=*/true,
+        &max_context_length));
+    ET_CHECK_OR_RETURN_ERROR(
+        max_context_length > 0,
+        InvalidProgram,
+        "TokenizerBackend max_context_length must be positive");
+
+    int8_t bos = 0;
+    int8_t eos = 0;
+    ET_CHECK_OK_OR_RETURN_ERROR(
+        parse_i8_compile_spec(compile_specs, kBosSpec, &bos));
+    ET_CHECK_OK_OR_RETURN_ERROR(
+        parse_i8_compile_spec(compile_specs, kEosSpec, &eos));
+
+    auto* handle = context.get_runtime_allocator()
+                       ->allocateInstance<TokenizerDelegateHandle>();
+    ET_CHECK_OR_RETURN_ERROR(
+        handle != nullptr,
+        MemoryAllocationFailed,
+        "Failed to allocate TokenizerBackend handle");
+    new (handle) TokenizerDelegateHandle();
+    handle->max_context_length = max_context_length;
+    handle->bos = bos;
+    handle->eos = eos;
+    handle->tokenizer = load_tokenizer_from_buffer(
+        processed->data(), processed->size());
+    if (handle->tokenizer == nullptr) {
+      handle->~TokenizerDelegateHandle();
+      ET_LOG(Error, "Failed to load bundled tokenizer");
+      return Error::InvalidProgram;
+    }
+    return reinterpret_cast<executorch::runtime::DelegateHandle*>(handle);
+  }
+
+  Error execute(
+      executorch::runtime::BackendExecutionContext&,
+      executorch::runtime::DelegateHandle* handle,
+      executorch::runtime::Span<executorch::runtime::EValue*> args)
+      const override {
+    ET_CHECK_OR_RETURN_ERROR(
+        handle != nullptr,
+        DelegateInvalidHandle,
+        "TokenizerBackend handle is null");
+    ET_CHECK_OR_RETURN_ERROR(
+        args.size() == 2,
+        InvalidProgram,
+        "TokenizerBackend expects 2 arguments, got %zu",
+        args.size());
+
+    auto* tokenizer_handle =
+        reinterpret_cast<TokenizerDelegateHandle*>(handle);
+    auto* input = args[0];
+    auto* output_value = args[1];
+    ET_CHECK_OR_RETURN_ERROR(
+        input != nullptr && input->isString(),
+        InvalidArgument,
+        "TokenizerBackend input must be a string");
+    ET_CHECK_OR_RETURN_ERROR(
+        output_value != nullptr && output_value->isTensor(),
+        InvalidArgument,
+        "TokenizerBackend output must be a tensor");
+
+    const std::string_view prompt = input->toString();
+    auto tokens_result = tokenizer_handle->tokenizer->encode(
+        std::string(prompt), tokenizer_handle->bos, tokenizer_handle->eos);
+    if (!tokens_result.ok()) {
+      ET_LOG(Error, "Bundled tokenizer failed to encode input");
+      return Error::InvalidArgument;
+    }
+    const auto& tokens = tokens_result.get();
+    ET_CHECK_OR_RETURN_ERROR(
+        tokens.size() <= tokenizer_handle->max_context_length,
+        InvalidArgument,
+        "Tokenizer output length %zu exceeds max context length %zu",
+        tokens.size(),
+        tokenizer_handle->max_context_length);
+
+    auto& output = output_value->toTensor();
+    ET_CHECK_OR_RETURN_ERROR(
+        output.scalar_type() == executorch::aten::ScalarType::Long,
+        InvalidArgument,
+        "TokenizerBackend output tensor must be int64");
+    ET_CHECK_OR_RETURN_ERROR(
+        output.dim() == 1,
+        InvalidArgument,
+        "TokenizerBackend output tensor must be rank 1");
+    executorch::aten::SizesType output_size =
+        static_cast<executorch::aten::SizesType>(tokens.size());
+    ET_CHECK_OK_OR_RETURN_ERROR(
+        executorch::runtime::resize_tensor(
+            output,
+            executorch::aten::ArrayRef<executorch::aten::SizesType>(
+                &output_size, 1)));
+    auto* output_data = output.mutable_data_ptr<int64_t>();
+    for (size_t i = 0; i < tokens.size(); ++i) {
+      output_data[i] = static_cast<int64_t>(tokens[i]);
+    }
+    return Error::Ok;
+  }
+
+  void destroy(executorch::runtime::DelegateHandle* handle) const override {
+    if (handle != nullptr) {
+      reinterpret_cast<TokenizerDelegateHandle*>(handle)
+          ->~TokenizerDelegateHandle();
+    }
+  }
+};
+
+TokenizerBackend tokenizer_backend;
+executorch::runtime::Backend tokenizer_backend_registration{
+    kTokenizerBackendId,
+    &tokenizer_backend};
+static auto tokenizer_backend_registration_status =
+    executorch::runtime::register_backend(tokenizer_backend_registration);
+
+} // namespace
+
 ::executorch::runtime::Result<std::unordered_map<std::string, int64_t>>
 get_llm_metadata(tokenizers::Tokenizer* tokenizer, Module* module) {
   // Initialize metadata with default values
diff --git a/extension/llm/runner/llm_runner_helper.h b/extension/llm/runner/llm_runner_helper.h
index b4c7c59806d..697f6162171 100644
--- a/extension/llm/runner/llm_runner_helper.h
+++ b/extension/llm/runner/llm_runner_helper.h
@@ -51,6 +51,21 @@ ET_EXPERIMENTAL std::unique_ptr<tokenizers::Tokenizer> load_tokenizer(
     size_t bos_token_index = 0,
     size_t eos_token_index = 1);
 
+/**
+ * @brief Loads a tokenizer from an in-memory model buffer
+ *
+ * This mirrors load_tokenizer(), but consumes bytes bundled inside an
+ * ExecuTorch program, such as tokenizer delegate inline data.
+ */
+ET_EXPERIMENTAL std::unique_ptr<tokenizers::Tokenizer>
+load_tokenizer_from_buffer(
+    const void* data,
+    size_t size,
+    std::unique_ptr<std::vector<std::string>> special_tokens = nullptr,
+    std::optional<std::string> pattern = std::nullopt,
+    size_t bos_token_index = 0,
+    size_t eos_token_index = 1);
+
 /**
  * @brief Gets LLM metadata from the model and tokenizer
  *
diff --git a/extension/llm/runner/multimodal_runner.cpp b/extension/llm/runner/multimodal_runner.cpp
index 9afaab0b97e..49d6d67f4c0 100644
--- a/extension/llm/runner/multimodal_runner.cpp
+++ b/extension/llm/runner/multimodal_runner.cpp
@@ -12,8 +12,8 @@
 #include <executorch/extension/llm/runner/multimodal_runner.h>
 #include <executorch/extension/llm/runner/util.h>
 #include <executorch/runtime/platform/runtime.h>
-#include <pytorch/tokenizers/hf_tokenizer.h>
 #include <pytorch/tokenizers/sentencepiece.h>
+#include <pytorch/tokenizers/hf_tokenizer.h>
 
 #ifdef CUDA_AVAILABLE
 #include <executorch/backends/cuda/runtime/memory_tracker.h>
diff --git a/extension/llm/runner/text_llm_runner.cpp b/extension/llm/runner/text_llm_runner.cpp
index 160b254460a..94a186e3065 100644
--- a/extension/llm/runner/text_llm_runner.cpp
+++ b/extension/llm/runner/text_llm_runner.cpp
@@ -15,9 +15,9 @@
 #include <executorch/extension/llm/runner/text_llm_runner.h>
 #include <executorch/extension/llm/runner/util.h>
 #include <executorch/runtime/platform/runtime.h>
+#include <pytorch/tokenizers/sentencepiece.h>
 #include <pytorch/tokenizers/hf_tokenizer.h>
 #include <pytorch/tokenizers/llama2c_tokenizer.h>
-#include <pytorch/tokenizers/sentencepiece.h>
 #include <pytorch/tokenizers/tiktoken.h>
 
 namespace executorch::extension::llm {
diff --git a/extension/llm/tokenizers b/extension/llm/tokenizers
index b642403834a..95eb49cec72 160000
--- a/extension/llm/tokenizers
+++ b/extension/llm/tokenizers
@@ -1 +1 @@
-Subproject commit b642403834a67c8ef14a7109dcd1bb5e5f3cb68a
+Subproject commit 95eb49cec721b23e40b96ca2560225b169c5ffe4
diff --git a/extension/wasm/tokenizers/tokenizers.cpp b/extension/wasm/tokenizers/tokenizers.cpp
index b1558464f20..a015921955b 100644
--- a/extension/wasm/tokenizers/tokenizers.cpp
+++ b/extension/wasm/tokenizers/tokenizers.cpp
@@ -9,9 +9,9 @@
 #include <emscripten.h>
 #include <emscripten/bind.h>
 #include <executorch/runtime/platform/compiler.h>
+#include <pytorch/tokenizers/sentencepiece.h>
 #include <pytorch/tokenizers/hf_tokenizer.h>
 #include <pytorch/tokenizers/llama2c_tokenizer.h>
-#include <pytorch/tokenizers/sentencepiece.h>
 #include <pytorch/tokenizers/tekken.h>
 #include <pytorch/tokenizers/tiktoken.h>
 #include <cstdio>
diff --git a/runtime/executor/method.cpp b/runtime/executor/method.cpp
index 1610804586d..58c71fdfc45 100644
--- a/runtime/executor/method.cpp
+++ b/runtime/executor/method.cpp
@@ -1290,14 +1290,9 @@ Method::set_input(const EValue& input_evalue, size_t input_idx) {
         lhs,
         rhs);
   } else if (e.isString()) {
-    ET_CHECK_OR_RETURN_ERROR(
-        e.toString() == input_evalue.toString(),
-        InvalidArgument,
-        "The %" ET_PRIsize_t
-        "-th input of method should have the same value as the input_evalue, but get %s and %s",
-        input_idx,
-        e.toString().data(),
-        input_evalue.toString().data());
+    // Strings are runtime inputs. The EValue references caller-owned storage,
+    // which must outlive this execution, matching non-memory-planned tensors.
+    mutable_value(get_input_index(input_idx)) = input_evalue;
   } else {
 #if ET_LOG_ENABLED
     std::array<char, kTagNameBufferSize> tag_name;