From 0ba85256d7aa34ff2f3459d91b2c689e75f2d40d Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu@meta.com>
Date: Tue, 3 Jun 2025 16:08:00 -0700
Subject: [PATCH 1/2] [llm] Add a generic text only LLM runner

Introducing `text_llm_runner`. This can be used to run all text only decoder only LLM models supported by ExecuTorch.

* Metadata is being read out from the .pte file and being used to construct the runner object.
* examples/models/llama/runner.h[.cpp] only contains a simple wrapper around `text_llm_runner.h[.cpp]`.


In next PRs I will move examples/models/phi-3-mini/runner to use the generic runner.

Will look into QNN and MediaTek runners as well.

Differential Revision: [D75910889](https://our.internmc.facebook.com/intern/diff/D75910889/)

[ghstack-poisoned]
---
 .../LLaMARunner/Exported/LLaMARunner.mm       |   2 +-
 examples/models/llama/main.cpp                |   9 +-
 examples/models/llama/runner/runner.cpp       | 338 +---------------
 examples/models/llama/runner/runner.h         |  72 +---
 examples/models/llama/runner/targets.bzl      |   9 +-
 .../models/llama/tokenizer/llama_tiktoken.cpp |   8 +-
 .../models/llama/tokenizer/llama_tiktoken.h   |   2 +
 extension/android/jni/jni_layer_llama.cpp     |   4 +-
 .../apple/Benchmark/Tests/LLaMA/LLaMATests.mm |   2 +-
 extension/llm/runner/targets.bzl              |   5 +
 extension/llm/runner/text_llm_runner.cpp      | 382 ++++++++++++++++++
 extension/llm/runner/text_llm_runner.h        | 109 +++++
 12 files changed, 534 insertions(+), 408 deletions(-)
 create mode 100644 extension/llm/runner/text_llm_runner.cpp
 create mode 100644 extension/llm/runner/text_llm_runner.h

diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.mm b/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.mm
index c2f01bf17b1..e9ff473c2cd 100644
--- a/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.mm
+++ b/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.mm
@@ -31,7 +31,7 @@ - (instancetype)initWithModelPath:(NSString*)modelPath
   self = [super init];
   if (self) {
     [ExecuTorchLog.sharedLog addSink:self];
-    _runner = example::Runner::create(
+    _runner = example::create_llama_runner(
         modelPath.UTF8String, tokenizerPath.UTF8String);
   }
   return self;
diff --git a/examples/models/llama/main.cpp b/examples/models/llama/main.cpp
index 1c1b6f62dc1..38009dd59ec 100644
--- a/examples/models/llama/main.cpp
+++ b/examples/models/llama/main.cpp
@@ -81,8 +81,13 @@ int32_t main(int32_t argc, char** argv) {
   }
 #endif
   // create llama runner
-  std::unique_ptr<example::Runner> runner =
-      example::Runner::create(model_path, tokenizer_path, data_path);
+  std::unique_ptr<::executorch::extension::llm::TextLLMRunner> runner =
+      example::create_llama_runner(model_path, tokenizer_path, data_path);
+
+  if (runner == nullptr) {
+    ET_LOG(Error, "Failed to create llama runner");
+    return 1;
+  }
 
   if (warmup) {
     runner->warmup(prompt, /*max_new_tokens=*/seq_len);
diff --git a/examples/models/llama/runner/runner.cpp b/examples/models/llama/runner/runner.cpp
index 119eedc704e..2ba2fdf9941 100644
--- a/examples/models/llama/runner/runner.cpp
+++ b/examples/models/llama/runner/runner.cpp
@@ -11,8 +11,7 @@
 // The module takes in a string as input and emits a string as output.
 
 #include <executorch/examples/models/llama/runner/runner.h>
-
-#include <executorch/extension/llm/runner/util.h>
+#include <executorch/extension/module/module.h>
 
 #include <executorch/examples/models/llama/tokenizer/llama_tiktoken.h>
 #include <pytorch/tokenizers/hf_tokenizer.h>
@@ -26,41 +25,14 @@ using ::executorch::runtime::Result;
 
 namespace llm = ::executorch::extension::llm;
 
-namespace {
-static constexpr auto kEnableDynamicShape = "enable_dynamic_shape";
-static constexpr auto kBosId = "get_bos_id";
-static constexpr auto kEosIds = "get_eos_ids";
-static constexpr auto kMaxSeqLen = "get_max_seq_len";
-static constexpr auto kMaxContextLen = "get_max_context_len";
-static constexpr auto kVocabSize = "get_vocab_size";
-static constexpr auto kUseKVCache = "use_kv_cache";
-static constexpr auto kUseSDPAWithKVCache = "use_sdpa_with_kv_cache";
-
-std::unique_ptr<::tokenizers::Tokenizer> load_tokenizer(
-    const std::string& tokenizer_path) {
-  auto json_tokenizer = std::make_unique<tokenizers::HFTokenizer>();
-  if (json_tokenizer->load(tokenizer_path) == ::tokenizers::Error::Ok) {
-    ET_LOG(Info, "Loaded json tokenizer");
-    return json_tokenizer;
-  }
-
-  auto tiktoken_tokenizer = get_tiktoken_for_llama();
-  if (tiktoken_tokenizer->load(tokenizer_path) == ::tokenizers::Error::Ok) {
-    ET_LOG(Info, "Loaded TikToken tokenizer");
-    return tiktoken_tokenizer;
-  }
-
-  auto bpe_tokenizer = std::make_unique<::tokenizers::Llama2cTokenizer>();
-  if (bpe_tokenizer->load(tokenizer_path) == ::tokenizers::Error::Ok) {
-    ET_LOG(Info, "Loaded BPE tokenizer");
-    return bpe_tokenizer;
-  }
-
-  return nullptr;
+std::unique_ptr<::tokenizers::Tokenizer> load_llama_tokenizer(
+    const std::string& tokenizer_path,
+    Version version) {
+  auto special_tokens = get_special_tokens(version);
+  return llm::load_tokenizer(tokenizer_path, std::move(special_tokens));
 }
-} // namespace
 
-std::unique_ptr<Runner> Runner::create(
+std::unique_ptr<llm::TextLLMRunner> create_llama_runner(
     const std::string& model_path,
     const std::string& tokenizer_path,
     std::optional<const std::string> data_path,
@@ -71,29 +43,10 @@ std::unique_ptr<Runner> Runner::create(
       model_path.c_str(),
       tokenizer_path.c_str());
 
-  // Create the Module
-  std::unique_ptr<Module> module;
-  if (data_path.has_value()) {
-    module = std::make_unique<Module>(
-        model_path, data_path.value(), Module::LoadMode::File);
-  } else {
-    module = std::make_unique<Module>(model_path, Module::LoadMode::File);
-  }
-
-  // Initialize metadata with default values
-  std::unordered_map<std::string, int64_t> metadata({
-      {kEnableDynamicShape, false},
-      {kMaxSeqLen, 128},
-      {kMaxContextLen, 128},
-      {kUseKVCache, true},
-      {kUseSDPAWithKVCache, false},
-  });
-
   // Create and load tokenizer
   std::unique_ptr<::tokenizers::Tokenizer> tokenizer =
-      load_tokenizer(tokenizer_path);
+      load_llama_tokenizer(tokenizer_path, Version::Default);
 
-  // Fallback to BPE tokenizer if tiktoken fails
   if (tokenizer == nullptr) {
     ET_LOG(
         Info,
@@ -101,279 +54,8 @@ std::unique_ptr<Runner> Runner::create(
         tokenizer_path.c_str());
     return nullptr;
   }
-
-  ET_LOG(Info, "Reading metadata from model");
-
-  // Set tokenizer-related metadata
-  metadata[kBosId] = tokenizer->bos_tok();
-  auto eos_ids = std::make_unique<std::unordered_set<uint64_t>>(
-      std::unordered_set<uint64_t>{tokenizer->eos_tok()});
-  metadata[kVocabSize] = tokenizer->vocab_size();
-
-  // Read metadata from the model
-  auto method_names_result = module->method_names();
-  if (method_names_result.error() != Error::Ok) {
-    ET_LOG(Error, "Failed reading method names");
-    return nullptr;
-  }
-  const auto method_names = method_names_result.get();
-
-  for (auto& pair : metadata) {
-    const auto& method_name = pair.first;
-    auto& value = pair.second;
-
-    if (method_names.count(method_name)) {
-      auto get_result = module->get(method_name);
-      value = get_result.get().toScalar().to<decltype(metadata)::mapped_type>();
-    } else {
-      ET_LOG(
-          Info,
-          "Method %s not found, using the default value %" PRId64,
-          method_name.c_str(),
-          value);
-    }
-    ET_LOG(Info, "Metadata: %s = %" PRId64, method_name.c_str(), value);
-  }
-
-  // Get EOS IDs if available
-  if (method_names.count(kEosIds)) {
-    eos_ids->clear();
-    auto execute_result = module->execute(kEosIds);
-    if (execute_result.error() != Error::Ok) {
-      ET_LOG(Error, "Failed to execute %s", kEosIds);
-      return nullptr;
-    }
-    for (const auto& eos_id : execute_result.get()) {
-      auto value = eos_id.toScalar().to<int64_t>();
-      eos_ids->emplace(value);
-      ET_LOG(Info, "eos_id = %" PRId64, value);
-    }
-  }
-
-  // Create text_decoder_runner. Use a shared_ptr so that it can be shared with
-  // TextPrefiller and TextTokenGenerator
-  auto text_decoder_runner = std::make_unique<llm::TextDecoderRunner>(
-      module.get(), metadata.at(kUseKVCache));
-
-  // Create text_prefiller
-  auto text_prefiller = std::make_unique<llm::TextPrefiller>(
-      text_decoder_runner.get(),
-      metadata.at(kUseKVCache),
-      metadata.at(kEnableDynamicShape),
-      metadata.at(kMaxSeqLen));
-
-  // Create text_token_generator with stats
-  auto stats = std::make_unique<llm::Stats>();
-  auto text_token_generator = std::make_unique<llm::TextTokenGenerator>(
-      tokenizer.get(),
-      text_decoder_runner.get(),
-      metadata.at(kUseKVCache),
-      std::move(eos_ids),
-      stats.get());
-
-  // Create and return the Runner instance
-  return std::make_unique<Runner>(
-      std::move(metadata),
-      std::move(tokenizer),
-      std::move(module),
-      std::move(text_decoder_runner),
-      std::move(text_prefiller),
-      std::move(text_token_generator),
-      std::move(stats),
-      temperature);
-}
-
-Runner::Runner(
-    std::unordered_map<std::string, int64_t> metadata,
-    std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
-    std::unique_ptr<::executorch::extension::Module> module,
-    std::unique_ptr<::executorch::extension::llm::TextDecoderRunner>
-        text_decoder_runner,
-    std::unique_ptr<::executorch::extension::llm::TextPrefiller> text_prefiller,
-    std::unique_ptr<::executorch::extension::llm::TextTokenGenerator>
-        text_token_generator,
-    std::unique_ptr<::executorch::extension::llm::Stats> stats,
-    float temperature)
-    : tokenizer_(std::move(tokenizer)),
-      metadata_(std::move(metadata)),
-      module_(std::move(module)),
-      text_decoder_runner_(std::move(text_decoder_runner)),
-      text_prefiller_(std::move(text_prefiller)),
-      text_token_generator_(std::move(text_token_generator)),
-      stats_(std::move(stats)),
-      temperature_(temperature) {
-  // Note: This constructor assumes that text_prefiller and text_token_generator
-  // already have references to the Module and TextDecoderRunner they need
-}
-
-bool Runner::is_loaded() const {
-  return text_prefiller_->is_loaded() && text_token_generator_->is_loaded();
-}
-
-Error Runner::load() {
-  if (is_loaded()) {
-    return Error::Ok;
-  }
-  ET_CHECK_OK_OR_RETURN_ERROR(text_prefiller_->load());
-  ET_CHECK_OK_OR_RETURN_ERROR(text_token_generator_->load());
-  return Error::Ok;
-}
-
-// Don't print with the same priority during warmup
-#define RUNNER_ET_LOG(warmup, format, ...) \
-  if (warmup) {                            \
-    ET_LOG(Debug, format, __VA_ARGS__);    \
-  } else {                                 \
-    ET_LOG(Info, format, __VA_ARGS__);     \
-  }
-
-Error Runner::generate(
-    const std::string& prompt,
-    const ::executorch::extension::llm::GenerationConfig& config,
-    std::function<void(const std::string&)> token_callback,
-    std::function<void(const llm::Stats&)> stats_callback) {
-  // Prepare the inputs.
-  // Use ones-initialized inputs.
-  ET_CHECK_MSG(!prompt.empty(), "Prompt cannot be null");
-  if (!is_loaded()) {
-    stats_->model_load_start_ms = llm::time_in_ms();
-    ET_CHECK_OK_OR_RETURN_ERROR(load());
-    stats_->model_load_end_ms = llm::time_in_ms();
-  }
-
-  if (config.warming) {
-    ET_LOG(Info, "Doing a warmup run...");
-  }
-
-  RUNNER_ET_LOG(
-      config.warming,
-      "RSS after loading model: %f MiB (0 if unsupported)",
-      llm::get_rss_bytes() / 1024.0 / 1024.0);
-
-  // Wrap the token_callback with print function
-  std::function<void(const std::string&)> wrapped_callback =
-      [token_callback, config](const std::string& piece) {
-        if (!config.warming) {
-          llm::safe_printf(piece.c_str());
-          fflush(stdout);
-        }
-        if (token_callback) {
-          token_callback(piece);
-        }
-      };
-  // First token time only measures the time it takes to encode the prompt and
-  // return a response token.
-
-  stats_->inference_start_ms = llm::time_in_ms();
-  shouldStop_ = false;
-
-  ::tokenizers::Result<std::vector<uint64_t>> encode_res = tokenizer_->encode(
-      prompt,
-      /* bos */ 0,
-      /* eos */ 0);
-
-  ET_CHECK_TK_OK_OR_RETURN_ERROR(
-      encode_res.error(), "Failed to encode prompt %s", prompt.c_str());
-
-  // encode the (string) prompt into tokens sequence
-  std::vector<uint64_t> prompt_tokens = encode_res.get();
-  int num_prompt_tokens = prompt_tokens.size();
-
-  ET_CHECK_MSG(num_prompt_tokens >= 1, "Expected at least 1 prompt token");
-  ET_CHECK_MSG(
-      num_prompt_tokens < metadata_.at(kMaxContextLen),
-      "num_prompt_tokens %d >= max_seq_len_ %" PRId64
-      ", Max seq length exceeded - please increase max seq len value in your export script",
-      num_prompt_tokens,
-      metadata_.at(kMaxContextLen));
-
-  // Determine max_new_tokens using the GenerationConfig's resolve method
-  int max_new_tokens = config.resolve_max_new_tokens(
-      metadata_.at(kMaxContextLen), num_prompt_tokens);
-
-  ET_LOG(Info, "Max new tokens resolved: %d", max_new_tokens);
-
-  // Prefill first
-  // Here feed all tokens to the model and get the next predicted token
-  // after the prompt. After that we will enter generate loop.
-
-  // print prompts
-  if (config.echo) {
-    wrapped_callback(prompt);
-  }
-  int64_t pos = 0;
-  auto prefill_res = text_prefiller_->prefill(prompt_tokens, pos);
-  ET_CHECK_OK_OR_RETURN_ERROR(prefill_res.error());
-  uint64_t cur_token = prefill_res.get();
-  stats_->first_token_ms = llm::time_in_ms();
-  stats_->prompt_eval_end_ms = llm::time_in_ms();
-
-  // print the first token from prefill. No prev_token so use cur_token for it.
-  wrapped_callback(
-      ET_UNWRAP_TOKENIZER(tokenizer_->decode(cur_token, cur_token)));
-  RUNNER_ET_LOG(
-      config.warming,
-      "RSS after prompt prefill: %f MiB (0 if unsupported)",
-      llm::get_rss_bytes() / 1024.0 / 1024.0);
-
-  // start the main loop
-  prompt_tokens.push_back(cur_token);
-
-  // Generate max_new_tokens - 1 because prefill already generated 1 token.
-  int64_t num_generated_tokens = ET_UNWRAP(text_token_generator_->generate(
-      prompt_tokens,
-      num_prompt_tokens,
-      max_new_tokens - 1,
-      temperature_ == -1.0f ? config.temperature : temperature_,
-      wrapped_callback));
-
-  stats_->inference_end_ms = llm::time_in_ms();
-  if (!config.warming) {
-    printf("\n");
-  }
-  RUNNER_ET_LOG(
-      config.warming,
-      "RSS after finishing text generation: %f MiB (0 if unsupported)",
-      llm::get_rss_bytes() / 1024.0 / 1024.0);
-
-  if (num_generated_tokens == max_new_tokens) {
-    RUNNER_ET_LOG(config.warming, "Max new tokens %i reached!", max_new_tokens);
-  }
-
-  stats_->num_prompt_tokens = num_prompt_tokens;
-  stats_->num_generated_tokens = num_generated_tokens;
-
-  if (config.warming) {
-    ET_LOG(Info, "Warmup run finished!");
-  } else {
-    // Do not print report during warmup
-    ::executorch::llm::print_report(*stats_);
-  }
-  if (stats_callback) {
-    stats_callback(*stats_);
-  }
-
-  return Error::Ok;
-}
-
-Error Runner::warmup(const std::string& prompt, int32_t max_new_tokens) {
-  // Create a GenerationConfig for warmup
-  llm::GenerationConfig config{
-      .echo = false, .max_new_tokens = max_new_tokens, .warming = true};
-
-  // Call generate with the warmup config
-  Error err = generate(prompt, config);
-
-  // Reset stats after warmup, not resetting the std::unique_ptr!
-  stats_->reset();
-  return err;
+  return llm::create_text_llm_runner(
+      model_path, std::move(tokenizer), data_path);
 }
 
-void Runner::stop() {
-  if (is_loaded()) {
-    text_token_generator_->stop();
-  } else {
-    ET_LOG(Error, "Token generator is not loaded, cannot stop");
-  }
-}
 } // namespace example
diff --git a/examples/models/llama/runner/runner.h b/examples/models/llama/runner/runner.h
index e4e91db37d5..09a166b0109 100644
--- a/examples/models/llama/runner/runner.h
+++ b/examples/models/llama/runner/runner.h
@@ -19,74 +19,20 @@
 #include <unordered_map>
 
 #include <executorch/extension/llm/runner/irunner.h>
-#include <executorch/extension/llm/runner/stats.h>
-#include <executorch/extension/llm/runner/text_decoder_runner.h>
-#include <executorch/extension/llm/runner/text_prefiller.h>
-#include <executorch/extension/llm/runner/text_token_generator.h>
-#include <executorch/extension/module/module.h>
+#include <executorch/extension/llm/runner/text_llm_runner.h>
 #include <pytorch/tokenizers/tokenizer.h>
 
 namespace example {
 
-class ET_EXPERIMENTAL Runner : public executorch::extension::llm::IRunner {
- public:
-  // Static factory method to create a Runner instance
-  static std::unique_ptr<Runner> create(
-      const std::string& model_path,
-      const std::string& tokenizer_path,
-      std::optional<const std::string> data_path = std::nullopt,
-      float temperature = -1.0f);
+namespace llm = ::executorch::extension::llm;
 
-  // Constructor with dependency injection
-  explicit Runner(
-      std::unordered_map<std::string, int64_t> metadata,
-      std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
-      std::unique_ptr<::executorch::extension::Module> module,
-      std::unique_ptr<::executorch::extension::llm::TextDecoderRunner>
-          text_decoder_runner,
-      std::unique_ptr<::executorch::extension::llm::TextPrefiller>
-          text_prefiller,
-      std::unique_ptr<::executorch::extension::llm::TextTokenGenerator>
-          text_token_generator,
-      std::unique_ptr<::executorch::extension::llm::Stats> stats,
-      float temperature = -1.0f);
+std::unique_ptr<llm::TextLLMRunner> create_llama_runner(
+    const std::string& model_path,
+    const std::string& tokenizer_path,
+    std::optional<const std::string> data_path = std::nullopt,
+    float temperature = -1.0f);
 
-  bool is_loaded() const override;
-  ::executorch::runtime::Error load() override;
-  ::executorch::runtime::Error generate(
-      const std::string& prompt,
-      const ::executorch::extension::llm::GenerationConfig& config,
-      std::function<void(const std::string&)> token_callback = {},
-      std::function<void(const ::executorch::extension::llm::Stats&)>
-          stats_callback = {}) override;
-  ::executorch::runtime::Error warmup(
-      const std::string& prompt,
-      int32_t max_new_tokens);
-  void stop() override;
-
- private:
-  bool shouldStop_{false};
-
-  // Components
-  std::unique_ptr<::tokenizers::Tokenizer> tokenizer_;
-  std::unordered_map<std::string, int64_t> metadata_;
-  std::unique_ptr<::executorch::extension::Module>
-      module_; // Manage module's lifecycle, make sure it outlives
-               // text_decoder_runner_.
-  std::unique_ptr<::executorch::extension::llm::TextDecoderRunner>
-      text_decoder_runner_; // Manage text_decoder_runner_'s lifecycle, make
-                            // sure it outlives text_prefiller_ &
-                            // text_token_generator_.
-  std::unique_ptr<::executorch::extension::llm::TextPrefiller> text_prefiller_;
-  std::unique_ptr<::executorch::extension::llm::TextTokenGenerator>
-      text_token_generator_;
-
-  // Stats
-  std::unique_ptr<::executorch::extension::llm::Stats> stats_;
-
-  // temperature.
-  // Deprecated, we should rely on the temperature in GenerationConfig instead.
-  float temperature_ = -1.0f;
-};
+std::unique_ptr<tokenizers::Tokenizer> load_llama_tokenizer(
+    const std::string& tokenizer_path);
 
 } // namespace example
diff --git a/examples/models/llama/runner/targets.bzl b/examples/models/llama/runner/targets.bzl
index 158202cf55a..9f11eab8c28 100644
--- a/examples/models/llama/runner/targets.bzl
+++ b/examples/models/llama/runner/targets.bzl
@@ -36,14 +36,7 @@ def define_common_targets():
             ],
             exported_deps = [
                 "//executorch/backends/xnnpack:xnnpack_backend",
-                "//executorch/extension/llm/runner:irunner",
-                "//executorch/extension/llm/runner:stats",
-                "//executorch/extension/llm/runner:text_decoder_runner" + aten_suffix,
-                "//executorch/extension/llm/runner:text_prefiller" + aten_suffix,
-                "//executorch/extension/llm/runner:text_token_generator" + aten_suffix,
-                "//executorch/extension/evalue_util:print_evalue" + aten_suffix,
-                "//executorch/extension/module:module" + aten_suffix,
-                "//executorch/extension/tensor:tensor" + aten_suffix,
+                "//executorch/extension/llm/runner:runner_lib" + aten_suffix,
                 "//executorch/kernels/quantized:generated_lib" + aten_suffix,
                 "//executorch/runtime/core/exec_aten:lib" + aten_suffix,
                 "//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix,
diff --git a/examples/models/llama/tokenizer/llama_tiktoken.cpp b/examples/models/llama/tokenizer/llama_tiktoken.cpp
index f595de3c4e7..7b98a6ca415 100644
--- a/examples/models/llama/tokenizer/llama_tiktoken.cpp
+++ b/examples/models/llama/tokenizer/llama_tiktoken.cpp
@@ -42,7 +42,9 @@ _get_default_special_tokens() {
   return special_tokens;
 }
 
-std::unique_ptr<std::vector<std::string>> _get_special_tokens(Version version) {
+} // namespace
+
+std::unique_ptr<std::vector<std::string>> get_special_tokens(Version version) {
   switch (version) {
     case Version::Multimodal:
       return get_multimodal_special_tokens();
@@ -51,11 +53,9 @@ std::unique_ptr<std::vector<std::string>> _get_special_tokens(Version version) {
   }
 }
 
-} // namespace
-
 std::unique_ptr<Tiktoken> get_tiktoken_for_llama(Version version) {
   return std::make_unique<Tiktoken>(
-      _get_special_tokens(version), kBOSTokenIndex, kEOSTokenIndex);
+      get_special_tokens(version), kBOSTokenIndex, kEOSTokenIndex);
 }
 
 std::unique_ptr<std::vector<std::string>> get_multimodal_special_tokens() {
diff --git a/examples/models/llama/tokenizer/llama_tiktoken.h b/examples/models/llama/tokenizer/llama_tiktoken.h
index a7f65eca29e..01d836ffbe6 100644
--- a/examples/models/llama/tokenizer/llama_tiktoken.h
+++ b/examples/models/llama/tokenizer/llama_tiktoken.h
@@ -20,6 +20,8 @@ enum class Version {
 std::unique_ptr<::tokenizers::Tiktoken> get_tiktoken_for_llama(
     Version version = Version::Default);
 
+std::unique_ptr<std::vector<std::string>> get_special_tokens(Version version);
+
 std::unique_ptr<std::vector<std::string>> get_multimodal_special_tokens();
 
 } // namespace example
diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp
index 03e26f089db..ad1c77a92b9 100644
--- a/extension/android/jni/jni_layer_llama.cpp
+++ b/extension/android/jni/jni_layer_llama.cpp
@@ -168,7 +168,9 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
       std::optional<const std::string> data_path_str = data_path
           ? std::optional<const std::string>{data_path->toStdString()}
           : std::nullopt;
-      runner_ = example::Runner::create(
+      // TODO(larryliu0820): Use the API in text_llm_runner.h to create the
+      // runner.
+      runner_ = example::create_llama_runner(
           model_path->toStdString(),
           tokenizer_path->toStdString(),
           data_path_str);
diff --git a/extension/benchmark/apple/Benchmark/Tests/LLaMA/LLaMATests.mm b/extension/benchmark/apple/Benchmark/Tests/LLaMA/LLaMATests.mm
index e53a457939c..c56f054ae3b 100644
--- a/extension/benchmark/apple/Benchmark/Tests/LLaMA/LLaMATests.mm
+++ b/extension/benchmark/apple/Benchmark/Tests/LLaMA/LLaMATests.mm
@@ -74,7 +74,7 @@ @implementation LLaMATests
   NSString *tokenizerPath = resources[@"tokenizer"];
   return @{
     @"generate" : ^(XCTestCase *testCase){
-      auto __block runner = example::Runner::create(
+      auto __block runner = example::create_llama_runner(
           modelPath.UTF8String, tokenizerPath.UTF8String);
       if (!runner) {
         XCTFail("Failed to create runner");
diff --git a/extension/llm/runner/targets.bzl b/extension/llm/runner/targets.bzl
index 03b593cacf5..d429514dab7 100644
--- a/extension/llm/runner/targets.bzl
+++ b/extension/llm/runner/targets.bzl
@@ -84,12 +84,17 @@ def define_common_targets():
             name = "runner_lib" + aten_suffix,
             exported_headers = [
                 "multimodal_runner.h",
+                "text_llm_runner.h",
+            ],
+            srcs = [
+                "text_llm_runner.cpp",
             ],
             visibility = [
                 "@EXECUTORCH_CLIENTS",
             ],
             exported_deps = [
                 ":image_prefiller" + aten_suffix,
+                ":irunner",
                 ":text_decoder_runner" + aten_suffix,
                 ":text_prefiller" + aten_suffix,
                 ":text_token_generator" + aten_suffix,
diff --git a/extension/llm/runner/text_llm_runner.cpp b/extension/llm/runner/text_llm_runner.cpp
new file mode 100644
index 00000000000..f7b5ade6cad
--- /dev/null
+++ b/extension/llm/runner/text_llm_runner.cpp
@@ -0,0 +1,382 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ * @lint-ignore-every CLANGTIDY facebook-hte-Deprecated
+ */
+
+// A simple llama2 runner that includes preprocessing and post processing logic.
+// The module takes in a string as input and emits a string as output.
+
+#include <executorch/examples/models/llama/tokenizer/llama_tiktoken.h>
+#include <executorch/extension/llm/runner/text_llm_runner.h>
+#include <executorch/extension/llm/runner/util.h>
+#include <pytorch/tokenizers/hf_tokenizer.h>
+#include <pytorch/tokenizers/llama2c_tokenizer.h>
+
+namespace executorch::extension::llm {
+
+using ::executorch::extension::Module;
+using ::executorch::runtime::Error;
+using ::executorch::runtime::Result;
+
+TextLLMRunner::TextLLMRunner(
+    std::unordered_map<std::string, int64_t> metadata,
+    std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
+    std::unique_ptr<::executorch::extension::Module> module,
+    std::unique_ptr<TextDecoderRunner> text_decoder_runner,
+    std::unique_ptr<TextPrefiller> text_prefiller,
+    std::unique_ptr<TextTokenGenerator> text_token_generator,
+    std::unique_ptr<Stats> stats,
+    float temperature)
+    : tokenizer_(std::move(tokenizer)),
+      metadata_(std::move(metadata)),
+      module_(std::move(module)),
+      text_decoder_runner_(std::move(text_decoder_runner)),
+      text_prefiller_(std::move(text_prefiller)),
+      text_token_generator_(std::move(text_token_generator)),
+      stats_(std::move(stats)),
+      temperature_(temperature) {
+  // Note: This constructor assumes that text_prefiller and text_token_generator
+  // already have references to the Module and TextDecoderRunner they need
+}
+
+bool TextLLMRunner::is_loaded() const {
+  return text_prefiller_->is_loaded() && text_token_generator_->is_loaded();
+}
+
+Error TextLLMRunner::load() {
+  if (is_loaded()) {
+    return Error::Ok;
+  }
+  ET_CHECK_OK_OR_RETURN_ERROR(text_prefiller_->load());
+  ET_CHECK_OK_OR_RETURN_ERROR(text_token_generator_->load());
+  return Error::Ok;
+}
+
+// Don't print with the same priority during warmup
+#define RUNNER_ET_LOG(warmup, format, ...) \
+  if (warmup) {                            \
+    ET_LOG(Debug, format, __VA_ARGS__);    \
+  } else {                                 \
+    ET_LOG(Info, format, __VA_ARGS__);     \
+  }
+
+Error TextLLMRunner::generate(
+    const std::string& prompt,
+    const GenerationConfig& config,
+    std::function<void(const std::string&)> token_callback,
+    std::function<void(const Stats&)> stats_callback) {
+  // Prepare the inputs.
+  // Use ones-initialized inputs.
+  ET_CHECK_MSG(!prompt.empty(), "Prompt cannot be null");
+  if (!is_loaded()) {
+    stats_->model_load_start_ms = time_in_ms();
+    ET_CHECK_OK_OR_RETURN_ERROR(load());
+    stats_->model_load_end_ms = time_in_ms();
+  }
+
+  if (config.warming) {
+    ET_LOG(Info, "Doing a warmup run...");
+  }
+
+  RUNNER_ET_LOG(
+      config.warming,
+      "RSS after loading model: %f MiB (0 if unsupported)",
+      get_rss_bytes() / 1024.0 / 1024.0);
+
+  // Wrap the token_callback with print function
+  std::function<void(const std::string&)> wrapped_callback =
+      [token_callback, config](const std::string& piece) {
+        if (!config.warming) {
+          llm::safe_printf(piece.c_str());
+          fflush(stdout);
+        }
+        if (token_callback) {
+          token_callback(piece);
+        }
+      };
+  // First token time only measures the time it takes to encode the prompt and
+  // return a response token.
+
+  stats_->inference_start_ms = time_in_ms();
+  shouldStop_ = false;
+
+  ::tokenizers::Result<std::vector<uint64_t>> encode_res = tokenizer_->encode(
+      prompt,
+      /* bos */ 0,
+      /* eos */ 0);
+
+  ET_CHECK_TK_OK_OR_RETURN_ERROR(
+      encode_res.error(), "Failed to encode prompt %s", prompt.c_str());
+
+  // encode the (string) prompt into tokens sequence
+  std::vector<uint64_t> prompt_tokens = encode_res.get();
+  int num_prompt_tokens = prompt_tokens.size();
+
+  ET_CHECK_MSG(num_prompt_tokens >= 1, "Expected at least 1 prompt token");
+  ET_CHECK_MSG(
+      num_prompt_tokens < metadata_.at(kMaxContextLen),
+      "num_prompt_tokens %d >= max_seq_len_ %" PRId64
+      ", Max seq length exceeded - please increase max seq len value in your export script",
+      num_prompt_tokens,
+      metadata_.at(kMaxContextLen));
+
+  // Determine max_new_tokens using the GenerationConfig's resolve method
+  int max_new_tokens = config.resolve_max_new_tokens(
+      metadata_.at(kMaxContextLen), num_prompt_tokens);
+
+  ET_LOG(Info, "Max new tokens resolved: %d", max_new_tokens);
+
+  // Prefill first
+  // Here feed all tokens to the model and get the next predicted token
+  // after the prompt. After that we will enter generate loop.
+
+  // print prompts
+  if (config.echo) {
+    wrapped_callback(prompt);
+  }
+  int64_t pos = 0;
+  auto prefill_res = text_prefiller_->prefill(prompt_tokens, pos);
+  ET_CHECK_OK_OR_RETURN_ERROR(prefill_res.error());
+  uint64_t cur_token = prefill_res.get();
+  stats_->first_token_ms = time_in_ms();
+  stats_->prompt_eval_end_ms = time_in_ms();
+
+  // print the first token from prefill. No prev_token so use cur_token for it.
+  wrapped_callback(
+      ET_UNWRAP_TOKENIZER(tokenizer_->decode(cur_token, cur_token)));
+  RUNNER_ET_LOG(
+      config.warming,
+      "RSS after prompt prefill: %f MiB (0 if unsupported)",
+      get_rss_bytes() / 1024.0 / 1024.0);
+
+  // start the main loop
+  prompt_tokens.push_back(cur_token);
+
+  // Generate max_new_tokens - 1 because prefill already generated 1 token.
+  int64_t num_generated_tokens = ET_UNWRAP(text_token_generator_->generate(
+      prompt_tokens,
+      num_prompt_tokens,
+      max_new_tokens - 1,
+      temperature_ == -1.0f ? config.temperature : temperature_,
+      wrapped_callback));
+
+  stats_->inference_end_ms = time_in_ms();
+  if (!config.warming) {
+    printf("\n");
+  }
+  RUNNER_ET_LOG(
+      config.warming,
+      "RSS after finishing text generation: %f MiB (0 if unsupported)",
+      get_rss_bytes() / 1024.0 / 1024.0);
+
+  if (num_generated_tokens == max_new_tokens) {
+    RUNNER_ET_LOG(config.warming, "Max new tokens %i reached!", max_new_tokens);
+  }
+
+  stats_->num_prompt_tokens = num_prompt_tokens;
+  stats_->num_generated_tokens = num_generated_tokens;
+
+  if (config.warming) {
+    ET_LOG(Info, "Warmup run finished!");
+  } else {
+    // Do not print report during warmup
+    print_report(*stats_);
+  }
+  if (stats_callback) {
+    stats_callback(*stats_);
+  }
+
+  return Error::Ok;
+}
+
+Error TextLLMRunner::warmup(const std::string& prompt, int32_t max_new_tokens) {
+  // Create a GenerationConfig for warmup
+  GenerationConfig config{
+      .echo = false, .max_new_tokens = max_new_tokens, .warming = true};
+
+  // Call generate with the warmup config
+  Error err = generate(prompt, config);
+
+  // Reset stats after warmup, not resetting the std::unique_ptr!
+  stats_->reset();
+  return err;
+}
+
+void TextLLMRunner::stop() {
+  if (is_loaded()) {
+    text_token_generator_->stop();
+  } else {
+    ET_LOG(Error, "Token generator is not loaded, cannot stop");
+  }
+}
+
+std::unique_ptr<tokenizers::Tokenizer> load_tokenizer(
+    const std::string& tokenizer_path,
+    std::unique_ptr<std::vector<std::string>> special_tokens,
+    std::optional<std::string> pattern,
+    size_t bos_token_index,
+    size_t eos_token_index) {
+  auto json_tokenizer = std::make_unique<tokenizers::HFTokenizer>();
+  if (json_tokenizer->load(tokenizer_path) == ::tokenizers::Error::Ok) {
+    ET_LOG(Info, "Loaded json tokenizer");
+    return json_tokenizer;
+  }
+  std::unique_ptr<::tokenizers::Tiktoken> tiktoken_tokenizer;
+  if (special_tokens != nullptr && !pattern.has_value()) {
+    tiktoken_tokenizer = std::make_unique<::tokenizers::Tiktoken>(
+        special_tokens, bos_token_index, eos_token_index);
+  } else if (special_tokens != nullptr && pattern.has_value()) {
+    tiktoken_tokenizer = std::make_unique<::tokenizers::Tiktoken>(
+        pattern.value(), special_tokens, bos_token_index, eos_token_index);
+  } else {
+    tiktoken_tokenizer = std::make_unique<::tokenizers::Tiktoken>();
+  }
+  if (tiktoken_tokenizer->load(tokenizer_path) == ::tokenizers::Error::Ok) {
+    ET_LOG(Info, "Loaded TikToken tokenizer");
+    return tiktoken_tokenizer;
+  }
+
+  auto bpe_tokenizer = std::make_unique<::tokenizers::Llama2cTokenizer>();
+  if (bpe_tokenizer->load(tokenizer_path) == ::tokenizers::Error::Ok) {
+    ET_LOG(Info, "Loaded BPE tokenizer");
+    return bpe_tokenizer;
+  }
+
+  return nullptr;
+}
+
+std::unordered_map<std::string, int64_t> get_llm_metadata(
+    tokenizers::Tokenizer* tokenizer,
+    Module* module) {
+  // Initialize metadata with default values
+  std::unordered_map<std::string, int64_t> metadata({
+      {llm::kEnableDynamicShape, false},
+      {llm::kMaxSeqLen, 128},
+      {llm::kMaxContextLen, 128},
+      {llm::kUseKVCache, true},
+      {llm::kUseSDPAWithKVCache, false},
+  });
+
+  // Read metadata from the model
+  auto method_names_result = module->method_names();
+  if (method_names_result.error() != Error::Ok) {
+    ET_LOG(Error, "Failed reading method names");
+    return metadata;
+  }
+  const auto method_names = method_names_result.get();
+
+  for (auto& pair : metadata) {
+    const auto& method_name = pair.first;
+    auto& value = pair.second;
+
+    if (method_names.count(method_name)) {
+      auto get_result = module->get(method_name);
+      value = get_result.get().toScalar().to<decltype(metadata)::mapped_type>();
+    } else {
+      ET_LOG(
+          Info,
+          "Method %s not found, using the default value %" PRId64,
+          method_name.c_str(),
+          value);
+    }
+    ET_LOG(Info, "Metadata: %s = %" PRId64, method_name.c_str(), value);
+  }
+  // Set tokenizer-related metadata
+  metadata[llm::kBosId] = tokenizer->bos_tok();
+  metadata[llm::kVocabSize] = tokenizer->vocab_size();
+  return metadata;
+}
+
+std::unordered_set<uint64_t> get_eos_ids(
+    tokenizers::Tokenizer* tokenizer,
+    Module* module) {
+  std::unordered_set<uint64_t> eos_ids = {tokenizer->eos_tok()};
+  // Get EOS IDs if available
+  auto method_names_result = module->method_names();
+  if (method_names_result.error() != Error::Ok) {
+    ET_LOG(Error, "Failed reading method names");
+    return eos_ids;
+  }
+  const auto method_names = method_names_result.get();
+
+  if (method_names.count(llm::kEosIds)) {
+    eos_ids.clear();
+    auto execute_result = module->execute(llm::kEosIds);
+    if (execute_result.error() != Error::Ok) {
+      ET_LOG(Error, "Failed to execute %s", llm::kEosIds);
+      return eos_ids;
+    }
+    for (const auto& eos_id : execute_result.get()) {
+      auto value = eos_id.toScalar().to<int64_t>();
+      eos_ids.emplace(value);
+      ET_LOG(Info, "eos_id = %" PRId64, value);
+    }
+  }
+  return eos_ids;
+}
+
+std::unique_ptr<TextLLMRunner> create_text_llm_runner(
+    const std::string& model_path,
+    std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
+    std::optional<const std::string> data_path,
+    float temperature) {
+  // Sanity check tokenizer
+  if (!tokenizer || !tokenizer->is_loaded()) {
+    ET_LOG(Error, "Tokenizer is null or not loaded");
+    return nullptr;
+  }
+
+  // Create the Module
+  std::unique_ptr<Module> module;
+  if (data_path.has_value()) {
+    module = std::make_unique<Module>(
+        model_path, data_path.value(), Module::LoadMode::File);
+  } else {
+    module = std::make_unique<Module>(model_path, Module::LoadMode::File);
+  }
+
+  // Get metadata from Module
+  ET_LOG(Info, "Reading metadata from model");
+  auto metadata = llm::get_llm_metadata(tokenizer.get(), module.get());
+
+  auto eos_ids = std::make_unique<std::unordered_set<uint64_t>>(
+      llm::get_eos_ids(tokenizer.get(), module.get()));
+
+  // Create text_decoder_runner. Use a shared_ptr so that it can be shared with
+  // TextPrefiller and TextTokenGenerator
+  auto text_decoder_runner = std::make_unique<TextDecoderRunner>(
+      module.get(), metadata.at(kUseKVCache));
+
+  // Create text_prefiller
+  auto text_prefiller = std::make_unique<TextPrefiller>(
+      text_decoder_runner.get(),
+      metadata.at(kUseKVCache),
+      metadata.at(kEnableDynamicShape),
+      metadata.at(kMaxSeqLen));
+
+  // Create text_token_generator with stats
+  auto stats = std::make_unique<Stats>();
+  auto text_token_generator = std::make_unique<TextTokenGenerator>(
+      tokenizer.get(),
+      text_decoder_runner.get(),
+      metadata.at(kUseKVCache),
+      std::move(eos_ids),
+      stats.get());
+
+  // Create and return the Runner instance
+  return std::make_unique<TextLLMRunner>(
+      std::move(metadata),
+      std::move(tokenizer),
+      std::move(module),
+      std::move(text_decoder_runner),
+      std::move(text_prefiller),
+      std::move(text_token_generator),
+      std::move(stats),
+      temperature);
+}
+
+} // namespace executorch::extension::llm
diff --git a/extension/llm/runner/text_llm_runner.h b/extension/llm/runner/text_llm_runner.h
new file mode 100644
index 00000000000..f2bd5c29e75
--- /dev/null
+++ b/extension/llm/runner/text_llm_runner.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// A simple llama2 runner that includes preprocessing and post processing logic.
+// The module takes in a string as input and emits a string as output.
+
+#pragma once
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include <executorch/extension/llm/runner/irunner.h>
+#include <executorch/extension/llm/runner/stats.h>
+#include <executorch/extension/llm/runner/text_decoder_runner.h>
+#include <executorch/extension/llm/runner/text_prefiller.h>
+#include <executorch/extension/llm/runner/text_token_generator.h>
+#include <executorch/extension/module/module.h>
+#include <pytorch/tokenizers/tokenizer.h>
+
+namespace executorch::extension::llm {
+
+static constexpr auto kEnableDynamicShape = "enable_dynamic_shape";
+static constexpr auto kBosId = "get_bos_id";
+static constexpr auto kEosIds = "get_eos_ids";
+static constexpr auto kMaxSeqLen = "get_max_seq_len";
+static constexpr auto kMaxContextLen = "get_max_context_len";
+static constexpr auto kVocabSize = "get_vocab_size";
+static constexpr auto kUseKVCache = "use_kv_cache";
+static constexpr auto kUseSDPAWithKVCache = "use_sdpa_with_kv_cache";
+
+class ET_EXPERIMENTAL TextLLMRunner : public IRunner {
+ public:
+  // Constructor with dependency injection
+  explicit TextLLMRunner(
+      std::unordered_map<std::string, int64_t> metadata,
+      std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
+      std::unique_ptr<::executorch::extension::Module> module,
+      std::unique_ptr<TextDecoderRunner> text_decoder_runner,
+      std::unique_ptr<TextPrefiller> text_prefiller,
+      std::unique_ptr<TextTokenGenerator> text_token_generator,
+      std::unique_ptr<Stats> stats,
+      float temperature = -1.0f);
+
+  bool is_loaded() const override;
+  ::executorch::runtime::Error load() override;
+  ::executorch::runtime::Error generate(
+      const std::string& prompt,
+      const GenerationConfig& config,
+      std::function<void(const std::string&)> token_callback = {},
+      std::function<void(const Stats&)> stats_callback = {}) override;
+  ::executorch::runtime::Error warmup(
+      const std::string& prompt,
+      int32_t max_new_tokens);
+  void stop() override;
+
+ private:
+  bool shouldStop_{false};
+
+  // Components
+  std::unique_ptr<::tokenizers::Tokenizer> tokenizer_;
+  std::unordered_map<std::string, int64_t> metadata_;
+  std::unique_ptr<::executorch::extension::Module>
+      module_; // Manage module's lifecycle, make sure it outlives
+               // text_decoder_runner_.
+  std::unique_ptr<TextDecoderRunner>
+      text_decoder_runner_; // Manage text_decoder_runner_'s lifecycle, make
+                            // sure it outlives text_prefiller_ &
+                            // text_token_generator_.
+  std::unique_ptr<TextPrefiller> text_prefiller_;
+  std::unique_ptr<TextTokenGenerator> text_token_generator_;
+
+  // Stats
+  std::unique_ptr<Stats> stats_;
+
+  // temperature.
+  // Deprecated, we should rely on the temperature in GenerationConfig instead.
+  float temperature_ = -1.0f;
+};
+
+std::unique_ptr<tokenizers::Tokenizer> load_tokenizer(
+    const std::string& tokenizer_path,
+    std::unique_ptr<std::vector<std::string>> special_tokens = nullptr,
+    std::optional<std::string> pattern = std::nullopt,
+    size_t bos_token_index = 0,
+    size_t eos_token_index = 1);
+
+std::unordered_map<std::string, int64_t> get_llm_metadata(
+    tokenizers::Tokenizer* tokenizer,
+    Module* module);
+
+std::unordered_set<uint64_t> get_eos_ids(
+    tokenizers::Tokenizer* tokenizer,
+    Module* module);
+
+std::unique_ptr<TextLLMRunner> create_text_llm_runner(
+    const std::string& model_path,
+    std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
+    std::optional<const std::string> data_path = std::nullopt,
+    float temperature = -1.0f);
+
+} // namespace executorch::extension::llm

From 652f613ef8b087415942b9cd66613f12b8054872 Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu@meta.com>
Date: Tue, 3 Jun 2025 16:21:37 -0700
Subject: [PATCH 2/2] Update on "[llm] Add a generic text only LLM runner"

Introducing `text_llm_runner`. This can be used to run all text only decoder only LLM models supported by ExecuTorch.

* Metadata is being read out from the .pte file and being used to construct the runner object.
* examples/models/llama/runner.h[.cpp] only contains a simple wrapper around `text_llm_runner.h[.cpp]`.


In next PRs I will move examples/models/phi-3-mini/runner to use the generic runner.

Will look into QNN and MediaTek runners as well.

Differential Revision: [D75910889](https://our.internmc.facebook.com/intern/diff/D75910889/)

[ghstack-poisoned]
---
 extension/llm/runner/text_llm_runner.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/extension/llm/runner/text_llm_runner.cpp b/extension/llm/runner/text_llm_runner.cpp
index f7b5ade6cad..879613549ed 100644
--- a/extension/llm/runner/text_llm_runner.cpp
+++ b/extension/llm/runner/text_llm_runner.cpp
@@ -231,7 +231,7 @@ std::unique_ptr<tokenizers::Tokenizer> load_tokenizer(
         special_tokens, bos_token_index, eos_token_index);
   } else if (special_tokens != nullptr && pattern.has_value()) {
     tiktoken_tokenizer = std::make_unique<::tokenizers::Tiktoken>(
-        pattern.value(), special_tokens, bos_token_index, eos_token_index);
+        pattern.value(), std::move(special_tokens), bos_token_index, eos_token_index);
   } else {
     tiktoken_tokenizer = std::make_unique<::tokenizers::Tiktoken>();
   }