From 0ba85256d7aa34ff2f3459d91b2c689e75f2d40d Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Tue, 3 Jun 2025 16:08:00 -0700 Subject: [PATCH 1/2] [llm] Add a generic text only LLM runner Introducing `text_llm_runner`. This can be used to run all text only decoder only LLM models supported by ExecuTorch. * Metadata is being read out from the .pte file and being used to construct the runner object. * examples/models/llama/runner.h[.cpp] only contains a simple wrapper around `text_llm_runner.h[.cpp]`. In next PRs I will move examples/models/phi-3-mini/runner to use the generic runner. Will look into QNN and MediaTek runners as well. Differential Revision: [D75910889](https://our.internmc.facebook.com/intern/diff/D75910889/) [ghstack-poisoned] --- .../LLaMARunner/Exported/LLaMARunner.mm | 2 +- examples/models/llama/main.cpp | 9 +- examples/models/llama/runner/runner.cpp | 338 +--------------- examples/models/llama/runner/runner.h | 72 +--- examples/models/llama/runner/targets.bzl | 9 +- .../models/llama/tokenizer/llama_tiktoken.cpp | 8 +- .../models/llama/tokenizer/llama_tiktoken.h | 2 + extension/android/jni/jni_layer_llama.cpp | 4 +- .../apple/Benchmark/Tests/LLaMA/LLaMATests.mm | 2 +- extension/llm/runner/targets.bzl | 5 + extension/llm/runner/text_llm_runner.cpp | 382 ++++++++++++++++++ extension/llm/runner/text_llm_runner.h | 109 +++++ 12 files changed, 534 insertions(+), 408 deletions(-) create mode 100644 extension/llm/runner/text_llm_runner.cpp create mode 100644 extension/llm/runner/text_llm_runner.h diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.mm b/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.mm index c2f01bf17b1..e9ff473c2cd 100644 --- a/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.mm +++ b/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.mm @@ -31,7 +31,7 @@ - (instancetype)initWithModelPath:(NSString*)modelPath self = [super init]; if (self) { [ExecuTorchLog.sharedLog addSink:self]; - _runner = example::Runner::create( + _runner = example::create_llama_runner( modelPath.UTF8String, tokenizerPath.UTF8String); } return self; diff --git a/examples/models/llama/main.cpp b/examples/models/llama/main.cpp index 1c1b6f62dc1..38009dd59ec 100644 --- a/examples/models/llama/main.cpp +++ b/examples/models/llama/main.cpp @@ -81,8 +81,13 @@ int32_t main(int32_t argc, char** argv) { } #endif // create llama runner - std::unique_ptr runner = - example::Runner::create(model_path, tokenizer_path, data_path); + std::unique_ptr<::executorch::extension::llm::TextLLMRunner> runner = + example::create_llama_runner(model_path, tokenizer_path, data_path); + + if (runner == nullptr) { + ET_LOG(Error, "Failed to create llama runner"); + return 1; + } if (warmup) { runner->warmup(prompt, /*max_new_tokens=*/seq_len); diff --git a/examples/models/llama/runner/runner.cpp b/examples/models/llama/runner/runner.cpp index 119eedc704e..2ba2fdf9941 100644 --- a/examples/models/llama/runner/runner.cpp +++ b/examples/models/llama/runner/runner.cpp @@ -11,8 +11,7 @@ // The module takes in a string as input and emits a string as output. #include - -#include +#include #include #include @@ -26,41 +25,14 @@ using ::executorch::runtime::Result; namespace llm = ::executorch::extension::llm; -namespace { -static constexpr auto kEnableDynamicShape = "enable_dynamic_shape"; -static constexpr auto kBosId = "get_bos_id"; -static constexpr auto kEosIds = "get_eos_ids"; -static constexpr auto kMaxSeqLen = "get_max_seq_len"; -static constexpr auto kMaxContextLen = "get_max_context_len"; -static constexpr auto kVocabSize = "get_vocab_size"; -static constexpr auto kUseKVCache = "use_kv_cache"; -static constexpr auto kUseSDPAWithKVCache = "use_sdpa_with_kv_cache"; - -std::unique_ptr<::tokenizers::Tokenizer> load_tokenizer( - const std::string& tokenizer_path) { - auto json_tokenizer = std::make_unique(); - if (json_tokenizer->load(tokenizer_path) == ::tokenizers::Error::Ok) { - ET_LOG(Info, "Loaded json tokenizer"); - return json_tokenizer; - } - - auto tiktoken_tokenizer = get_tiktoken_for_llama(); - if (tiktoken_tokenizer->load(tokenizer_path) == ::tokenizers::Error::Ok) { - ET_LOG(Info, "Loaded TikToken tokenizer"); - return tiktoken_tokenizer; - } - - auto bpe_tokenizer = std::make_unique<::tokenizers::Llama2cTokenizer>(); - if (bpe_tokenizer->load(tokenizer_path) == ::tokenizers::Error::Ok) { - ET_LOG(Info, "Loaded BPE tokenizer"); - return bpe_tokenizer; - } - - return nullptr; +std::unique_ptr<::tokenizers::Tokenizer> load_llama_tokenizer( + const std::string& tokenizer_path, + Version version) { + auto special_tokens = get_special_tokens(version); + return llm::load_tokenizer(tokenizer_path, std::move(special_tokens)); } -} // namespace -std::unique_ptr Runner::create( +std::unique_ptr create_llama_runner( const std::string& model_path, const std::string& tokenizer_path, std::optional data_path, @@ -71,29 +43,10 @@ std::unique_ptr Runner::create( model_path.c_str(), tokenizer_path.c_str()); - // Create the Module - std::unique_ptr module; - if (data_path.has_value()) { - module = std::make_unique( - model_path, data_path.value(), Module::LoadMode::File); - } else { - module = std::make_unique(model_path, Module::LoadMode::File); - } - - // Initialize metadata with default values - std::unordered_map metadata({ - {kEnableDynamicShape, false}, - {kMaxSeqLen, 128}, - {kMaxContextLen, 128}, - {kUseKVCache, true}, - {kUseSDPAWithKVCache, false}, - }); - // Create and load tokenizer std::unique_ptr<::tokenizers::Tokenizer> tokenizer = - load_tokenizer(tokenizer_path); + load_llama_tokenizer(tokenizer_path, Version::Default); - // Fallback to BPE tokenizer if tiktoken fails if (tokenizer == nullptr) { ET_LOG( Info, @@ -101,279 +54,8 @@ std::unique_ptr Runner::create( tokenizer_path.c_str()); return nullptr; } - - ET_LOG(Info, "Reading metadata from model"); - - // Set tokenizer-related metadata - metadata[kBosId] = tokenizer->bos_tok(); - auto eos_ids = std::make_unique>( - std::unordered_set{tokenizer->eos_tok()}); - metadata[kVocabSize] = tokenizer->vocab_size(); - - // Read metadata from the model - auto method_names_result = module->method_names(); - if (method_names_result.error() != Error::Ok) { - ET_LOG(Error, "Failed reading method names"); - return nullptr; - } - const auto method_names = method_names_result.get(); - - for (auto& pair : metadata) { - const auto& method_name = pair.first; - auto& value = pair.second; - - if (method_names.count(method_name)) { - auto get_result = module->get(method_name); - value = get_result.get().toScalar().to(); - } else { - ET_LOG( - Info, - "Method %s not found, using the default value %" PRId64, - method_name.c_str(), - value); - } - ET_LOG(Info, "Metadata: %s = %" PRId64, method_name.c_str(), value); - } - - // Get EOS IDs if available - if (method_names.count(kEosIds)) { - eos_ids->clear(); - auto execute_result = module->execute(kEosIds); - if (execute_result.error() != Error::Ok) { - ET_LOG(Error, "Failed to execute %s", kEosIds); - return nullptr; - } - for (const auto& eos_id : execute_result.get()) { - auto value = eos_id.toScalar().to(); - eos_ids->emplace(value); - ET_LOG(Info, "eos_id = %" PRId64, value); - } - } - - // Create text_decoder_runner. Use a shared_ptr so that it can be shared with - // TextPrefiller and TextTokenGenerator - auto text_decoder_runner = std::make_unique( - module.get(), metadata.at(kUseKVCache)); - - // Create text_prefiller - auto text_prefiller = std::make_unique( - text_decoder_runner.get(), - metadata.at(kUseKVCache), - metadata.at(kEnableDynamicShape), - metadata.at(kMaxSeqLen)); - - // Create text_token_generator with stats - auto stats = std::make_unique(); - auto text_token_generator = std::make_unique( - tokenizer.get(), - text_decoder_runner.get(), - metadata.at(kUseKVCache), - std::move(eos_ids), - stats.get()); - - // Create and return the Runner instance - return std::make_unique( - std::move(metadata), - std::move(tokenizer), - std::move(module), - std::move(text_decoder_runner), - std::move(text_prefiller), - std::move(text_token_generator), - std::move(stats), - temperature); -} - -Runner::Runner( - std::unordered_map metadata, - std::unique_ptr<::tokenizers::Tokenizer> tokenizer, - std::unique_ptr<::executorch::extension::Module> module, - std::unique_ptr<::executorch::extension::llm::TextDecoderRunner> - text_decoder_runner, - std::unique_ptr<::executorch::extension::llm::TextPrefiller> text_prefiller, - std::unique_ptr<::executorch::extension::llm::TextTokenGenerator> - text_token_generator, - std::unique_ptr<::executorch::extension::llm::Stats> stats, - float temperature) - : tokenizer_(std::move(tokenizer)), - metadata_(std::move(metadata)), - module_(std::move(module)), - text_decoder_runner_(std::move(text_decoder_runner)), - text_prefiller_(std::move(text_prefiller)), - text_token_generator_(std::move(text_token_generator)), - stats_(std::move(stats)), - temperature_(temperature) { - // Note: This constructor assumes that text_prefiller and text_token_generator - // already have references to the Module and TextDecoderRunner they need -} - -bool Runner::is_loaded() const { - return text_prefiller_->is_loaded() && text_token_generator_->is_loaded(); -} - -Error Runner::load() { - if (is_loaded()) { - return Error::Ok; - } - ET_CHECK_OK_OR_RETURN_ERROR(text_prefiller_->load()); - ET_CHECK_OK_OR_RETURN_ERROR(text_token_generator_->load()); - return Error::Ok; -} - -// Don't print with the same priority during warmup -#define RUNNER_ET_LOG(warmup, format, ...) \ - if (warmup) { \ - ET_LOG(Debug, format, __VA_ARGS__); \ - } else { \ - ET_LOG(Info, format, __VA_ARGS__); \ - } - -Error Runner::generate( - const std::string& prompt, - const ::executorch::extension::llm::GenerationConfig& config, - std::function token_callback, - std::function stats_callback) { - // Prepare the inputs. - // Use ones-initialized inputs. - ET_CHECK_MSG(!prompt.empty(), "Prompt cannot be null"); - if (!is_loaded()) { - stats_->model_load_start_ms = llm::time_in_ms(); - ET_CHECK_OK_OR_RETURN_ERROR(load()); - stats_->model_load_end_ms = llm::time_in_ms(); - } - - if (config.warming) { - ET_LOG(Info, "Doing a warmup run..."); - } - - RUNNER_ET_LOG( - config.warming, - "RSS after loading model: %f MiB (0 if unsupported)", - llm::get_rss_bytes() / 1024.0 / 1024.0); - - // Wrap the token_callback with print function - std::function wrapped_callback = - [token_callback, config](const std::string& piece) { - if (!config.warming) { - llm::safe_printf(piece.c_str()); - fflush(stdout); - } - if (token_callback) { - token_callback(piece); - } - }; - // First token time only measures the time it takes to encode the prompt and - // return a response token. - - stats_->inference_start_ms = llm::time_in_ms(); - shouldStop_ = false; - - ::tokenizers::Result> encode_res = tokenizer_->encode( - prompt, - /* bos */ 0, - /* eos */ 0); - - ET_CHECK_TK_OK_OR_RETURN_ERROR( - encode_res.error(), "Failed to encode prompt %s", prompt.c_str()); - - // encode the (string) prompt into tokens sequence - std::vector prompt_tokens = encode_res.get(); - int num_prompt_tokens = prompt_tokens.size(); - - ET_CHECK_MSG(num_prompt_tokens >= 1, "Expected at least 1 prompt token"); - ET_CHECK_MSG( - num_prompt_tokens < metadata_.at(kMaxContextLen), - "num_prompt_tokens %d >= max_seq_len_ %" PRId64 - ", Max seq length exceeded - please increase max seq len value in your export script", - num_prompt_tokens, - metadata_.at(kMaxContextLen)); - - // Determine max_new_tokens using the GenerationConfig's resolve method - int max_new_tokens = config.resolve_max_new_tokens( - metadata_.at(kMaxContextLen), num_prompt_tokens); - - ET_LOG(Info, "Max new tokens resolved: %d", max_new_tokens); - - // Prefill first - // Here feed all tokens to the model and get the next predicted token - // after the prompt. After that we will enter generate loop. - - // print prompts - if (config.echo) { - wrapped_callback(prompt); - } - int64_t pos = 0; - auto prefill_res = text_prefiller_->prefill(prompt_tokens, pos); - ET_CHECK_OK_OR_RETURN_ERROR(prefill_res.error()); - uint64_t cur_token = prefill_res.get(); - stats_->first_token_ms = llm::time_in_ms(); - stats_->prompt_eval_end_ms = llm::time_in_ms(); - - // print the first token from prefill. No prev_token so use cur_token for it. - wrapped_callback( - ET_UNWRAP_TOKENIZER(tokenizer_->decode(cur_token, cur_token))); - RUNNER_ET_LOG( - config.warming, - "RSS after prompt prefill: %f MiB (0 if unsupported)", - llm::get_rss_bytes() / 1024.0 / 1024.0); - - // start the main loop - prompt_tokens.push_back(cur_token); - - // Generate max_new_tokens - 1 because prefill already generated 1 token. - int64_t num_generated_tokens = ET_UNWRAP(text_token_generator_->generate( - prompt_tokens, - num_prompt_tokens, - max_new_tokens - 1, - temperature_ == -1.0f ? config.temperature : temperature_, - wrapped_callback)); - - stats_->inference_end_ms = llm::time_in_ms(); - if (!config.warming) { - printf("\n"); - } - RUNNER_ET_LOG( - config.warming, - "RSS after finishing text generation: %f MiB (0 if unsupported)", - llm::get_rss_bytes() / 1024.0 / 1024.0); - - if (num_generated_tokens == max_new_tokens) { - RUNNER_ET_LOG(config.warming, "Max new tokens %i reached!", max_new_tokens); - } - - stats_->num_prompt_tokens = num_prompt_tokens; - stats_->num_generated_tokens = num_generated_tokens; - - if (config.warming) { - ET_LOG(Info, "Warmup run finished!"); - } else { - // Do not print report during warmup - ::executorch::llm::print_report(*stats_); - } - if (stats_callback) { - stats_callback(*stats_); - } - - return Error::Ok; -} - -Error Runner::warmup(const std::string& prompt, int32_t max_new_tokens) { - // Create a GenerationConfig for warmup - llm::GenerationConfig config{ - .echo = false, .max_new_tokens = max_new_tokens, .warming = true}; - - // Call generate with the warmup config - Error err = generate(prompt, config); - - // Reset stats after warmup, not resetting the std::unique_ptr! - stats_->reset(); - return err; + return llm::create_text_llm_runner( + model_path, std::move(tokenizer), data_path); } -void Runner::stop() { - if (is_loaded()) { - text_token_generator_->stop(); - } else { - ET_LOG(Error, "Token generator is not loaded, cannot stop"); - } -} } // namespace example diff --git a/examples/models/llama/runner/runner.h b/examples/models/llama/runner/runner.h index e4e91db37d5..09a166b0109 100644 --- a/examples/models/llama/runner/runner.h +++ b/examples/models/llama/runner/runner.h @@ -19,74 +19,20 @@ #include #include -#include -#include -#include -#include -#include +#include #include namespace example { -class ET_EXPERIMENTAL Runner : public executorch::extension::llm::IRunner { - public: - // Static factory method to create a Runner instance - static std::unique_ptr create( - const std::string& model_path, - const std::string& tokenizer_path, - std::optional data_path = std::nullopt, - float temperature = -1.0f); +namespace llm = ::executorch::extension::llm; - // Constructor with dependency injection - explicit Runner( - std::unordered_map metadata, - std::unique_ptr<::tokenizers::Tokenizer> tokenizer, - std::unique_ptr<::executorch::extension::Module> module, - std::unique_ptr<::executorch::extension::llm::TextDecoderRunner> - text_decoder_runner, - std::unique_ptr<::executorch::extension::llm::TextPrefiller> - text_prefiller, - std::unique_ptr<::executorch::extension::llm::TextTokenGenerator> - text_token_generator, - std::unique_ptr<::executorch::extension::llm::Stats> stats, - float temperature = -1.0f); +std::unique_ptr create_llama_runner( + const std::string& model_path, + const std::string& tokenizer_path, + std::optional data_path = std::nullopt, + float temperature = -1.0f); - bool is_loaded() const override; - ::executorch::runtime::Error load() override; - ::executorch::runtime::Error generate( - const std::string& prompt, - const ::executorch::extension::llm::GenerationConfig& config, - std::function token_callback = {}, - std::function - stats_callback = {}) override; - ::executorch::runtime::Error warmup( - const std::string& prompt, - int32_t max_new_tokens); - void stop() override; - - private: - bool shouldStop_{false}; - - // Components - std::unique_ptr<::tokenizers::Tokenizer> tokenizer_; - std::unordered_map metadata_; - std::unique_ptr<::executorch::extension::Module> - module_; // Manage module's lifecycle, make sure it outlives - // text_decoder_runner_. - std::unique_ptr<::executorch::extension::llm::TextDecoderRunner> - text_decoder_runner_; // Manage text_decoder_runner_'s lifecycle, make - // sure it outlives text_prefiller_ & - // text_token_generator_. - std::unique_ptr<::executorch::extension::llm::TextPrefiller> text_prefiller_; - std::unique_ptr<::executorch::extension::llm::TextTokenGenerator> - text_token_generator_; - - // Stats - std::unique_ptr<::executorch::extension::llm::Stats> stats_; - - // temperature. - // Deprecated, we should rely on the temperature in GenerationConfig instead. - float temperature_ = -1.0f; -}; +std::unique_ptr load_llama_tokenizer( + const std::string& tokenizer_path); } // namespace example diff --git a/examples/models/llama/runner/targets.bzl b/examples/models/llama/runner/targets.bzl index 158202cf55a..9f11eab8c28 100644 --- a/examples/models/llama/runner/targets.bzl +++ b/examples/models/llama/runner/targets.bzl @@ -36,14 +36,7 @@ def define_common_targets(): ], exported_deps = [ "//executorch/backends/xnnpack:xnnpack_backend", - "//executorch/extension/llm/runner:irunner", - "//executorch/extension/llm/runner:stats", - "//executorch/extension/llm/runner:text_decoder_runner" + aten_suffix, - "//executorch/extension/llm/runner:text_prefiller" + aten_suffix, - "//executorch/extension/llm/runner:text_token_generator" + aten_suffix, - "//executorch/extension/evalue_util:print_evalue" + aten_suffix, - "//executorch/extension/module:module" + aten_suffix, - "//executorch/extension/tensor:tensor" + aten_suffix, + "//executorch/extension/llm/runner:runner_lib" + aten_suffix, "//executorch/kernels/quantized:generated_lib" + aten_suffix, "//executorch/runtime/core/exec_aten:lib" + aten_suffix, "//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix, diff --git a/examples/models/llama/tokenizer/llama_tiktoken.cpp b/examples/models/llama/tokenizer/llama_tiktoken.cpp index f595de3c4e7..7b98a6ca415 100644 --- a/examples/models/llama/tokenizer/llama_tiktoken.cpp +++ b/examples/models/llama/tokenizer/llama_tiktoken.cpp @@ -42,7 +42,9 @@ _get_default_special_tokens() { return special_tokens; } -std::unique_ptr> _get_special_tokens(Version version) { +} // namespace + +std::unique_ptr> get_special_tokens(Version version) { switch (version) { case Version::Multimodal: return get_multimodal_special_tokens(); @@ -51,11 +53,9 @@ std::unique_ptr> _get_special_tokens(Version version) { } } -} // namespace - std::unique_ptr get_tiktoken_for_llama(Version version) { return std::make_unique( - _get_special_tokens(version), kBOSTokenIndex, kEOSTokenIndex); + get_special_tokens(version), kBOSTokenIndex, kEOSTokenIndex); } std::unique_ptr> get_multimodal_special_tokens() { diff --git a/examples/models/llama/tokenizer/llama_tiktoken.h b/examples/models/llama/tokenizer/llama_tiktoken.h index a7f65eca29e..01d836ffbe6 100644 --- a/examples/models/llama/tokenizer/llama_tiktoken.h +++ b/examples/models/llama/tokenizer/llama_tiktoken.h @@ -20,6 +20,8 @@ enum class Version { std::unique_ptr<::tokenizers::Tiktoken> get_tiktoken_for_llama( Version version = Version::Default); +std::unique_ptr> get_special_tokens(Version version); + std::unique_ptr> get_multimodal_special_tokens(); } // namespace example diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp index 03e26f089db..ad1c77a92b9 100644 --- a/extension/android/jni/jni_layer_llama.cpp +++ b/extension/android/jni/jni_layer_llama.cpp @@ -168,7 +168,9 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass { std::optional data_path_str = data_path ? std::optional{data_path->toStdString()} : std::nullopt; - runner_ = example::Runner::create( + // TODO(larryliu0820): Use the API in text_llm_runner.h to create the + // runner. + runner_ = example::create_llama_runner( model_path->toStdString(), tokenizer_path->toStdString(), data_path_str); diff --git a/extension/benchmark/apple/Benchmark/Tests/LLaMA/LLaMATests.mm b/extension/benchmark/apple/Benchmark/Tests/LLaMA/LLaMATests.mm index e53a457939c..c56f054ae3b 100644 --- a/extension/benchmark/apple/Benchmark/Tests/LLaMA/LLaMATests.mm +++ b/extension/benchmark/apple/Benchmark/Tests/LLaMA/LLaMATests.mm @@ -74,7 +74,7 @@ @implementation LLaMATests NSString *tokenizerPath = resources[@"tokenizer"]; return @{ @"generate" : ^(XCTestCase *testCase){ - auto __block runner = example::Runner::create( + auto __block runner = example::create_llama_runner( modelPath.UTF8String, tokenizerPath.UTF8String); if (!runner) { XCTFail("Failed to create runner"); diff --git a/extension/llm/runner/targets.bzl b/extension/llm/runner/targets.bzl index 03b593cacf5..d429514dab7 100644 --- a/extension/llm/runner/targets.bzl +++ b/extension/llm/runner/targets.bzl @@ -84,12 +84,17 @@ def define_common_targets(): name = "runner_lib" + aten_suffix, exported_headers = [ "multimodal_runner.h", + "text_llm_runner.h", + ], + srcs = [ + "text_llm_runner.cpp", ], visibility = [ "@EXECUTORCH_CLIENTS", ], exported_deps = [ ":image_prefiller" + aten_suffix, + ":irunner", ":text_decoder_runner" + aten_suffix, ":text_prefiller" + aten_suffix, ":text_token_generator" + aten_suffix, diff --git a/extension/llm/runner/text_llm_runner.cpp b/extension/llm/runner/text_llm_runner.cpp new file mode 100644 index 00000000000..f7b5ade6cad --- /dev/null +++ b/extension/llm/runner/text_llm_runner.cpp @@ -0,0 +1,382 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + * @lint-ignore-every CLANGTIDY facebook-hte-Deprecated + */ + +// A simple llama2 runner that includes preprocessing and post processing logic. +// The module takes in a string as input and emits a string as output. + +#include +#include +#include +#include +#include + +namespace executorch::extension::llm { + +using ::executorch::extension::Module; +using ::executorch::runtime::Error; +using ::executorch::runtime::Result; + +TextLLMRunner::TextLLMRunner( + std::unordered_map metadata, + std::unique_ptr<::tokenizers::Tokenizer> tokenizer, + std::unique_ptr<::executorch::extension::Module> module, + std::unique_ptr text_decoder_runner, + std::unique_ptr text_prefiller, + std::unique_ptr text_token_generator, + std::unique_ptr stats, + float temperature) + : tokenizer_(std::move(tokenizer)), + metadata_(std::move(metadata)), + module_(std::move(module)), + text_decoder_runner_(std::move(text_decoder_runner)), + text_prefiller_(std::move(text_prefiller)), + text_token_generator_(std::move(text_token_generator)), + stats_(std::move(stats)), + temperature_(temperature) { + // Note: This constructor assumes that text_prefiller and text_token_generator + // already have references to the Module and TextDecoderRunner they need +} + +bool TextLLMRunner::is_loaded() const { + return text_prefiller_->is_loaded() && text_token_generator_->is_loaded(); +} + +Error TextLLMRunner::load() { + if (is_loaded()) { + return Error::Ok; + } + ET_CHECK_OK_OR_RETURN_ERROR(text_prefiller_->load()); + ET_CHECK_OK_OR_RETURN_ERROR(text_token_generator_->load()); + return Error::Ok; +} + +// Don't print with the same priority during warmup +#define RUNNER_ET_LOG(warmup, format, ...) \ + if (warmup) { \ + ET_LOG(Debug, format, __VA_ARGS__); \ + } else { \ + ET_LOG(Info, format, __VA_ARGS__); \ + } + +Error TextLLMRunner::generate( + const std::string& prompt, + const GenerationConfig& config, + std::function token_callback, + std::function stats_callback) { + // Prepare the inputs. + // Use ones-initialized inputs. + ET_CHECK_MSG(!prompt.empty(), "Prompt cannot be null"); + if (!is_loaded()) { + stats_->model_load_start_ms = time_in_ms(); + ET_CHECK_OK_OR_RETURN_ERROR(load()); + stats_->model_load_end_ms = time_in_ms(); + } + + if (config.warming) { + ET_LOG(Info, "Doing a warmup run..."); + } + + RUNNER_ET_LOG( + config.warming, + "RSS after loading model: %f MiB (0 if unsupported)", + get_rss_bytes() / 1024.0 / 1024.0); + + // Wrap the token_callback with print function + std::function wrapped_callback = + [token_callback, config](const std::string& piece) { + if (!config.warming) { + llm::safe_printf(piece.c_str()); + fflush(stdout); + } + if (token_callback) { + token_callback(piece); + } + }; + // First token time only measures the time it takes to encode the prompt and + // return a response token. + + stats_->inference_start_ms = time_in_ms(); + shouldStop_ = false; + + ::tokenizers::Result> encode_res = tokenizer_->encode( + prompt, + /* bos */ 0, + /* eos */ 0); + + ET_CHECK_TK_OK_OR_RETURN_ERROR( + encode_res.error(), "Failed to encode prompt %s", prompt.c_str()); + + // encode the (string) prompt into tokens sequence + std::vector prompt_tokens = encode_res.get(); + int num_prompt_tokens = prompt_tokens.size(); + + ET_CHECK_MSG(num_prompt_tokens >= 1, "Expected at least 1 prompt token"); + ET_CHECK_MSG( + num_prompt_tokens < metadata_.at(kMaxContextLen), + "num_prompt_tokens %d >= max_seq_len_ %" PRId64 + ", Max seq length exceeded - please increase max seq len value in your export script", + num_prompt_tokens, + metadata_.at(kMaxContextLen)); + + // Determine max_new_tokens using the GenerationConfig's resolve method + int max_new_tokens = config.resolve_max_new_tokens( + metadata_.at(kMaxContextLen), num_prompt_tokens); + + ET_LOG(Info, "Max new tokens resolved: %d", max_new_tokens); + + // Prefill first + // Here feed all tokens to the model and get the next predicted token + // after the prompt. After that we will enter generate loop. + + // print prompts + if (config.echo) { + wrapped_callback(prompt); + } + int64_t pos = 0; + auto prefill_res = text_prefiller_->prefill(prompt_tokens, pos); + ET_CHECK_OK_OR_RETURN_ERROR(prefill_res.error()); + uint64_t cur_token = prefill_res.get(); + stats_->first_token_ms = time_in_ms(); + stats_->prompt_eval_end_ms = time_in_ms(); + + // print the first token from prefill. No prev_token so use cur_token for it. + wrapped_callback( + ET_UNWRAP_TOKENIZER(tokenizer_->decode(cur_token, cur_token))); + RUNNER_ET_LOG( + config.warming, + "RSS after prompt prefill: %f MiB (0 if unsupported)", + get_rss_bytes() / 1024.0 / 1024.0); + + // start the main loop + prompt_tokens.push_back(cur_token); + + // Generate max_new_tokens - 1 because prefill already generated 1 token. + int64_t num_generated_tokens = ET_UNWRAP(text_token_generator_->generate( + prompt_tokens, + num_prompt_tokens, + max_new_tokens - 1, + temperature_ == -1.0f ? config.temperature : temperature_, + wrapped_callback)); + + stats_->inference_end_ms = time_in_ms(); + if (!config.warming) { + printf("\n"); + } + RUNNER_ET_LOG( + config.warming, + "RSS after finishing text generation: %f MiB (0 if unsupported)", + get_rss_bytes() / 1024.0 / 1024.0); + + if (num_generated_tokens == max_new_tokens) { + RUNNER_ET_LOG(config.warming, "Max new tokens %i reached!", max_new_tokens); + } + + stats_->num_prompt_tokens = num_prompt_tokens; + stats_->num_generated_tokens = num_generated_tokens; + + if (config.warming) { + ET_LOG(Info, "Warmup run finished!"); + } else { + // Do not print report during warmup + print_report(*stats_); + } + if (stats_callback) { + stats_callback(*stats_); + } + + return Error::Ok; +} + +Error TextLLMRunner::warmup(const std::string& prompt, int32_t max_new_tokens) { + // Create a GenerationConfig for warmup + GenerationConfig config{ + .echo = false, .max_new_tokens = max_new_tokens, .warming = true}; + + // Call generate with the warmup config + Error err = generate(prompt, config); + + // Reset stats after warmup, not resetting the std::unique_ptr! + stats_->reset(); + return err; +} + +void TextLLMRunner::stop() { + if (is_loaded()) { + text_token_generator_->stop(); + } else { + ET_LOG(Error, "Token generator is not loaded, cannot stop"); + } +} + +std::unique_ptr load_tokenizer( + const std::string& tokenizer_path, + std::unique_ptr> special_tokens, + std::optional pattern, + size_t bos_token_index, + size_t eos_token_index) { + auto json_tokenizer = std::make_unique(); + if (json_tokenizer->load(tokenizer_path) == ::tokenizers::Error::Ok) { + ET_LOG(Info, "Loaded json tokenizer"); + return json_tokenizer; + } + std::unique_ptr<::tokenizers::Tiktoken> tiktoken_tokenizer; + if (special_tokens != nullptr && !pattern.has_value()) { + tiktoken_tokenizer = std::make_unique<::tokenizers::Tiktoken>( + special_tokens, bos_token_index, eos_token_index); + } else if (special_tokens != nullptr && pattern.has_value()) { + tiktoken_tokenizer = std::make_unique<::tokenizers::Tiktoken>( + pattern.value(), special_tokens, bos_token_index, eos_token_index); + } else { + tiktoken_tokenizer = std::make_unique<::tokenizers::Tiktoken>(); + } + if (tiktoken_tokenizer->load(tokenizer_path) == ::tokenizers::Error::Ok) { + ET_LOG(Info, "Loaded TikToken tokenizer"); + return tiktoken_tokenizer; + } + + auto bpe_tokenizer = std::make_unique<::tokenizers::Llama2cTokenizer>(); + if (bpe_tokenizer->load(tokenizer_path) == ::tokenizers::Error::Ok) { + ET_LOG(Info, "Loaded BPE tokenizer"); + return bpe_tokenizer; + } + + return nullptr; +} + +std::unordered_map get_llm_metadata( + tokenizers::Tokenizer* tokenizer, + Module* module) { + // Initialize metadata with default values + std::unordered_map metadata({ + {llm::kEnableDynamicShape, false}, + {llm::kMaxSeqLen, 128}, + {llm::kMaxContextLen, 128}, + {llm::kUseKVCache, true}, + {llm::kUseSDPAWithKVCache, false}, + }); + + // Read metadata from the model + auto method_names_result = module->method_names(); + if (method_names_result.error() != Error::Ok) { + ET_LOG(Error, "Failed reading method names"); + return metadata; + } + const auto method_names = method_names_result.get(); + + for (auto& pair : metadata) { + const auto& method_name = pair.first; + auto& value = pair.second; + + if (method_names.count(method_name)) { + auto get_result = module->get(method_name); + value = get_result.get().toScalar().to(); + } else { + ET_LOG( + Info, + "Method %s not found, using the default value %" PRId64, + method_name.c_str(), + value); + } + ET_LOG(Info, "Metadata: %s = %" PRId64, method_name.c_str(), value); + } + // Set tokenizer-related metadata + metadata[llm::kBosId] = tokenizer->bos_tok(); + metadata[llm::kVocabSize] = tokenizer->vocab_size(); + return metadata; +} + +std::unordered_set get_eos_ids( + tokenizers::Tokenizer* tokenizer, + Module* module) { + std::unordered_set eos_ids = {tokenizer->eos_tok()}; + // Get EOS IDs if available + auto method_names_result = module->method_names(); + if (method_names_result.error() != Error::Ok) { + ET_LOG(Error, "Failed reading method names"); + return eos_ids; + } + const auto method_names = method_names_result.get(); + + if (method_names.count(llm::kEosIds)) { + eos_ids.clear(); + auto execute_result = module->execute(llm::kEosIds); + if (execute_result.error() != Error::Ok) { + ET_LOG(Error, "Failed to execute %s", llm::kEosIds); + return eos_ids; + } + for (const auto& eos_id : execute_result.get()) { + auto value = eos_id.toScalar().to(); + eos_ids.emplace(value); + ET_LOG(Info, "eos_id = %" PRId64, value); + } + } + return eos_ids; +} + +std::unique_ptr create_text_llm_runner( + const std::string& model_path, + std::unique_ptr<::tokenizers::Tokenizer> tokenizer, + std::optional data_path, + float temperature) { + // Sanity check tokenizer + if (!tokenizer || !tokenizer->is_loaded()) { + ET_LOG(Error, "Tokenizer is null or not loaded"); + return nullptr; + } + + // Create the Module + std::unique_ptr module; + if (data_path.has_value()) { + module = std::make_unique( + model_path, data_path.value(), Module::LoadMode::File); + } else { + module = std::make_unique(model_path, Module::LoadMode::File); + } + + // Get metadata from Module + ET_LOG(Info, "Reading metadata from model"); + auto metadata = llm::get_llm_metadata(tokenizer.get(), module.get()); + + auto eos_ids = std::make_unique>( + llm::get_eos_ids(tokenizer.get(), module.get())); + + // Create text_decoder_runner. Use a shared_ptr so that it can be shared with + // TextPrefiller and TextTokenGenerator + auto text_decoder_runner = std::make_unique( + module.get(), metadata.at(kUseKVCache)); + + // Create text_prefiller + auto text_prefiller = std::make_unique( + text_decoder_runner.get(), + metadata.at(kUseKVCache), + metadata.at(kEnableDynamicShape), + metadata.at(kMaxSeqLen)); + + // Create text_token_generator with stats + auto stats = std::make_unique(); + auto text_token_generator = std::make_unique( + tokenizer.get(), + text_decoder_runner.get(), + metadata.at(kUseKVCache), + std::move(eos_ids), + stats.get()); + + // Create and return the Runner instance + return std::make_unique( + std::move(metadata), + std::move(tokenizer), + std::move(module), + std::move(text_decoder_runner), + std::move(text_prefiller), + std::move(text_token_generator), + std::move(stats), + temperature); +} + +} // namespace executorch::extension::llm diff --git a/extension/llm/runner/text_llm_runner.h b/extension/llm/runner/text_llm_runner.h new file mode 100644 index 00000000000..f2bd5c29e75 --- /dev/null +++ b/extension/llm/runner/text_llm_runner.h @@ -0,0 +1,109 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +// A simple llama2 runner that includes preprocessing and post processing logic. +// The module takes in a string as input and emits a string as output. + +#pragma once + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace executorch::extension::llm { + +static constexpr auto kEnableDynamicShape = "enable_dynamic_shape"; +static constexpr auto kBosId = "get_bos_id"; +static constexpr auto kEosIds = "get_eos_ids"; +static constexpr auto kMaxSeqLen = "get_max_seq_len"; +static constexpr auto kMaxContextLen = "get_max_context_len"; +static constexpr auto kVocabSize = "get_vocab_size"; +static constexpr auto kUseKVCache = "use_kv_cache"; +static constexpr auto kUseSDPAWithKVCache = "use_sdpa_with_kv_cache"; + +class ET_EXPERIMENTAL TextLLMRunner : public IRunner { + public: + // Constructor with dependency injection + explicit TextLLMRunner( + std::unordered_map metadata, + std::unique_ptr<::tokenizers::Tokenizer> tokenizer, + std::unique_ptr<::executorch::extension::Module> module, + std::unique_ptr text_decoder_runner, + std::unique_ptr text_prefiller, + std::unique_ptr text_token_generator, + std::unique_ptr stats, + float temperature = -1.0f); + + bool is_loaded() const override; + ::executorch::runtime::Error load() override; + ::executorch::runtime::Error generate( + const std::string& prompt, + const GenerationConfig& config, + std::function token_callback = {}, + std::function stats_callback = {}) override; + ::executorch::runtime::Error warmup( + const std::string& prompt, + int32_t max_new_tokens); + void stop() override; + + private: + bool shouldStop_{false}; + + // Components + std::unique_ptr<::tokenizers::Tokenizer> tokenizer_; + std::unordered_map metadata_; + std::unique_ptr<::executorch::extension::Module> + module_; // Manage module's lifecycle, make sure it outlives + // text_decoder_runner_. + std::unique_ptr + text_decoder_runner_; // Manage text_decoder_runner_'s lifecycle, make + // sure it outlives text_prefiller_ & + // text_token_generator_. + std::unique_ptr text_prefiller_; + std::unique_ptr text_token_generator_; + + // Stats + std::unique_ptr stats_; + + // temperature. + // Deprecated, we should rely on the temperature in GenerationConfig instead. + float temperature_ = -1.0f; +}; + +std::unique_ptr load_tokenizer( + const std::string& tokenizer_path, + std::unique_ptr> special_tokens = nullptr, + std::optional pattern = std::nullopt, + size_t bos_token_index = 0, + size_t eos_token_index = 1); + +std::unordered_map get_llm_metadata( + tokenizers::Tokenizer* tokenizer, + Module* module); + +std::unordered_set get_eos_ids( + tokenizers::Tokenizer* tokenizer, + Module* module); + +std::unique_ptr create_text_llm_runner( + const std::string& model_path, + std::unique_ptr<::tokenizers::Tokenizer> tokenizer, + std::optional data_path = std::nullopt, + float temperature = -1.0f); + +} // namespace executorch::extension::llm From 652f613ef8b087415942b9cd66613f12b8054872 Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Tue, 3 Jun 2025 16:21:37 -0700 Subject: [PATCH 2/2] Update on "[llm] Add a generic text only LLM runner" Introducing `text_llm_runner`. This can be used to run all text only decoder only LLM models supported by ExecuTorch. * Metadata is being read out from the .pte file and being used to construct the runner object. * examples/models/llama/runner.h[.cpp] only contains a simple wrapper around `text_llm_runner.h[.cpp]`. In next PRs I will move examples/models/phi-3-mini/runner to use the generic runner. Will look into QNN and MediaTek runners as well. Differential Revision: [D75910889](https://our.internmc.facebook.com/intern/diff/D75910889/) [ghstack-poisoned] --- extension/llm/runner/text_llm_runner.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extension/llm/runner/text_llm_runner.cpp b/extension/llm/runner/text_llm_runner.cpp index f7b5ade6cad..879613549ed 100644 --- a/extension/llm/runner/text_llm_runner.cpp +++ b/extension/llm/runner/text_llm_runner.cpp @@ -231,7 +231,7 @@ std::unique_ptr load_tokenizer( special_tokens, bos_token_index, eos_token_index); } else if (special_tokens != nullptr && pattern.has_value()) { tiktoken_tokenizer = std::make_unique<::tokenizers::Tiktoken>( - pattern.value(), special_tokens, bos_token_index, eos_token_index); + pattern.value(), std::move(special_tokens), bos_token_index, eos_token_index); } else { tiktoken_tokenizer = std::make_unique<::tokenizers::Tiktoken>(); }