diff --git a/examples/models/llama/main.cpp b/examples/models/llama/main.cpp index 3de47598426..80ece46a1bb 100644 --- a/examples/models/llama/main.cpp +++ b/examples/models/llama/main.cpp @@ -77,6 +77,11 @@ DEFINE_string( "etdump.in", "If an etdump path is provided, generate an ETDump file at the specified path for profiling purposes."); +DEFINE_string( + method_name, + "forward", + "Method name to execute in the model (e.g., 'forward', 'lora_forward')."); + // Helper function to parse comma-separated string lists std::vector parseStringList(const std::string& input) { std::vector result; @@ -145,11 +150,11 @@ int32_t main(int32_t argc, char** argv) { data_paths, temperature, #ifdef ET_EVENT_TRACER_ENABLED - std::move(etdump_gen_ptr) + std::move(etdump_gen_ptr), #else - nullptr + nullptr, #endif - ); + FLAGS_method_name); if (runner == nullptr) { ET_LOG(Error, "Failed to create llama runner"); diff --git a/examples/models/llama/runner/runner.cpp b/examples/models/llama/runner/runner.cpp index d2db805405e..3e26e5334e3 100644 --- a/examples/models/llama/runner/runner.cpp +++ b/examples/models/llama/runner/runner.cpp @@ -37,7 +37,8 @@ std::unique_ptr create_llama_runner( const std::string& tokenizer_path, std::optional data_path, float temperature, - std::unique_ptr<::executorch::runtime::EventTracer> event_tracer) { + std::unique_ptr<::executorch::runtime::EventTracer> event_tracer, + const std::string& method_name) { if (data_path.has_value()) { std::vector data_files; data_files.push_back(data_path.value()); @@ -46,14 +47,16 @@ std::unique_ptr create_llama_runner( tokenizer_path, std::move(data_files), temperature, - std::move(event_tracer)); + std::move(event_tracer), + method_name); } return create_llama_runner( model_path, tokenizer_path, std::vector(), temperature, - std::move(event_tracer)); + std::move(event_tracer), + method_name); } std::unique_ptr create_llama_runner( @@ -61,7 +64,8 @@ std::unique_ptr create_llama_runner( const std::string& tokenizer_path, std::vector data_files, float temperature, - std::unique_ptr<::executorch::runtime::EventTracer> event_tracer) { + std::unique_ptr<::executorch::runtime::EventTracer> event_tracer, + const std::string& method_name) { ET_LOG( Info, "Creating LLaMa runner: model_path=%s, tokenizer_path=%s", @@ -84,7 +88,8 @@ std::unique_ptr create_llama_runner( std::move(tokenizer), data_files, temperature, - std::move(event_tracer)); + std::move(event_tracer), + method_name); } } // namespace example diff --git a/examples/models/llama/runner/runner.h b/examples/models/llama/runner/runner.h index 10225fcb81d..00d0832908b 100644 --- a/examples/models/llama/runner/runner.h +++ b/examples/models/llama/runner/runner.h @@ -29,14 +29,16 @@ std::unique_ptr create_llama_runner( const std::string& tokenizer_path, std::optional data_path, float temperature = -1.0f, - std::unique_ptr<::executorch::runtime::EventTracer> event_tracer = nullptr); + std::unique_ptr<::executorch::runtime::EventTracer> event_tracer = nullptr, + const std::string& method_name = "forward"); std::unique_ptr create_llama_runner( const std::string& model_path, const std::string& tokenizer_path, std::vector data_files = {}, float temperature = -1.0f, - std::unique_ptr<::executorch::runtime::EventTracer> event_tracer = nullptr); + std::unique_ptr<::executorch::runtime::EventTracer> event_tracer = nullptr, + const std::string& method_name = "forward"); std::unique_ptr load_llama_tokenizer( const std::string& tokenizer_path, diff --git a/extension/llm/runner/llm_runner_helper.cpp b/extension/llm/runner/llm_runner_helper.cpp index 13f8d7a9db5..25846a2c5bc 100644 --- a/extension/llm/runner/llm_runner_helper.cpp +++ b/extension/llm/runner/llm_runner_helper.cpp @@ -182,18 +182,26 @@ std::unique_ptr create_text_llm_runner( const std::string& model_path, std::unique_ptr<::tokenizers::Tokenizer> tokenizer, std::optional data_path, - float temperature) { + float temperature, + const std::string& method_name) { if (data_path.has_value()) { std::vector data_files; data_files.push_back(data_path.value()); return create_text_llm_runner( - model_path, std::move(tokenizer), std::move(data_files), temperature); + model_path, + std::move(tokenizer), + std::move(data_files), + temperature, + nullptr, + method_name); } return create_text_llm_runner( model_path, std::move(tokenizer), std::vector(), - temperature); + temperature, + nullptr, + method_name); } std::unique_ptr create_text_llm_runner( @@ -201,7 +209,8 @@ std::unique_ptr create_text_llm_runner( std::unique_ptr<::tokenizers::Tokenizer> tokenizer, std::vector data_files, float temperature, - std::unique_ptr<::executorch::runtime::EventTracer> event_tracer) { + std::unique_ptr<::executorch::runtime::EventTracer> event_tracer, + const std::string& method_name) { // Sanity check tokenizer if (!tokenizer || !tokenizer->is_loaded()) { ET_LOG(Error, "Tokenizer is null or not loaded"); @@ -236,10 +245,10 @@ std::unique_ptr create_text_llm_runner( // Create IOManager std::unique_ptr io_manager = std::make_unique(*module); - // Create text_decoder_runner. Use a shared_ptr so that it can be shared with - // TextPrefiller and TextTokenGenerator - auto text_decoder_runner = - std::make_unique(module.get(), io_manager.get()); + // Create text_decoder_runner + ET_LOG(Info, "Using method: %s", method_name.c_str()); + auto text_decoder_runner = std::make_unique( + module.get(), io_manager.get(), method_name); // Create text_prefiller auto text_prefiller = std::make_unique( diff --git a/extension/llm/runner/llm_runner_helper.h b/extension/llm/runner/llm_runner_helper.h index 424567b7c2b..373124d8560 100644 --- a/extension/llm/runner/llm_runner_helper.h +++ b/extension/llm/runner/llm_runner_helper.h @@ -95,6 +95,7 @@ ET_EXPERIMENTAL std::unordered_set get_eos_ids( * @param data_path Optional path to additional data required by the model * @param temperature Optional temperature parameter for controlling randomness * (deprecated) + * @param method_name Name of the method to execute in the model * @return std::unique_ptr Initialized TextLLMRunner instance, or * nullptr on failure */ @@ -102,7 +103,8 @@ ET_EXPERIMENTAL std::unique_ptr create_text_llm_runner( const std::string& model_path, std::unique_ptr<::tokenizers::Tokenizer> tokenizer, std::optional data_path, - float temperature = -1.0f); + float temperature = -1.0f, + const std::string& method_name = "forward"); /** * @brief Creates a TextLLMRunner instance with dependency injection @@ -116,6 +118,8 @@ ET_EXPERIMENTAL std::unique_ptr create_text_llm_runner( * @param data_files Vector of paths to additional data required by the model * @param temperature Optional temperature parameter for controlling randomness * (deprecated) + * @param event_tracer Optional event tracer for profiling + * @param method_name Name of the method to execute in the model * @return std::unique_ptr Initialized TextLLMRunner instance, or * nullptr on failure */ @@ -124,7 +128,8 @@ ET_EXPERIMENTAL std::unique_ptr create_text_llm_runner( std::unique_ptr<::tokenizers::Tokenizer> tokenizer, std::vector data_files = {}, float temperature = -1.0f, - std::unique_ptr<::executorch::runtime::EventTracer> event_tracer = nullptr); + std::unique_ptr<::executorch::runtime::EventTracer> event_tracer = nullptr, + const std::string& method_name = "forward"); /** * @brief Creates a MultimodalRunner instance with dependency injection diff --git a/extension/llm/runner/test/test_text_decoder_runner.cpp b/extension/llm/runner/test/test_text_decoder_runner.cpp index 0001509ec55..917467e31fd 100644 --- a/extension/llm/runner/test/test_text_decoder_runner.cpp +++ b/extension/llm/runner/test/test_text_decoder_runner.cpp @@ -47,6 +47,41 @@ class TextDecoderRunnerTest : public Test { std::unique_ptr io_manager_; }; +// Test that method_name defaults to "forward" +TEST_F(TextDecoderRunnerTest, MethodNameDefaultsToForward) { + EXPECT_EQ(runner_->method_name(), "forward"); +} + +// Test that method_name can be set to a custom value +TEST_F(TextDecoderRunnerTest, MethodNameCustomValue) { + auto custom_runner = std::make_unique( + mock_module_.get(), io_manager_.get(), "encode"); + EXPECT_EQ(custom_runner->method_name(), "encode"); +} + +// Test that load() uses method_name (not hardcoded "forward") +TEST_F(TextDecoderRunnerTest, LoadUsesMethodName) { + // Get an available model + const char* model_path = std::getenv("KVCACHE_CACHE_POS"); + if (!model_path) { + GTEST_SKIP() << "No PTE model environment variable set"; + } + auto module = std::make_unique(model_path); + auto load_result = module->load(); + if (load_result != Error::Ok) { + GTEST_SKIP() << "Failed to load model"; + } + + auto io_mgr = std::make_unique(*module); + + // Create runner with a method name that doesn't exist + TextDecoderRunner runner(module.get(), io_mgr.get(), "nonexistent_method"); + + // load() should fail because "nonexistent_method" doesn't exist + auto result = runner.load(); + EXPECT_NE(result, Error::Ok); +} + // Test logits_to_token() method with Float tensor TEST_F(TextDecoderRunnerTest, LogitsToTokenFloat) { TensorFactory tf_float; diff --git a/extension/llm/runner/text_decoder_runner.cpp b/extension/llm/runner/text_decoder_runner.cpp index 8d51736ace5..3eb4e346e05 100644 --- a/extension/llm/runner/text_decoder_runner.cpp +++ b/extension/llm/runner/text_decoder_runner.cpp @@ -22,8 +22,13 @@ namespace llm { // NOTE: we observed ~2x loading performance increase on iPhone 15 // and a ~5% improvement on Galaxy S22 by switching to // FileDataLoader instead of MmapDataLoader + UseMlockIgnoreErrors. -TextDecoderRunner::TextDecoderRunner(Module* module, IOManager* io_manager) - : module_(module), io_manager_(io_manager) {} +TextDecoderRunner::TextDecoderRunner( + Module* module, + IOManager* io_manager, + std::string method_name) + : module_(module), + io_manager_(io_manager), + method_name_(std::move(method_name)) {} // This function is functional, meaning it shouldn't modify any state of the // input. It should be safe to call multiple times with the same inputs. The @@ -32,7 +37,7 @@ ::executorch::runtime::Result TextDecoderRunner::step( TensorPtr& tokens, int64_t start_pos) { // ET_LOG(Info, "Input token %" PRIu64, input_token); - auto method_meta_result = module_->method_meta("forward"); + auto method_meta_result = module_->method_meta(method_name_); if (!method_meta_result.ok()) { return method_meta_result.error(); } @@ -44,25 +49,31 @@ ::executorch::runtime::Result TextDecoderRunner::step( if (use_kv_cache) { auto start_pos_tensor_result = populate_start_pos_or_cache_position( - module_, start_pos, cache_positions, tokens->numel(), "forward"); + module_, + start_pos, + cache_positions, + tokens->numel(), + method_name_.c_str()); if (!start_pos_tensor_result.ok()) { return start_pos_tensor_result.error(); } auto start_pos_tensor = std::move(*start_pos_tensor_result); std::vector inputs; - auto inputs_res = io_manager_->prepare_decode(tokens, start_pos_tensor); + auto inputs_res = + io_manager_->prepare_decode(tokens, start_pos_tensor, method_name_); ET_CHECK_OK_OR_RETURN_ERROR(inputs_res.error()); inputs = inputs_res.get(); - auto outputs_res = module_->forward(inputs); + auto outputs_res = module_->execute(method_name_, inputs); ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error()); - auto update_err = io_manager_->update_decode(outputs_res.get()); + auto update_err = + io_manager_->update_decode(outputs_res.get(), method_name_); ET_CHECK_OK_OR_RETURN_ERROR(update_err); ET_CHECK_MSG( outputs_res.get().size() == 1, - "More then one output returned from executing LLM."); + "More than one output returned from executing LLM."); ET_CHECK_MSG( outputs_res.get()[0].isTensor(), "Non Tensor Output returned from executing LLM"); @@ -72,11 +83,12 @@ ::executorch::runtime::Result TextDecoderRunner::step( } else { // no kv cache (void)start_pos; // unused - auto outputs_res = module_->forward(tokens); + std::vector inputs{tokens}; + auto outputs_res = module_->execute(method_name_, inputs); ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error()); ET_CHECK_MSG( outputs_res.get().size() == 1, - "More then one output returned from executing LLM."); + "More than one output returned from executing LLM."); ET_CHECK_MSG( outputs_res.get()[0].isTensor(), "Non Tensor Output returned from executing LLM"); diff --git a/extension/llm/runner/text_decoder_runner.h b/extension/llm/runner/text_decoder_runner.h index 720000185c9..8b855e2924f 100644 --- a/extension/llm/runner/text_decoder_runner.h +++ b/extension/llm/runner/text_decoder_runner.h @@ -20,7 +20,10 @@ namespace llm { class ET_EXPERIMENTAL TextDecoderRunner { public: - explicit TextDecoderRunner(Module* module, IOManager* io_manager); + explicit TextDecoderRunner( + Module* module, + IOManager* io_manager, + std::string method_name = "forward"); virtual ~TextDecoderRunner() = default; @@ -40,7 +43,14 @@ class ET_EXPERIMENTAL TextDecoderRunner { * @return The error code. */ virtual ::executorch::runtime::Error load() { - return module_->load_method("forward"); + auto err = module_->load_method(method_name_); + if (err != ::executorch::runtime::Error::Ok) { + ET_LOG( + Error, + "Failed to load method '%s'. Check available methods in the model.", + method_name_.c_str()); + } + return err; } /** @@ -48,7 +58,15 @@ class ET_EXPERIMENTAL TextDecoderRunner { * @return True if the Module is loaded, false otherwise. */ virtual bool is_method_loaded() { - return module_->is_method_loaded("forward"); + return module_->is_method_loaded(method_name_); + } + + /** + * Get the method name used by this runner. + * @return The method name. + */ + const std::string& method_name() const { + return method_name_; } inline void stop() { @@ -79,6 +97,7 @@ class ET_EXPERIMENTAL TextDecoderRunner { */ Module* module_; IOManager* io_manager_; + std::string method_name_; bool should_stop_{false}; };