From 5e2dc186fb69208074e48ace72897c1e1bb2d915 Mon Sep 17 00:00:00 2001 From: "rasul.osmanbayli" Date: Tue, 2 Jun 2026 21:26:05 +0400 Subject: [PATCH 1/4] feat: update provider model selections --- docs/concepts/architecture.mdx | 2 +- docs/features/structured-output.mdx | 4 +- docs/guides/cli.mdx | 6 +- docs/quickstart.mdx | 2 +- docs/sdk/configuration.mdx | 24 +++---- mobilerun/agent/providers/registry.py | 14 ++-- mobilerun/agent/utils/llm_picker.py | 70 +++++++++++++++++-- mobilerun/cli/tui/settings/data.py | 2 +- mobilerun/config_example.yaml | 10 +-- mobilerun/config_manager/config_manager.py | 12 ++-- pyproject.toml | 3 + tests/test_anthropic_oauth_llm.py | 8 +++ tests/test_gemini_oauth_llm.py | 36 ++++++++++ tests/test_llm_picker.py | 78 ++++++++++++++++++++++ tests/test_provider_registry.py | 77 +++++++++++++++++++++ 15 files changed, 308 insertions(+), 40 deletions(-) create mode 100644 tests/test_gemini_oauth_llm.py create mode 100644 tests/test_llm_picker.py create mode 100644 tests/test_provider_registry.py diff --git a/docs/concepts/architecture.mdx b/docs/concepts/architecture.mdx index 18be667c..d6e62272 100644 --- a/docs/concepts/architecture.mdx +++ b/docs/concepts/architecture.mdx @@ -79,7 +79,7 @@ llm_profiles: model: gpt-4o fast_agent: provider: GoogleGenAI - model: gemini-3.1-flash-lite-preview + model: gemini-3.1-flash-lite agent: reasoning: true # Enable Manager/Executor workflow max_steps: 15 # Maximum execution steps (global) diff --git a/docs/features/structured-output.mdx b/docs/features/structured-output.mdx index 0b970564..a274c0fd 100644 --- a/docs/features/structured-output.mdx +++ b/docs/features/structured-output.mdx @@ -124,7 +124,7 @@ By default, extraction uses the `structured_output` LLM profile. If not configur llm_profiles: fast_agent: provider: GoogleGenAI - model: gemini-3.1-flash-lite-preview + model: gemini-3.1-flash-lite temperature: 0.3 structured_output: @@ -140,7 +140,7 @@ from mobilerun import load_llm config = MobileConfig() llms = { - "fast_agent": load_llm("GoogleGenAI", "gemini-3.1-flash-lite-preview"), + "fast_agent": load_llm("GoogleGenAI", "gemini-3.1-flash-lite"), "structured_output": load_llm("OpenAI", "gpt-4o-mini"), } diff --git a/docs/guides/cli.mdx b/docs/guides/cli.mdx index 75a98570..2d3734e9 100644 --- a/docs/guides/cli.mdx +++ b/docs/guides/cli.mdx @@ -94,7 +94,7 @@ In screenshot-only mode, Mobilerun does not read an accessibility tree. The agen export GOOGLE_API_KEY=your-key mobilerun run "Archive old emails" \ --provider GoogleGenAI \ - --model gemini-3.1-flash-lite-preview + --model gemini-3.1-flash-lite # OpenAI export OPENAI_API_KEY=your-key @@ -384,7 +384,7 @@ trajectories/2025-10-16_14-30-45/ ```bash Quick Test mobilerun run "Turn on dark mode" \ --provider GoogleGenAI \ - --model gemini-3.1-flash-lite-preview + --model gemini-3.1-flash-lite ``` ```bash Debug Task @@ -405,7 +405,7 @@ mobilerun run "Check Wi-Fi" \ ```bash Cost Optimization mobilerun run "Set alarm" \ --provider GoogleGenAI \ - --model gemini-3.1-flash-lite-preview \ + --model gemini-3.1-flash-lite \ --no-vision ``` diff --git a/docs/quickstart.mdx b/docs/quickstart.mdx index 2f3eeb27..9cfcbbe6 100644 --- a/docs/quickstart.mdx +++ b/docs/quickstart.mdx @@ -122,7 +122,7 @@ mobilerun run "Find a contact named John and send him an email" --reasoning **Common CLI flags:** - `--provider` - LLM provider (GoogleGenAI, OpenAI, Anthropic, etc.) -- `--model` - Model name (gemini-3.1-flash-lite-preview, gpt-4o, etc.) +- `--model` - Model name (gemini-3.1-flash-lite, gpt-4o, etc.) - `--vision` - Enable screenshot processing - `--reasoning` - Enable multi-agent planning mode - `--steps N` - Maximum execution steps (default: 15) diff --git a/docs/sdk/configuration.mdx b/docs/sdk/configuration.mdx index 184addb7..1d7d8fc1 100644 --- a/docs/sdk/configuration.mdx +++ b/docs/sdk/configuration.mdx @@ -228,7 +228,7 @@ CredentialsConfig( ```python from llama_index.llms.google_genai import GoogleGenAI -llm = GoogleGenAI(model="gemini-3.1-flash-lite-preview", temperature=0.2) +llm = GoogleGenAI(model="gemini-3.1-flash-lite", temperature=0.2) agent = MobileAgent(goal="...", llms=llm) ``` @@ -242,10 +242,10 @@ agent = MobileAgent( goal="...", llms={ "manager": OpenAI(model="gpt-4o"), # Planning - "executor": GoogleGenAI(model="gemini-3.1-flash-lite-preview"), # Action selection - "fast_agent": GoogleGenAI(model="gemini-3.1-flash-lite-preview"), # Fast Agent: Direct execution (XML tool-calling) + "executor": GoogleGenAI(model="gemini-3.1-flash-lite"), # Action selection + "fast_agent": GoogleGenAI(model="gemini-3.1-flash-lite"), # Fast Agent: Direct execution (XML tool-calling) "app_opener": OpenAI(model="gpt-4o-mini"), # App launching - "structured_output": GoogleGenAI(model="gemini-3.1-flash-lite-preview"), # Output extraction + "structured_output": GoogleGenAI(model="gemini-3.1-flash-lite"), # Output extraction } ) ``` @@ -429,10 +429,10 @@ agent = MobileAgent( # LLMs llms={ "manager": OpenAI(model="gpt-4o"), # Planning - "executor": GoogleGenAI(model="gemini-3.1-flash-lite-preview"), # Action selection - "fast_agent": GoogleGenAI(model="gemini-3.1-flash-lite-preview"), # Fast Agent: Direct execution (XML tool-calling) + "executor": GoogleGenAI(model="gemini-3.1-flash-lite"), # Action selection + "fast_agent": GoogleGenAI(model="gemini-3.1-flash-lite"), # Fast Agent: Direct execution (XML tool-calling) "app_opener": OpenAI(model="gpt-4o-mini"), # App launching - "structured_output": GoogleGenAI(model="gemini-3.1-flash-lite-preview"), # Output extraction + "structured_output": GoogleGenAI(model="gemini-3.1-flash-lite"), # Output extraction }, # Custom tools @@ -505,21 +505,21 @@ agent: llm_profiles: manager: provider: GoogleGenAI - model: gemini-3.1-flash-lite-preview + model: gemini-3.1-flash-lite temperature: 0.2 kwargs: max_tokens: 8192 executor: provider: GoogleGenAI - model: gemini-3.1-flash-lite-preview + model: gemini-3.1-flash-lite temperature: 0.1 kwargs: max_tokens: 4096 fast_agent: # Fast Agent: Direct execution (XML tool-calling) provider: GoogleGenAI - model: gemini-3.1-flash-lite-preview + model: gemini-3.1-flash-lite temperature: 0.2 kwargs: max_tokens: 8192 @@ -531,7 +531,7 @@ llm_profiles: structured_output: provider: GoogleGenAI - model: gemini-3.1-flash-lite-preview + model: gemini-3.1-flash-lite temperature: 0.0 device: @@ -585,7 +585,7 @@ mobilerun run "Task" --steps 30 --reasoning --vision mobilerun run "Task" --device emulator-5554 --tcp # Override LLM (applies to ALL agents) -mobilerun run "Task" --provider GoogleGenAI --model gemini-3.1-flash-lite-preview +mobilerun run "Task" --provider GoogleGenAI --model gemini-3.1-flash-lite # Override logging mobilerun run "Task" --debug --save-trajectory action --tracing diff --git a/mobilerun/agent/providers/registry.py b/mobilerun/agent/providers/registry.py index ccbdf130..10ed5608 100644 --- a/mobilerun/agent/providers/registry.py +++ b/mobilerun/agent/providers/registry.py @@ -33,9 +33,10 @@ auth_mode="api_key", default_model="gemini-3.1-pro-preview", models=( + "gemini-3.5-flash", "gemini-3-flash-preview", "gemini-3.1-pro-preview", - "gemini-3.1-flash-lite-preview", + "gemini-3.1-flash-lite", ), requires_api_key=True, ), @@ -45,9 +46,10 @@ auth_mode="oauth", default_model="gemini-3.1-pro-preview", models=( + "gemini-3.5-flash", "gemini-3-flash-preview", "gemini-3.1-pro-preview", - "gemini-3.1-flash-lite-preview", + "gemini-3.1-flash-lite", ), credential_path=str(GEMINI_OAUTH_CREDENTIAL_PATH), ), @@ -61,8 +63,9 @@ id="OpenAIResponses", runtime_provider_name="OpenAIResponses", auth_mode="api_key", - default_model="gpt-5.4", + default_model="gpt-5.5", models=( + "gpt-5.5", "gpt-5.4", "gpt-5.4-mini", "gpt-5.4-nano", @@ -73,8 +76,9 @@ id="openai_oauth", runtime_provider_name="openai_oauth", auth_mode="oauth", - default_model="gpt-5.4", + default_model="gpt-5.5", models=( + "gpt-5.5", "gpt-5.4", "gpt-5.4-mini", "gpt-5.3-codex", @@ -95,6 +99,7 @@ default_model="claude-sonnet-4-6", models=( "claude-sonnet-4-6", + "claude-opus-4-8", "claude-opus-4-6", "claude-haiku-4-5", ), @@ -107,6 +112,7 @@ default_model="claude-opus-4-7", models=( "claude-opus-4-7", + "claude-opus-4-8", "claude-sonnet-4-6", "claude-opus-4-6", "claude-haiku-4-5", diff --git a/mobilerun/agent/utils/llm_picker.py b/mobilerun/agent/utils/llm_picker.py index 980c60f2..e930857e 100644 --- a/mobilerun/agent/utils/llm_picker.py +++ b/mobilerun/agent/utils/llm_picker.py @@ -23,13 +23,66 @@ "MiniMax", ] +PROVIDER_ALIASES = { + "openai": "OpenAIResponses", + "gpt": "OpenAIResponses", + "gemini": "GoogleGenAI", + "google": "GoogleGenAI", + "claude": "Anthropic", + "openai compatible": "OpenAILike", + "openai-like": "OpenAILike", + "openai like": "OpenAILike", + "openai_compatible": "OpenAILike", + "openai_like": "OpenAILike", + "zai": "ZAI", + "z.ai": "ZAI", +} + +ZAI_GLOBAL_API_BASE = "https://api.z.ai/api/paas/v4" +OPENAI_RESPONSES_MODELS_WITHOUT_SAMPLING_PARAMS = {"gpt-5.5"} +OPENAI_RESPONSES_UNSUPPORTED_SAMPLING_PARAMS = {"temperature", "top_p"} + + +def normalize_provider_name(provider_name: str) -> str: + """Map user-facing provider names to Mobilerun runtime providers.""" + stripped = provider_name.strip() + key = stripped.lower() + return PROVIDER_ALIASES.get(key, stripped) + + +def _openai_responses_model_omits_sampling_params(model: object) -> bool: + return ( + str(model or "").strip() in OPENAI_RESPONSES_MODELS_WITHOUT_SAMPLING_PARAMS + ) + + +def _load_openai_responses(**kwargs: Any) -> LLM: + from llama_index.llms.openai.responses import OpenAIResponses + + class MobilerunOpenAIResponses(OpenAIResponses): + def _get_model_kwargs(self, **kwargs: Any) -> dict[str, Any]: + model_kwargs = super()._get_model_kwargs(**kwargs) + if _openai_responses_model_omits_sampling_params( + model_kwargs.get("model", self.model) + ): + for param in OPENAI_RESPONSES_UNSUPPORTED_SAMPLING_PARAMS: + model_kwargs.pop(param, None) + return model_kwargs + + filtered_kwargs = {k: v for k, v in kwargs.items() if v is not None} + logger.debug( + "Initializing MobilerunOpenAIResponses with kwargs: " + f"{list(filtered_kwargs.keys())}" + ) + return MobilerunOpenAIResponses(**filtered_kwargs) + def load_llm(provider_name: str, model: str | None = None, **kwargs: Any) -> LLM: """Load and initialize a configured LLM backend. Args: provider_name: Case-sensitive provider name (e.g. "OpenAIResponses", "Ollama"). - model: Model name (e.g. "gpt-4", "gemini-3.1-flash-lite-preview"). + model: Model name (e.g. "gpt-4", "gemini-3.1-flash-lite"). **kwargs: Keyword arguments for the LLM class constructor. Returns: @@ -38,6 +91,8 @@ def load_llm(provider_name: str, model: str | None = None, **kwargs: Any) -> LLM if not provider_name: raise ValueError("provider_name cannot be empty.") + provider_name = normalize_provider_name(provider_name) + if model is not None: kwargs["model"] = model @@ -67,6 +122,13 @@ def load_llm(provider_name: str, model: str | None = None, **kwargs: Any) -> LLM if "base_url" in kwargs and "api_base" not in kwargs: kwargs["api_base"] = kwargs.pop("base_url") + if provider_name == "ZAI": + provider_name = "OpenAILike" + kwargs.setdefault("is_chat_model", True) + if "base_url" in kwargs and "api_base" not in kwargs: + kwargs["api_base"] = kwargs.pop("base_url") + kwargs.setdefault("api_base", ZAI_GLOBAL_API_BASE) + if provider_name == "DeepSeek": import os @@ -83,9 +145,7 @@ def load_llm(provider_name: str, model: str | None = None, **kwargs: Any) -> LLM # --- Standard providers (inline dispatch) --- if provider_name == "OpenAIResponses": - from llama_index.llms.openai.responses import OpenAIResponses - - llm_class = OpenAIResponses + return _load_openai_responses(**kwargs) elif provider_name == "OpenAILike": from llama_index.llms.openai_like import OpenAILike @@ -206,7 +266,7 @@ def load_llms_from_profiles( }, { "name": "GoogleGenAI", - "model": "gemini-3.1-flash-lite-preview", + "model": "gemini-3.1-flash-lite", }, { "name": "OpenAIResponses", diff --git a/mobilerun/cli/tui/settings/data.py b/mobilerun/cli/tui/settings/data.py index 2ee3e670..9ba45a7b 100644 --- a/mobilerun/cli/tui/settings/data.py +++ b/mobilerun/cli/tui/settings/data.py @@ -45,7 +45,7 @@ class ProfileSettings: """Full LLM profile for one agent role.""" provider: str = "GoogleGenAI" - model: str = "gemini-3.1-flash-lite-preview" + model: str = "gemini-3.1-flash-lite" temperature: float = 0.2 api_key: str = "" api_key_source: str = "auto" diff --git a/mobilerun/config_example.yaml b/mobilerun/config_example.yaml index 465c2bed..c06088b5 100644 --- a/mobilerun/config_example.yaml +++ b/mobilerun/config_example.yaml @@ -71,7 +71,7 @@ llm_profiles: # Manager: Plans and reasons about task progress manager: provider: GoogleGenAI - model: gemini-3.1-flash-lite-preview + model: gemini-3.1-flash-lite temperature: 0.2 # api_key_source: auto # auto = saved env file first, then shell env; env = shell only; file = saved env file # kwargs: # optional kwargs, add api_key in kwargs if not already in .env @@ -80,7 +80,7 @@ llm_profiles: # Executor: Selects and executes atomic actions executor: provider: GoogleGenAI - model: gemini-3.1-flash-lite-preview + model: gemini-3.1-flash-lite temperature: 0.1 # api_key_source: auto # kwargs: @@ -89,7 +89,7 @@ llm_profiles: # Fast Agent: Direct execution agent (XML tool-calling or code generation) fast_agent: provider: GoogleGenAI - model: gemini-3.1-flash-lite-preview + model: gemini-3.1-flash-lite temperature: 0.2 # api_key_source: auto # kwargs: @@ -98,7 +98,7 @@ llm_profiles: # App Opener: Opens apps by name/description app_opener: provider: GoogleGenAI - model: gemini-3.1-flash-lite-preview + model: gemini-3.1-flash-lite temperature: 0.0 # api_key_source: auto # kwargs: @@ -107,7 +107,7 @@ llm_profiles: # Structured Output: Extracts structured data from final answers structured_output: provider: GoogleGenAI - model: gemini-3.1-flash-lite-preview + model: gemini-3.1-flash-lite temperature: 0.0 # api_key_source: auto # kwargs: diff --git a/mobilerun/config_manager/config_manager.py b/mobilerun/config_manager/config_manager.py index 0bf7740c..c2b476bc 100644 --- a/mobilerun/config_manager/config_manager.py +++ b/mobilerun/config_manager/config_manager.py @@ -17,7 +17,7 @@ class LLMProfile: """LLM profile configuration.""" provider: str = "GoogleGenAI" - model: str = "gemini-3.1-flash-lite-preview" + model: str = "gemini-3.1-flash-lite" temperature: float = 0.2 api_key_source: Literal["auto", "env", "file"] = "auto" base_url: Optional[str] = None @@ -232,31 +232,31 @@ def _default_profiles() -> Dict[str, LLMProfile]: return { "manager": LLMProfile( provider="GoogleGenAI", - model="gemini-3.1-flash-lite-preview", + model="gemini-3.1-flash-lite", temperature=0.2, kwargs={}, ), "executor": LLMProfile( provider="GoogleGenAI", - model="gemini-3.1-flash-lite-preview", + model="gemini-3.1-flash-lite", temperature=0.1, kwargs={}, ), "fast_agent": LLMProfile( provider="GoogleGenAI", - model="gemini-3.1-flash-lite-preview", + model="gemini-3.1-flash-lite", temperature=0.2, kwargs={}, ), "app_opener": LLMProfile( provider="GoogleGenAI", - model="gemini-3.1-flash-lite-preview", + model="gemini-3.1-flash-lite", temperature=0.0, kwargs={}, ), "structured_output": LLMProfile( provider="GoogleGenAI", - model="gemini-3.1-flash-lite-preview", + model="gemini-3.1-flash-lite", temperature=0.0, kwargs={}, ), diff --git a/pyproject.toml b/pyproject.toml index 188d90f9..3f09fa23 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -89,6 +89,9 @@ mobilerun = "mobilerun.cli.main:cli" line-length = 100 target-version = "py313" +[tool.pytest.ini_options] +pythonpath = ["."] + [tool.ruff.lint] select = [ "E", # pycodestyle errors diff --git a/tests/test_anthropic_oauth_llm.py b/tests/test_anthropic_oauth_llm.py index 4f199433..ade21283 100644 --- a/tests/test_anthropic_oauth_llm.py +++ b/tests/test_anthropic_oauth_llm.py @@ -51,3 +51,11 @@ def test_default_opus_payload_sends_max_tokens_without_temperature(): assert payload["model"] == "claude-opus-4-7" assert payload["max_tokens"] == 8192 assert "temperature" not in payload + + +def test_opus_4_8_payload_sends_max_tokens_without_temperature(): + payload = _payload_for(model="claude-opus-4-8") + + assert payload["model"] == "claude-opus-4-8" + assert payload["max_tokens"] == 8192 + assert "temperature" not in payload diff --git a/tests/test_gemini_oauth_llm.py b/tests/test_gemini_oauth_llm.py new file mode 100644 index 00000000..4f116cb7 --- /dev/null +++ b/tests/test_gemini_oauth_llm.py @@ -0,0 +1,36 @@ +from llama_index.core.base.llms.types import ChatMessage, MessageRole + +from mobilerun.agent.providers.registry import resolve_provider_variant +from mobilerun.agent.providers.setup_service import ( + SetupSelection, + create_profile_for_variant, +) +from mobilerun.agent.utils.llm_picker import load_llm + + +def test_gemini_oauth_profile_sends_gemini_3_5_flash_verbatim(tmp_path) -> None: + variant = resolve_provider_variant("gemini", "oauth") + profile = create_profile_for_variant( + variant, + SetupSelection( + family_id="gemini", + variant_id=variant.id, + auth_mode="oauth", + model="gemini-3.5-flash", + credential_path=str(tmp_path / "missing-auth-profiles.json"), + ), + ) + + llm = load_llm( + profile.provider, + model=profile.model, + credential_path=profile.credential_path, + **profile.kwargs, + ) + payload = llm._to_code_assist_request( + [ChatMessage(role=MessageRole.USER, content="hello")] + ) + + assert profile.provider == "gemini_oauth_code_assist" + assert profile.model == "gemini-3.5-flash" + assert payload["model"] == "gemini-3.5-flash" diff --git a/tests/test_llm_picker.py b/tests/test_llm_picker.py new file mode 100644 index 00000000..fe5ebe8a --- /dev/null +++ b/tests/test_llm_picker.py @@ -0,0 +1,78 @@ +import pytest + +from mobilerun.agent.utils.llm_picker import load_llm, normalize_provider_name + + +@pytest.mark.parametrize( + ("alias", "expected"), + [ + ("OpenAI", "OpenAIResponses"), + ("openai", "OpenAIResponses"), + ("GPT", "OpenAIResponses"), + ("Gemini", "GoogleGenAI"), + ("Google", "GoogleGenAI"), + ("Claude", "Anthropic"), + ("OpenAI Compatible", "OpenAILike"), + ("OpenAI-like", "OpenAILike"), + ("ZAI", "ZAI"), + ("Z.AI", "ZAI"), + ], +) +def test_normalize_provider_name_accepts_user_facing_aliases( + alias: str, expected: str +) -> None: + assert normalize_provider_name(alias) == expected + + +def test_openai_responses_omits_temperature_for_gpt_5_5() -> None: + llm = load_llm( + "OpenAIResponses", + model="gpt-5.5", + api_key="stub", + temperature=0.4, + ) + + kwargs = llm._get_model_kwargs() + + assert kwargs["model"] == "gpt-5.5" + assert "temperature" not in kwargs + assert "top_p" not in kwargs + + +def test_openai_responses_keeps_temperature_for_gpt_5_4() -> None: + llm = load_llm( + "OpenAIResponses", + model="gpt-5.4", + api_key="stub", + temperature=0.4, + ) + + kwargs = llm._get_model_kwargs() + + assert kwargs["model"] == "gpt-5.4" + assert kwargs["temperature"] == 0.4 + assert kwargs["top_p"] == 1.0 + + +def test_openai_alias_loads_openai_responses_without_temperature_for_gpt_5_5() -> None: + llm = load_llm( + "OpenAI", + model="gpt-5.5", + api_key="stub", + temperature=0.4, + ) + + assert type(llm).__name__ == "MobilerunOpenAIResponses" + assert "temperature" not in llm._get_model_kwargs() + assert "top_p" not in llm._get_model_kwargs() + + +def test_zai_alias_uses_openai_like_transport_defaults() -> None: + llm = load_llm( + "ZAI", + model="glm-5", + api_key="stub", + ) + + assert type(llm).__name__ == "OpenAILike" + assert llm.api_base == "https://api.z.ai/api/paas/v4" diff --git a/tests/test_provider_registry.py b/tests/test_provider_registry.py new file mode 100644 index 00000000..2f192f7d --- /dev/null +++ b/tests/test_provider_registry.py @@ -0,0 +1,77 @@ +from mobilerun.agent.providers.registry import ( + list_models_for_variant, + normalize_model_id_for_variant, + resolve_provider_variant, +) +from mobilerun.config_manager.config_manager import LLMProfile, MobileConfig + + +def test_gemini_catalogs_use_current_flash_models() -> None: + expected_models = ( + "gemini-3.5-flash", + "gemini-3-flash-preview", + "gemini-3.1-pro-preview", + "gemini-3.1-flash-lite", + ) + + for auth_mode in ("api_key", "oauth"): + variant = resolve_provider_variant("gemini", auth_mode) + models = list_models_for_variant("gemini", auth_mode) + + assert variant.default_model == "gemini-3.1-pro-preview" + assert models == expected_models + assert "gemini-3.1-flash-lite-preview" not in models + + +def test_anthropic_catalogs_include_opus_4_8_without_changing_defaults() -> None: + api_key_variant = resolve_provider_variant("anthropic", "api_key") + api_key_models = list_models_for_variant("anthropic", "api_key") + oauth_variant = resolve_provider_variant("anthropic", "oauth") + oauth_models = list_models_for_variant("anthropic", "oauth") + + assert api_key_variant.default_model == "claude-sonnet-4-6" + assert api_key_models == ( + "claude-sonnet-4-6", + "claude-opus-4-8", + "claude-opus-4-6", + "claude-haiku-4-5", + ) + assert oauth_variant.default_model == "claude-opus-4-7" + assert oauth_models == ( + "claude-opus-4-7", + "claude-opus-4-8", + "claude-sonnet-4-6", + "claude-opus-4-6", + "claude-haiku-4-5", + ) + + +def test_openai_oauth_catalog_uses_current_codex_model() -> None: + variant = resolve_provider_variant("openai", "oauth") + models = list_models_for_variant("openai", "oauth") + + assert variant.default_model == "gpt-5.5" + assert models == ("gpt-5.5", "gpt-5.4", "gpt-5.4-mini", "gpt-5.3-codex") + assert ( + normalize_model_id_for_variant( + "openai", "oauth", "openai-codex/gpt-5.3-codex" + ) + == "gpt-5.3-codex" + ) + + +def test_openai_api_key_catalog_uses_current_default_model() -> None: + variant = resolve_provider_variant("openai", "api_key") + models = list_models_for_variant("openai", "api_key") + + assert variant.default_model == "gpt-5.5" + assert models == ("gpt-5.5", "gpt-5.4", "gpt-5.4-mini", "gpt-5.4-nano") + + +def test_default_profiles_use_stable_gemini_flash_lite() -> None: + config = MobileConfig() + + assert LLMProfile().model == "gemini-3.1-flash-lite" + assert { + profile.model for profile in config.llm_profiles.values() + } == {"gemini-3.1-flash-lite"} From 2ba8b6dd97fc4097aa623789f833c81e145dacac Mon Sep 17 00:00:00 2001 From: "rasul.osmanbayli" Date: Wed, 3 Jun 2026 13:06:09 +0400 Subject: [PATCH 2/4] fix: restore OpenAI Responses usage tracking --- mobilerun/agent/usage.py | 14 +++++++-- tests/test_usage.py | 64 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 76 insertions(+), 2 deletions(-) create mode 100644 tests/test_usage.py diff --git a/mobilerun/agent/usage.py b/mobilerun/agent/usage.py index 9ba4a046..263547e0 100644 --- a/mobilerun/agent/usage.py +++ b/mobilerun/agent/usage.py @@ -24,6 +24,11 @@ "Ollama", ] +PROVIDER_ALIASES = { + "MobilerunOpenAIResponses": "OpenAIResponses", + "openai_responses_llm": "OpenAIResponses", +} + class UsageResult(BaseModel): request_tokens: int @@ -50,7 +55,12 @@ def _usage_field(usage: Any, *names: str) -> int: return 0 +def _normalize_provider_name(provider: str) -> str: + return PROVIDER_ALIASES.get(provider, provider) + + def get_usage_from_response(provider: str, chat_rsp: ChatResponse) -> UsageResult: + provider = _normalize_provider_name(provider) rsp = chat_rsp.raw if not rsp: raise ValueError("No raw response in chat response") @@ -210,7 +220,7 @@ def llm_callback(llm: LLM, *args: List[BaseCallbackHandler]): def create_tracker(llm: LLM) -> TokenCountingHandler: - provider = llm.__class__.__name__ + provider = _normalize_provider_name(llm.__class__.__name__) if provider not in SUPPORTED_PROVIDERS: raise ValueError(f"Tracking not yet supported for provider: {provider}") @@ -245,7 +255,7 @@ def track_usage(llm: LLM) -> TokenCountingHandler: >>> # ... make LLM calls ... >>> print(f"Total tokens used: {tracker.usage.total_tokens}") """ - provider = llm.__class__.__name__ + provider = _normalize_provider_name(llm.__class__.__name__) if provider not in SUPPORTED_PROVIDERS: raise ValueError(f"Tracking not yet supported for provider: {provider}") diff --git a/tests/test_usage.py b/tests/test_usage.py new file mode 100644 index 00000000..363e3527 --- /dev/null +++ b/tests/test_usage.py @@ -0,0 +1,64 @@ +from types import SimpleNamespace + +from llama_index.core.base.llms.types import ChatMessage, ChatResponse, MessageRole + +from mobilerun.agent.usage import ( + TokenCountingHandler, + create_tracker, + get_usage_from_response, + track_usage, +) +from mobilerun.agent.utils.llm_picker import load_llm + + +def _openai_responses_chat_response() -> ChatResponse: + return ChatResponse( + message=ChatMessage(role=MessageRole.ASSISTANT, content="ok"), + raw=SimpleNamespace( + usage=SimpleNamespace( + input_tokens=3, + output_tokens=2, + total_tokens=5, + ) + ), + ) + + +def test_track_usage_supports_mobilerun_openai_responses_wrapper() -> None: + llm = load_llm("OpenAIResponses", model="gpt-5.5", api_key="stub") + + tracker = track_usage(llm) + + assert isinstance(tracker, TokenCountingHandler) + assert tracker.provider == "OpenAIResponses" + + +def test_create_tracker_supports_mobilerun_openai_responses_wrapper() -> None: + llm = load_llm("OpenAIResponses", model="gpt-5.5", api_key="stub") + + tracker = create_tracker(llm) + + assert isinstance(tracker, TokenCountingHandler) + assert tracker.provider == "OpenAIResponses" + + +def test_openai_responses_wrapper_name_extracts_usage_from_response() -> None: + usage = get_usage_from_response( + "MobilerunOpenAIResponses", _openai_responses_chat_response() + ) + + assert usage.request_tokens == 3 + assert usage.response_tokens == 2 + assert usage.total_tokens == 5 + assert usage.requests == 1 + + +def test_openai_responses_class_name_extracts_usage_from_response() -> None: + usage = get_usage_from_response( + "openai_responses_llm", _openai_responses_chat_response() + ) + + assert usage.request_tokens == 3 + assert usage.response_tokens == 2 + assert usage.total_tokens == 5 + assert usage.requests == 1 From 9bf3aee9844281d902f555c986335ba901a29756 Mon Sep 17 00:00:00 2001 From: "rasul.osmanbayli" Date: Wed, 3 Jun 2026 21:15:36 +0400 Subject: [PATCH 3/4] fix: align OAuth model validation --- mobilerun/agent/providers/registry.py | 1 - mobilerun/agent/usage.py | 1 + mobilerun/agent/utils/llm_picker.py | 68 ++++++++++++++++++++++- tests/test_llm_picker.py | 80 +++++++++++++++++++++++++++ tests/test_provider_registry.py | 12 +--- tests/test_usage.py | 9 +++ 6 files changed, 158 insertions(+), 13 deletions(-) diff --git a/mobilerun/agent/providers/registry.py b/mobilerun/agent/providers/registry.py index 10ed5608..f0ab2309 100644 --- a/mobilerun/agent/providers/registry.py +++ b/mobilerun/agent/providers/registry.py @@ -81,7 +81,6 @@ "gpt-5.5", "gpt-5.4", "gpt-5.4-mini", - "gpt-5.3-codex", ), credential_path=str(OPENAI_OAUTH_CREDENTIAL_PATH), ), diff --git a/mobilerun/agent/usage.py b/mobilerun/agent/usage.py index 263547e0..ecf46fa7 100644 --- a/mobilerun/agent/usage.py +++ b/mobilerun/agent/usage.py @@ -25,6 +25,7 @@ ] PROVIDER_ALIASES = { + "MobilerunAnthropic": "Anthropic", "MobilerunOpenAIResponses": "OpenAIResponses", "openai_responses_llm": "OpenAIResponses", } diff --git a/mobilerun/agent/utils/llm_picker.py b/mobilerun/agent/utils/llm_picker.py index e930857e..1b495a3e 100644 --- a/mobilerun/agent/utils/llm_picker.py +++ b/mobilerun/agent/utils/llm_picker.py @@ -39,8 +39,15 @@ } ZAI_GLOBAL_API_BASE = "https://api.z.ai/api/paas/v4" +OPENAI_OAUTH_UNSUPPORTED_MODELS = {"gpt-5.3-codex"} OPENAI_RESPONSES_MODELS_WITHOUT_SAMPLING_PARAMS = {"gpt-5.5"} OPENAI_RESPONSES_UNSUPPORTED_SAMPLING_PARAMS = {"temperature", "top_p"} +ANTHROPIC_CURRENT_MODEL_CONTEXT_WINDOWS = { + "claude-opus-4-8": 200_000, + "claude-sonnet-4-6": 200_000, + "claude-opus-4-6": 200_000, + "claude-haiku-4-5": 200_000, +} def normalize_provider_name(provider_name: str) -> str: @@ -56,6 +63,20 @@ def _openai_responses_model_omits_sampling_params(model: object) -> bool: ) +def _anthropic_model_omits_temperature(model: object) -> bool: + return str(model or "").strip().startswith("claude-opus-4") + + +def _validate_openai_oauth_model(model: object) -> None: + model_id = str(model or "").strip() + if model_id in OPENAI_OAUTH_UNSUPPORTED_MODELS: + supported = "gpt-5.5, gpt-5.4, or gpt-5.4-mini" + raise ValueError( + f"Model '{model_id}' is not supported with OpenAI OAuth " + f"ChatGPT-account credentials. Use {supported}." + ) + + def _load_openai_responses(**kwargs: Any) -> LLM: from llama_index.llms.openai.responses import OpenAIResponses @@ -77,6 +98,48 @@ def _get_model_kwargs(self, **kwargs: Any) -> dict[str, Any]: return MobilerunOpenAIResponses(**filtered_kwargs) +def _load_anthropic(**kwargs: Any) -> LLM: + from llama_index.core.base.llms.types import LLMMetadata + from llama_index.llms.anthropic import Anthropic + + class MobilerunAnthropic(Anthropic): + @property + def _model_kwargs(self) -> dict[str, Any]: + model_kwargs = super()._model_kwargs + if ( + _anthropic_model_omits_temperature( + model_kwargs.get("model", self.model) + ) + and "temperature" not in (self.additional_kwargs or {}) + ): + model_kwargs.pop("temperature", None) + return model_kwargs + + @property + def metadata(self) -> LLMMetadata: + try: + return super().metadata + except ValueError: + context_window = ANTHROPIC_CURRENT_MODEL_CONTEXT_WINDOWS.get( + self.model + ) + if context_window is None: + raise + return LLMMetadata( + context_window=context_window, + num_output=self.max_tokens, + is_chat_model=True, + model_name=self.model, + is_function_calling_model=True, + ) + + filtered_kwargs = {k: v for k, v in kwargs.items() if v is not None} + logger.debug( + f"Initializing MobilerunAnthropic with kwargs: {list(filtered_kwargs.keys())}" + ) + return MobilerunAnthropic(**filtered_kwargs) + + def load_llm(provider_name: str, model: str | None = None, **kwargs: Any) -> LLM: """Load and initialize a configured LLM backend. @@ -100,6 +163,7 @@ def load_llm(provider_name: str, model: str | None = None, **kwargs: Any) -> LLM if provider_name == "openai_oauth": from mobilerun.agent.utils.oauth.openai_oauth_llm import OpenAIOAuth + _validate_openai_oauth_model(kwargs.get("model")) return OpenAIOAuth(**{k: v for k, v in kwargs.items() if v is not None}) if provider_name == "anthropic_oauth": from mobilerun.agent.utils.oauth.anthropic_oauth_llm import AnthropicOAuthLLM @@ -162,9 +226,7 @@ def load_llm(provider_name: str, model: str | None = None, **kwargs: Any) -> LLM llm_class = Ollama elif provider_name == "Anthropic": - from llama_index.llms.anthropic import Anthropic - - llm_class = Anthropic + return _load_anthropic(**kwargs) elif provider_name == "OpenRouter": from llama_index.llms.openrouter import OpenRouter diff --git a/tests/test_llm_picker.py b/tests/test_llm_picker.py index fe5ebe8a..887721b5 100644 --- a/tests/test_llm_picker.py +++ b/tests/test_llm_picker.py @@ -76,3 +76,83 @@ def test_zai_alias_uses_openai_like_transport_defaults() -> None: assert type(llm).__name__ == "OpenAILike" assert llm.api_base == "https://api.z.ai/api/paas/v4" + + +def test_openai_oauth_rejects_unsupported_codex_model() -> None: + with pytest.raises(ValueError, match="not supported with OpenAI OAuth"): + load_llm("openai_oauth", model="gpt-5.3-codex") + + +@pytest.mark.parametrize("model", ["claude-opus-4-8", "claude-opus-4-6"]) +def test_anthropic_opus_4_omits_default_temperature(model: str) -> None: + llm = load_llm( + "Anthropic", + model=model, + api_key="stub", + temperature=0.2, + ) + + kwargs = llm._get_all_kwargs() + + assert type(llm).__name__ == "MobilerunAnthropic" + assert kwargs["model"] == model + assert "temperature" not in kwargs + + +def test_anthropic_opus_4_keeps_explicit_additional_temperature() -> None: + llm = load_llm( + "Anthropic", + model="claude-opus-4-8", + api_key="stub", + temperature=0.2, + additional_kwargs={"temperature": 0.0}, + ) + + assert llm._get_all_kwargs()["temperature"] == 0.0 + + +def test_anthropic_opus_4_keeps_per_call_temperature() -> None: + llm = load_llm( + "Anthropic", + model="claude-opus-4-8", + api_key="stub", + temperature=0.2, + ) + + assert llm._get_all_kwargs(temperature=0.0)["temperature"] == 0.0 + + +def test_anthropic_sonnet_keeps_temperature() -> None: + llm = load_llm( + "Anthropic", + model="claude-sonnet-4-6", + api_key="stub", + temperature=0.2, + ) + + kwargs = llm._get_all_kwargs() + + assert kwargs["model"] == "claude-sonnet-4-6" + assert kwargs["temperature"] == 0.2 + + +@pytest.mark.parametrize( + "model", + [ + "claude-opus-4-8", + "claude-sonnet-4-6", + "claude-opus-4-6", + "claude-haiku-4-5", + ], +) +def test_anthropic_current_catalog_models_have_metadata(model: str) -> None: + llm = load_llm( + "Anthropic", + model=model, + api_key="stub", + ) + + metadata = llm.metadata + + assert metadata.model_name == model + assert metadata.context_window > 0 diff --git a/tests/test_provider_registry.py b/tests/test_provider_registry.py index 2f192f7d..71f15d61 100644 --- a/tests/test_provider_registry.py +++ b/tests/test_provider_registry.py @@ -1,6 +1,5 @@ from mobilerun.agent.providers.registry import ( list_models_for_variant, - normalize_model_id_for_variant, resolve_provider_variant, ) from mobilerun.config_manager.config_manager import LLMProfile, MobileConfig @@ -46,18 +45,13 @@ def test_anthropic_catalogs_include_opus_4_8_without_changing_defaults() -> None ) -def test_openai_oauth_catalog_uses_current_codex_model() -> None: +def test_openai_oauth_catalog_hides_unsupported_codex_model() -> None: variant = resolve_provider_variant("openai", "oauth") models = list_models_for_variant("openai", "oauth") assert variant.default_model == "gpt-5.5" - assert models == ("gpt-5.5", "gpt-5.4", "gpt-5.4-mini", "gpt-5.3-codex") - assert ( - normalize_model_id_for_variant( - "openai", "oauth", "openai-codex/gpt-5.3-codex" - ) - == "gpt-5.3-codex" - ) + assert models == ("gpt-5.5", "gpt-5.4", "gpt-5.4-mini") + assert "gpt-5.3-codex" not in models def test_openai_api_key_catalog_uses_current_default_model() -> None: diff --git a/tests/test_usage.py b/tests/test_usage.py index 363e3527..6e7e6bcd 100644 --- a/tests/test_usage.py +++ b/tests/test_usage.py @@ -62,3 +62,12 @@ def test_openai_responses_class_name_extracts_usage_from_response() -> None: assert usage.response_tokens == 2 assert usage.total_tokens == 5 assert usage.requests == 1 + + +def test_track_usage_supports_mobilerun_anthropic_wrapper() -> None: + llm = load_llm("Anthropic", model="claude-opus-4-8", api_key="stub") + + tracker = track_usage(llm) + + assert isinstance(tracker, TokenCountingHandler) + assert tracker.provider == "Anthropic" From c2837d5e4f07ccd7e256aab26b238cd7d97bd207 Mon Sep 17 00:00:00 2001 From: "rasul.osmanbayli" Date: Wed, 3 Jun 2026 21:32:45 +0400 Subject: [PATCH 4/4] fix: hide unsupported Gemini OAuth flash model --- mobilerun/agent/providers/registry.py | 1 - mobilerun/agent/utils/llm_picker.py | 12 ++++++++++++ tests/test_gemini_oauth_llm.py | 8 ++++---- tests/test_llm_picker.py | 5 +++++ tests/test_provider_registry.py | 27 +++++++++++++++++++-------- 5 files changed, 40 insertions(+), 13 deletions(-) diff --git a/mobilerun/agent/providers/registry.py b/mobilerun/agent/providers/registry.py index f0ab2309..98da85a4 100644 --- a/mobilerun/agent/providers/registry.py +++ b/mobilerun/agent/providers/registry.py @@ -46,7 +46,6 @@ auth_mode="oauth", default_model="gemini-3.1-pro-preview", models=( - "gemini-3.5-flash", "gemini-3-flash-preview", "gemini-3.1-pro-preview", "gemini-3.1-flash-lite", diff --git a/mobilerun/agent/utils/llm_picker.py b/mobilerun/agent/utils/llm_picker.py index 1b495a3e..f2596c74 100644 --- a/mobilerun/agent/utils/llm_picker.py +++ b/mobilerun/agent/utils/llm_picker.py @@ -39,6 +39,7 @@ } ZAI_GLOBAL_API_BASE = "https://api.z.ai/api/paas/v4" +GEMINI_OAUTH_UNSUPPORTED_MODELS = {"gemini-3.5-flash"} OPENAI_OAUTH_UNSUPPORTED_MODELS = {"gpt-5.3-codex"} OPENAI_RESPONSES_MODELS_WITHOUT_SAMPLING_PARAMS = {"gpt-5.5"} OPENAI_RESPONSES_UNSUPPORTED_SAMPLING_PARAMS = {"temperature", "top_p"} @@ -77,6 +78,16 @@ def _validate_openai_oauth_model(model: object) -> None: ) +def _validate_gemini_oauth_model(model: object) -> None: + model_id = str(model or "").strip() + if model_id in GEMINI_OAUTH_UNSUPPORTED_MODELS: + supported = "gemini-3-flash-preview, gemini-3.1-pro-preview, or gemini-3.1-flash-lite" + raise ValueError( + f"Model '{model_id}' is not supported with Gemini OAuth Code Assist " + f"credentials. Use {supported}." + ) + + def _load_openai_responses(**kwargs: Any) -> LLM: from llama_index.llms.openai.responses import OpenAIResponses @@ -174,6 +185,7 @@ def load_llm(provider_name: str, model: str | None = None, **kwargs: Any) -> LLM GeminiOAuthCodeAssistLLM, ) + _validate_gemini_oauth_model(kwargs.get("model")) return GeminiOAuthCodeAssistLLM( **{k: v for k, v in kwargs.items() if v is not None} ) diff --git a/tests/test_gemini_oauth_llm.py b/tests/test_gemini_oauth_llm.py index 4f116cb7..d5cdd553 100644 --- a/tests/test_gemini_oauth_llm.py +++ b/tests/test_gemini_oauth_llm.py @@ -8,7 +8,7 @@ from mobilerun.agent.utils.llm_picker import load_llm -def test_gemini_oauth_profile_sends_gemini_3_5_flash_verbatim(tmp_path) -> None: +def test_gemini_oauth_profile_sends_gemini_flash_lite_verbatim(tmp_path) -> None: variant = resolve_provider_variant("gemini", "oauth") profile = create_profile_for_variant( variant, @@ -16,7 +16,7 @@ def test_gemini_oauth_profile_sends_gemini_3_5_flash_verbatim(tmp_path) -> None: family_id="gemini", variant_id=variant.id, auth_mode="oauth", - model="gemini-3.5-flash", + model="gemini-3.1-flash-lite", credential_path=str(tmp_path / "missing-auth-profiles.json"), ), ) @@ -32,5 +32,5 @@ def test_gemini_oauth_profile_sends_gemini_3_5_flash_verbatim(tmp_path) -> None: ) assert profile.provider == "gemini_oauth_code_assist" - assert profile.model == "gemini-3.5-flash" - assert payload["model"] == "gemini-3.5-flash" + assert profile.model == "gemini-3.1-flash-lite" + assert payload["model"] == "gemini-3.1-flash-lite" diff --git a/tests/test_llm_picker.py b/tests/test_llm_picker.py index 887721b5..e87f1be3 100644 --- a/tests/test_llm_picker.py +++ b/tests/test_llm_picker.py @@ -83,6 +83,11 @@ def test_openai_oauth_rejects_unsupported_codex_model() -> None: load_llm("openai_oauth", model="gpt-5.3-codex") +def test_gemini_oauth_rejects_unsupported_flash_3_5_model() -> None: + with pytest.raises(ValueError, match="not supported with Gemini OAuth"): + load_llm("gemini_oauth_code_assist", model="gemini-3.5-flash") + + @pytest.mark.parametrize("model", ["claude-opus-4-8", "claude-opus-4-6"]) def test_anthropic_opus_4_omits_default_temperature(model: str) -> None: llm = load_llm( diff --git a/tests/test_provider_registry.py b/tests/test_provider_registry.py index 71f15d61..6c0c85fd 100644 --- a/tests/test_provider_registry.py +++ b/tests/test_provider_registry.py @@ -5,21 +5,32 @@ from mobilerun.config_manager.config_manager import LLMProfile, MobileConfig -def test_gemini_catalogs_use_current_flash_models() -> None: - expected_models = ( +def test_gemini_api_key_catalog_uses_current_flash_models() -> None: + variant = resolve_provider_variant("gemini", "api_key") + models = list_models_for_variant("gemini", "api_key") + + assert variant.default_model == "gemini-3.1-pro-preview" + assert models == ( "gemini-3.5-flash", "gemini-3-flash-preview", "gemini-3.1-pro-preview", "gemini-3.1-flash-lite", ) + assert "gemini-3.1-flash-lite-preview" not in models + - for auth_mode in ("api_key", "oauth"): - variant = resolve_provider_variant("gemini", auth_mode) - models = list_models_for_variant("gemini", auth_mode) +def test_gemini_oauth_catalog_hides_unsupported_flash_3_5_model() -> None: + variant = resolve_provider_variant("gemini", "oauth") + models = list_models_for_variant("gemini", "oauth") - assert variant.default_model == "gemini-3.1-pro-preview" - assert models == expected_models - assert "gemini-3.1-flash-lite-preview" not in models + assert variant.default_model == "gemini-3.1-pro-preview" + assert models == ( + "gemini-3-flash-preview", + "gemini-3.1-pro-preview", + "gemini-3.1-flash-lite", + ) + assert "gemini-3.5-flash" not in models + assert "gemini-3.1-flash-lite-preview" not in models def test_anthropic_catalogs_include_opus_4_8_without_changing_defaults() -> None: