Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 31 additions & 4 deletions benchmark/llm_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,19 @@

logger = logging.getLogger(__name__)

def get_text_content(content) -> str:
"""Extract plain text from LLM response content.

Handles both plain strings and lists returned by models with thinking/reasoning
blocks (e.g. Claude extended thinking on Bedrock returns a list of content blocks).
"""

if isinstance(content, list):
for block in content:
if isinstance(block, dict) and block.get("type") == "text":
return block.get("text", "")
return ""
return content or ""

class LLMClient:
"""
Expand Down Expand Up @@ -35,6 +48,8 @@ def __init__(
self.top_p = top_p
self.effort = effort
self.reasoning = reasoning
# Responses API only when an effort is set for OpenAI/AzureOpenAI endpoints.
self.use_responses_api = bool(effort) and self.provider in ("openai", "azureopenai")
self.llm = None

self._initialize_llm()
Expand Down Expand Up @@ -65,11 +80,18 @@ def _initialize_llm(self):
elif self.provider == "openai":
from langchain_openai import ChatOpenAI

model_kwargs = {}
if self.top_p is not None:
model_kwargs["top_p"] = self.top_p

self.llm = ChatOpenAI(
model=self.model,
openai_api_key=self.api_key,
temperature=self.temperature,
max_tokens=self.max_tokens,
model_kwargs=model_kwargs,
use_responses_api=self.use_responses_api,
reasoning_effort=self.effort,
)
elif self.provider == "google":
from langchain_google_genai import ChatGoogleGenerativeAI
Expand Down Expand Up @@ -97,8 +119,6 @@ def _initialize_llm(self):
model_kwargs = {}
if self.top_p is not None:
model_kwargs["top_p"] = self.top_p
if self.effort is not None:
model_kwargs["reasoning_effort"] = self.effort

self.llm = AzureChatOpenAI(
azure_endpoint=self.custom_api_endpoint,
Expand All @@ -107,7 +127,9 @@ def _initialize_llm(self):
azure_deployment=self.model,
temperature=self.temperature,
max_completion_tokens=self.max_tokens,
model_kwargs=model_kwargs # In GPT-5.X this is a first class parameter, but passing this way is also allowed.
model_kwargs=model_kwargs,
use_responses_api=self.use_responses_api, # Required for some reasoning model configurations
reasoning_effort=self.effort,
)
elif self.provider == "vllm" or self.provider == "openrouter":
from langchain_openai import ChatOpenAI
Expand Down Expand Up @@ -200,6 +222,10 @@ def _convert_mcp_tools_to_langchain(
"parameters": cleaned_schema,
},
}
# Responses API defaults to strict mode, which forces the model to fill every
# optional param with hallucinated values. Explicit strict=False avoids this.
if self.provider in ("openai", "azureopenai") and self.use_responses_api:
tool_def["function"]["strict"] = False
langchain_tools.append(tool_def)

return langchain_tools
Expand Down Expand Up @@ -314,7 +340,8 @@ async def invoke_with_tools(
# Convert MCP tools to LangChain format
langchain_tools = self._convert_mcp_tools_to_langchain(tools)

# Bind tools to LLM
# Bind tools to LLM (the strict=False flag for OpenAI providers is
# set on each tool dict in _convert_mcp_tools_to_langchain).
llm_with_tools = self.llm.bind_tools(langchain_tools)
llm_with_retry = llm_with_tools.with_retry(
retry_if_exception_type=(
Expand Down
4 changes: 2 additions & 2 deletions orchestrators/decomposing_planner.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

from langchain_core.messages import HumanMessage, SystemMessage, ToolMessage

from benchmark.llm_client import LLMClient
from benchmark.llm_client import LLMClient, get_text_content
from .base import AgentOrchestrator

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -323,7 +323,7 @@ async def generate_plan_and_subtasks(
f"{usage['input_tokens'] + usage['output_tokens']} tokens"
)

content = extract_json_from_llm_response(response.content)
content = extract_json_from_llm_response(get_text_content(response.content))
logger.debug(f"Extracted JSON content (attempt {attempt + 1}):\n{content[:500]}")

parsed = json.loads(content)
Expand Down
4 changes: 2 additions & 2 deletions orchestrators/planner_react.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from langchain_core.messages import HumanMessage, SystemMessage, ToolMessage

from benchmark.llm_client import LLMClient
from benchmark.llm_client import LLMClient, get_text_content
from .base import AgentOrchestrator

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -101,7 +101,7 @@ async def generate_plan(

logger.info("🧠 Meta Agent: Generating execution plan...")
response = await self.planner_llm.ainvoke([HumanMessage(content=prompt)])
plan = response.content
plan = get_text_content(response.content)

logger.info(f"✅ Plan generated ({len(plan)} characters)")
logger.debug(f"Generated plan:\n{plan}")
Expand Down
6 changes: 3 additions & 3 deletions orchestrators/react.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from typing import Any, Dict, List, TYPE_CHECKING

from benchmark.mcp_client import MCPClient
from benchmark.llm_client import LLMClient
from benchmark.llm_client import LLMClient, get_text_content
from benchmark.models import BenchmarkConfig
from langchain_core.messages import SystemMessage, HumanMessage, ToolMessage

Expand Down Expand Up @@ -58,7 +58,7 @@ async def execute(self) -> Dict[str, Any]:
conversation_flow.append(
{
"type": "ai_message",
"content": response.content,
"content": get_text_content(response.content),
"usage_metadata": usage_metadata,
"response_metadata": response_metadata,
"tool_calls": [
Expand All @@ -68,7 +68,7 @@ async def execute(self) -> Dict[str, Any]:
}
)

logger.info(f"LLM Response: {response.content}")
logger.info(f"LLM Response: {get_text_content(response.content)}")

# Terminate if the LLM decided no further tool calls are needed
if not response.tool_calls or len(response.tool_calls) == 0:
Expand Down