From 519d816a03cb2df725b5797842ba56f56346eece Mon Sep 17 00:00:00 2001 From: "varun.pandey" Date: Fri, 8 May 2026 17:37:52 -0400 Subject: [PATCH 1/4] Support responses API for OpenAI models --- benchmark/llm_client.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/benchmark/llm_client.py b/benchmark/llm_client.py index 46d64e5..5be3cbf 100644 --- a/benchmark/llm_client.py +++ b/benchmark/llm_client.py @@ -35,6 +35,8 @@ def __init__( self.top_p = top_p self.effort = effort self.reasoning = reasoning + # Responses API only when an effort is set for OpenAI/AzureOpenAI endpoints. + self.use_responses_api = bool(effort) and self.provider in ("openai", "azureopenai") self.llm = None self._initialize_llm() @@ -65,11 +67,18 @@ def _initialize_llm(self): elif self.provider == "openai": from langchain_openai import ChatOpenAI + model_kwargs = {} + if self.top_p is not None: + model_kwargs["top_p"] = self.top_p + self.llm = ChatOpenAI( model=self.model, openai_api_key=self.api_key, temperature=self.temperature, max_tokens=self.max_tokens, + model_kwargs=model_kwargs, + use_responses_api=self.use_responses_api, + reasoning_effort=self.effort, ) elif self.provider == "google": from langchain_google_genai import ChatGoogleGenerativeAI @@ -97,8 +106,6 @@ def _initialize_llm(self): model_kwargs = {} if self.top_p is not None: model_kwargs["top_p"] = self.top_p - if self.effort is not None: - model_kwargs["reasoning_effort"] = self.effort self.llm = AzureChatOpenAI( azure_endpoint=self.custom_api_endpoint, @@ -107,7 +114,9 @@ def _initialize_llm(self): azure_deployment=self.model, temperature=self.temperature, max_completion_tokens=self.max_tokens, - model_kwargs=model_kwargs # In GPT-5.X this is a first class parameter, but passing this way is also allowed. + model_kwargs=model_kwargs, + use_responses_api=self.use_responses_api, # Required for some reasoning model configurations + reasoning_effort=self.effort, ) elif self.provider == "vllm" or self.provider == "openrouter": from langchain_openai import ChatOpenAI @@ -132,6 +141,7 @@ def _initialize_llm(self): temperature=self.temperature, max_tokens=self.max_tokens, model_kwargs=model_kwargs, + request_timeout=600, ) elif self.provider == "qwq": from langchain_qwq import ChatQwQ @@ -200,6 +210,10 @@ def _convert_mcp_tools_to_langchain( "parameters": cleaned_schema, }, } + # Responses API defaults to strict mode, which forces the model to fill every + # optional param with hallucinated values. Explicit strict=False avoids this. + if self.provider in ("openai", "azureopenai") and self.use_responses_api: + tool_def["function"]["strict"] = False langchain_tools.append(tool_def) return langchain_tools @@ -314,7 +328,8 @@ async def invoke_with_tools( # Convert MCP tools to LangChain format langchain_tools = self._convert_mcp_tools_to_langchain(tools) - # Bind tools to LLM + # Bind tools to LLM (the strict=False flag for OpenAI providers is + # set on each tool dict in _convert_mcp_tools_to_langchain). llm_with_tools = self.llm.bind_tools(langchain_tools) llm_with_retry = llm_with_tools.with_retry( retry_if_exception_type=( From 43c13b29bfe642062cd125d9be61e3f4db486be3 Mon Sep 17 00:00:00 2001 From: "varun.pandey" Date: Fri, 8 May 2026 17:42:59 -0400 Subject: [PATCH 2/4] Remove timeout parameter --- benchmark/llm_client.py | 1 - 1 file changed, 1 deletion(-) diff --git a/benchmark/llm_client.py b/benchmark/llm_client.py index 5be3cbf..b393c00 100644 --- a/benchmark/llm_client.py +++ b/benchmark/llm_client.py @@ -141,7 +141,6 @@ def _initialize_llm(self): temperature=self.temperature, max_tokens=self.max_tokens, model_kwargs=model_kwargs, - request_timeout=600, ) elif self.provider == "qwq": from langchain_qwq import ChatQwQ From 3286340fa5f891d3c3d154b3b7e303dec5616cde Mon Sep 17 00:00:00 2001 From: "varun.pandey" Date: Sat, 30 May 2026 11:23:14 -0400 Subject: [PATCH 3/4] Get text content from response --- benchmark/llm_client.py | 13 +++++++++++++ orchestrators/decomposing_planner.py | 4 ++-- orchestrators/planner_react.py | 4 ++-- 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/benchmark/llm_client.py b/benchmark/llm_client.py index b393c00..001cad6 100644 --- a/benchmark/llm_client.py +++ b/benchmark/llm_client.py @@ -3,6 +3,19 @@ logger = logging.getLogger(__name__) +def get_text_content(content) -> str: + """Extract plain text from LLM response content. + + Handles both plain strings and lists returned by models with thinking/reasoning + blocks (e.g. Claude extended thinking on Bedrock returns a list of content blocks). + """ + + if isinstance(content, list): + for block in content: + if isinstance(block, dict) and block.get("type") == "text": + return block.get("text", "") + return "" + return content or "" class LLMClient: """ diff --git a/orchestrators/decomposing_planner.py b/orchestrators/decomposing_planner.py index b9dce7f..f5a0355 100644 --- a/orchestrators/decomposing_planner.py +++ b/orchestrators/decomposing_planner.py @@ -17,7 +17,7 @@ from langchain_core.messages import HumanMessage, SystemMessage, ToolMessage -from benchmark.llm_client import LLMClient +from benchmark.llm_client import LLMClient, get_text_content from .base import AgentOrchestrator logger = logging.getLogger(__name__) @@ -323,7 +323,7 @@ async def generate_plan_and_subtasks( f"{usage['input_tokens'] + usage['output_tokens']} tokens" ) - content = extract_json_from_llm_response(response.content) + content = extract_json_from_llm_response(get_text_content(response.content)) logger.debug(f"Extracted JSON content (attempt {attempt + 1}):\n{content[:500]}") parsed = json.loads(content) diff --git a/orchestrators/planner_react.py b/orchestrators/planner_react.py index bb369a8..48fffeb 100644 --- a/orchestrators/planner_react.py +++ b/orchestrators/planner_react.py @@ -11,7 +11,7 @@ from langchain_core.messages import HumanMessage, SystemMessage, ToolMessage -from benchmark.llm_client import LLMClient +from benchmark.llm_client import LLMClient, get_text_content from .base import AgentOrchestrator logger = logging.getLogger(__name__) @@ -101,7 +101,7 @@ async def generate_plan( logger.info("🧠 Meta Agent: Generating execution plan...") response = await self.planner_llm.ainvoke([HumanMessage(content=prompt)]) - plan = response.content + plan = get_text_content(response.content) logger.info(f"✅ Plan generated ({len(plan)} characters)") logger.debug(f"Generated plan:\n{plan}") From 97a6ed78af71206e1d59eecbfec44317897632fa Mon Sep 17 00:00:00 2001 From: "varun.pandey" Date: Sat, 30 May 2026 11:35:56 -0400 Subject: [PATCH 4/4] Get text content from response for React --- orchestrators/react.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/orchestrators/react.py b/orchestrators/react.py index c81e053..fee5e3e 100644 --- a/orchestrators/react.py +++ b/orchestrators/react.py @@ -5,7 +5,7 @@ from typing import Any, Dict, List, TYPE_CHECKING from benchmark.mcp_client import MCPClient -from benchmark.llm_client import LLMClient +from benchmark.llm_client import LLMClient, get_text_content from benchmark.models import BenchmarkConfig from langchain_core.messages import SystemMessage, HumanMessage, ToolMessage @@ -58,7 +58,7 @@ async def execute(self) -> Dict[str, Any]: conversation_flow.append( { "type": "ai_message", - "content": response.content, + "content": get_text_content(response.content), "usage_metadata": usage_metadata, "response_metadata": response_metadata, "tool_calls": [ @@ -68,7 +68,7 @@ async def execute(self) -> Dict[str, Any]: } ) - logger.info(f"LLM Response: {response.content}") + logger.info(f"LLM Response: {get_text_content(response.content)}") # Terminate if the LLM decided no further tool calls are needed if not response.tool_calls or len(response.tool_calls) == 0: