From 519d816a03cb2df725b5797842ba56f56346eece Mon Sep 17 00:00:00 2001
From: "varun.pandey" <varun.pandey@servicenow.com>
Date: Fri, 8 May 2026 17:37:52 -0400
Subject: [PATCH 1/4] Support responses API for OpenAI models

---
 benchmark/llm_client.py | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/benchmark/llm_client.py b/benchmark/llm_client.py
index 46d64e5..5be3cbf 100644
--- a/benchmark/llm_client.py
+++ b/benchmark/llm_client.py
@@ -35,6 +35,8 @@ def __init__(
         self.top_p = top_p
         self.effort = effort
         self.reasoning = reasoning
+        # Responses API only when an effort is set for OpenAI/AzureOpenAI endpoints.
+        self.use_responses_api = bool(effort) and self.provider in ("openai", "azureopenai")
         self.llm = None
 
         self._initialize_llm()
@@ -65,11 +67,18 @@ def _initialize_llm(self):
             elif self.provider == "openai":
                 from langchain_openai import ChatOpenAI
 
+                model_kwargs = {}
+                if self.top_p is not None:
+                    model_kwargs["top_p"] = self.top_p
+
                 self.llm = ChatOpenAI(
                     model=self.model,
                     openai_api_key=self.api_key,
                     temperature=self.temperature,
                     max_tokens=self.max_tokens,
+                    model_kwargs=model_kwargs,
+                    use_responses_api=self.use_responses_api,
+                    reasoning_effort=self.effort,
                 )
             elif self.provider == "google":
                 from langchain_google_genai import ChatGoogleGenerativeAI
@@ -97,8 +106,6 @@ def _initialize_llm(self):
                 model_kwargs = {}
                 if self.top_p is not None:
                     model_kwargs["top_p"] = self.top_p
-                if self.effort is not None:
-                    model_kwargs["reasoning_effort"] = self.effort
 
                 self.llm = AzureChatOpenAI(
                     azure_endpoint=self.custom_api_endpoint,
@@ -107,7 +114,9 @@ def _initialize_llm(self):
                     azure_deployment=self.model,
                     temperature=self.temperature,
                     max_completion_tokens=self.max_tokens,
-                    model_kwargs=model_kwargs # In GPT-5.X this is a first class parameter, but passing this way is also allowed.
+                    model_kwargs=model_kwargs,
+                    use_responses_api=self.use_responses_api, # Required for some reasoning model configurations
+                    reasoning_effort=self.effort,
                 )
             elif self.provider == "vllm" or self.provider == "openrouter":
                 from langchain_openai import ChatOpenAI
@@ -132,6 +141,7 @@ def _initialize_llm(self):
                     temperature=self.temperature,
                     max_tokens=self.max_tokens,
                     model_kwargs=model_kwargs,
+                    request_timeout=600,
                 )
             elif self.provider == "qwq":
                 from langchain_qwq import ChatQwQ
@@ -200,6 +210,10 @@ def _convert_mcp_tools_to_langchain(
                     "parameters": cleaned_schema,
                 },
             }
+            # Responses API defaults to strict mode, which forces the model to fill every
+            # optional param with hallucinated values. Explicit strict=False avoids this.
+            if self.provider in ("openai", "azureopenai") and self.use_responses_api:
+                tool_def["function"]["strict"] = False
             langchain_tools.append(tool_def)
 
         return langchain_tools
@@ -314,7 +328,8 @@ async def invoke_with_tools(
         # Convert MCP tools to LangChain format
         langchain_tools = self._convert_mcp_tools_to_langchain(tools)
 
-        # Bind tools to LLM
+        # Bind tools to LLM (the strict=False flag for OpenAI providers is
+        # set on each tool dict in _convert_mcp_tools_to_langchain).
         llm_with_tools = self.llm.bind_tools(langchain_tools)
         llm_with_retry = llm_with_tools.with_retry(
             retry_if_exception_type=(

From 43c13b29bfe642062cd125d9be61e3f4db486be3 Mon Sep 17 00:00:00 2001
From: "varun.pandey" <varun.pandey@servicenow.com>
Date: Fri, 8 May 2026 17:42:59 -0400
Subject: [PATCH 2/4] Remove timeout parameter

---
 benchmark/llm_client.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/benchmark/llm_client.py b/benchmark/llm_client.py
index 5be3cbf..b393c00 100644
--- a/benchmark/llm_client.py
+++ b/benchmark/llm_client.py
@@ -141,7 +141,6 @@ def _initialize_llm(self):
                     temperature=self.temperature,
                     max_tokens=self.max_tokens,
                     model_kwargs=model_kwargs,
-                    request_timeout=600,
                 )
             elif self.provider == "qwq":
                 from langchain_qwq import ChatQwQ

From 3286340fa5f891d3c3d154b3b7e303dec5616cde Mon Sep 17 00:00:00 2001
From: "varun.pandey" <varun.pandey@servicenow.com>
Date: Sat, 30 May 2026 11:23:14 -0400
Subject: [PATCH 3/4] Get text content from response

---
 benchmark/llm_client.py              | 13 +++++++++++++
 orchestrators/decomposing_planner.py |  4 ++--
 orchestrators/planner_react.py       |  4 ++--
 3 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/benchmark/llm_client.py b/benchmark/llm_client.py
index b393c00..001cad6 100644
--- a/benchmark/llm_client.py
+++ b/benchmark/llm_client.py
@@ -3,6 +3,19 @@
 
 logger = logging.getLogger(__name__)
 
+def get_text_content(content) -> str:
+    """Extract plain text from LLM response content.
+
+    Handles both plain strings and lists returned by models with thinking/reasoning
+    blocks (e.g. Claude extended thinking on Bedrock returns a list of content blocks).
+    """
+
+    if isinstance(content, list):
+        for block in content:
+            if isinstance(block, dict) and block.get("type") == "text":
+                return block.get("text", "")
+        return ""
+    return content or ""
 
 class LLMClient:
     """
diff --git a/orchestrators/decomposing_planner.py b/orchestrators/decomposing_planner.py
index b9dce7f..f5a0355 100644
--- a/orchestrators/decomposing_planner.py
+++ b/orchestrators/decomposing_planner.py
@@ -17,7 +17,7 @@
 
 from langchain_core.messages import HumanMessage, SystemMessage, ToolMessage
 
-from benchmark.llm_client import LLMClient
+from benchmark.llm_client import LLMClient, get_text_content
 from .base import AgentOrchestrator
 
 logger = logging.getLogger(__name__)
@@ -323,7 +323,7 @@ async def generate_plan_and_subtasks(
                     f"{usage['input_tokens'] + usage['output_tokens']} tokens"
                 )
 
-                content = extract_json_from_llm_response(response.content)
+                content = extract_json_from_llm_response(get_text_content(response.content))
                 logger.debug(f"Extracted JSON content (attempt {attempt + 1}):\n{content[:500]}")
 
                 parsed = json.loads(content)
diff --git a/orchestrators/planner_react.py b/orchestrators/planner_react.py
index bb369a8..48fffeb 100644
--- a/orchestrators/planner_react.py
+++ b/orchestrators/planner_react.py
@@ -11,7 +11,7 @@
 
 from langchain_core.messages import HumanMessage, SystemMessage, ToolMessage
 
-from benchmark.llm_client import LLMClient
+from benchmark.llm_client import LLMClient, get_text_content
 from .base import AgentOrchestrator
 
 logger = logging.getLogger(__name__)
@@ -101,7 +101,7 @@ async def generate_plan(
 
         logger.info("🧠 Meta Agent: Generating execution plan...")
         response = await self.planner_llm.ainvoke([HumanMessage(content=prompt)])
-        plan = response.content
+        plan = get_text_content(response.content)
 
         logger.info(f"✅ Plan generated ({len(plan)} characters)")
         logger.debug(f"Generated plan:\n{plan}")

From 97a6ed78af71206e1d59eecbfec44317897632fa Mon Sep 17 00:00:00 2001
From: "varun.pandey" <varun.pandey@servicenow.com>
Date: Sat, 30 May 2026 11:35:56 -0400
Subject: [PATCH 4/4] Get text content from response for React

---
 orchestrators/react.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/orchestrators/react.py b/orchestrators/react.py
index c81e053..fee5e3e 100644
--- a/orchestrators/react.py
+++ b/orchestrators/react.py
@@ -5,7 +5,7 @@
 from typing import Any, Dict, List, TYPE_CHECKING
 
 from benchmark.mcp_client import MCPClient
-from benchmark.llm_client import LLMClient
+from benchmark.llm_client import LLMClient, get_text_content
 from benchmark.models import BenchmarkConfig
 from langchain_core.messages import SystemMessage, HumanMessage, ToolMessage
 
@@ -58,7 +58,7 @@ async def execute(self) -> Dict[str, Any]:
             conversation_flow.append(
                 {
                     "type": "ai_message",
-                    "content": response.content,
+                    "content": get_text_content(response.content),
                     "usage_metadata": usage_metadata,
                     "response_metadata": response_metadata,
                     "tool_calls": [
@@ -68,7 +68,7 @@ async def execute(self) -> Dict[str, Any]:
                 }
             )
 
-            logger.info(f"LLM Response: {response.content}")
+            logger.info(f"LLM Response: {get_text_content(response.content)}")
 
             # Terminate if the LLM decided no further tool calls are needed
             if not response.tool_calls or len(response.tool_calls) == 0: