ServiceNow · varunpandey23 · May 8, 2026 · May 8, 2026 · May 30, 2026 · May 30, 2026
diff --git a/benchmark/llm_client.py b/benchmark/llm_client.py
@@ -3,6 +3,19 @@
 
 logger = logging.getLogger(__name__)
 
+def get_text_content(content) -> str:
+    """Extract plain text from LLM response content.
+
+    Handles both plain strings and lists returned by models with thinking/reasoning
+    blocks (e.g. Claude extended thinking on Bedrock returns a list of content blocks).
+    """
+
+    if isinstance(content, list):
+        for block in content:
+            if isinstance(block, dict) and block.get("type") == "text":
+                return block.get("text", "")
+        return ""
+    return content or ""
 
 class LLMClient:
     """
@@ -35,6 +48,8 @@ def __init__(
         self.top_p = top_p
         self.effort = effort
         self.reasoning = reasoning
+        # Responses API only when an effort is set for OpenAI/AzureOpenAI endpoints.
+        self.use_responses_api = bool(effort) and self.provider in ("openai", "azureopenai")
         self.llm = None
 
         self._initialize_llm()
@@ -65,11 +80,18 @@ def _initialize_llm(self):
             elif self.provider == "openai":
                 from langchain_openai import ChatOpenAI
 
+                model_kwargs = {}
+                if self.top_p is not None:
+                    model_kwargs["top_p"] = self.top_p
+
                 self.llm = ChatOpenAI(
                     model=self.model,
                     openai_api_key=self.api_key,
                     temperature=self.temperature,
                     max_tokens=self.max_tokens,
+                    model_kwargs=model_kwargs,
+                    use_responses_api=self.use_responses_api,
+                    reasoning_effort=self.effort,
                 )
             elif self.provider == "google":
                 from langchain_google_genai import ChatGoogleGenerativeAI
@@ -97,8 +119,6 @@ def _initialize_llm(self):
                 model_kwargs = {}
                 if self.top_p is not None:
                     model_kwargs["top_p"] = self.top_p
-                if self.effort is not None:
-                    model_kwargs["reasoning_effort"] = self.effort
 
                 self.llm = AzureChatOpenAI(
                     azure_endpoint=self.custom_api_endpoint,
@@ -107,7 +127,9 @@ def _initialize_llm(self):
                     azure_deployment=self.model,
                     temperature=self.temperature,
                     max_completion_tokens=self.max_tokens,
-                    model_kwargs=model_kwargs # In GPT-5.X this is a first class parameter, but passing this way is also allowed.
+                    model_kwargs=model_kwargs,
+                    use_responses_api=self.use_responses_api, # Required for some reasoning model configurations
+                    reasoning_effort=self.effort,
                 )
             elif self.provider == "vllm" or self.provider == "openrouter":
                 from langchain_openai import ChatOpenAI
@@ -200,6 +222,10 @@ def _convert_mcp_tools_to_langchain(
                     "parameters": cleaned_schema,
                 },
             }
+            # Responses API defaults to strict mode, which forces the model to fill every
+            # optional param with hallucinated values. Explicit strict=False avoids this.
+            if self.provider in ("openai", "azureopenai") and self.use_responses_api:
+                tool_def["function"]["strict"] = False
             langchain_tools.append(tool_def)
 
         return langchain_tools
@@ -314,7 +340,8 @@ async def invoke_with_tools(
         # Convert MCP tools to LangChain format
         langchain_tools = self._convert_mcp_tools_to_langchain(tools)
 
-        # Bind tools to LLM
+        # Bind tools to LLM (the strict=False flag for OpenAI providers is
+        # set on each tool dict in _convert_mcp_tools_to_langchain).
         llm_with_tools = self.llm.bind_tools(langchain_tools)
         llm_with_retry = llm_with_tools.with_retry(
             retry_if_exception_type=(

diff --git a/orchestrators/decomposing_planner.py b/orchestrators/decomposing_planner.py
@@ -17,7 +17,7 @@
 
 from langchain_core.messages import HumanMessage, SystemMessage, ToolMessage
 
-from benchmark.llm_client import LLMClient
+from benchmark.llm_client import LLMClient, get_text_content
 from .base import AgentOrchestrator
 
 logger = logging.getLogger(__name__)
@@ -323,7 +323,7 @@ async def generate_plan_and_subtasks(
                     f"{usage['input_tokens'] + usage['output_tokens']} tokens"
                 )
 
-                content = extract_json_from_llm_response(response.content)
+                content = extract_json_from_llm_response(get_text_content(response.content))
                 logger.debug(f"Extracted JSON content (attempt {attempt + 1}):\n{content[:500]}")
 
                 parsed = json.loads(content)

diff --git a/orchestrators/planner_react.py b/orchestrators/planner_react.py
@@ -11,7 +11,7 @@
 
 from langchain_core.messages import HumanMessage, SystemMessage, ToolMessage
 
-from benchmark.llm_client import LLMClient
+from benchmark.llm_client import LLMClient, get_text_content
 from .base import AgentOrchestrator
 
 logger = logging.getLogger(__name__)
@@ -101,7 +101,7 @@ async def generate_plan(
 
         logger.info("🧠 Meta Agent: Generating execution plan...")
         response = await self.planner_llm.ainvoke([HumanMessage(content=prompt)])
-        plan = response.content
+        plan = get_text_content(response.content)
 
         logger.info(f"✅ Plan generated ({len(plan)} characters)")
         logger.debug(f"Generated plan:\n{plan}")

diff --git a/orchestrators/react.py b/orchestrators/react.py
@@ -5,7 +5,7 @@
 from typing import Any, Dict, List, TYPE_CHECKING
 
 from benchmark.mcp_client import MCPClient
-from benchmark.llm_client import LLMClient
+from benchmark.llm_client import LLMClient, get_text_content
 from benchmark.models import BenchmarkConfig
 from langchain_core.messages import SystemMessage, HumanMessage, ToolMessage
 
@@ -58,7 +58,7 @@ async def execute(self) -> Dict[str, Any]:
             conversation_flow.append(
                 {
                     "type": "ai_message",
-                    "content": response.content,
+                    "content": get_text_content(response.content),
                     "usage_metadata": usage_metadata,
                     "response_metadata": response_metadata,
                     "tool_calls": [
@@ -68,7 +68,7 @@ async def execute(self) -> Dict[str, Any]:
                 }
             )
 
-            logger.info(f"LLM Response: {response.content}")
+            logger.info(f"LLM Response: {get_text_content(response.content)}")
 
             # Terminate if the LLM decided no further tool calls are needed
             if not response.tool_calls or len(response.tool_calls) == 0: