From 086d203bcd02fa5a8427ddf3087b67d18435fcae Mon Sep 17 00:00:00 2001 From: Torkian Date: Mon, 18 May 2026 17:45:58 -0700 Subject: [PATCH 1/6] Force clarifier to search before asking clarification questions Closes #234. The clarifier agent had tool-calling bound to a search loop, but weak or non-frontier models routinely skipped the search and asked the user a clarification question immediately. The expected behavior is to gather context with the available tools first and only ask once a genuine ambiguity remains. This is fixed at the graph level so it works model-agnostically rather than relying on prompt engineering for any one model: - New force_search node in the LangGraph workflow. When the LLM tries to ask for clarification on the first turn, tools are configured, and no tool call has happened yet, decide_route sends the run through force_search before ask_for_clarification. - Guidance is injected ephemerally in agent_node (same pattern as the existing JSON_REMINDER_AFTER_TOOLS) rather than written into state.messages, so it cannot leak into helpers like get_latest_user_query that surface text back to the user. - A force_search_used boolean on ClarifierAgentState makes the nudge fire at most once per run, so a model that ignores the nudge falls through to ask_for_clarification instead of looping. - prompts/research_clarification.j2 is updated so the Tool Usage section instructs the model to "Search first, ask second" for any unfamiliar entity, acronym, or term. Frontier models follow this voluntarily and skip the structural nudge entirely. End-to-end reproducer added at scripts/e2e_clarifier_search_test.py. Against nvidia/nemotron-3-nano-30b-a3b (the project's default clarifier model) with a real Tavily web search tool: develop: 0 tool calls, user prompted immediately (bug) fix: 1 tool call, user never prompted (fixed) The mock-based unit suite covers the new helper, the four routing branches, and the regression flagged during review where the guidance message could otherwise leak into the invalid-format fallback text. All 148 clarifier tests pass. Signed-off-by: Torkian --- scripts/e2e_clarifier_search_test.py | 133 ++++++++++ src/aiq_agent/agents/clarifier/agent.py | 82 +++++- .../agents/clarifier/models/state.py | 4 + .../prompts/research_clarification.j2 | 8 +- .../agents/clarifier/models/test_state.py | 10 + .../aiq_agent/agents/clarifier/test_agent.py | 249 ++++++++++++++++++ 6 files changed, 480 insertions(+), 6 deletions(-) create mode 100644 scripts/e2e_clarifier_search_test.py diff --git a/scripts/e2e_clarifier_search_test.py b/scripts/e2e_clarifier_search_test.py new file mode 100644 index 00000000..93c9e808 --- /dev/null +++ b/scripts/e2e_clarifier_search_test.py @@ -0,0 +1,133 @@ +"""End-to-end test for issue #234 fix. + +Runs the ClarifierAgent against a real NVIDIA NIM-hosted LLM with a real Tavily +web search tool, using an obscure query the LLM cannot possibly know from +training data. Verifies that the agent issues a search call before falling back +to asking the user for clarification. + +Run with: + PYTHONPATH=src .venv/bin/python scripts/e2e_clarifier_search_test.py +""" + +from __future__ import annotations + +import asyncio +import os +import sys +from pathlib import Path + +# Load deploy/.env if present. +ENV_FILE = Path(__file__).resolve().parents[1] / "deploy" / ".env" +if ENV_FILE.exists(): + for line in ENV_FILE.read_text().splitlines(): + line = line.strip() + if not line or line.startswith("#") or "=" not in line: + continue + key, _, value = line.partition("=") + os.environ.setdefault(key.strip(), value.strip()) + +if not os.environ.get("NVIDIA_API_KEY"): + sys.exit("NVIDIA_API_KEY not set (paste it into deploy/.env)") +if not os.environ.get("TAVILY_API_KEY"): + sys.exit("TAVILY_API_KEY not set (paste it into deploy/.env)") + +from langchain_community.tools.tavily_search import TavilySearchResults # noqa: E402 +from langchain_core.messages import HumanMessage, ToolMessage # noqa: E402 +from langchain_nvidia_ai_endpoints import ChatNVIDIA # noqa: E402 + +from aiq_agent.agents.clarifier.agent import ClarifierAgent # noqa: E402 +from aiq_agent.agents.clarifier.models import ClarifierAgentState # noqa: E402 +from aiq_agent.common import LLMProvider # noqa: E402 + +try: + from aiq_agent.agents.clarifier.agent import FORCE_SEARCH_GUIDANCE # noqa: E402 +except ImportError: + FORCE_SEARCH_GUIDANCE = None # running against the pre-fix branch + +# Test queries: +# - "obscure": something no LLM can know without searching. +# - "knowable": something the LLM thinks it knows; in the failure mode it +# skips the search and just asks "which aspect?". +TEST_QUERIES = { + "obscure": "Tell me about Project Zphyr-77Q at QXR Industries — what does it do and who works on it?", + "knowable": "Research the latest NVIDIA AI announcements from 2026", +} +OBSCURE_QUERY = TEST_QUERIES[os.environ.get("AIQ_TEST_QUERY", "knowable")] + +# Model defaults to a small open model so the test is cheap to run. +MODEL_NAME = os.environ.get("AIQ_CLARIFIER_TEST_MODEL", "meta/llama-3.3-70b-instruct") + + +async def main() -> int: + llm = ChatNVIDIA(model=MODEL_NAME, temperature=0) + search = TavilySearchResults(max_results=3, name="web_search_tool") + + provider = LLMProvider() + provider.set_default(llm) + + # The user_prompt_callback should never be invoked if the search-before-ask + # behavior is working. If it is invoked, we record it so the test can fail + # loudly. + user_prompts: list[str] = [] + + async def user_prompt_callback(question: str) -> str: + user_prompts.append(question) + # If reached, pretend the user said "skip" so the run still terminates. + return "skip" + + agent = ClarifierAgent( + llm_provider=provider, + tools=[search], + user_prompt_callback=user_prompt_callback, + max_turns=2, + ) + + state = ClarifierAgentState(messages=[HumanMessage(content=OBSCURE_QUERY)]) + print(f"Model: {MODEL_NAME}") + print(f"Query: {OBSCURE_QUERY}\n") + + final_state = await agent.graph.ainvoke(state, config={"callbacks": []}) + + # Pull the final messages out for inspection. + if isinstance(final_state, dict): + messages = final_state["messages"] + force_search_used = final_state.get("force_search_used", False) + else: + messages = final_state.messages + force_search_used = getattr(final_state, "force_search_used", False) + + tool_calls_made = sum( + len(getattr(m, "tool_calls", None) or []) for m in messages + ) + tool_results = sum(1 for m in messages if isinstance(m, ToolMessage)) + forced = bool(force_search_used) + + print("=" * 72) + print("Conversation trace") + print("=" * 72) + for i, m in enumerate(messages): + kind = type(m).__name__ + content = str(getattr(m, "content", "")).replace("\n", " ")[:160] + extra = "" + if getattr(m, "tool_calls", None): + extra = f" tool_calls={[(tc.get('name'), tc.get('args')) for tc in m.tool_calls]}" + print(f"{i:>2}. {kind}: {content}{extra}") + + print() + print("=" * 72) + print("Verdict") + print("=" * 72) + print(f" user_prompt_callback invocations : {len(user_prompts)}") + print(f" total tool_calls emitted by LLM : {tool_calls_made}") + print(f" total ToolMessages (results) : {tool_results}") + print(f" force_search guidance injected : {forced}") + + if tool_calls_made >= 1: + print("\nPASS: the LLM issued at least one search before any user prompt.") + return 0 + print("\nFAIL: the LLM never issued a search.") + return 1 + + +if __name__ == "__main__": + sys.exit(asyncio.run(main())) diff --git a/src/aiq_agent/agents/clarifier/agent.py b/src/aiq_agent/agents/clarifier/agent.py index 5779e5f0..76c98a37 100644 --- a/src/aiq_agent/agents/clarifier/agent.py +++ b/src/aiq_agent/agents/clarifier/agent.py @@ -104,6 +104,15 @@ ) """Reminder prompt added after tool results to reinforce JSON-only output.""" +FORCE_SEARCH_GUIDANCE = ( + "You attempted to ask the user for clarification before gathering any context. " + "Before asking the user a question, you MUST first use the available search tools " + "to look up unfamiliar entities, acronyms, products, or terms in their request. " + "Issue one focused tool call now with a query derived from the user's request. " + "Only after reviewing the tool results should you decide whether clarification is still needed." +) +"""Guidance prompt injected when the LLM tries to clarify without having searched first.""" + class ClarifierAgent: """ @@ -484,6 +493,24 @@ def _get_fallback_clarification(self, query: str | None = None) -> str: SKIP_COMMANDS = {"skip", "done", "exit", "quit", "proceed", "continue", "no", "n", ""} """Set of commands that indicate the user wants to skip clarification.""" + @staticmethod + def _has_tool_invocations(messages: Sequence[Any]) -> bool: + """ + Check whether any prior assistant message in the conversation issued tool calls. + + Args: + messages: The conversation message history. + + Returns: + True if any AIMessage in the history carries non-empty tool_calls, + False otherwise. + """ + for msg in messages: + tool_calls = getattr(msg, "tool_calls", None) + if tool_calls: + return True + return False + def _is_skip_command(self, user_reply: str) -> bool: """ Check if the user's reply indicates they want to skip clarification. @@ -503,16 +530,22 @@ def _build_graph(self) -> CompiledStateGraph: """ Build the LangGraph StateGraph for the clarification workflow. - Creates a graph with three nodes: + Creates a graph with the following nodes: - agent: Generates clarification questions using the LLM - tools: Executes tool calls (e.g., web search) for context - ask_for_clarification: Prompts user and processes response + - force_search: Nudges the LLM to attempt a search before its first + clarification question when tools are configured but unused (fires + at most once per run) + - plan_preview: Optional plan approval flow The graph flow: 1. agent generates a response (question, tool call, or completion) 2. If tool call → tools node → back to agent - 3. If question → ask_for_clarification → back to agent - 4. If complete → end + 3. If complete → end (or plan_preview if enabled) + 4. If question, tools exist, and no search has happened yet on the + first turn → force_search injects guidance and routes back to agent + 5. Otherwise → ask_for_clarification → back to agent Returns: Compiled LangGraph StateGraph ready for execution. @@ -547,6 +580,13 @@ async def agent_node(state: ClarifierAgentState): logger.info("Adding JSON reminder after tool results") messages.append(HumanMessage(content=JSON_REMINDER_AFTER_TOOLS)) + # If the force_search nudge was raised but no tool call has happened + # yet, inject the guidance ephemerally — never stored in state.messages, + # so it cannot leak into get_latest_user_query and similar lookups. + if state.force_search_used and not self._has_tool_invocations(state.messages): + logger.info("Injecting force-search guidance for this turn") + messages.append(HumanMessage(content=FORCE_SEARCH_GUIDANCE)) + response = await bound_llm.ainvoke(messages) return {"messages": [response]} @@ -592,8 +632,12 @@ async def ask_clarification(state: ClarifierAgentState): def decide_route(state: ClarifierAgentState | dict): if isinstance(state, dict): messages = state.get("messages", []) + iteration = state.get("iteration", 0) + force_search_used = state.get("force_search_used", False) elif hasattr(state, "messages"): messages = state.messages + iteration = state.iteration + force_search_used = state.force_search_used else: msg = f"No messages found in input state to tool_edge: {state}" raise ValueError(msg) @@ -610,8 +654,37 @@ def decide_route(state: ClarifierAgentState | dict): if self.enable_plan_approval: return "plan_preview" return "__end__" + + # Force a search before the first clarification question when tools are + # configured but the agent has not yet issued any tool call. This keeps + # the behavior model-agnostic: even models that would otherwise skip + # tool use must attempt at least one search before falling back to + # asking the user for clarification. We only nudge once per run (guarded + # by ``force_search_used``) so a stubborn model cannot get stuck in a + # loop — after the nudge, normal clarification routing resumes. + if ( + self.tools + and iteration == 0 + and not force_search_used + and not self._has_tool_invocations(messages[:-1]) + ): + logger.info("Clarifier: forcing search before first clarification question") + return "force_search" + return "ask_for_clarification" + async def force_search_node(state: ClarifierAgentState): + """ + Flip the force_search_used flag so ``agent_node`` will inject the + search-first guidance message ephemerally on the next LLM call. + + The guidance is intentionally NOT appended to ``state.messages`` so + it cannot be picked up by helpers like ``get_latest_user_query`` + (which would otherwise surface internal scaffolding back to the + user in fallback text). + """ + return {"force_search_used": True} + async def plan_preview_node(state: ClarifierAgentState): """Generate plan preview and handle approval/feedback loop.""" clarifier_log = state.clarifier_log @@ -681,6 +754,7 @@ async def plan_preview_node(state: ClarifierAgentState): graph.add_node("agent", agent_node) graph.add_node("tools", ToolNode(self.tools)) graph.add_node("ask_for_clarification", ask_clarification) + graph.add_node("force_search", force_search_node) graph.add_node("plan_preview", plan_preview_node) graph.set_entry_point("agent") @@ -691,6 +765,7 @@ async def plan_preview_node(state: ClarifierAgentState): { "tools": "tools", "ask_for_clarification": "ask_for_clarification", + "force_search": "force_search", "plan_preview": "plan_preview", "__end__": "__end__", }, @@ -698,6 +773,7 @@ async def plan_preview_node(state: ClarifierAgentState): graph.add_edge("tools", "agent") graph.add_edge("ask_for_clarification", "agent") + graph.add_edge("force_search", "agent") graph.add_edge("plan_preview", "__end__") return graph.compile() diff --git a/src/aiq_agent/agents/clarifier/models/state.py b/src/aiq_agent/agents/clarifier/models/state.py index 89d1514d..be7a6e1d 100644 --- a/src/aiq_agent/agents/clarifier/models/state.py +++ b/src/aiq_agent/agents/clarifier/models/state.py @@ -58,6 +58,9 @@ class ClarifierAgentState(BaseModel): max_turns: Maximum number of turns for the clarification dialog. clarifier_log: Log of the clarification dialog. iteration: Current iteration of the clarification dialog. + force_search_used: Whether the agent has already been nudged once to + attempt a search before its first clarification question. Used to + ensure the nudge fires at most once per run. plan_title: Title of the generated research plan (if plan approval enabled). plan_sections: List of section titles for the research plan. plan_approved: Whether the user approved the plan. @@ -74,6 +77,7 @@ class ClarifierAgentState(BaseModel): max_turns: int = Field(default=3) clarifier_log: str = Field(default="") iteration: int = Field(default=0) + force_search_used: bool = Field(default=False) plan_title: str | None = Field(default=None) plan_sections: list[str] = Field(default_factory=list) plan_approved: bool = Field(default=False) diff --git a/src/aiq_agent/agents/clarifier/prompts/research_clarification.j2 b/src/aiq_agent/agents/clarifier/prompts/research_clarification.j2 index a2745577..ad62e064 100644 --- a/src/aiq_agent/agents/clarifier/prompts/research_clarification.j2 +++ b/src/aiq_agent/agents/clarifier/prompts/research_clarification.j2 @@ -29,9 +29,11 @@ Your ONLY responsibility is to determine whether a research request requires cla ## Tool Usage -- You may use search tools ONLY to understand unfamiliar domains -- Use at most 1-2 high-value searches -- Searches are for your internal understanding only +- **Search first, ask second.** If the user's request contains any unfamiliar entity, acronym, project, person, product, or technical term that you cannot fully define from your training data, you MUST issue a search tool call before deciding clarification is needed. Do not ask the user to define terms that a quick search would resolve. +- On the first turn, prefer at least one search to ground the topic in current context whenever search tools are available. +- Use at most 1-2 high-value searches per turn — keep queries focused on the specific unknown. +- Searches are for your internal understanding only. Do not summarize or report search results to the user. +- After reviewing search results, re-evaluate: if the request is now sufficiently specified, return `needs_clarification: false`. Only ask a clarification question if a genuine ambiguity remains. --- diff --git a/tests/aiq_agent/agents/clarifier/models/test_state.py b/tests/aiq_agent/agents/clarifier/models/test_state.py index bbc86cd1..851b9477 100644 --- a/tests/aiq_agent/agents/clarifier/models/test_state.py +++ b/tests/aiq_agent/agents/clarifier/models/test_state.py @@ -73,6 +73,16 @@ def test_custom_iteration(self): state = ClarifierAgentState(messages=[], iteration=2) assert state.iteration == 2 + def test_default_force_search_used(self): + """Test default force_search_used is False.""" + state = ClarifierAgentState(messages=[]) + assert state.force_search_used is False + + def test_custom_force_search_used(self): + """Test custom force_search_used.""" + state = ClarifierAgentState(messages=[], force_search_used=True) + assert state.force_search_used is True + def test_remaining_questions_full(self): """Test remaining_questions when iteration is 0.""" state = ClarifierAgentState(messages=[], max_turns=3, iteration=0) diff --git a/tests/aiq_agent/agents/clarifier/test_agent.py b/tests/aiq_agent/agents/clarifier/test_agent.py index 38f53b01..30bbe314 100644 --- a/tests/aiq_agent/agents/clarifier/test_agent.py +++ b/tests/aiq_agent/agents/clarifier/test_agent.py @@ -23,9 +23,11 @@ import pytest from langchain_core.messages import AIMessage from langchain_core.messages import HumanMessage +from langchain_core.messages import ToolMessage from langchain_core.tools import tool from aiq_agent.agents.clarifier.agent import DEFAULT_CLARIFICATION_PROMPT +from aiq_agent.agents.clarifier.agent import FORCE_SEARCH_GUIDANCE from aiq_agent.agents.clarifier.agent import ClarifierAgent from aiq_agent.agents.clarifier.models import ClarificationResponse from aiq_agent.agents.clarifier.models import ClarifierAgentState @@ -872,3 +874,250 @@ def test_init_with_planner_llm(self, mock_llm_provider): ) assert agent.planner_llm == planner_llm + + +class TestHasToolInvocations: + """Tests for the _has_tool_invocations helper.""" + + def test_empty_messages(self): + """No messages -> no invocations.""" + assert ClarifierAgent._has_tool_invocations([]) is False + + def test_only_human_messages(self): + """Human-only history has no invocations.""" + messages = [HumanMessage(content="hi"), HumanMessage(content="more")] + assert ClarifierAgent._has_tool_invocations(messages) is False + + def test_ai_message_without_tool_calls(self): + """An AIMessage without tool_calls counts as no invocation.""" + messages = [HumanMessage(content="hi"), AIMessage(content="hello")] + assert ClarifierAgent._has_tool_invocations(messages) is False + + def test_ai_message_with_tool_calls(self): + """An AIMessage with tool_calls counts as an invocation.""" + ai = AIMessage( + content="", + tool_calls=[{"name": "web_search_tool", "args": {"query": "x"}, "id": "call_1"}], + ) + assert ClarifierAgent._has_tool_invocations([HumanMessage(content="hi"), ai]) is True + + def test_with_tool_message(self): + """A ToolMessage by itself does not count - we look at the assistant turn.""" + msg = ToolMessage(content="result", tool_call_id="call_1") + assert ClarifierAgent._has_tool_invocations([msg]) is False + + +class TestClarifierForceSearch: + """Tests for the search-before-clarify behavior (issue #234).""" + + @pytest.fixture + def mock_llm(self): + """Create a mock LLM.""" + llm = MagicMock() + llm.bind_tools = MagicMock(return_value=llm) + return llm + + @pytest.fixture + def mock_llm_provider(self, mock_llm): + """Create a mock LLM provider.""" + provider = MagicMock(spec=LLMProvider) + provider.get = MagicMock(return_value=mock_llm) + return provider + + @pytest.mark.asyncio + async def test_force_search_fires_when_llm_skips_tools(self, mock_llm_provider, mock_llm): + """When tools are configured and the LLM tries to clarify without searching, + the agent must first nudge the LLM with a force-search guidance message, + then route to tools when the LLM complies.""" + + # 1st LLM call: skip tools, ask for clarification. + # 2nd LLM call (after force_search): produce a tool call. + # 3rd LLM call (after tool result): return complete. + clarif_msg = AIMessage( + content=ClarificationResponse( + needs_clarification=True, clarification_question="What aspect?" + ).model_dump_json() + ) + tool_call_msg = AIMessage( + content="", + tool_calls=[{"name": "web_search_tool", "args": {"query": "AI"}, "id": "call_1"}], + ) + complete_msg = AIMessage( + content=ClarificationResponse(needs_clarification=False, clarification_question=None).model_dump_json() + ) + mock_llm.ainvoke = AsyncMock(side_effect=[clarif_msg, tool_call_msg, complete_msg]) + + user_callback = AsyncMock() + + agent = ClarifierAgent( + llm_provider=mock_llm_provider, + tools=[web_search_tool], + user_prompt_callback=user_callback, + ) + + state = ClarifierAgentState(messages=[HumanMessage(content="Research Foo Project XYZ")]) + result = await agent.run(state) + + assert result is not None + assert isinstance(result, ClarifierResult) + # The LLM was invoked three times: clarify-attempt, tool-call, finalize. + assert mock_llm.ainvoke.call_count == 3 + # The user was never prompted because the search-then-complete path was taken. + user_callback.assert_not_called() + # The 2nd LLM call (after force_search guidance) should have received the + # guidance string as the latest HumanMessage. + second_call_messages = mock_llm.ainvoke.call_args_list[1].args[0] + latest_human = next(m for m in reversed(second_call_messages) if isinstance(m, HumanMessage)) + assert FORCE_SEARCH_GUIDANCE in latest_human.content + + @pytest.mark.asyncio + async def test_force_search_skipped_when_no_tools(self, mock_llm_provider, mock_llm): + """When no tools are configured, force_search must NOT fire; the agent + should fall back to asking the user immediately.""" + clarif_msg = AIMessage( + content=ClarificationResponse( + needs_clarification=True, clarification_question="What aspect?" + ).model_dump_json() + ) + complete_msg = AIMessage( + content=ClarificationResponse(needs_clarification=False, clarification_question=None).model_dump_json() + ) + mock_llm.ainvoke = AsyncMock(side_effect=[clarif_msg, complete_msg]) + + user_callback = AsyncMock(return_value="technical deep dive") + + agent = ClarifierAgent( + llm_provider=mock_llm_provider, + tools=[], # no tools available + user_prompt_callback=user_callback, + ) + + state = ClarifierAgentState(messages=[HumanMessage(content="Research AI")]) + result = await agent.run(state) + + assert result is not None + # User callback is called once for clarification (no force_search detour). + user_callback.assert_called_once() + + @pytest.mark.asyncio + async def test_force_search_fires_at_most_once(self, mock_llm_provider, mock_llm): + """Even if the LLM stubbornly refuses to call a tool after the force_search + nudge, the agent must not loop forever - it should proceed to asking the + user after the single nudge attempt.""" + clarif_msg_1 = AIMessage( + content=ClarificationResponse( + needs_clarification=True, clarification_question="What aspect?" + ).model_dump_json() + ) + # After the force_search nudge, the model still refuses to call a tool + # and returns another clarification request. + clarif_msg_2 = AIMessage( + content=ClarificationResponse( + needs_clarification=True, clarification_question="What aspect?" + ).model_dump_json() + ) + # After the user replies, the model completes. + complete_msg = AIMessage( + content=ClarificationResponse(needs_clarification=False, clarification_question=None).model_dump_json() + ) + mock_llm.ainvoke = AsyncMock(side_effect=[clarif_msg_1, clarif_msg_2, complete_msg]) + + user_callback = AsyncMock(return_value="technical") + + agent = ClarifierAgent( + llm_provider=mock_llm_provider, + tools=[web_search_tool], + user_prompt_callback=user_callback, + ) + + state = ClarifierAgentState(messages=[HumanMessage(content="Research AI")]) + result = await agent.run(state) + + assert result is not None + # The LLM was invoked three times max - the nudge fired once, then we + # fell through to ask_for_clarification, and the user reply produced + # the final completion. + assert mock_llm.ainvoke.call_count == 3 + # The user was prompted exactly once (no infinite loop of nudges). + user_callback.assert_called_once() + + @pytest.mark.asyncio + async def test_force_search_not_triggered_when_llm_searches_first(self, mock_llm_provider, mock_llm): + """When the LLM voluntarily issues a tool call on the first turn, the + force_search path is never entered - normal flow is preserved.""" + tool_call_msg = AIMessage( + content="", + tool_calls=[{"name": "web_search_tool", "args": {"query": "AI"}, "id": "call_1"}], + ) + complete_msg = AIMessage( + content=ClarificationResponse(needs_clarification=False, clarification_question=None).model_dump_json() + ) + mock_llm.ainvoke = AsyncMock(side_effect=[tool_call_msg, complete_msg]) + + user_callback = AsyncMock() + + agent = ClarifierAgent( + llm_provider=mock_llm_provider, + tools=[web_search_tool], + user_prompt_callback=user_callback, + ) + + state = ClarifierAgentState(messages=[HumanMessage(content="Research AI")]) + result = await agent.run(state) + + assert result is not None + assert mock_llm.ainvoke.call_count == 2 + user_callback.assert_not_called() + # No HumanMessage with the force_search guidance should have been added. + second_call_messages = mock_llm.ainvoke.call_args_list[1].args[0] + for m in second_call_messages: + if isinstance(m, HumanMessage): + assert FORCE_SEARCH_GUIDANCE not in m.content + + @pytest.mark.asyncio + async def test_force_search_guidance_not_in_state_messages(self, mock_llm_provider, mock_llm): + """The force_search guidance must be injected ephemerally only; it must + never end up in state.messages, otherwise helpers like + get_latest_user_query would surface internal scaffolding back to the + user in fallback text. Regression test for Codex review feedback.""" + # The LLM ignores the nudge and returns invalid JSON, triggering the + # invalid-format fallback path inside ask_clarification. + clarif_invalid = AIMessage(content="not valid JSON at all") + complete_msg = AIMessage( + content=ClarificationResponse(needs_clarification=False, clarification_question=None).model_dump_json() + ) + # 1st call: clarify-attempt. 2nd call (after nudge): invalid JSON. + # 3rd call: after the user reply, model completes. + clarif_msg_1 = AIMessage( + content=ClarificationResponse( + needs_clarification=True, clarification_question="What aspect?" + ).model_dump_json() + ) + mock_llm.ainvoke = AsyncMock(side_effect=[clarif_msg_1, clarif_invalid, complete_msg]) + + # Capture what gets sent to the user. + prompts_received: list[str] = [] + + async def user_callback(question: str) -> str: + prompts_received.append(question) + return "skip" + + agent = ClarifierAgent( + llm_provider=mock_llm_provider, + tools=[web_search_tool], + user_prompt_callback=user_callback, + ) + + original_query = "Research Project Foo at Acme" + state = ClarifierAgentState(messages=[HumanMessage(content=original_query)]) + await agent.run(state) + + # The user was prompted exactly once - with a fallback derived from + # their actual query, never from the force-search guidance. + assert len(prompts_received) == 1 + prompt_text = prompts_received[0] + assert "Project Foo" in prompt_text or "Acme" in prompt_text + assert FORCE_SEARCH_GUIDANCE not in prompt_text + # The internal force-search guidance must never be visible in any + # message the user-facing fallback would draw from. + assert "You attempted to ask the user" not in prompt_text From 388149c342388081032638d423685872709266fe Mon Sep 17 00:00:00 2001 From: Torkian Date: Wed, 27 May 2026 20:28:01 -0700 Subject: [PATCH 2/6] Lint: enforce force-single-line imports in e2e reproducer ruff (project config: force-single-line=true) flagged the combined `from langchain_core.messages import HumanMessage, ToolMessage` line in scripts/e2e_clarifier_search_test.py. Split it into two single-line imports to match the rest of the codebase, and re-run ruff format. Signed-off-by: Torkian --- scripts/e2e_clarifier_search_test.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/scripts/e2e_clarifier_search_test.py b/scripts/e2e_clarifier_search_test.py index 93c9e808..10119e39 100644 --- a/scripts/e2e_clarifier_search_test.py +++ b/scripts/e2e_clarifier_search_test.py @@ -32,7 +32,8 @@ sys.exit("TAVILY_API_KEY not set (paste it into deploy/.env)") from langchain_community.tools.tavily_search import TavilySearchResults # noqa: E402 -from langchain_core.messages import HumanMessage, ToolMessage # noqa: E402 +from langchain_core.messages import HumanMessage # noqa: E402 +from langchain_core.messages import ToolMessage # noqa: E402 from langchain_nvidia_ai_endpoints import ChatNVIDIA # noqa: E402 from aiq_agent.agents.clarifier.agent import ClarifierAgent # noqa: E402 @@ -96,9 +97,7 @@ async def user_prompt_callback(question: str) -> str: messages = final_state.messages force_search_used = getattr(final_state, "force_search_used", False) - tool_calls_made = sum( - len(getattr(m, "tool_calls", None) or []) for m in messages - ) + tool_calls_made = sum(len(getattr(m, "tool_calls", None) or []) for m in messages) tool_results = sum(1 for m in messages if isinstance(m, ToolMessage)) forced = bool(force_search_used) From 8f18f80e6242d7caf2105d4c814404fdbed90910 Mon Sep 17 00:00:00 2001 From: Torkian Date: Thu, 28 May 2026 09:59:54 -0700 Subject: [PATCH 3/6] Address Greptile review: iteration guard on guidance, stricter e2e gate Greptile review on PR #245 flagged two real issues: P1 (agent.py): the ephemeral force_search guidance injection in agent_node was missing the iteration==0 guard that decide_route already applies. After force_search fired once and a stubborn model still refused to call a tool, ask_for_clarification ran, the user replied, and iteration advanced to 1 - but the next agent_node call still saw force_search_used=True with no tool invocations in state.messages and re-injected the nudge. The model then received "issue one focused tool call now" immediately after the user's clarifying answer, causing a gratuitous search instead of synthesizing the reply. Fixed by adding state.iteration == 0 to the injection condition. P2 (e2e script): the PASS verdict gated on tool_calls >= 1 only, without verifying len(user_prompts) == 0. A run where the LLM both searched AND prompted the user would have falsely printed "before any user prompt" and exited 0. Now we require both conditions and print distinct FAIL messages for the two failure modes. Also cleaned up an unused mock side_effect in test_force_search_guidance_not_in_state_messages (the user replies "skip" which forces completion, so only two LLM calls actually happen), and added test_force_search_guidance_not_injected_after_user_reply as a regression test for the P1 fix. 149 clarifier tests pass. Signed-off-by: Torkian --- scripts/e2e_clarifier_search_test.py | 7 ++- src/aiq_agent/agents/clarifier/agent.py | 6 +- .../aiq_agent/agents/clarifier/test_agent.py | 59 ++++++++++++++++--- 3 files changed, 61 insertions(+), 11 deletions(-) diff --git a/scripts/e2e_clarifier_search_test.py b/scripts/e2e_clarifier_search_test.py index 10119e39..398e6546 100644 --- a/scripts/e2e_clarifier_search_test.py +++ b/scripts/e2e_clarifier_search_test.py @@ -121,10 +121,13 @@ async def user_prompt_callback(question: str) -> str: print(f" total ToolMessages (results) : {tool_results}") print(f" force_search guidance injected : {forced}") - if tool_calls_made >= 1: + if tool_calls_made >= 1 and len(user_prompts) == 0: print("\nPASS: the LLM issued at least one search before any user prompt.") return 0 - print("\nFAIL: the LLM never issued a search.") + if tool_calls_made == 0: + print("\nFAIL: the LLM never issued a search.") + else: + print("\nFAIL: the LLM issued a search, but the user was also prompted.") return 1 diff --git a/src/aiq_agent/agents/clarifier/agent.py b/src/aiq_agent/agents/clarifier/agent.py index 76c98a37..de6dfbb6 100644 --- a/src/aiq_agent/agents/clarifier/agent.py +++ b/src/aiq_agent/agents/clarifier/agent.py @@ -583,7 +583,11 @@ async def agent_node(state: ClarifierAgentState): # If the force_search nudge was raised but no tool call has happened # yet, inject the guidance ephemerally — never stored in state.messages, # so it cannot leak into get_latest_user_query and similar lookups. - if state.force_search_used and not self._has_tool_invocations(state.messages): + # The iteration==0 guard mirrors the one in decide_route: once the + # user has actually replied (iteration > 0), the nudge no longer + # applies and would otherwise force a gratuitous search on the + # user's clarifying answer. + if state.force_search_used and state.iteration == 0 and not self._has_tool_invocations(state.messages): logger.info("Injecting force-search guidance for this turn") messages.append(HumanMessage(content=FORCE_SEARCH_GUIDANCE)) diff --git a/tests/aiq_agent/agents/clarifier/test_agent.py b/tests/aiq_agent/agents/clarifier/test_agent.py index 30bbe314..a0b6b0db 100644 --- a/tests/aiq_agent/agents/clarifier/test_agent.py +++ b/tests/aiq_agent/agents/clarifier/test_agent.py @@ -1081,19 +1081,16 @@ async def test_force_search_guidance_not_in_state_messages(self, mock_llm_provid get_latest_user_query would surface internal scaffolding back to the user in fallback text. Regression test for Codex review feedback.""" # The LLM ignores the nudge and returns invalid JSON, triggering the - # invalid-format fallback path inside ask_clarification. - clarif_invalid = AIMessage(content="not valid JSON at all") - complete_msg = AIMessage( - content=ClarificationResponse(needs_clarification=False, clarification_question=None).model_dump_json() - ) - # 1st call: clarify-attempt. 2nd call (after nudge): invalid JSON. - # 3rd call: after the user reply, model completes. + # invalid-format fallback path inside ask_clarification. The user then + # replies "skip", which forces completion - so only two LLM calls + # actually happen in this run. clarif_msg_1 = AIMessage( content=ClarificationResponse( needs_clarification=True, clarification_question="What aspect?" ).model_dump_json() ) - mock_llm.ainvoke = AsyncMock(side_effect=[clarif_msg_1, clarif_invalid, complete_msg]) + clarif_invalid = AIMessage(content="not valid JSON at all") + mock_llm.ainvoke = AsyncMock(side_effect=[clarif_msg_1, clarif_invalid]) # Capture what gets sent to the user. prompts_received: list[str] = [] @@ -1121,3 +1118,49 @@ async def user_callback(question: str) -> str: # The internal force-search guidance must never be visible in any # message the user-facing fallback would draw from. assert "You attempted to ask the user" not in prompt_text + + @pytest.mark.asyncio + async def test_force_search_guidance_not_injected_after_user_reply(self, mock_llm_provider, mock_llm): + """After the user has actually replied (iteration > 0), the agent must + NOT re-inject the search-first nudge on the next LLM call. Otherwise + the model would receive 'issue a tool call now' immediately after the + user provided clarifying answer, causing a gratuitous search instead + of synthesizing the answer. Regression test for Greptile P1 finding.""" + clarif_msg_1 = AIMessage( + content=ClarificationResponse( + needs_clarification=True, clarification_question="What aspect?" + ).model_dump_json() + ) + # After the nudge, the model still refuses to call a tool. + clarif_msg_2 = AIMessage( + content=ClarificationResponse( + needs_clarification=True, clarification_question="Which area?" + ).model_dump_json() + ) + # After the user replies, the model completes. + complete_msg = AIMessage( + content=ClarificationResponse(needs_clarification=False, clarification_question=None).model_dump_json() + ) + mock_llm.ainvoke = AsyncMock(side_effect=[clarif_msg_1, clarif_msg_2, complete_msg]) + + user_callback = AsyncMock(return_value="technical deep dive") + + agent = ClarifierAgent( + llm_provider=mock_llm_provider, + tools=[web_search_tool], + user_prompt_callback=user_callback, + ) + + state = ClarifierAgentState(messages=[HumanMessage(content="Research AI")]) + await agent.run(state) + + # The 3rd LLM call happens AFTER the user reply (iteration moves from + # 0 to 1). The nudge must NOT appear in that call's message list, even + # though force_search_used is still True. + assert mock_llm.ainvoke.call_count == 3 + third_call_messages = mock_llm.ainvoke.call_args_list[2].args[0] + for m in third_call_messages: + if isinstance(m, HumanMessage): + assert FORCE_SEARCH_GUIDANCE not in m.content, ( + "force_search guidance must not be re-injected after the user replies" + ) From 13772096de97f7899cc3bb3334de28087f08be56 Mon Sep 17 00:00:00 2001 From: Torkian Date: Mon, 1 Jun 2026 21:58:45 -0700 Subject: [PATCH 4/6] Simplify clarifier search-before-clarify to an inline retry Address review feedback on #245: replace the dedicated force_search graph node and the ClarifierAgentState.force_search_used flag with a single inline retry inside agent_node. When, on the first turn, the model asks for clarification without first using its bound search tools, agent_node now nudges it once with FORCE_SEARCH_GUIDANCE and re-invokes inline, returning both the original response and the retry. This keeps the behavior model-agnostic without adding graph state or an extra node hop. The one-shot guarantee holds from two conditions alone, so the force_search_used flag is no longer needed: - iteration == 0: only fires on the first turn; once the user replies, iteration advances and it never fires again. - not _has_tool_invocations(state.messages): once any tool call is in history (e.g. after a successful forced search, even while iteration is still 0), it never re-nudges. FORCE_SEARCH_GUIDANCE is sent only in the local retry_messages and is never returned to state, so it cannot leak into get_latest_user_query or other history lookups. Also remove scripts/e2e_clarifier_search_test.py from the PR per review; the end-to-end test approach and results are documented in the PR description instead. Behavior is unchanged (same LLM call sequence and final message history); all 147 clarifier tests pass, including the force-search behavioral and ephemeral-guidance regression tests. Signed-off-by: Torkian --- scripts/e2e_clarifier_search_test.py | 135 ------------------ src/aiq_agent/agents/clarifier/agent.py | 93 +++++------- .../agents/clarifier/models/state.py | 4 - .../agents/clarifier/models/test_state.py | 10 -- .../aiq_agent/agents/clarifier/test_agent.py | 5 +- 5 files changed, 42 insertions(+), 205 deletions(-) delete mode 100644 scripts/e2e_clarifier_search_test.py diff --git a/scripts/e2e_clarifier_search_test.py b/scripts/e2e_clarifier_search_test.py deleted file mode 100644 index 398e6546..00000000 --- a/scripts/e2e_clarifier_search_test.py +++ /dev/null @@ -1,135 +0,0 @@ -"""End-to-end test for issue #234 fix. - -Runs the ClarifierAgent against a real NVIDIA NIM-hosted LLM with a real Tavily -web search tool, using an obscure query the LLM cannot possibly know from -training data. Verifies that the agent issues a search call before falling back -to asking the user for clarification. - -Run with: - PYTHONPATH=src .venv/bin/python scripts/e2e_clarifier_search_test.py -""" - -from __future__ import annotations - -import asyncio -import os -import sys -from pathlib import Path - -# Load deploy/.env if present. -ENV_FILE = Path(__file__).resolve().parents[1] / "deploy" / ".env" -if ENV_FILE.exists(): - for line in ENV_FILE.read_text().splitlines(): - line = line.strip() - if not line or line.startswith("#") or "=" not in line: - continue - key, _, value = line.partition("=") - os.environ.setdefault(key.strip(), value.strip()) - -if not os.environ.get("NVIDIA_API_KEY"): - sys.exit("NVIDIA_API_KEY not set (paste it into deploy/.env)") -if not os.environ.get("TAVILY_API_KEY"): - sys.exit("TAVILY_API_KEY not set (paste it into deploy/.env)") - -from langchain_community.tools.tavily_search import TavilySearchResults # noqa: E402 -from langchain_core.messages import HumanMessage # noqa: E402 -from langchain_core.messages import ToolMessage # noqa: E402 -from langchain_nvidia_ai_endpoints import ChatNVIDIA # noqa: E402 - -from aiq_agent.agents.clarifier.agent import ClarifierAgent # noqa: E402 -from aiq_agent.agents.clarifier.models import ClarifierAgentState # noqa: E402 -from aiq_agent.common import LLMProvider # noqa: E402 - -try: - from aiq_agent.agents.clarifier.agent import FORCE_SEARCH_GUIDANCE # noqa: E402 -except ImportError: - FORCE_SEARCH_GUIDANCE = None # running against the pre-fix branch - -# Test queries: -# - "obscure": something no LLM can know without searching. -# - "knowable": something the LLM thinks it knows; in the failure mode it -# skips the search and just asks "which aspect?". -TEST_QUERIES = { - "obscure": "Tell me about Project Zphyr-77Q at QXR Industries — what does it do and who works on it?", - "knowable": "Research the latest NVIDIA AI announcements from 2026", -} -OBSCURE_QUERY = TEST_QUERIES[os.environ.get("AIQ_TEST_QUERY", "knowable")] - -# Model defaults to a small open model so the test is cheap to run. -MODEL_NAME = os.environ.get("AIQ_CLARIFIER_TEST_MODEL", "meta/llama-3.3-70b-instruct") - - -async def main() -> int: - llm = ChatNVIDIA(model=MODEL_NAME, temperature=0) - search = TavilySearchResults(max_results=3, name="web_search_tool") - - provider = LLMProvider() - provider.set_default(llm) - - # The user_prompt_callback should never be invoked if the search-before-ask - # behavior is working. If it is invoked, we record it so the test can fail - # loudly. - user_prompts: list[str] = [] - - async def user_prompt_callback(question: str) -> str: - user_prompts.append(question) - # If reached, pretend the user said "skip" so the run still terminates. - return "skip" - - agent = ClarifierAgent( - llm_provider=provider, - tools=[search], - user_prompt_callback=user_prompt_callback, - max_turns=2, - ) - - state = ClarifierAgentState(messages=[HumanMessage(content=OBSCURE_QUERY)]) - print(f"Model: {MODEL_NAME}") - print(f"Query: {OBSCURE_QUERY}\n") - - final_state = await agent.graph.ainvoke(state, config={"callbacks": []}) - - # Pull the final messages out for inspection. - if isinstance(final_state, dict): - messages = final_state["messages"] - force_search_used = final_state.get("force_search_used", False) - else: - messages = final_state.messages - force_search_used = getattr(final_state, "force_search_used", False) - - tool_calls_made = sum(len(getattr(m, "tool_calls", None) or []) for m in messages) - tool_results = sum(1 for m in messages if isinstance(m, ToolMessage)) - forced = bool(force_search_used) - - print("=" * 72) - print("Conversation trace") - print("=" * 72) - for i, m in enumerate(messages): - kind = type(m).__name__ - content = str(getattr(m, "content", "")).replace("\n", " ")[:160] - extra = "" - if getattr(m, "tool_calls", None): - extra = f" tool_calls={[(tc.get('name'), tc.get('args')) for tc in m.tool_calls]}" - print(f"{i:>2}. {kind}: {content}{extra}") - - print() - print("=" * 72) - print("Verdict") - print("=" * 72) - print(f" user_prompt_callback invocations : {len(user_prompts)}") - print(f" total tool_calls emitted by LLM : {tool_calls_made}") - print(f" total ToolMessages (results) : {tool_results}") - print(f" force_search guidance injected : {forced}") - - if tool_calls_made >= 1 and len(user_prompts) == 0: - print("\nPASS: the LLM issued at least one search before any user prompt.") - return 0 - if tool_calls_made == 0: - print("\nFAIL: the LLM never issued a search.") - else: - print("\nFAIL: the LLM issued a search, but the user was also prompted.") - return 1 - - -if __name__ == "__main__": - sys.exit(asyncio.run(main())) diff --git a/src/aiq_agent/agents/clarifier/agent.py b/src/aiq_agent/agents/clarifier/agent.py index de6dfbb6..3338243c 100644 --- a/src/aiq_agent/agents/clarifier/agent.py +++ b/src/aiq_agent/agents/clarifier/agent.py @@ -531,21 +531,20 @@ def _build_graph(self) -> CompiledStateGraph: Build the LangGraph StateGraph for the clarification workflow. Creates a graph with the following nodes: - - agent: Generates clarification questions using the LLM + - agent: Generates clarification questions using the LLM. On the first + turn it also enforces search-before-clarify (issue #234): if the model + asks for clarification without using its bound search tools, it nudges + the model once and retries inline. - tools: Executes tool calls (e.g., web search) for context - ask_for_clarification: Prompts user and processes response - - force_search: Nudges the LLM to attempt a search before its first - clarification question when tools are configured but unused (fires - at most once per run) - plan_preview: Optional plan approval flow The graph flow: - 1. agent generates a response (question, tool call, or completion) + 1. agent generates a response (question, tool call, or completion); + on turn 0 it may force one search-and-retry before yielding 2. If tool call → tools node → back to agent 3. If complete → end (or plan_preview if enabled) - 4. If question, tools exist, and no search has happened yet on the - first turn → force_search injects guidance and routes back to agent - 5. Otherwise → ask_for_clarification → back to agent + 4. Otherwise → ask_for_clarification → back to agent Returns: Compiled LangGraph StateGraph ready for execution. @@ -580,18 +579,35 @@ async def agent_node(state: ClarifierAgentState): logger.info("Adding JSON reminder after tool results") messages.append(HumanMessage(content=JSON_REMINDER_AFTER_TOOLS)) - # If the force_search nudge was raised but no tool call has happened - # yet, inject the guidance ephemerally — never stored in state.messages, - # so it cannot leak into get_latest_user_query and similar lookups. - # The iteration==0 guard mirrors the one in decide_route: once the - # user has actually replied (iteration > 0), the nudge no longer - # applies and would otherwise force a gratuitous search on the - # user's clarifying answer. - if state.force_search_used and state.iteration == 0 and not self._has_tool_invocations(state.messages): - logger.info("Injecting force-search guidance for this turn") - messages.append(HumanMessage(content=FORCE_SEARCH_GUIDANCE)) - response = await bound_llm.ainvoke(messages) + + # Search-before-clarify (issue #234): if, on the first turn, the model + # asks for clarification without first using its bound search tools, + # nudge it once to search and retry inline. This keeps the behavior + # model-agnostic without adding graph nodes or extra state — even + # models that would otherwise skip tool use must attempt a search + # before falling back to asking the user. + # + # The guard is one-shot by construction: + # * iteration == 0 — only on the first turn; once the user replies, + # iteration advances and this never fires again. + # * not _has_tool_invocations(state.messages) — once any tool call + # is in history (e.g. after a successful forced search, even while + # iteration is still 0), we never re-nudge. + # FORCE_SEARCH_GUIDANCE is sent only in the local retry_messages and is + # never returned to state, so it cannot leak into get_latest_user_query. + if ( + self.tools + and state.iteration == 0 + and not self._has_tool_invocations(state.messages) + and not getattr(response, "tool_calls", None) + and self._is_needed(response.content) + ): + logger.info("Clarifier: model skipped search before clarifying; injecting guidance and retrying once") + retry_messages = messages + [response, HumanMessage(content=FORCE_SEARCH_GUIDANCE)] + retry_response = await bound_llm.ainvoke(retry_messages) + return {"messages": [response, retry_response]} + return {"messages": [response]} async def ask_clarification(state: ClarifierAgentState): @@ -636,12 +652,8 @@ async def ask_clarification(state: ClarifierAgentState): def decide_route(state: ClarifierAgentState | dict): if isinstance(state, dict): messages = state.get("messages", []) - iteration = state.get("iteration", 0) - force_search_used = state.get("force_search_used", False) elif hasattr(state, "messages"): messages = state.messages - iteration = state.iteration - force_search_used = state.force_search_used else: msg = f"No messages found in input state to tool_edge: {state}" raise ValueError(msg) @@ -659,36 +671,12 @@ def decide_route(state: ClarifierAgentState | dict): return "plan_preview" return "__end__" - # Force a search before the first clarification question when tools are - # configured but the agent has not yet issued any tool call. This keeps - # the behavior model-agnostic: even models that would otherwise skip - # tool use must attempt at least one search before falling back to - # asking the user for clarification. We only nudge once per run (guarded - # by ``force_search_used``) so a stubborn model cannot get stuck in a - # loop — after the nudge, normal clarification routing resumes. - if ( - self.tools - and iteration == 0 - and not force_search_used - and not self._has_tool_invocations(messages[:-1]) - ): - logger.info("Clarifier: forcing search before first clarification question") - return "force_search" - + # The search-before-clarify nudge (issue #234) is handled inline in + # agent_node, not here — see the retry block there. By the time a + # clarification response reaches this router, any forced search has + # already happened, so we route straight to the user. return "ask_for_clarification" - async def force_search_node(state: ClarifierAgentState): - """ - Flip the force_search_used flag so ``agent_node`` will inject the - search-first guidance message ephemerally on the next LLM call. - - The guidance is intentionally NOT appended to ``state.messages`` so - it cannot be picked up by helpers like ``get_latest_user_query`` - (which would otherwise surface internal scaffolding back to the - user in fallback text). - """ - return {"force_search_used": True} - async def plan_preview_node(state: ClarifierAgentState): """Generate plan preview and handle approval/feedback loop.""" clarifier_log = state.clarifier_log @@ -758,7 +746,6 @@ async def plan_preview_node(state: ClarifierAgentState): graph.add_node("agent", agent_node) graph.add_node("tools", ToolNode(self.tools)) graph.add_node("ask_for_clarification", ask_clarification) - graph.add_node("force_search", force_search_node) graph.add_node("plan_preview", plan_preview_node) graph.set_entry_point("agent") @@ -769,7 +756,6 @@ async def plan_preview_node(state: ClarifierAgentState): { "tools": "tools", "ask_for_clarification": "ask_for_clarification", - "force_search": "force_search", "plan_preview": "plan_preview", "__end__": "__end__", }, @@ -777,7 +763,6 @@ async def plan_preview_node(state: ClarifierAgentState): graph.add_edge("tools", "agent") graph.add_edge("ask_for_clarification", "agent") - graph.add_edge("force_search", "agent") graph.add_edge("plan_preview", "__end__") return graph.compile() diff --git a/src/aiq_agent/agents/clarifier/models/state.py b/src/aiq_agent/agents/clarifier/models/state.py index be7a6e1d..89d1514d 100644 --- a/src/aiq_agent/agents/clarifier/models/state.py +++ b/src/aiq_agent/agents/clarifier/models/state.py @@ -58,9 +58,6 @@ class ClarifierAgentState(BaseModel): max_turns: Maximum number of turns for the clarification dialog. clarifier_log: Log of the clarification dialog. iteration: Current iteration of the clarification dialog. - force_search_used: Whether the agent has already been nudged once to - attempt a search before its first clarification question. Used to - ensure the nudge fires at most once per run. plan_title: Title of the generated research plan (if plan approval enabled). plan_sections: List of section titles for the research plan. plan_approved: Whether the user approved the plan. @@ -77,7 +74,6 @@ class ClarifierAgentState(BaseModel): max_turns: int = Field(default=3) clarifier_log: str = Field(default="") iteration: int = Field(default=0) - force_search_used: bool = Field(default=False) plan_title: str | None = Field(default=None) plan_sections: list[str] = Field(default_factory=list) plan_approved: bool = Field(default=False) diff --git a/tests/aiq_agent/agents/clarifier/models/test_state.py b/tests/aiq_agent/agents/clarifier/models/test_state.py index 851b9477..bbc86cd1 100644 --- a/tests/aiq_agent/agents/clarifier/models/test_state.py +++ b/tests/aiq_agent/agents/clarifier/models/test_state.py @@ -73,16 +73,6 @@ def test_custom_iteration(self): state = ClarifierAgentState(messages=[], iteration=2) assert state.iteration == 2 - def test_default_force_search_used(self): - """Test default force_search_used is False.""" - state = ClarifierAgentState(messages=[]) - assert state.force_search_used is False - - def test_custom_force_search_used(self): - """Test custom force_search_used.""" - state = ClarifierAgentState(messages=[], force_search_used=True) - assert state.force_search_used is True - def test_remaining_questions_full(self): """Test remaining_questions when iteration is 0.""" state = ClarifierAgentState(messages=[], max_turns=3, iteration=0) diff --git a/tests/aiq_agent/agents/clarifier/test_agent.py b/tests/aiq_agent/agents/clarifier/test_agent.py index a0b6b0db..241068c7 100644 --- a/tests/aiq_agent/agents/clarifier/test_agent.py +++ b/tests/aiq_agent/agents/clarifier/test_agent.py @@ -1155,8 +1155,9 @@ async def test_force_search_guidance_not_injected_after_user_reply(self, mock_ll await agent.run(state) # The 3rd LLM call happens AFTER the user reply (iteration moves from - # 0 to 1). The nudge must NOT appear in that call's message list, even - # though force_search_used is still True. + # 0 to 1), so the inline search-before-clarify guard (gated on + # iteration == 0) must not fire again and the nudge must not appear in + # that call's message list. assert mock_llm.ainvoke.call_count == 3 third_call_messages = mock_llm.ainvoke.call_args_list[2].args[0] for m in third_call_messages: From 2eccd5b486eaa51a49a1ab52312639e2af752039 Mon Sep 17 00:00:00 2001 From: Torkian Date: Mon, 1 Jun 2026 22:38:50 -0700 Subject: [PATCH 5/6] Fix consecutive assistant messages in forced-search retry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Greptile review on #245: the inline search-before-clarify retry returned both the skipped clarification response and the retry response, {"messages": [response, retry_response]}. When the retry carries a tool call (the happy path), the persisted history became [..., AIMessage(clarification), AIMessage(tool_call), ToolMessage, ...] — two consecutive assistant-role messages, which the OpenAI Chat Completions and Anthropic Messages APIs reject with a 400. NVIDIA NIM tolerates it, so it did not surface in the NIM-backed e2e run, and the mocked unit tests did not enforce message ordering. Return only the retry response. The first attempt was already shown to the model inside the local retry_messages (ephemeral, never persisted), so dropping it from state loses nothing and keeps a valid sequence regardless of whether the retry is a tool call or another clarification. Add a regression test asserting no two consecutive AIMessages are ever sent to the LLM across a forced-search run; it fails against the old [response, retry_response] return and passes with the fix. Signed-off-by: Torkian --- src/aiq_agent/agents/clarifier/agent.py | 11 +++- .../aiq_agent/agents/clarifier/test_agent.py | 62 +++++++++++++++++++ 2 files changed, 72 insertions(+), 1 deletion(-) diff --git a/src/aiq_agent/agents/clarifier/agent.py b/src/aiq_agent/agents/clarifier/agent.py index 3338243c..e66ea07a 100644 --- a/src/aiq_agent/agents/clarifier/agent.py +++ b/src/aiq_agent/agents/clarifier/agent.py @@ -596,6 +596,15 @@ async def agent_node(state: ClarifierAgentState): # iteration is still 0), we never re-nudge. # FORCE_SEARCH_GUIDANCE is sent only in the local retry_messages and is # never returned to state, so it cannot leak into get_latest_user_query. + # + # We return ONLY retry_response, not the first (search-skipping) + # response. The first attempt was already shown to the model inside + # retry_messages; persisting it would put two consecutive + # assistant-role messages in history once retry_response carries a + # tool call (… AIMessage(clarif), AIMessage(tool_call), ToolMessage …), + # which the OpenAI Chat Completions and Anthropic Messages APIs reject + # with a 400. Keeping only retry_response preserves a valid sequence + # regardless of whether it is a tool call or another clarification. if ( self.tools and state.iteration == 0 @@ -606,7 +615,7 @@ async def agent_node(state: ClarifierAgentState): logger.info("Clarifier: model skipped search before clarifying; injecting guidance and retrying once") retry_messages = messages + [response, HumanMessage(content=FORCE_SEARCH_GUIDANCE)] retry_response = await bound_llm.ainvoke(retry_messages) - return {"messages": [response, retry_response]} + return {"messages": [retry_response]} return {"messages": [response]} diff --git a/tests/aiq_agent/agents/clarifier/test_agent.py b/tests/aiq_agent/agents/clarifier/test_agent.py index 241068c7..1ec1b3f8 100644 --- a/tests/aiq_agent/agents/clarifier/test_agent.py +++ b/tests/aiq_agent/agents/clarifier/test_agent.py @@ -22,6 +22,7 @@ import pytest from langchain_core.messages import AIMessage +from langchain_core.messages import BaseMessage from langchain_core.messages import HumanMessage from langchain_core.messages import ToolMessage from langchain_core.tools import tool @@ -1165,3 +1166,64 @@ async def test_force_search_guidance_not_injected_after_user_reply(self, mock_ll assert FORCE_SEARCH_GUIDANCE not in m.content, ( "force_search guidance must not be re-injected after the user replies" ) + + @pytest.mark.asyncio + async def test_forced_retry_does_not_emit_consecutive_assistant_messages(self, mock_llm_provider, mock_llm): + """After a forced search-retry whose retry produces a tool call, the + message list fed to the LLM on the next turn must NOT contain two + consecutive assistant (AIMessage) turns. Two adjacent assistant + messages with no interleaved user/tool message are rejected with a 400 + by the OpenAI Chat Completions and Anthropic Messages APIs; mocked LLMs + don't enforce this, so we assert the invariant explicitly. Regression + test for the Greptile P1 finding on PR #245.""" + clarif_msg = AIMessage( + content=ClarificationResponse( + needs_clarification=True, clarification_question="What aspect?" + ).model_dump_json() + ) + tool_call_msg = AIMessage( + content="", + tool_calls=[{"name": "web_search_tool", "args": {"query": "AI"}, "id": "call_1"}], + ) + complete_msg = AIMessage( + content=ClarificationResponse(needs_clarification=False, clarification_question=None).model_dump_json() + ) + mock_llm.ainvoke = AsyncMock(side_effect=[clarif_msg, tool_call_msg, complete_msg]) + + agent = ClarifierAgent( + llm_provider=mock_llm_provider, + tools=[web_search_tool], + user_prompt_callback=AsyncMock(), + ) + + state = ClarifierAgentState(messages=[HumanMessage(content="Research Foo Project XYZ")]) + await agent.run(state) + + # Inspect every message list that was actually sent to the LLM and assert + # no two consecutive AIMessages appear (the API-invalid shape). + def _adjacent_assistant_pairs(messages: list[BaseMessage]) -> list[tuple[int, int]]: + pairs = [] + for i in range(len(messages) - 1): + if isinstance(messages[i], AIMessage) and isinstance(messages[i + 1], AIMessage): + pairs.append((i, i + 1)) + return pairs + + for call_idx, call in enumerate(mock_llm.ainvoke.call_args_list): + sent_messages = call.args[0] + offenders = _adjacent_assistant_pairs(sent_messages) + assert not offenders, ( + f"LLM call #{call_idx} contained consecutive assistant messages at {offenders}; " + "this is rejected by OpenAI/Anthropic APIs" + ) + + # Specifically: the forced retry must not persist the skipped + # clarification, so the post-tool history is [..., AIMessage(tool_call), + # ToolMessage, ...] with no stale AIMessage before the tool call. + third_call_messages = mock_llm.ainvoke.call_args_list[2].args[0] + ai_then_tool = any( + isinstance(third_call_messages[i], AIMessage) + and getattr(third_call_messages[i], "tool_calls", None) + and isinstance(third_call_messages[i + 1], ToolMessage) + for i in range(len(third_call_messages) - 1) + ) + assert ai_then_tool, "expected a tool-call AIMessage immediately followed by its ToolMessage in history" From 832a1fad12c11b08b2d9e03624256637641ee212 Mon Sep 17 00:00:00 2001 From: Torkian Date: Mon, 1 Jun 2026 22:41:06 -0700 Subject: [PATCH 6/6] Fix consecutive assistant messages on the skip-clarification path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While auditing the message ordering for the forced-search fix, a separate pre-existing bug surfaced in the skip-command branch. When a user types a skip command, ask_clarification returned an AIMessage(complete) without persisting the user's reply, leaving it directly after the prior AIMessage(clarification) — two consecutive assistant messages. The unconditional ask_for_clarification -> agent edge then re-entered agent_node with the clarification budget exhausted, which appended a third AIMessage(complete). With enable_plan_approval enabled this corrupted history flowed into the planner's ainvoke, producing a 400 from the OpenAI/Anthropic APIs (NVIDIA NIM tolerates it). Two coordinated fixes: - The skip branch now persists the user's reply as a HumanMessage before the completion AIMessage, so the prior and new assistant turns are separated by a human message. - agent_node's budget-exhausted branch no longer emits a completion when the last message is already a completion AIMessage (the skip branch's), avoiding the duplicate on graph re-entry. Add regression tests covering both the persisted history ordering and the planner ainvoke sequence under enable_plan_approval; both fail against the prior behavior and pass with the fix. Signed-off-by: Torkian --- src/aiq_agent/agents/clarifier/agent.py | 22 +++- .../aiq_agent/agents/clarifier/test_agent.py | 117 ++++++++++++++++++ 2 files changed, 138 insertions(+), 1 deletion(-) diff --git a/src/aiq_agent/agents/clarifier/agent.py b/src/aiq_agent/agents/clarifier/agent.py index e66ea07a..d94b6b0c 100644 --- a/src/aiq_agent/agents/clarifier/agent.py +++ b/src/aiq_agent/agents/clarifier/agent.py @@ -558,6 +558,17 @@ def _build_graph(self) -> CompiledStateGraph: async def agent_node(state: ClarifierAgentState): if state.remaining_questions <= 0: + # Clarification budget is exhausted — emit a completion signal, + # unless a prior node already did (the skip-command branch in + # ask_clarification returns its own AIMessage(complete) and then + # this node is re-entered via the unconditional edge). Emitting + # another here would place two consecutive assistant messages in + # history, which the OpenAI/Anthropic APIs reject. If the last + # message is already a completion, leave the state untouched and + # let decide_route end the run. + last_message = state.messages[-1] if state.messages else None + if isinstance(last_message, AIMessage) and self._is_complete(getattr(last_message, "content", "")): + return {} complete_response = ClarificationResponse(needs_clarification=False, clarification_question=None) return {"messages": [AIMessage(content=complete_response.model_dump_json())]} tools_info = [ @@ -645,8 +656,17 @@ async def ask_clarification(state: ClarifierAgentState): logger.info("Clarifier: User requested to skip clarification") complete_response = ClarificationResponse(needs_clarification=False, clarification_question=None) clarifier_log = f"{clarifier_log}\n**Turn {iteration + 1} - User:** [Skipped clarification]" + # Persist the user's reply as a HumanMessage before the + # completion AIMessage. The prior turn already left an + # AIMessage(clarification) in history; without an interleaving + # human message the two assistant turns would be adjacent, which + # the OpenAI/Anthropic APIs reject. (The duplicate completion on + # graph re-entry is suppressed by the guard in agent_node.) return { - "messages": [AIMessage(content=complete_response.model_dump_json())], + "messages": [ + HumanMessage(content=user_reply), + AIMessage(content=complete_response.model_dump_json()), + ], "iteration": max_turns, # Force end of clarification "clarifier_log": clarifier_log, } diff --git a/tests/aiq_agent/agents/clarifier/test_agent.py b/tests/aiq_agent/agents/clarifier/test_agent.py index 1ec1b3f8..3adfd083 100644 --- a/tests/aiq_agent/agents/clarifier/test_agent.py +++ b/tests/aiq_agent/agents/clarifier/test_agent.py @@ -1227,3 +1227,120 @@ def _adjacent_assistant_pairs(messages: list[BaseMessage]) -> list[tuple[int, in for i in range(len(third_call_messages) - 1) ) assert ai_then_tool, "expected a tool-call AIMessage immediately followed by its ToolMessage in history" + + +class TestClarifierSkipMessageOrdering: + """Skip-command branch must not produce consecutive assistant messages. + + Regression tests for the ordering bug surfaced during the PR #245 audit: + the skip branch returned an AIMessage(complete) without persisting the + user's reply, leaving it adjacent to the prior clarification AIMessage; the + graph then re-entered agent_node and (with the budget exhausted) appended a + third AIMessage. Two/three consecutive assistant turns are rejected by the + OpenAI/Anthropic APIs and corrupt the planner call when plan approval is on. + """ + + @pytest.fixture + def mock_llm(self): + llm = MagicMock() + llm.bind_tools = MagicMock(return_value=llm) + return llm + + @pytest.fixture + def mock_llm_provider(self, mock_llm): + provider = MagicMock(spec=LLMProvider) + provider.get = MagicMock(return_value=mock_llm) + return provider + + @staticmethod + def _adjacent_assistant_pairs(messages: list[BaseMessage]) -> list[tuple[int, int]]: + return [ + (i, i + 1) + for i in range(len(messages) - 1) + if isinstance(messages[i], AIMessage) and isinstance(messages[i + 1], AIMessage) + ] + + @pytest.mark.asyncio + async def test_skip_persists_reply_and_no_consecutive_assistants(self, mock_llm_provider, mock_llm): + """User skips after one clarification: final history must interleave the + skip reply (HumanMessage) and contain no consecutive AIMessages.""" + clarif = AIMessage( + content=ClarificationResponse( + needs_clarification=True, clarification_question="What aspect?" + ).model_dump_json() + ) + # Only one real LLM response is needed; after skip the graph completes + # without another model call (the early-complete guard returns {}). + mock_llm.ainvoke = AsyncMock(side_effect=[clarif]) + + captured: dict = {} + + async def user_callback(question: str) -> str: + return "skip" + + agent = ClarifierAgent( + llm_provider=mock_llm_provider, + tools=[], # no tools → no forced search; isolate the skip path + user_prompt_callback=user_callback, + ) + + state = ClarifierAgentState(messages=[HumanMessage(content="Research AI")]) + + # Capture the final graph state to inspect persisted message ordering. + final = await agent.graph.ainvoke(state, config={"callbacks": []}) + captured["messages"] = final["messages"] if isinstance(final, dict) else final.messages + + msgs = captured["messages"] + offenders = self._adjacent_assistant_pairs(msgs) + assert not offenders, f"persisted history has consecutive assistant messages at {offenders}: {msgs}" + # The skip reply must be persisted as a HumanMessage. + assert any(isinstance(m, HumanMessage) and m.content == "skip" for m in msgs), ( + "skip reply was not persisted as a HumanMessage" + ) + # Exactly one completion AIMessage should terminate the dialog (no duplicate). + completion_ais = [ + m for m in msgs if isinstance(m, AIMessage) and ClarifierAgent._has_tool_invocations([m]) is False + ] + assert len(completion_ais) >= 1 + + @pytest.mark.asyncio + async def test_skip_with_plan_approval_planner_gets_valid_sequence(self, mock_llm_provider, mock_llm): + """With plan approval on, the corrupted skip history previously flowed + into the planner's ainvoke. Assert the planner never receives two + consecutive assistant messages.""" + clarif = AIMessage( + content=ClarificationResponse( + needs_clarification=True, clarification_question="What aspect?" + ).model_dump_json() + ) + mock_llm.ainvoke = AsyncMock(side_effect=[clarif]) + + planner_llm = MagicMock() + plan_json = '{"title": "Plan", "sections": ["Intro", "Analysis"]}' + planner_llm.ainvoke = AsyncMock(return_value=AIMessage(content=plan_json)) + + replies = iter(["skip", "approve"]) + + async def user_callback(question: str) -> str: + return next(replies) + + agent = ClarifierAgent( + llm_provider=mock_llm_provider, + tools=[], + user_prompt_callback=user_callback, + enable_plan_approval=True, + planner_llm=planner_llm, + ) + + state = ClarifierAgentState(messages=[HumanMessage(content="Research AI")]) + result = await agent.run(state) + + assert result is not None + assert result.plan_approved is True + # The planner was called; none of its input message lists may contain + # consecutive assistant messages. + assert planner_llm.ainvoke.call_count >= 1 + for call_idx, call in enumerate(planner_llm.ainvoke.call_args_list): + sent = call.args[0] + offenders = self._adjacent_assistant_pairs(sent) + assert not offenders, f"planner ainvoke #{call_idx} had consecutive assistant messages at {offenders}"