Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion src/agent/claude_agent/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,9 @@ async def _run(args: argparse.Namespace) -> None:

runner = ClaudeAgentRunner(model=args.model_id, max_turns=args.max_turns)
result = await runner.run(args.question)
print_result(result, show_trajectory=args.show_trajectory, output_json=args.output_json)
print_result(
result, show_trajectory=args.show_trajectory, output_json=args.output_json
)


def main() -> None:
Expand Down
34 changes: 27 additions & 7 deletions src/agent/claude_agent/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,13 @@
import time
from pathlib import Path

from claude_agent_sdk import AssistantMessage, ClaudeAgentOptions, HookMatcher, ResultMessage, query
from claude_agent_sdk import (
AssistantMessage,
ClaudeAgentOptions,
HookMatcher,
ResultMessage,
query,
)
from claude_agent_sdk import TextBlock, ToolUseBlock

from observability import agent_run_span, persist_trajectory
Expand Down Expand Up @@ -132,8 +138,14 @@ async def run(self, question: str) -> AgentResult:
last_turn_start = run_started
tool_outputs: dict[str, object] = {}

async def _capture_tool_output(input_data, tool_use_id: str, context) -> dict:
resp = input_data.get("tool_response") if isinstance(input_data, dict) else input_data
async def _capture_tool_output(
input_data, tool_use_id: str, context
) -> dict:
resp = (
input_data.get("tool_response")
if isinstance(input_data, dict)
else input_data
)
if isinstance(resp, dict):
tool_outputs[tool_use_id] = resp.get("content", resp)
else:
Expand All @@ -145,7 +157,9 @@ async def _capture_tool_output(input_data, tool_use_id: str, context) -> dict:
# per-tool duration for claude-agent is therefore not captured
# (matches openai-agent / deep-agent).
options.hooks = {
"PostToolUse": [HookMatcher(matcher=".*", hooks=[_capture_tool_output])],
"PostToolUse": [
HookMatcher(matcher=".*", hooks=[_capture_tool_output])
],
}

def _flush_tool_outputs() -> None:
Expand All @@ -169,7 +183,9 @@ def _flush_tool_outputs() -> None:
text += block.text
elif isinstance(block, ToolUseBlock):
tool_calls.append(
ToolCall(name=block.name, input=block.input, id=block.id)
ToolCall(
name=block.name, input=block.input, id=block.id
)
)
usage = message.usage or {}
trajectory.turns.append(
Expand Down Expand Up @@ -197,8 +213,12 @@ def _flush_tool_outputs() -> None:

duration_ms = (time.perf_counter() - run_started) * 1000
span.set_attribute("agent.answer.length", len(answer))
span.set_attribute("gen_ai.usage.input_tokens", trajectory.total_input_tokens)
span.set_attribute("gen_ai.usage.output_tokens", trajectory.total_output_tokens)
span.set_attribute(
"gen_ai.usage.input_tokens", trajectory.total_input_tokens
)
span.set_attribute(
"gen_ai.usage.output_tokens", trajectory.total_output_tokens
)
span.set_attribute("agent.turns", len(trajectory.turns))
span.set_attribute("agent.tool_calls", len(trajectory.all_tool_calls))
span.set_attribute("agent.duration_ms", duration_ms)
Expand Down
21 changes: 18 additions & 3 deletions src/agent/claude_agent/tests/test_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,12 @@ async def fake_query(prompt, options):

@pytest.mark.anyio
async def test_run_collects_trajectory():
from claude_agent_sdk import AssistantMessage, ResultMessage, TextBlock, ToolUseBlock
from claude_agent_sdk import (
AssistantMessage,
ResultMessage,
TextBlock,
ToolUseBlock,
)

mock_tool = MagicMock(spec=ToolUseBlock)
mock_tool.name = "sensors"
Expand Down Expand Up @@ -157,7 +162,12 @@ async def fake_query(prompt, options):
@pytest.mark.anyio
async def test_run_tool_output_captured():
"""PostToolUse hook output is attached to the matching ToolCall."""
from claude_agent_sdk import AssistantMessage, ResultMessage, TextBlock, ToolUseBlock
from claude_agent_sdk import (
AssistantMessage,
ResultMessage,
TextBlock,
ToolUseBlock,
)

mock_tool = MagicMock(spec=ToolUseBlock)
mock_tool.name = "sensors"
Expand Down Expand Up @@ -206,7 +216,12 @@ async def fake_query(prompt, options):
@pytest.mark.anyio
async def test_run_tool_output_string_response():
"""PostToolUse hook handles string tool_response (no .get)."""
from claude_agent_sdk import AssistantMessage, ResultMessage, TextBlock, ToolUseBlock
from claude_agent_sdk import (
AssistantMessage,
ResultMessage,
TextBlock,
ToolUseBlock,
)

mock_tool = MagicMock(spec=ToolUseBlock)
mock_tool.name = "sites"
Expand Down
4 changes: 3 additions & 1 deletion src/agent/deep_agent/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,9 @@ async def _run(args: argparse.Namespace) -> None:
recursion_limit=args.recursion_limit,
)
result = await runner.run(args.question)
print_result(result, show_trajectory=args.show_trajectory, output_json=args.output_json)
print_result(
result, show_trajectory=args.show_trajectory, output_json=args.output_json
)


def main() -> None:
Expand Down
8 changes: 6 additions & 2 deletions src/agent/deep_agent/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,8 +239,12 @@ async def run(self, question: str) -> AgentResult:
)

span.set_attribute("agent.answer.length", len(answer))
span.set_attribute("gen_ai.usage.input_tokens", trajectory.total_input_tokens)
span.set_attribute("gen_ai.usage.output_tokens", trajectory.total_output_tokens)
span.set_attribute(
"gen_ai.usage.input_tokens", trajectory.total_input_tokens
)
span.set_attribute(
"gen_ai.usage.output_tokens", trajectory.total_output_tokens
)
span.set_attribute("agent.turns", len(trajectory.turns))
span.set_attribute("agent.tool_calls", len(trajectory.all_tool_calls))
span.set_attribute(
Expand Down
47 changes: 39 additions & 8 deletions src/agent/deep_agent/tests/test_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,12 +122,20 @@ def test_build_trajectory_tool_calls_and_outputs():
AIMessage(
content="",
tool_calls=[{"name": "sensors", "args": {"asset_id": "CH-6"}, "id": "c1"}],
usage_metadata={"input_tokens": 100, "output_tokens": 20, "total_tokens": 120},
usage_metadata={
"input_tokens": 100,
"output_tokens": 20,
"total_tokens": 120,
},
),
ToolMessage(content="5 sensors found", tool_call_id="c1"),
AIMessage(
content="Chiller 6 has 5 sensors.",
usage_metadata={"input_tokens": 150, "output_tokens": 30, "total_tokens": 180},
usage_metadata={
"input_tokens": 150,
"output_tokens": 30,
"total_tokens": 180,
},
),
]
traj = _build_trajectory(messages)
Expand All @@ -149,7 +157,12 @@ def test_build_trajectory_tool_calls_and_outputs():

def test_build_trajectory_list_content():
messages = [
AIMessage(content=[{"type": "text", "text": "part one "}, {"type": "text", "text": "part two"}])
AIMessage(
content=[
{"type": "text", "text": "part one "},
{"type": "text", "text": "part two"},
]
)
]
traj = _build_trajectory(messages)
assert traj.turns[0].text == "part one part two"
Expand All @@ -172,13 +185,21 @@ def test_build_trajectory_multiple_tool_calls_one_turn():
{"name": "sites", "args": {}, "id": "c1"},
{"name": "assets", "args": {"site_id": "MAIN"}, "id": "c2"},
],
usage_metadata={"input_tokens": 50, "output_tokens": 10, "total_tokens": 60},
usage_metadata={
"input_tokens": 50,
"output_tokens": 10,
"total_tokens": 60,
},
),
ToolMessage(content=["MAIN"], tool_call_id="c1"),
ToolMessage(content=["Chiller 6"], tool_call_id="c2"),
AIMessage(
content="Found Chiller 6 at site MAIN.",
usage_metadata={"input_tokens": 80, "output_tokens": 15, "total_tokens": 95},
usage_metadata={
"input_tokens": 80,
"output_tokens": 15,
"total_tokens": 95,
},
),
]
traj = _build_trajectory(messages)
Expand Down Expand Up @@ -242,13 +263,23 @@ async def test_run_collects_trajectory():
HumanMessage(content="What sensors are on Chiller 6?"),
AIMessage(
content="",
tool_calls=[{"name": "sensors", "args": {"asset_id": "CH-6"}, "id": "c1"}],
usage_metadata={"input_tokens": 100, "output_tokens": 20, "total_tokens": 120},
tool_calls=[
{"name": "sensors", "args": {"asset_id": "CH-6"}, "id": "c1"}
],
usage_metadata={
"input_tokens": 100,
"output_tokens": 20,
"total_tokens": 120,
},
),
ToolMessage(content="sensor data", tool_call_id="c1"),
AIMessage(
content="Chiller 6 has 5 sensors.",
usage_metadata={"input_tokens": 150, "output_tokens": 30, "total_tokens": 180},
usage_metadata={
"input_tokens": 150,
"output_tokens": 30,
"total_tokens": 180,
},
),
]
}
Expand Down
2 changes: 1 addition & 1 deletion src/agent/direct_llm_agent/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@

from .runner import DirectLLMAgentRunner

__all__ = ["DirectLLMAgentRunner"]
__all__ = ["DirectLLMAgentRunner"]
2 changes: 1 addition & 1 deletion src/agent/direct_llm_agent/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,4 +75,4 @@ def main() -> None:


if __name__ == "__main__":
main()
main()
2 changes: 1 addition & 1 deletion src/agent/direct_llm_agent/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,4 +100,4 @@ async def run(self, question: str) -> AgentResult:
question=question,
answer=answer,
trajectory=trajectory,
)
)
4 changes: 3 additions & 1 deletion src/agent/openai_agent/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,9 @@ async def _run(args: argparse.Namespace) -> None:

runner = OpenAIAgentRunner(model=args.model_id, max_turns=args.max_turns)
result = await runner.run(args.question)
print_result(result, show_trajectory=args.show_trajectory, output_json=args.output_json)
print_result(
result, show_trajectory=args.show_trajectory, output_json=args.output_json
)


def main() -> None:
Expand Down
23 changes: 17 additions & 6 deletions src/agent/openai_agent/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,14 @@

from openai import AsyncOpenAI

from agents import Agent, ModelProvider, OpenAIChatCompletionsModel, RunConfig, Runner, set_tracing_disabled
from agents import (
Agent,
ModelProvider,
OpenAIChatCompletionsModel,
RunConfig,
Runner,
set_tracing_disabled,
)
from agents.mcp import MCPServerStdio

from observability import agent_run_span, persist_trajectory
Expand Down Expand Up @@ -136,7 +143,9 @@ def _flush() -> None:
tc_id = getattr(raw, "call_id", "") or getattr(raw, "id", "") or ""
tc_args = getattr(raw, "arguments", "{}") or "{}"
try:
tc_input = json.loads(tc_args) if isinstance(tc_args, str) else tc_args
tc_input = (
json.loads(tc_args) if isinstance(tc_args, str) else tc_args
)
except (json.JSONDecodeError, TypeError):
tc_input = {"raw": tc_args}
tool_calls.append(ToolCall(name=tc_name, input=tc_input, id=tc_id))
Expand Down Expand Up @@ -251,8 +260,12 @@ async def run(self, question: str) -> AgentResult:
)

span.set_attribute("agent.answer.length", len(answer))
span.set_attribute("gen_ai.usage.input_tokens", trajectory.total_input_tokens)
span.set_attribute("gen_ai.usage.output_tokens", trajectory.total_output_tokens)
span.set_attribute(
"gen_ai.usage.input_tokens", trajectory.total_input_tokens
)
span.set_attribute(
"gen_ai.usage.output_tokens", trajectory.total_output_tokens
)
span.set_attribute("agent.turns", len(trajectory.turns))
span.set_attribute("agent.tool_calls", len(trajectory.all_tool_calls))
span.set_attribute(
Expand All @@ -270,5 +283,3 @@ async def run(self, question: str) -> AgentResult:
answer=answer,
trajectory=trajectory,
)


10 changes: 6 additions & 4 deletions src/agent/plan_execute/executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,9 @@ async def execute_plan(self, plan: Plan, question: str) -> list[StepResult]:
)
schema = tool_schemas.get(step.server, {}).get(step.tool, "")
step_started = time.perf_counter()
result = await self.execute_step(step, context, question, tool_schema=schema)
result = await self.execute_step(
step, context, question, tool_schema=schema
)
result.duration_ms = (time.perf_counter() - step_started) * 1000
if result.success:
_log.info("Step %d OK.", step.step_number)
Expand Down Expand Up @@ -202,8 +204,7 @@ async def _resolve_args_with_llm(
f"Step {n}: {r.response}" for n, r in sorted(context.items())
)
prompt = (
_ARG_RESOLUTION_PROMPT
.replace("{question}", question)
_ARG_RESOLUTION_PROMPT.replace("{question}", question)
.replace("{task}", task)
.replace("{tool}", tool)
.replace("{tool_schema}", tool_schema or "(unknown)")
Expand All @@ -214,7 +215,8 @@ async def _resolve_args_with_llm(
if resolved is None:
_log.warning(
"Tool '%s': arg resolution returned no parseable JSON (response: %r…)",
tool, raw[:120],
tool,
raw[:120],
)
return {}
return resolved
Expand Down
5 changes: 2 additions & 3 deletions src/agent/plan_execute/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,7 @@ def generate(self, prompt: str, temperature: float = 0.0) -> str:
self.output_tokens += result.output_tokens
return result.text

def generate_with_usage(
self, prompt: str, temperature: float = 0.0
) -> LLMResult:
def generate_with_usage(self, prompt: str, temperature: float = 0.0) -> LLMResult:
result = self._inner.generate_with_usage(prompt, temperature)
self.input_tokens += result.input_tokens
self.output_tokens += result.output_tokens
Expand All @@ -62,6 +60,7 @@ def generate_with_usage(
def model_id(self) -> str:
return self._inner.model_id


_log = logging.getLogger(__name__)

_SUMMARIZE_PROMPT = """\
Expand Down
2 changes: 1 addition & 1 deletion src/agent/stirrup_agent/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@

from .runner import StirrupAgentRunner

__all__ = ["StirrupAgentRunner"]
__all__ = ["StirrupAgentRunner"]
8 changes: 5 additions & 3 deletions src/agent/stirrup_agent/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def _build_parser() -> argparse.ArgumentParser:
default=16_384,
metavar="N",
help="Max output tokens per model call; must stay under the provider "
"limit (watsonx caps new tokens at 100k). Default: 16384.",
"limit (watsonx caps new tokens at 100k). Default: 16384.",
)
return parser

Expand All @@ -97,12 +97,14 @@ async def _run(args: argparse.Namespace) -> None:
max_tokens=args.max_tokens,
)
result = await runner.run(args.question)
print_result(result, show_trajectory=args.show_trajectory, output_json=args.output_json)
print_result(
result, show_trajectory=args.show_trajectory, output_json=args.output_json
)


def main() -> None:
run_sdk_cli("stirrup-agent", _build_parser, _run)


if __name__ == "__main__":
main()
main()
Loading
Loading