Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion examples/zero-code-examples/openai-agents/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
openai-agents>=0.3.3
openai>=2.30.0
openai-agents>=0.13.0
opentelemetry-instrumentation-openai-agents-v2>=0.1.0

opentelemetry-sdk>=1.36.0
Expand Down
1 change: 1 addition & 0 deletions examples/zero-code-examples/openai-agents/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ def main():
print(f"OTLP endpoint: {endpoint}")

os.environ.setdefault("OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT", "span_and_event")
os.environ.setdefault("OTEL_SEMCONV_STABILITY_OPT_IN", "gen_ai_latest_experimental")

os.environ.setdefault(
"OTEL_RESOURCE_ATTRIBUTES",
Expand Down
3 changes: 3 additions & 0 deletions src/agentevals/api/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,10 @@ class DebugLoadData(CamelModel):

class TraceConversionMetadata(CamelModel):
agent_name: str | None = None
agent_id: str | None = None
model: str | None = None
response_model: str | None = None
provider: str | None = None
start_time: int | None = None
user_input_preview: str | None = None
final_output_preview: str | None = None
Expand Down
85 changes: 83 additions & 2 deletions src/agentevals/extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

import json
import logging
from typing import Any, Protocol
from typing import Any, Protocol, TypedDict, TypeVar

from .loader.base import Span, Trace
from .trace_attrs import (
Expand All @@ -22,14 +22,26 @@
ADK_SCOPE_VALUE,
ADK_TOOL_CALL_ARGS,
ADK_TOOL_RESPONSE,
OTEL_ERROR_TYPE,
OTEL_GENAI_INPUT_MESSAGES,
OTEL_GENAI_OP,
OTEL_GENAI_OUTPUT_MESSAGES,
OTEL_GENAI_PROVIDER_NAME,
OTEL_GENAI_REQUEST_MAX_TOKENS,
OTEL_GENAI_REQUEST_MODEL,
OTEL_GENAI_REQUEST_TEMPERATURE,
OTEL_GENAI_RESPONSE_FINISH_REASONS,
OTEL_GENAI_RESPONSE_ID,
OTEL_GENAI_RESPONSE_MODEL,
OTEL_GENAI_SYSTEM,
OTEL_GENAI_TOOL_CALL_ARGUMENTS,
OTEL_GENAI_TOOL_CALL_ID,
OTEL_GENAI_TOOL_CALL_RESULT,
OTEL_GENAI_TOOL_DESCRIPTION,
OTEL_GENAI_TOOL_NAME,
OTEL_GENAI_TOOL_TYPE,
OTEL_GENAI_USAGE_CACHE_CREATION_TOKENS,
OTEL_GENAI_USAGE_CACHE_READ_TOKENS,
OTEL_GENAI_USAGE_INPUT_TOKENS,
OTEL_GENAI_USAGE_OUTPUT_TOKENS,
OTEL_SCOPE,
Expand Down Expand Up @@ -139,6 +151,65 @@ def extract_token_usage_from_attrs(
return 0, 0, model


_T = TypeVar("_T", int, float)


def _safe_cast(value: Any, target_type: type[_T], default: _T | None = None) -> _T | None:
"""Try to cast *value* to *target_type*, returning *default* on failure."""
if value is None:
return default
try:
return target_type(value)
except (TypeError, ValueError):
return default


def _parse_finish_reasons(raw: Any) -> list[str]:
"""Parse finish reasons from a list, JSON string, or plain string."""
if isinstance(raw, list):
return [str(r) for r in raw]
if isinstance(raw, str):
parsed = parse_json(raw)
if isinstance(parsed, list):
return [str(r) for r in parsed]
if raw:
return [raw]
return []


class ExtendedModelInfo(TypedDict):
request_model: str | None
response_model: str | None
provider: str | None
finish_reasons: list[str]
response_id: str | None
temperature: float | None
max_tokens: int | None
cache_creation_tokens: int
cache_read_tokens: int
error_type: str | None


def extract_extended_model_info_from_attrs(attrs: dict[str, Any]) -> ExtendedModelInfo:
"""Extract extended model and provider metadata from span attributes.

Uses gen_ai.system as fallback for provider when gen_ai.provider.name is
absent (backward compat with pre-v1.37.0 instrumentors).
"""
return {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

consider having a TypedDict for this

"request_model": attrs.get(OTEL_GENAI_REQUEST_MODEL),
"response_model": attrs.get(OTEL_GENAI_RESPONSE_MODEL),
"provider": attrs.get(OTEL_GENAI_PROVIDER_NAME) or attrs.get(OTEL_GENAI_SYSTEM),
"finish_reasons": _parse_finish_reasons(attrs.get(OTEL_GENAI_RESPONSE_FINISH_REASONS)),
"response_id": attrs.get(OTEL_GENAI_RESPONSE_ID),
"temperature": _safe_cast(attrs.get(OTEL_GENAI_REQUEST_TEMPERATURE), float),
"max_tokens": _safe_cast(attrs.get(OTEL_GENAI_REQUEST_MAX_TOKENS), int),
"cache_creation_tokens": _safe_cast(attrs.get(OTEL_GENAI_USAGE_CACHE_CREATION_TOKENS), int, 0),
"cache_read_tokens": _safe_cast(attrs.get(OTEL_GENAI_USAGE_CACHE_READ_TOKENS), int, 0),
"error_type": attrs.get(OTEL_ERROR_TYPE),
}


def extract_tool_call_from_attrs(
attrs: dict[str, Any], operation_name: str = "", span_id: str = ""
) -> dict[str, Any] | None:
Expand Down Expand Up @@ -171,7 +242,17 @@ def extract_tool_call_from_attrs(
if fallback_id:
tool_call_id = fallback_id

return {"id": tool_call_id, "name": tool_name, "args": args}
result: dict[str, Any] = {"id": tool_call_id, "name": tool_name, "args": args}

tool_type = attrs.get(OTEL_GENAI_TOOL_TYPE)
if tool_type:
result["type"] = tool_type

tool_description = attrs.get(OTEL_GENAI_TOOL_DESCRIPTION)
if tool_description:
result["description"] = tool_description

return result


def parse_tool_response_content(content: Any) -> dict:
Expand Down
49 changes: 47 additions & 2 deletions src/agentevals/streaming/ws_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,12 @@
WSSpanReceivedEvent,
)
from ..converter import convert_traces
from ..extraction import extract_token_usage_from_attrs, is_llm_span, parse_tool_response_content
from ..extraction import (
extract_extended_model_info_from_attrs,
extract_token_usage_from_attrs,
is_llm_span,
parse_tool_response_content,
)
from ..loader.base import Trace
from ..loader.otlp import OtlpJsonLoader
from ..trace_attrs import OTEL_GENAI_INPUT_MESSAGES, OTEL_GENAI_REQUEST_MODEL
Expand Down Expand Up @@ -794,8 +799,17 @@ def _extract_model_info_from_trace(self, trace: Trace, invocation_idx: int) -> d
models_used: set[str] = set()
total_input_tokens = 0
total_output_tokens = 0
total_cache_creation_tokens = 0
total_cache_read_tokens = 0
first_provider: str | None = None
response_models: set[str] = set()
finish_reasons: set[str] = set()
error_types: set[str] = set()
first_temperature: float | None = None
first_max_tokens: int | None = None

llm_spans = [s for s in trace.all_spans if is_llm_span(s) or "call_llm" in s.operation_name]
llm_spans.sort(key=lambda s: s.start_time)

for span in llm_spans:
in_toks, out_toks, model = extract_token_usage_from_attrs(span.tags)
Expand All @@ -808,12 +822,43 @@ def _extract_model_info_from_trace(self, trace: Trace, invocation_idx: int) -> d
total_input_tokens += in_toks
total_output_tokens += out_toks

ext = extract_extended_model_info_from_attrs(span.tags)
if first_provider is None and ext["provider"]:
first_provider = ext["provider"]
if ext["response_model"]:
response_models.add(ext["response_model"])
finish_reasons.update(ext["finish_reasons"])
total_cache_creation_tokens += ext["cache_creation_tokens"]
total_cache_read_tokens += ext["cache_read_tokens"]
if ext["error_type"]:
error_types.add(ext["error_type"])
if first_temperature is None and ext["temperature"] is not None:
first_temperature = ext["temperature"]
if first_max_tokens is None and ext["max_tokens"] is not None:
first_max_tokens = ext["max_tokens"]

if models_used:
model_info["models"] = list(models_used)
model_info["models"] = sorted(models_used)
if total_input_tokens > 0:
model_info["inputTokens"] = total_input_tokens
if total_output_tokens > 0:
model_info["outputTokens"] = total_output_tokens
if first_provider:
model_info["provider"] = first_provider
if response_models:
model_info["responseModels"] = sorted(response_models)
if finish_reasons:
model_info["finishReasons"] = sorted(finish_reasons)
if total_cache_creation_tokens > 0:
model_info["cacheCreationTokens"] = total_cache_creation_tokens
if total_cache_read_tokens > 0:
model_info["cacheReadTokens"] = total_cache_read_tokens
if first_temperature is not None:
model_info["temperature"] = first_temperature
if first_max_tokens is not None:
model_info["maxTokens"] = first_max_tokens
if error_types:
model_info["errorTypes"] = sorted(error_types)

return model_info

Expand Down
39 changes: 39 additions & 0 deletions src/agentevals/trace_attrs.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

Single source of truth for all attribute names used across the converter,
extraction, streaming, and runner modules.

Covers OTel GenAI semantic conventions up to v1.40.0.
"""

# OTel scope
Expand All @@ -25,6 +27,43 @@
OTEL_GENAI_TOOL_CALL_RESULT = "gen_ai.tool.call.result"
OTEL_GENAI_CONVERSATION_ID = "gen_ai.conversation.id"

# Provider and response metadata (v1.37.0+)
OTEL_GENAI_PROVIDER_NAME = "gen_ai.provider.name"
OTEL_GENAI_RESPONSE_MODEL = "gen_ai.response.model"
OTEL_GENAI_RESPONSE_ID = "gen_ai.response.id"
OTEL_GENAI_RESPONSE_FINISH_REASONS = "gen_ai.response.finish_reasons"

# Deprecated provider attribute (pre-v1.37.0, renamed to gen_ai.provider.name)
OTEL_GENAI_SYSTEM = "gen_ai.system"

# Agent identity (v1.31.0+)
OTEL_GENAI_AGENT_ID = "gen_ai.agent.id"
OTEL_GENAI_AGENT_DESCRIPTION = "gen_ai.agent.description"

# Tool metadata (v1.31.0+)
OTEL_GENAI_TOOL_DESCRIPTION = "gen_ai.tool.description"
OTEL_GENAI_TOOL_TYPE = "gen_ai.tool.type"

# Error classification
OTEL_ERROR_TYPE = "error.type"

# Request parameters
OTEL_GENAI_REQUEST_TEMPERATURE = "gen_ai.request.temperature"
OTEL_GENAI_REQUEST_MAX_TOKENS = "gen_ai.request.max_tokens"
OTEL_GENAI_REQUEST_TOP_P = "gen_ai.request.top_p"
OTEL_GENAI_REQUEST_TOP_K = "gen_ai.request.top_k"

# Cache token usage (Anthropic/OpenAI prompt caching)
OTEL_GENAI_USAGE_CACHE_CREATION_TOKENS = "gen_ai.usage.cache_creation.input_tokens"
OTEL_GENAI_USAGE_CACHE_READ_TOKENS = "gen_ai.usage.cache_read.input_tokens"

# System/tool definitions (opt-in, v1.37.0+)
OTEL_GENAI_SYSTEM_INSTRUCTIONS = "gen_ai.system_instructions"
OTEL_GENAI_TOOL_DEFINITIONS = "gen_ai.tool.definitions"

# Output type
OTEL_GENAI_OUTPUT_TYPE = "gen_ai.output.type"

# ADK-specific custom attributes (gcp.vertex.agent.*)
ADK_LLM_REQUEST = "gcp.vertex.agent.llm_request"
ADK_LLM_RESPONSE = "gcp.vertex.agent.llm_response"
Expand Down
47 changes: 40 additions & 7 deletions src/agentevals/trace_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,16 @@

from .extraction import (
extract_agent_response_from_attrs,
extract_extended_model_info_from_attrs,
extract_token_usage_from_attrs,
extract_user_text_from_attrs,
get_extractor,
)
from .trace_attrs import OTEL_GENAI_AGENT_NAME, OTEL_GENAI_REQUEST_MODEL
from .trace_attrs import (
OTEL_GENAI_AGENT_ID,
OTEL_GENAI_AGENT_NAME,
OTEL_GENAI_REQUEST_MODEL,
)


def _truncate(text: str, max_length: int = 200) -> str:
Expand Down Expand Up @@ -41,6 +46,8 @@ def extract_performance_metrics(trace, extractor=None) -> dict[str, Any]:
prompt_tokens = []
output_tokens = []
total_tokens = []
cache_creation_tokens_total = 0
cache_read_tokens_total = 0

if extractor is None:
extractor = get_extractor(trace)
Expand All @@ -64,29 +71,41 @@ def extract_performance_metrics(trace, extractor=None) -> dict[str, Any]:
prompt_tokens.append(in_toks)
output_tokens.append(out_toks)
total_tokens.append(in_toks + out_toks)
ext = extract_extended_model_info_from_attrs(span.tags)
cache_creation_tokens_total += ext["cache_creation_tokens"]
cache_read_tokens_total += ext["cache_read_tokens"]
elif role == "tool":
tool_latencies.append(duration_ms)

tokens_info: dict[str, Any] = {
"total_prompt": sum(prompt_tokens) if prompt_tokens else 0,
"total_output": sum(output_tokens) if output_tokens else 0,
"total": sum(total_tokens) if total_tokens else 0,
"per_llm_call": _calc_percentiles(total_tokens) if total_tokens else {"p50": 0.0, "p95": 0.0, "p99": 0.0},
}
if cache_creation_tokens_total > 0:
tokens_info["cache_creation_tokens"] = cache_creation_tokens_total
if cache_read_tokens_total > 0:
tokens_info["cache_read_tokens"] = cache_read_tokens_total

return {
"latency": {
"overall": _calc_percentiles(agent_latencies),
"llm_calls": _calc_percentiles(llm_latencies),
"tool_executions": _calc_percentiles(tool_latencies),
},
"tokens": {
"total_prompt": sum(prompt_tokens) if prompt_tokens else 0,
"total_output": sum(output_tokens) if output_tokens else 0,
"total": sum(total_tokens) if total_tokens else 0,
"per_llm_call": _calc_percentiles(total_tokens) if total_tokens else {"p50": 0.0, "p95": 0.0, "p99": 0.0},
},
"tokens": tokens_info,
}


def extract_trace_metadata(trace, extractor=None) -> dict[str, Any]:
"""Extract agent name, model, timing, and preview text from a trace."""
metadata: dict[str, Any] = {
"agent_name": None,
"agent_id": None,
"model": None,
"response_model": None,
"provider": None,
"start_time": None,
"user_input_preview": None,
"final_output_preview": None,
Expand All @@ -99,12 +118,19 @@ def extract_trace_metadata(trace, extractor=None) -> dict[str, Any]:
if invocation_spans:
first_inv = invocation_spans[0]
metadata["agent_name"] = first_inv.get_tag(OTEL_GENAI_AGENT_NAME)
metadata["agent_id"] = first_inv.get_tag(OTEL_GENAI_AGENT_ID)
metadata["start_time"] = first_inv.start_time

llm_spans = extractor.find_llm_spans_in(first_inv)
if llm_spans:
metadata["model"] = llm_spans[0].get_tag(OTEL_GENAI_REQUEST_MODEL)

ext = extract_extended_model_info_from_attrs(llm_spans[0].tags)
if ext["response_model"]:
metadata["response_model"] = ext["response_model"]
if ext["provider"]:
metadata["provider"] = ext["provider"]

user_text = extract_user_text_from_attrs(llm_spans[0].tags)
if user_text:
metadata["user_input_preview"] = _truncate(user_text)
Expand All @@ -123,4 +149,11 @@ def extract_trace_metadata(trace, extractor=None) -> dict[str, Any]:
metadata["model"] = model
break

if not metadata["provider"]:
for span in trace.all_spans:
ext = extract_extended_model_info_from_attrs(span.tags)
if ext["provider"]:
metadata["provider"] = ext["provider"]
break

return metadata
Loading