agentevals-dev · krisztianfekete · Apr 7, 2026 · Apr 7, 2026 · Apr 7, 2026 · Apr 7, 2026
diff --git a/examples/zero-code-examples/openai-agents/requirements.txt b/examples/zero-code-examples/openai-agents/requirements.txt
@@ -1,4 +1,5 @@
-openai-agents>=0.3.3
+openai>=2.30.0
+openai-agents>=0.13.0
 opentelemetry-instrumentation-openai-agents-v2>=0.1.0
 
 opentelemetry-sdk>=1.36.0

diff --git a/examples/zero-code-examples/openai-agents/run.py b/examples/zero-code-examples/openai-agents/run.py
@@ -56,6 +56,7 @@ def main():
     print(f"OTLP endpoint: {endpoint}")
 
     os.environ.setdefault("OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT", "span_and_event")
+    os.environ.setdefault("OTEL_SEMCONV_STABILITY_OPT_IN", "gen_ai_latest_experimental")
 
     os.environ.setdefault(
         "OTEL_RESOURCE_ATTRIBUTES",

diff --git a/src/agentevals/api/models.py b/src/agentevals/api/models.py
@@ -113,7 +113,10 @@ class DebugLoadData(CamelModel):
 
 class TraceConversionMetadata(CamelModel):
     agent_name: str | None = None
+    agent_id: str | None = None
     model: str | None = None
+    response_model: str | None = None
+    provider: str | None = None
     start_time: int | None = None
     user_input_preview: str | None = None
     final_output_preview: str | None = None

diff --git a/src/agentevals/extraction.py b/src/agentevals/extraction.py
@@ -13,7 +13,7 @@
 
 import json
 import logging
-from typing import Any, Protocol
+from typing import Any, Protocol, TypedDict, TypeVar
 
 from .loader.base import Span, Trace
 from .trace_attrs import (
@@ -22,14 +22,26 @@
     ADK_SCOPE_VALUE,
     ADK_TOOL_CALL_ARGS,
     ADK_TOOL_RESPONSE,
+    OTEL_ERROR_TYPE,
     OTEL_GENAI_INPUT_MESSAGES,
     OTEL_GENAI_OP,
     OTEL_GENAI_OUTPUT_MESSAGES,
+    OTEL_GENAI_PROVIDER_NAME,
+    OTEL_GENAI_REQUEST_MAX_TOKENS,
     OTEL_GENAI_REQUEST_MODEL,
+    OTEL_GENAI_REQUEST_TEMPERATURE,
+    OTEL_GENAI_RESPONSE_FINISH_REASONS,
+    OTEL_GENAI_RESPONSE_ID,
+    OTEL_GENAI_RESPONSE_MODEL,
+    OTEL_GENAI_SYSTEM,
     OTEL_GENAI_TOOL_CALL_ARGUMENTS,
     OTEL_GENAI_TOOL_CALL_ID,
     OTEL_GENAI_TOOL_CALL_RESULT,
+    OTEL_GENAI_TOOL_DESCRIPTION,
     OTEL_GENAI_TOOL_NAME,
+    OTEL_GENAI_TOOL_TYPE,
+    OTEL_GENAI_USAGE_CACHE_CREATION_TOKENS,
+    OTEL_GENAI_USAGE_CACHE_READ_TOKENS,
     OTEL_GENAI_USAGE_INPUT_TOKENS,
     OTEL_GENAI_USAGE_OUTPUT_TOKENS,
     OTEL_SCOPE,
@@ -139,6 +151,65 @@ def extract_token_usage_from_attrs(
     return 0, 0, model
 
 
+_T = TypeVar("_T", int, float)
+
+
+def _safe_cast(value: Any, target_type: type[_T], default: _T | None = None) -> _T | None:
+    """Try to cast *value* to *target_type*, returning *default* on failure."""
+    if value is None:
+        return default
+    try:
+        return target_type(value)
+    except (TypeError, ValueError):
+        return default
+
+
+def _parse_finish_reasons(raw: Any) -> list[str]:
+    """Parse finish reasons from a list, JSON string, or plain string."""
+    if isinstance(raw, list):
+        return [str(r) for r in raw]
+    if isinstance(raw, str):
+        parsed = parse_json(raw)
+        if isinstance(parsed, list):
+            return [str(r) for r in parsed]
+        if raw:
+            return [raw]
+    return []
+
+
+class ExtendedModelInfo(TypedDict):
+    request_model: str | None
+    response_model: str | None
+    provider: str | None
+    finish_reasons: list[str]
+    response_id: str | None
+    temperature: float | None
+    max_tokens: int | None
+    cache_creation_tokens: int
+    cache_read_tokens: int
+    error_type: str | None
+
+
+def extract_extended_model_info_from_attrs(attrs: dict[str, Any]) -> ExtendedModelInfo:
+    """Extract extended model and provider metadata from span attributes.
+
+    Uses gen_ai.system as fallback for provider when gen_ai.provider.name is
+    absent (backward compat with pre-v1.37.0 instrumentors).
+    """
+    return {
+        "request_model": attrs.get(OTEL_GENAI_REQUEST_MODEL),
+        "response_model": attrs.get(OTEL_GENAI_RESPONSE_MODEL),
+        "provider": attrs.get(OTEL_GENAI_PROVIDER_NAME) or attrs.get(OTEL_GENAI_SYSTEM),
+        "finish_reasons": _parse_finish_reasons(attrs.get(OTEL_GENAI_RESPONSE_FINISH_REASONS)),
+        "response_id": attrs.get(OTEL_GENAI_RESPONSE_ID),
+        "temperature": _safe_cast(attrs.get(OTEL_GENAI_REQUEST_TEMPERATURE), float),
+        "max_tokens": _safe_cast(attrs.get(OTEL_GENAI_REQUEST_MAX_TOKENS), int),
+        "cache_creation_tokens": _safe_cast(attrs.get(OTEL_GENAI_USAGE_CACHE_CREATION_TOKENS), int, 0),
+        "cache_read_tokens": _safe_cast(attrs.get(OTEL_GENAI_USAGE_CACHE_READ_TOKENS), int, 0),
+        "error_type": attrs.get(OTEL_ERROR_TYPE),
+    }
+
+
 def extract_tool_call_from_attrs(
     attrs: dict[str, Any], operation_name: str = "", span_id: str = ""
 ) -> dict[str, Any] | None:
@@ -171,7 +242,17 @@ def extract_tool_call_from_attrs(
             if fallback_id:
                 tool_call_id = fallback_id
 
-    return {"id": tool_call_id, "name": tool_name, "args": args}
+    result: dict[str, Any] = {"id": tool_call_id, "name": tool_name, "args": args}
+
+    tool_type = attrs.get(OTEL_GENAI_TOOL_TYPE)
+    if tool_type:
+        result["type"] = tool_type
+
+    tool_description = attrs.get(OTEL_GENAI_TOOL_DESCRIPTION)
+    if tool_description:
+        result["description"] = tool_description
+
+    return result
 
 
 def parse_tool_response_content(content: Any) -> dict:

diff --git a/src/agentevals/streaming/ws_server.py b/src/agentevals/streaming/ws_server.py
@@ -20,7 +20,12 @@
     WSSpanReceivedEvent,
 )
 from ..converter import convert_traces
-from ..extraction import extract_token_usage_from_attrs, is_llm_span, parse_tool_response_content
+from ..extraction import (
+    extract_extended_model_info_from_attrs,
+    extract_token_usage_from_attrs,
+    is_llm_span,
+    parse_tool_response_content,
+)
 from ..loader.base import Trace
 from ..loader.otlp import OtlpJsonLoader
 from ..trace_attrs import OTEL_GENAI_INPUT_MESSAGES, OTEL_GENAI_REQUEST_MODEL
@@ -794,8 +799,17 @@ def _extract_model_info_from_trace(self, trace: Trace, invocation_idx: int) -> d
         models_used: set[str] = set()
         total_input_tokens = 0
         total_output_tokens = 0
+        total_cache_creation_tokens = 0
+        total_cache_read_tokens = 0
+        first_provider: str | None = None
+        response_models: set[str] = set()
+        finish_reasons: set[str] = set()
+        error_types: set[str] = set()
+        first_temperature: float | None = None
+        first_max_tokens: int | None = None
 
         llm_spans = [s for s in trace.all_spans if is_llm_span(s) or "call_llm" in s.operation_name]
+        llm_spans.sort(key=lambda s: s.start_time)
 
         for span in llm_spans:
             in_toks, out_toks, model = extract_token_usage_from_attrs(span.tags)
@@ -808,12 +822,43 @@ def _extract_model_info_from_trace(self, trace: Trace, invocation_idx: int) -> d
             total_input_tokens += in_toks
             total_output_tokens += out_toks
 
+            ext = extract_extended_model_info_from_attrs(span.tags)
+            if first_provider is None and ext["provider"]:
+                first_provider = ext["provider"]
+            if ext["response_model"]:
+                response_models.add(ext["response_model"])
+            finish_reasons.update(ext["finish_reasons"])
+            total_cache_creation_tokens += ext["cache_creation_tokens"]
+            total_cache_read_tokens += ext["cache_read_tokens"]
+            if ext["error_type"]:
+                error_types.add(ext["error_type"])
+            if first_temperature is None and ext["temperature"] is not None:
+                first_temperature = ext["temperature"]
+            if first_max_tokens is None and ext["max_tokens"] is not None:
+                first_max_tokens = ext["max_tokens"]
+
         if models_used:
-            model_info["models"] = list(models_used)
+            model_info["models"] = sorted(models_used)
         if total_input_tokens > 0:
             model_info["inputTokens"] = total_input_tokens
         if total_output_tokens > 0:
             model_info["outputTokens"] = total_output_tokens
+        if first_provider:
+            model_info["provider"] = first_provider
+        if response_models:
+            model_info["responseModels"] = sorted(response_models)
+        if finish_reasons:
+            model_info["finishReasons"] = sorted(finish_reasons)
+        if total_cache_creation_tokens > 0:
+            model_info["cacheCreationTokens"] = total_cache_creation_tokens
+        if total_cache_read_tokens > 0:
+            model_info["cacheReadTokens"] = total_cache_read_tokens
+        if first_temperature is not None:
+            model_info["temperature"] = first_temperature
+        if first_max_tokens is not None:
+            model_info["maxTokens"] = first_max_tokens
+        if error_types:
+            model_info["errorTypes"] = sorted(error_types)
 
         return model_info
 

diff --git a/src/agentevals/trace_attrs.py b/src/agentevals/trace_attrs.py
@@ -2,6 +2,8 @@
 
 Single source of truth for all attribute names used across the converter,
 extraction, streaming, and runner modules.
+
+Covers OTel GenAI semantic conventions up to v1.40.0.
 """
 
 # OTel scope
@@ -25,6 +27,43 @@
 OTEL_GENAI_TOOL_CALL_RESULT = "gen_ai.tool.call.result"
 OTEL_GENAI_CONVERSATION_ID = "gen_ai.conversation.id"
 
+# Provider and response metadata (v1.37.0+)
+OTEL_GENAI_PROVIDER_NAME = "gen_ai.provider.name"
+OTEL_GENAI_RESPONSE_MODEL = "gen_ai.response.model"
+OTEL_GENAI_RESPONSE_ID = "gen_ai.response.id"
+OTEL_GENAI_RESPONSE_FINISH_REASONS = "gen_ai.response.finish_reasons"
+
+# Deprecated provider attribute (pre-v1.37.0, renamed to gen_ai.provider.name)
+OTEL_GENAI_SYSTEM = "gen_ai.system"
+
+# Agent identity (v1.31.0+)
+OTEL_GENAI_AGENT_ID = "gen_ai.agent.id"
+OTEL_GENAI_AGENT_DESCRIPTION = "gen_ai.agent.description"
+
+# Tool metadata (v1.31.0+)
+OTEL_GENAI_TOOL_DESCRIPTION = "gen_ai.tool.description"
+OTEL_GENAI_TOOL_TYPE = "gen_ai.tool.type"
+
+# Error classification
+OTEL_ERROR_TYPE = "error.type"
+
+# Request parameters
+OTEL_GENAI_REQUEST_TEMPERATURE = "gen_ai.request.temperature"
+OTEL_GENAI_REQUEST_MAX_TOKENS = "gen_ai.request.max_tokens"
+OTEL_GENAI_REQUEST_TOP_P = "gen_ai.request.top_p"
+OTEL_GENAI_REQUEST_TOP_K = "gen_ai.request.top_k"
+
+# Cache token usage (Anthropic/OpenAI prompt caching)
+OTEL_GENAI_USAGE_CACHE_CREATION_TOKENS = "gen_ai.usage.cache_creation.input_tokens"
+OTEL_GENAI_USAGE_CACHE_READ_TOKENS = "gen_ai.usage.cache_read.input_tokens"
+
+# System/tool definitions (opt-in, v1.37.0+)
+OTEL_GENAI_SYSTEM_INSTRUCTIONS = "gen_ai.system_instructions"
+OTEL_GENAI_TOOL_DEFINITIONS = "gen_ai.tool.definitions"
+
+# Output type
+OTEL_GENAI_OUTPUT_TYPE = "gen_ai.output.type"
+
 # ADK-specific custom attributes (gcp.vertex.agent.*)
 ADK_LLM_REQUEST = "gcp.vertex.agent.llm_request"
 ADK_LLM_RESPONSE = "gcp.vertex.agent.llm_response"

diff --git a/src/agentevals/trace_metrics.py b/src/agentevals/trace_metrics.py
@@ -6,11 +6,16 @@
 
 from .extraction import (
     extract_agent_response_from_attrs,
+    extract_extended_model_info_from_attrs,
     extract_token_usage_from_attrs,
     extract_user_text_from_attrs,
     get_extractor,
 )
-from .trace_attrs import OTEL_GENAI_AGENT_NAME, OTEL_GENAI_REQUEST_MODEL
+from .trace_attrs import (
+    OTEL_GENAI_AGENT_ID,
+    OTEL_GENAI_AGENT_NAME,
+    OTEL_GENAI_REQUEST_MODEL,
+)
 
 
 def _truncate(text: str, max_length: int = 200) -> str:
@@ -41,6 +46,8 @@ def extract_performance_metrics(trace, extractor=None) -> dict[str, Any]:
     prompt_tokens = []
     output_tokens = []
     total_tokens = []
+    cache_creation_tokens_total = 0
+    cache_read_tokens_total = 0
 
     if extractor is None:
         extractor = get_extractor(trace)
@@ -64,29 +71,41 @@ def extract_performance_metrics(trace, extractor=None) -> dict[str, Any]:
                 prompt_tokens.append(in_toks)
                 output_tokens.append(out_toks)
                 total_tokens.append(in_toks + out_toks)
+            ext = extract_extended_model_info_from_attrs(span.tags)
+            cache_creation_tokens_total += ext["cache_creation_tokens"]
+            cache_read_tokens_total += ext["cache_read_tokens"]
         elif role == "tool":
             tool_latencies.append(duration_ms)
 
+    tokens_info: dict[str, Any] = {
+        "total_prompt": sum(prompt_tokens) if prompt_tokens else 0,
+        "total_output": sum(output_tokens) if output_tokens else 0,
+        "total": sum(total_tokens) if total_tokens else 0,
+        "per_llm_call": _calc_percentiles(total_tokens) if total_tokens else {"p50": 0.0, "p95": 0.0, "p99": 0.0},
+    }
+    if cache_creation_tokens_total > 0:
+        tokens_info["cache_creation_tokens"] = cache_creation_tokens_total
+    if cache_read_tokens_total > 0:
+        tokens_info["cache_read_tokens"] = cache_read_tokens_total
+
     return {
         "latency": {
             "overall": _calc_percentiles(agent_latencies),
             "llm_calls": _calc_percentiles(llm_latencies),
             "tool_executions": _calc_percentiles(tool_latencies),
         },
-        "tokens": {
-            "total_prompt": sum(prompt_tokens) if prompt_tokens else 0,
-            "total_output": sum(output_tokens) if output_tokens else 0,
-            "total": sum(total_tokens) if total_tokens else 0,
-            "per_llm_call": _calc_percentiles(total_tokens) if total_tokens else {"p50": 0.0, "p95": 0.0, "p99": 0.0},
-        },
+        "tokens": tokens_info,
     }
 
 
 def extract_trace_metadata(trace, extractor=None) -> dict[str, Any]:
     """Extract agent name, model, timing, and preview text from a trace."""
     metadata: dict[str, Any] = {
         "agent_name": None,
+        "agent_id": None,
         "model": None,
+        "response_model": None,
+        "provider": None,
         "start_time": None,
         "user_input_preview": None,
         "final_output_preview": None,
@@ -99,12 +118,19 @@ def extract_trace_metadata(trace, extractor=None) -> dict[str, Any]:
     if invocation_spans:
         first_inv = invocation_spans[0]
         metadata["agent_name"] = first_inv.get_tag(OTEL_GENAI_AGENT_NAME)
+        metadata["agent_id"] = first_inv.get_tag(OTEL_GENAI_AGENT_ID)
         metadata["start_time"] = first_inv.start_time
 
         llm_spans = extractor.find_llm_spans_in(first_inv)
         if llm_spans:
             metadata["model"] = llm_spans[0].get_tag(OTEL_GENAI_REQUEST_MODEL)
 
+            ext = extract_extended_model_info_from_attrs(llm_spans[0].tags)
+            if ext["response_model"]:
+                metadata["response_model"] = ext["response_model"]
+            if ext["provider"]:
+                metadata["provider"] = ext["provider"]
+
             user_text = extract_user_text_from_attrs(llm_spans[0].tags)
             if user_text:
                 metadata["user_input_preview"] = _truncate(user_text)
@@ -123,4 +149,11 @@ def extract_trace_metadata(trace, extractor=None) -> dict[str, Any]:
                 metadata["model"] = model
                 break
 
+    if not metadata["provider"]:
+        for span in trace.all_spans:
+            ext = extract_extended_model_info_from_attrs(span.tags)
+            if ext["provider"]:
+                metadata["provider"] = ext["provider"]
+                break
+
     return metadata