diff --git a/noveum_customer_support_bt/README_workflow.md b/noveum_customer_support_bt/README_workflow.md new file mode 100644 index 0000000..db7a8b8 --- /dev/null +++ b/noveum_customer_support_bt/README_workflow.md @@ -0,0 +1,81 @@ +# Noveum Platform Score Upload Workflow + +## Prerequisites +- Virtual environment activated +- .env file configured with API credentials +- Processed dataset available +- Evaluation results CSV available + +## Step-by-Step Commands + +### Step 1: Setup Environment +```bash +cd /Users/mramanindia/work/NovaEval +source .venv/bin/activate +cd noveum_customer_support_bt +``` + +### Step 2: Create Dataset +```bash +python create_dataset.py --dataset-type agent --description "Customer Support Agent Evaluation Dataset" --pretty +``` +**Note**: After this step, update your .env file with the returned dataset slug if different. + +### Step 3: Create Dataset Version +```bash +python create_dataset_version.py --pretty +``` + +### Step 4: Upload Dataset Items +```bash +python upload_dataset.py --dataset-json processed_datasets/agent.rag_evaluation_metrics_dataset_processed_dataset.json --item-type conversation +``` + +### Step 5: Publish Dataset Version +```bash +python publish_dataset_version.py --pretty +``` + +### Step 6: Upload Evaluation Scores + +#### Option A: Upload All Scores Separately +```bash +# Task Progression Scores +python upload_scores.py demo_results/agent.rag_evaluation_metrics_dataset/agent_evaluation_results.csv --item-key-col turn_id --score-col task_progression --reasoning-col task_progression_reasoning --scorer-id task_progression_scorer --scorer-version 1.0.0 + +# Context Relevancy Scores +python upload_scores.py demo_results/agent.rag_evaluation_metrics_dataset/agent_evaluation_results.csv --item-key-col turn_id --score-col context_relevancy --reasoning-col context_relevancy_reasoning --scorer-id context_relevancy_scorer --scorer-version 1.0.0 + +# Role Adherence Scores +python upload_scores.py demo_results/agent.rag_evaluation_metrics_dataset/agent_evaluation_results.csv --item-key-col turn_id --score-col role_adherence --reasoning-col role_adherence_reasoning --scorer-id role_adherence_scorer --scorer-version 1.0.0 + +# Tool Relevancy Scores +python upload_scores.py demo_results/agent.rag_evaluation_metrics_dataset/agent_evaluation_results.csv --item-key-col turn_id --score-col tool_relevancy --reasoning-col tool_relevancy_reasoning --scorer-id tool_relevancy_scorer --scorer-version 1.0.0 + +# Parameter Correctness Scores +python upload_scores.py demo_results/agent.rag_evaluation_metrics_dataset/agent_evaluation_results.csv --item-key-col turn_id --score-col parameter_correctness --reasoning-col parameter_correctness_reasoning --scorer-id parameter_correctness_scorer --scorer-version 1.0.0 +``` + +#### Option B: Test with Dry Run First +Add `--dry-run` flag to any upload command to test without actually uploading: +```bash +python upload_scores.py demo_results/agent.rag_evaluation_metrics_dataset/agent_evaluation_results.csv --item-key-col turn_id --score-col task_progression --reasoning-col task_progression_reasoning --scorer-id task_progression_scorer --scorer-version 1.0.0 --dry-run +``` + +## Environment Variables Required +Make sure your .env file contains: +- NOVEUM_API_KEY +- NOVEUM_ORG_SLUG +- NOVEUM_DATASET_SLUG +- NOVEUM_DATASET_NAME +- LATEST_VERSION +- NOVEUM_PROJECT +- NOVEUM_ENVIRONMENT +- BETA (true/false) + +## Troubleshooting +- If dataset creation fails, check if dataset already exists +- If upload fails, verify the JSON format matches expected schema +- Use --pretty flag for better formatted output +- Check API responses for specific error messages + diff --git a/noveum_customer_support_bt/api_data.json b/noveum_customer_support_bt/api_data.json new file mode 100644 index 0000000..5bc3e2d --- /dev/null +++ b/noveum_customer_support_bt/api_data.json @@ -0,0 +1,84 @@ +{ + "items": [ + { + "item_key": "eda4fe22-9a2b-4b73-856b-f4f3309bf719", + "item_id": "item_1" + }, + { + "item_key": "0ffffba1-8a37-443c-8866-d53ffbfa7718", + "item_id": "item_2" + }, + { + "item_key": "f1f37bd7-0851-4659-b493-b80d3800d920", + "item_id": "item_3" + }, + { + "item_key": "43cdf081-4f01-49cd-b566-dbd1619e6cd2", + "item_id": "item_4" + }, + { + "item_key": "9a1983f4-09da-4b53-80e6-38de6878e0e7", + "item_id": "item_5" + }, + { + "item_key": "5d2517e1-220a-429d-9d59-f701bda25eed", + "item_id": "item_6" + }, + { + "item_key": "52aacb67-c361-4445-9b72-c157f79f47d6", + "item_id": "item_7" + }, + { + "item_key": "a81ca3a8-80aa-4c39-876e-8d40ea7a0aef", + "item_id": "item_8" + }, + { + "item_key": "230aad27-f3dd-4968-a45a-3c2f07ac28ed", + "item_id": "item_9" + }, + { + "item_key": "83c7dcce-3d89-4da1-8b3f-d419885d4cbc", + "item_id": "item_10" + }, + { + "item_key": "2218f641-604c-491a-9710-b51a9941b982", + "item_id": "item_11" + }, + { + "item_key": "255fd49c-84b4-4b18-887e-6308a412d535", + "item_id": "item_12" + }, + { + "item_key": "dc511122-c0b6-415c-9a49-c7b45132dd87", + "item_id": "item_13" + }, + { + "item_key": "04bebf38-a343-4563-80db-0154bef8d927", + "item_id": "item_14" + }, + { + "item_key": "5e043630-6493-42b5-beb8-79faa19bfa37", + "item_id": "item_15" + }, + { + "item_key": "7da9814d-a2e8-4c4e-b750-68b26bd5fd22", + "item_id": "item_16" + }, + { + "item_key": "16143f74-2831-4753-b33d-ce4b645093c5", + "item_id": "item_17" + }, + { + "item_key": "fc64e6cc-6739-4256-ac4a-7b80c3028233", + "item_id": "item_18" + }, + { + "item_key": "b7945c49-f584-4c70-972d-536a805d8a31", + "item_id": "item_19" + }, + { + "item_key": "f5c40ecf-36c0-45ba-9cc9-dc0329b0324b", + "item_id": "item_20" + } + ] +} \ No newline at end of file diff --git a/noveum_customer_support_bt/create_dataset.py b/noveum_customer_support_bt/create_dataset.py new file mode 100644 index 0000000..9aaf601 --- /dev/null +++ b/noveum_customer_support_bt/create_dataset.py @@ -0,0 +1,161 @@ +#!/usr/bin/env python3 +""" +Script to create a new dataset in Noveum API. +Creates a dataset with the specified name and type. +""" + +import os +import json +import requests +import argparse +from dotenv import load_dotenv +from typing import Dict, Any, Optional + +# Load environment variables +load_dotenv() + +# Get API credentials from environment +api_key = os.getenv('NOVEUM_API_KEY') +org_slug = os.getenv('NOVEUM_ORG_SLUG') +dataset_slug = os.getenv('NOVEUM_DATASET_SLUG') +dataset_name = os.getenv('NOVEUM_DATASET_SLUG') +beta_env = os.getenv('BETA', 'false').lower() == 'true' + +def validate_environment(): + """Validate that all required environment variables are set""" + required_vars = { + 'NOVEUM_API_KEY': api_key, + 'NOVEUM_ORG_SLUG': org_slug, + 'NOVEUM_DATASET_SLUG': dataset_slug, + 'NOVEUM_DATASET_NAME': dataset_name + } + + missing_vars = [var for var, value in required_vars.items() if not value] + + if missing_vars: + print(f"Error: Missing required environment variables: {', '.join(missing_vars)}") + print("Please set these variables in your .env file or environment") + return False + + return True + +def create_dataset(dataset_type: str, description: str = "", visibility: str = "org", environment: str = "") -> Optional[Dict[str, Any]]: + """Create a new dataset in Noveum API""" + + # Construct API URL based on BETA environment variable + if beta_env: + api_url = f"https://noveum.ai/api/v1/datasets" + else: + api_url = f"https://noveum.ai/api/v1/organizations/{org_slug}/datasets" + + # Prepare headers + headers = { + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {api_key}', + 'Cookie': f'apiKeyCookie={api_key}' + } + + # Prepare request data + request_data = { + "name": dataset_name, + "slug": dataset_name, # Will be auto-generated by the API + "description": description, + "visibility": visibility, + "dataset_type": dataset_type, + "environment": environment + } + + print(f"Creating dataset at: {api_url}") + print(f"Organization: {org_slug}") + print(f"Dataset name: {dataset_name}") + print(f"Dataset type: {dataset_type}") + print(f"Description: {description}") + print(f"Visibility: {visibility}") + print(f"Environment: {environment}") + + try: + response = requests.post(api_url, headers=headers, json=request_data, timeout=30) + response.raise_for_status() + + data = response.json() + print("Successfully created dataset") + print(f"Response status: {response.status_code}") + + return data + + except requests.exceptions.RequestException as e: + print(f"Error creating dataset: {e}") + if hasattr(e, 'response') and e.response is not None: + print(f"Response status: {e.response.status_code}") + print(f"Response text: {e.response.text}") + return None + +def main(): + # Define valid dataset types + valid_dataset_types = ['agent', 'conversational', 'g-eval', 'custom'] + + parser = argparse.ArgumentParser(description='Create a new dataset in Noveum API') + parser.add_argument('--dataset-type', type=str, default='agent', + choices=valid_dataset_types, + help=f'Type of the dataset. Must be one of: {", ".join(valid_dataset_types)} (default: agent)') + parser.add_argument('--description', type=str, default="", + help='Description of the dataset (default: empty string)') + parser.add_argument('--visibility', type=str, default="org", + help='Visibility of the dataset (default: org)') + parser.add_argument('--environment', type=str, default="", + help='Environment for the dataset (default: empty string)') + parser.add_argument('--pretty', action='store_true', + help='Pretty print the JSON response') + parser.add_argument('--output', type=str, default="dataset_creation_response.json", + help='Output file to save the JSON response (default: dataset_creation_response.json)') + + args = parser.parse_args() + + # Print warning about dataset slug + print("⚠️ WARNING: Please update the dataset slug in your .env file after creating the dataset!") + print(" The API will return a slug that you should set as NOVEUM_DATASET_SLUG in your .env file.") + print() + + # Validate environment variables + if not validate_environment(): + return 1 + + # Create dataset + data = create_dataset( + dataset_type=args.dataset_type, + description=args.description, + visibility=args.visibility, + environment=args.environment + ) + + if data is None: + return 1 + + # Save response to file + try: + with open(args.output, 'w', encoding='utf-8') as f: + json.dump(data, f, indent=2) + print(f"\nResponse saved to: {args.output}") + except (OSError, IOError) as e: + print(f"Error saving response to file: {e}") + return 1 + + # Print the response + if args.pretty: + print("\nResponse data:") + print(json.dumps(data, indent=2)) + else: + print(f"\nResponse data: {json.dumps(data)}") + + # Extract and display the dataset slug if available + if isinstance(data, dict) and 'slug' in data: + print(f"\nπŸ“ IMPORTANT: Update your .env file with:") + print(f" NOVEUM_DATASET_SLUG={data['slug']}") + elif isinstance(data, dict) and 'data' in data and isinstance(data['data'], dict) and 'slug' in data['data']: + print(f"\nπŸ“ IMPORTANT: Update your .env file with:") + print(f" NOVEUM_DATASET_SLUG={data['data']['slug']}") + + return 0 + +if __name__ == "__main__": + exit(main()) diff --git a/noveum_customer_support_bt/create_dataset_version.py b/noveum_customer_support_bt/create_dataset_version.py new file mode 100644 index 0000000..bdc6632 --- /dev/null +++ b/noveum_customer_support_bt/create_dataset_version.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python3 +""" +Script to create a new dataset version in Noveum API. +Creates a new version for the specified dataset. +""" + +import os +import json +import requests +import argparse +from dotenv import load_dotenv +from typing import Dict, Any, Optional + +# Load environment variables +load_dotenv() + +# Get API credentials from environment +api_key = os.getenv('NOVEUM_API_KEY') +org_slug = os.getenv('NOVEUM_ORG_SLUG') +dataset_slug = os.getenv('NOVEUM_DATASET_SLUG') +latest_version = os.getenv('LATEST_VERSION') +beta_env = os.getenv('BETA', 'false').lower() == 'true' + +def validate_environment(): + """Validate that all required environment variables are set""" + required_vars = { + 'NOVEUM_API_KEY': api_key, + 'NOVEUM_ORG_SLUG': org_slug, + 'NOVEUM_DATASET_SLUG': dataset_slug, + 'LATEST_VERSION': latest_version + } + + missing_vars = [var for var, value in required_vars.items() if not value] + + if missing_vars: + print(f"Error: Missing required environment variables: {', '.join(missing_vars)}") + print("Please set these variables in your .env file or environment") + return False + + return True + +def create_dataset_version(version: str) -> Optional[Dict[str, Any]]: + """Create a new dataset version in Noveum API""" + + # Construct API URL based on BETA environment variable + if beta_env: + api_url = f"https://noveum.ai/api/v1/datasets/{dataset_slug}/versions?organizationSlug={org_slug}" + else: + api_url = f"https://noveum.ai/api/v1/organizations/{org_slug}/datasets/{dataset_slug}/versions" + + # Prepare headers + headers = { + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {api_key}', + 'Cookie': f'apiKeyCookie={api_key}' + } + + # Prepare request data + request_data = { + "version": version + } + + print(f"Creating dataset version at: {api_url}") + print(f"Organization: {org_slug}") + print(f"Dataset: {dataset_slug}") + print(f"Version: {version}") + + try: + response = requests.post(api_url, headers=headers, json=request_data, timeout=30) + response.raise_for_status() + + data = response.json() + print("Successfully created dataset version") + print(f"Response status: {response.status_code}") + + return data + + except requests.exceptions.RequestException as e: + print(f"Error creating dataset version: {e}") + if hasattr(e, 'response') and e.response is not None: + print(f"Response status: {e.response.status_code}") + print(f"Response text: {e.response.text}") + return None + +def main(): + parser = argparse.ArgumentParser(description='Create a new dataset version in Noveum API') + parser.add_argument('--pretty', action='store_true', + help='Pretty print the JSON response') + parser.add_argument('--output', type=str, default="dataset_version_response.json", + help='Output file to save the JSON response (default: dataset_version_response.json)') + + args = parser.parse_args() + + # Validate environment variables + if not validate_environment(): + return 1 + + # Create dataset version + data = create_dataset_version(version=latest_version) + + if data is None: + return 1 + + # Save response to file + try: + with open(args.output, 'w', encoding='utf-8') as f: + json.dump(data, f, indent=2) + print(f"\nResponse saved to: {args.output}") + except (OSError, IOError) as e: + print(f"Error saving response to file: {e}") + return 1 + + # Print the response + if args.pretty: + print("\nResponse data:") + print(json.dumps(data, indent=2)) + else: + print(f"\nResponse data: {json.dumps(data)}") + + return 0 + +if __name__ == "__main__": + exit(main()) diff --git a/noveum_customer_support_bt/demo_utils.py b/noveum_customer_support_bt/demo_utils.py new file mode 100644 index 0000000..20904ba --- /dev/null +++ b/noveum_customer_support_bt/demo_utils.py @@ -0,0 +1,1025 @@ +import json +import os +import re +import logging +from typing import Any, Dict, List, Optional +import pandas as pd +from pathlib import Path + +# NovaEval imports +from novaeval.agents.agent_data import AgentData, ToolSchema, ToolCall, ToolResult +from novaeval.datasets.agent_dataset import AgentDataset +from novaeval.evaluators.agent_evaluator import AgentEvaluator +from novaeval.models.gemini import GeminiModel +from novaeval.scorers.agent_scorers import ( + context_relevancy_scorer, + role_adherence_scorer, + task_progression_scorer, + tool_relevancy_scorer, + parameter_correctness_scorer +) +from dotenv import load_dotenv + +load_dotenv() + +print("βœ… All imports successful!") + +def list_dataset_files(directory: str = "split_datasets") -> List[str]: + """ + List available JSON dataset files in the specified directory. + + Args: + directory: Directory to search for dataset files + + Returns: + List of JSON file names + """ + try: + if not os.path.exists(directory): + print(f"❌ Directory {directory} does not exist") + return [] + + files = os.listdir(directory) + json_files = [f for f in files if f.endswith('.json')] + + print(f"πŸ“Š Found {len(json_files)} JSON files in {directory}/:") + for file in json_files: + print(f" - {file}") + + return json_files + except Exception as e: + print(f"❌ Error listing files: {e}") + return [] + +print("βœ… list_dataset_files function defined!") + +def load_and_analyze_dataset(file_name: str) -> tuple[List[Dict[str, Any]], Dict[str, int]]: + """ + Load JSON dataset file and analyze span types. + + Args: + file_name: Path to the JSON dataset file + + Returns: + Tuple of (spans_data, span_types_dict) + """ + try: + with open(file_name, 'r', encoding='utf-8') as f: + spans_data = json.load(f) + + print(f"πŸ“Š Loaded {len(spans_data)} spans from {file_name}") + print("\nπŸ” Available span types:") + + # Analyze span types + span_types = {} + for span in spans_data: + span_name = span.get('name', 'unknown') + if span_name not in span_types: + span_types[span_name] = 0 + span_types[span_name] += 1 + + for span_type, count in span_types.items(): + print(f" - {span_type}: {count}") + + return spans_data, span_types + + except FileNotFoundError: + print(f"❌ File {file_name} not found") + return [], {} + except json.JSONDecodeError as e: + print(f"❌ Error parsing JSON: {e}") + return [], {} + except Exception as e: + print(f"❌ Error loading dataset: {e}") + return [], {} + +print("βœ… load_and_analyze_dataset function defined!") + +def parse_tools_from_prompt(prompt: str) -> List[ToolSchema]: + """ + Parse tool definitions from LLM prompts using regex. + + Expected format: tool_name(param: type = default) -> return_type - description + """ + # Pattern to match tool signatures + pattern = r'(\w+)\(([^)]*)\)\s*->\s*(\w+)\s*-\s*(.+?)(?=\n\w+\(|$)' + matches = re.findall(pattern, prompt, re.DOTALL) + + tools = [] + for match in matches: + tool_name, params_str, return_type, description = match + + # Parse parameters + args_schema = parse_params(params_str) + + tool = ToolSchema( + name=tool_name, + description=description.strip(), + args_schema=args_schema, + return_schema={"type": return_type} + ) + tools.append(tool) + + return tools + +print("βœ… parse_tools_from_prompt function defined!") + +def parse_params(params_str: str) -> Dict[str, Any]: + """ + Parse parameter string into schema dictionary. + + Format: param_name: type = default_value + """ + if not params_str.strip(): + return {} + + # Split parameters by comma + params = [p.strip() for p in params_str.split(',') if p.strip()] + schema = {} + + for param in params: + if ':' in param: + parts = param.split(':', 1) + param_name = parts[0].strip() + type_and_default = parts[1].strip() + + # Extract type and default value + if '=' in type_and_default: + type_part, default_part = type_and_default.split('=', 1) + param_type = type_part.strip() + default_val = default_part.strip().strip('"\'') + schema[param_name] = {'type': param_type, 'default': default_val} + else: + param_type = type_and_default.strip() + schema[param_name] = {'type': param_type} + + return schema + +print("βœ… parse_params function defined!") + +def identify_span_type(span: Dict[str, Any]) -> str: + """ + Identify span type based on attributes. + """ + attributes = span.get('attributes', {}) + span_name = span.get('name', '') + + # Check for agent spans - expanded to include RAG evaluation spans + agent_span_names = [ + 'reddit_agent_run_1', 'reddit_agent_run_2', + 'agent.query_generation', 'agent.comment_generation', + 'agent.rag_evaluation_metrics', 'agent.web_search_evaluation_metrics', + 'agent.query_routing' + ] + + # Check for agent attributes or known agent span names + if (any('chain.name' == key for key in attributes.keys()) or + span_name in agent_span_names or + span_name.startswith('agent.')): + return 'agent' + + # Check for LLM attributes + if any('llm.model' == key for key in attributes.keys()): + return 'llm' + + # Check for tool attributes + tool_span_names = ['post_validation', 'email_generation_and_sending'] + if (any('tool.name' == key for key in attributes.keys()) or + span_name in tool_span_names): + return 'tool' + + print('returning unknown type for span') + print(span) + return 'unknown' + +print("βœ… identify_span_type function defined!") + +def map_span_to_agent_data(span: Dict[str, Any], count_unknowns: Optional[Dict[str, int]] = None) -> AgentData: + """ + Map a single span from file_name to AgentData format. + """ + + attributes = span.get('attributes', {}) + events = span.get('events', []) + span_type = identify_span_type(span) + + # Base mappings + data = { + 'user_id': span.get('metadata', {}).get('user_id', None), + 'task_id': span.get('trace_id'), + 'turn_id': span.get('span_id'), + 'ground_truth': None, + 'expected_tool_call': None, + 'agent_name': span_type, + 'agent_role': span_type, + 'system_prompt': "You are a helpful customer support agent", + 'metadata': None, + 'exit_status': span.get('status'), + 'tools_available': [], + 'tool_calls': [], + 'parameters_passed': {}, + 'tool_call_results': [], + 'retrieval_query': None, + 'retrieved_context': None, + 'agent_exit': False, + 'trace': None + } + + # Span-specific mappings + if span_type == 'agent': + # Agent task - handle different span types + span_name = span.get('name', '') + + if span_name.startswith('agent.rag_evaluation_metrics'): + # RAG evaluation spans + data['agent_task'] = attributes.get('input_query', 'RAG evaluation task') + data['agent_response'] = attributes.get('output_response', '') + data['retrieval_query'] = [attributes.get('input_query', '')] + data['retrieved_context'] = [[attributes.get('retrieval.context_retrieved', '')]] + elif span_name.startswith('agent.web_search_evaluation_metrics'): + # Web search evaluation spans + data['agent_task'] = attributes.get('input_query', 'Web search evaluation task') + data['agent_response'] = attributes.get('output_response', '') + data['retrieval_query'] = [attributes.get('input_query', '')] + data['retrieved_context'] = [[attributes.get('web_search.search_results', '')]] + elif span_name.startswith('agent.query_routing'): + # Query routing spans + data['agent_task'] = attributes.get('input_query', 'Query routing task') + data['agent_response'] = attributes.get('routing_decision', '') + else: + # Standard agent spans + chain_inputs = attributes.get('chain.inputs', {}) + if isinstance(chain_inputs, dict) and 'input' in chain_inputs: + data['agent_task'] = chain_inputs['input'] + elif attributes.get("agent_task"): + data['agent_task'] = attributes.get("agent_task") + else: + print('agent_task not found') + + # Agent response + finish_values = attributes.get('agent.output.finish.return_values', {}) + if isinstance(finish_values, dict) and 'output' in finish_values: + data['agent_response'] = finish_values['output'] + elif attributes.get("agent_response"): + data['agent_response'] = attributes.get("agent_response") + else: + print("agent_response is not available " + span['span_id']) + # Tool calls from agent actions - handle different span types + if span_name.startswith('agent.rag_evaluation_metrics'): + # RAG evaluation spans don't have traditional tool calls + # They have retrieval and response evaluation capabilities + data['tools_available'] = [ + ToolSchema( + name="rag_evaluation", + description="Evaluates RAG system performance", + args_schema={}, + return_schema={"type": "evaluation_result"} + ) + ] + elif span_name.startswith('agent.web_search_evaluation_metrics'): + # Web search evaluation spans + data['tools_available'] = [ + ToolSchema( + name="web_search_evaluation", + description="Evaluates web search performance", + args_schema={}, + return_schema={"type": "evaluation_result"} + ) + ] + elif span_name.startswith('agent.query_routing'): + # Query routing spans + data['tools_available'] = [ + ToolSchema( + name="query_routing", + description="Routes queries to appropriate handlers", + args_schema={}, + return_schema={"type": "routing_decision"} + ) + ] + else: + # Standard agent tool calls + tool_name = attributes.get('agent.output.action.tool') + tool_input = attributes.get('agent.output.action.tool_input') + + if tool_name: + tool_call = ToolCall( + tool_name=tool_name, + parameters={'input': tool_input} if tool_input else {}, + call_id=span['span_id'] + ) + data['tool_calls'] = [tool_call] + data['parameters_passed'] = {'input': tool_input} if tool_input else {} + + # Handle retrieval query for langchain_retriever + if tool_name == 'langchain_retriever' and tool_input: + data['retrieval_query'] = [tool_input] + + # Agent exit status + data['agent_exit'] = any(event.get('name') == 'agent_finish' for event in events) + + # Trace (dump events as JSON) + if events: + data['trace'] = json.dumps(events) + + elif span_type == 'llm': + # Agent response from LLM output + llm_input = attributes.get('llm.input.prompts', ['input is not available']) + data['agent_task'] = llm_input[0] + + llm_responses = attributes.get('llm.output.response', []) + if llm_responses: + data['agent_response'] = llm_responses[0] + else: + print("llm_response is not available") + # Parse tools from prompt + prompts = attributes.get('llm.input.prompts', []) + if prompts: + try: + tools = parse_tools_from_prompt(prompts[0]) + data['tools_available'] = tools + except Exception: + # Fallback to empty list if parsing fails + data['tools_available'] = [] + + data['parameters_passed'] = {} + + elif span_type == 'tool': + # Agent response from tool output + tool_output = attributes.get('tool.output.output') + data['agent_task'] = f"This is a simple tool call, and the tool will execute as programmed. Its name is - {attributes.get('tool.name')}" + if tool_output: + data['agent_response'] = tool_output + elif attributes.get("tool_response"): + data['agent_response'] = attributes.get("tool_response") + else: + print("tool_output is not available " + span['span_id']) + # Tool call results + tool_name = attributes.get('tool.name') + if tool_name and tool_output is not None: + tool_result = ToolResult( + call_id=span['span_id'], + result=tool_output, + success=span.get('status') == 'ok', + error_message=None if span.get('status') == 'ok' else 'Tool execution failed' + ) + data['tool_call_results'] = [tool_result] + + # Handle retrieved context for langchain_retriever + if tool_name == 'langchain_retriever': + data['retrieved_context'] = [[tool_output]] + + # Parameters from tool input + tool_input_keys = [key for key in attributes.keys() if key.startswith('tool.input.')] + tool_params = {} + for key in tool_input_keys: + param_name = key.replace('tool.input.', '') + tool_params[param_name] = attributes[key] + data['parameters_passed'] = tool_params + else: + if count_unknowns is not None and 'count' in count_unknowns: + count_unknowns['count'] += 1 + print('Spans with unknown type: ' + str(count_unknowns['count'])) + else: + print('Spans with unknown type detected') + return AgentData(**data) + + +print("βœ… map_span_to_agent_data function defined!") + +def convert_spans_to_agent_dataset(spans_data: List[Dict[str, Any]]) -> tuple[List[AgentData], List[str], AgentDataset]: + """ + Convert list of spans to AgentData objects and create AgentDataset. + + Args: + spans_data: List of span dictionaries + + Returns: + Tuple of (agent_data_list, errors, dataset) + """ + print("πŸ”„ Converting spans to AgentData objects...") + + agent_data_list = [] + errors = [] + count_unknowns = {'count': 0} + + for i, span in enumerate(spans_data): + try: + agent_data = map_span_to_agent_data(span, count_unknowns) + agent_data_list.append(agent_data) + except Exception as e: + errors.append(f"Span {i}: {str(e)}") + if len(errors) <= 5: # Show first 5 errors only + print(f"⚠️ Error processing span {i}: {e}") + + print(f"\nβœ… Successfully converted {len(agent_data_list)} spans to AgentData") + if errors: + print(f"❌ {len(errors)} spans had errors") + + # Create AgentDataset + dataset = AgentDataset() + dataset.data = agent_data_list + + print(f"πŸ“Š AgentDataset created with {len(dataset.data)} records") + + return agent_data_list, errors, dataset + +print("βœ… convert_spans_to_agent_dataset function defined!") + +def analyze_dataset_statistics(dataset: AgentDataset) -> Dict[str, Any]: + """ + Analyze dataset statistics including agent types, tool usage, and response counts. + + Args: + dataset: AgentDataset to analyze + + Returns: + Dictionary containing statistics + """ + print("πŸ“ˆ Dataset Statistics:") + + agent_types = {} + tool_usage = {} + with_responses = 0 + with_tool_calls = 0 + with_retrieval = 0 + + for data in dataset.data: + # Agent types + if data.agent_name: + agent_types[data.agent_name] = agent_types.get(data.agent_name, 0) + 1 + + # Responses + if data.agent_response: + with_responses += 1 + + # Tool calls + if data.tool_calls: + with_tool_calls += 1 + for tool_call in data.tool_calls: + if hasattr(tool_call, 'tool_name'): + tool_usage[tool_call.tool_name] = tool_usage.get(tool_call.tool_name, 0) + 1 + + # Retrieval + if data.retrieval_query: + with_retrieval += 1 + + stats = { + 'agent_types': dict(agent_types), + 'records_with_responses': with_responses, + 'records_with_tool_calls': with_tool_calls, + 'records_with_retrieval': with_retrieval, + 'tool_usage': dict(tool_usage), + 'total_records': len(dataset.data) + } + + print(f"\nAgent Types: {dict(agent_types)}") + print(f"Records with responses: {with_responses}") + print(f"Records with tool calls: {with_tool_calls}") + print(f"Records with retrieval: {with_retrieval}") + print(f"Tool usage: {dict(tool_usage)}") + + return stats + +print("βœ… analyze_dataset_statistics function defined!") + +def setup_gemini_model(model_name: str = "gemini-2.0-flash-lite", temperature: float = 0.1, max_tokens: int = 1024) -> Optional[GeminiModel]: + """ + Setup and initialize Gemini model with API key validation. + + Args: + model_name: Gemini model name to use + temperature: Temperature for generation + max_tokens: Maximum tokens to generate + + Returns: + GeminiModel instance or None if setup failed + """ + # Check for API key + if 'GEMINI_API_KEY' not in os.environ: + print("⚠️ GEMINI_API_KEY environment variable not set!") + print("Please set it before running evaluation:") + print("export GEMINI_API_KEY='your-api-key-here'") + return None + else: + print("βœ… GEMINI_API_KEY found in environment") + + # Initialize Gemini model + try: + gemini_model = GeminiModel( + model_name=model_name, + temperature=temperature, + max_tokens=max_tokens + ) + print("βœ… Gemini model initialized") + return gemini_model + except Exception as e: + print(f"❌ Error initializing Gemini model: {e}") + return None + +print("βœ… setup_gemini_model function defined!") + +def setup_agent_evaluator(dataset: AgentDataset, gemini_model: GeminiModel, output_dir: str = "./demo_results", + include_reasoning: bool = True, stream: bool = False) -> Optional[AgentEvaluator]: + """ + Setup AgentEvaluator with scoring functions. + + Args: + dataset: AgentDataset to evaluate + gemini_model: Initialized GeminiModel + output_dir: Directory for output files + include_reasoning: Whether to include reasoning in results + stream: Whether to stream results + + Returns: + AgentEvaluator instance or None if setup failed + """ + # Initialize scoring functions + scoring_functions = [ + task_progression_scorer, + context_relevancy_scorer, + role_adherence_scorer, + tool_relevancy_scorer, + parameter_correctness_scorer + ] + + print(f"βœ… Initialized {len(scoring_functions)} scoring functions:") + for func in scoring_functions: + print(f" - {func.__name__}") + + # Create AgentEvaluator + try: + evaluator = AgentEvaluator( + agent_dataset=dataset, + models=[gemini_model], + scoring_functions=scoring_functions, + output_dir=output_dir, + stream=stream, + include_reasoning=include_reasoning + ) + print("\nβœ… AgentEvaluator created with Gemini model and scoring functions") + return evaluator + except Exception as e: + print(f"❌ Error creating AgentEvaluator: {e}") + return None + +print("βœ… setup_agent_evaluator function defined!") + +def run_evaluation(dataset: AgentDataset, evaluator: AgentEvaluator, sample_size: int = 25, + file_name: str = "sample_evaluation") -> Optional[pd.DataFrame]: + """ + Run agent evaluation on sample data and display results. + + Args: + dataset: AgentDataset to evaluate + evaluator: Initialized AgentEvaluator + sample_size: Number of samples to evaluate + file_name: Base name for output files + + Returns: + DataFrame with results or None if evaluation failed + """ + print("πŸš€ Running evaluation on sample data...") + + try: + # Create a smaller dataset for demo purposes + # Filter for records with agent responses or meaningful content + sample_data = [] + for data in dataset.data: + if (data.agent_response and data.agent_response.strip()) or data.agent_task: + sample_data.append(data) + if len(sample_data) >= sample_size: + break + + print(f"\nπŸ“Š Evaluating {len(sample_data)} sample records...") + + # Create a temporary dataset with just the sample data + sample_dataset = AgentDataset() + sample_dataset.data = sample_data + + # Create a new evaluator with the sample dataset + sample_evaluator = AgentEvaluator( + agent_dataset=sample_dataset, + models=evaluator.models, + scoring_functions=evaluator.scoring_functions, + output_dir=f"{evaluator.output_dir}/{file_name}", + stream=evaluator.stream, + include_reasoning=evaluator.include_reasoning + ) + + # Run the evaluation + sample_evaluator.run_all(save_every=1, file_type="csv") + + print("\nβœ… Evaluation completed!") + + # Read and display results + results_file = f"{evaluator.output_dir}/{file_name}/agent_evaluation_results.csv" + + if os.path.exists(results_file): + results_df = pd.read_csv(results_file) + print("\nπŸ“Š Results Summary:") + + # Calculate averages for each scorer + scorer_columns = [col for col in results_df.columns if col not in ['user_id', 'task_id', 'turn_id', 'agent_name'] and not col.endswith('_reasoning')] + + for col in scorer_columns: + if results_df[col].dtype in ['float64', 'int64']: + avg_score = results_df[col].mean() + print(f" - {col}: {avg_score:.2f}") + + # Show individual scores + print("\nπŸ” Individual Scores:") + for i, row in results_df.iterrows(): + print(f"\n Record {i+1} (Task: {row.get('task_id', 'N/A')}):") + for col in scorer_columns: + if pd.notna(row[col]): + print(f" - {col}: {row[col]}") + + return results_df + else: + print("❌ Results file not found") + return None + + except Exception as e: + print(f"❌ Error during evaluation: {e}") + print(f"Error type: {type(e).__name__}") + import traceback + traceback.print_exc() + return None + +print("βœ… run_evaluation function defined!") + +def analyze_agent_behavior_patterns(dataset: AgentDataset) -> Dict[str, Any]: + """ + Analyze agent behavior patterns including tool usage, task types, and response statistics. + + Args: + dataset: AgentDataset to analyze + + Returns: + Dictionary containing analysis results + """ + print("πŸ” Dataset Analysis:") + print("\n=== Agent Behavior Patterns ===") + + # Analyze tool usage patterns + tool_patterns = {} + task_types = {} + response_lengths = [] + + for data in dataset.data: + # Tool usage + if data.tool_calls: + for tool_call in data.tool_calls: + if hasattr(tool_call, 'tool_name'): + tool_name = tool_call.tool_name + if tool_name not in tool_patterns: + tool_patterns[tool_name] = {'count': 0, 'success_rate': 0} + tool_patterns[tool_name]['count'] += 1 + + # Task analysis + if data.agent_task: + # Simple categorization + task_lower = data.agent_task.lower() + if 'user_input' in task_lower: + task_types['user_input'] = task_types.get('user_input', 0) + 1 + elif 'exit' in task_lower: + task_types['exit_command'] = task_types.get('exit_command', 0) + 1 + else: + task_types['other'] = task_types.get('other', 0) + 1 + + # Response analysis + if data.agent_response: + response_lengths.append(len(data.agent_response)) + + print("\nπŸ“ˆ Tool Usage:") + for tool, stats in tool_patterns.items(): + print(f" - {tool}: {stats['count']} uses") + + print("\nπŸ“‹ Task Types:") + for task_type, count in task_types.items(): + print(f" - {task_type}: {count}") + + analysis_results = { + 'tool_patterns': tool_patterns, + 'task_types': task_types, + 'response_lengths': response_lengths + } + + if response_lengths: + avg_response_length = sum(response_lengths) / len(response_lengths) + print("\nπŸ“ Response Statistics:") + print(f" - Average response length: {avg_response_length:.1f} characters") + print(f" - Min response length: {min(response_lengths)}") + print(f" - Max response length: {max(response_lengths)}") + + analysis_results['avg_response_length'] = avg_response_length + analysis_results['min_response_length'] = min(response_lengths) + analysis_results['max_response_length'] = max(response_lengths) + + return analysis_results + +print("βœ… analyze_agent_behavior_patterns function defined!") + +def export_processed_dataset(dataset: AgentDataset, file_name: str = "processed_agent_dataset") -> bool: + """ + Export processed AgentDataset to JSON and CSV formats. + + Args: + dataset: AgentDataset to export + file_name: Base name for export files + + Returns: + True if export successful, False otherwise + """ + print("πŸ’Ύ Exporting processed dataset...") + + success = True + + # Create directory if it doesn't exist + output_dir = Path(file_name).parent + output_dir.mkdir(parents=True, exist_ok=True) + try: + # Export to JSON + json_file = f'{file_name}.json' + dataset.export_to_json(json_file) + print(f"βœ… Exported to {json_file}") + + except Exception as e: + print(f"❌ JSON export error: {e}") + success = False + + try: + # Export to CSV (optional) + csv_file = f'{file_name}.csv' + dataset.export_to_csv(csv_file) + print(f"βœ… Exported to {csv_file}") + + except Exception as e: + print(f"❌ CSV export error: {e}") + success = False + + if success: + print("βœ… Export completed successfully!") + else: + print("❌ Some exports failed") + + return success + +print("βœ… export_processed_dataset function defined!") +def setup_logging(log_level: str = "INFO", log_file: Optional[str] = None) -> None: + """ + Configure logging for the evaluation process. + + Args: + log_level: Logging level (DEBUG, INFO, WARNING, ERROR) + log_file: Optional log file path + """ + log_format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s' + + if log_file: + logging.basicConfig( + level=getattr(logging, log_level.upper()), + format=log_format, + handlers=[ + logging.FileHandler(log_file), + logging.StreamHandler() + ] + ) + else: + logging.basicConfig( + level=getattr(logging, log_level.upper()), + format=log_format + ) + + print(f"βœ… Logging configured at {log_level} level") + +print("βœ… setup_logging function defined!") + +def validate_environment() -> Dict[str, bool]: + """ + Check for required environment variables and dependencies. + + Returns: + Dictionary with validation results + """ + results = { + 'gemini_api_key': 'GEMINI_API_KEY' in os.environ, + 'pandas_available': True, + 'novaeval_available': True + } + + try: + import pandas # pylint: disable=import-outside-toplevel,unused-import + except ImportError: + results['pandas_available'] = False + + try: + from novaeval.agents.agent_data import AgentData as _AgentData # pylint: disable=import-outside-toplevel,unused-import + except ImportError: + results['novaeval_available'] = False + + print("πŸ” Environment validation:") + for key, value in results.items(): + status = "βœ…" if value else "❌" + print(f" {status} {key}: {value}") + + return results + +print("βœ… validate_environment function defined!") + +def print_demo_summary(file_name: str, spans_count: int, dataset_count: int, + has_results: bool = False) -> None: + """ + Print a summary of the demo execution. + + Args: + file_name: Name of the processed file + spans_count: Number of spans processed + dataset_count: Number of AgentData records created + has_results: Whether evaluation results are available + """ + print("\nπŸŽ‰ Demo completed successfully!") + print("\nπŸ“‹ Summary:") + print(f" - Processed {spans_count} spans from {file_name}") + print(f" - Created {dataset_count} AgentData records") + print(" - Configured evaluation with Gemini model and 5 scorers") + if has_results: + print(" - Successfully evaluated sample data") + print(" - Exported processed dataset for future use") + +print("βœ… print_demo_summary function defined!") + + +def run_complete_agent_evaluation(selected_file: str, + sample_size: int = 25, + evaluation_name: str = "agent_evaluation", + model_name: str = "gemini-2.0-flash-lite", + temperature: float = 0.1, + max_tokens: int = 1024, + output_dir: str = "./evaluation_results") -> Dict[str, Any]: + """ + Complete agent evaluation pipeline in a single method call. + + Args: + selected_file: Path to the JSON dataset file to evaluate + sample_size: Number of samples to evaluate (default: 25) + evaluation_name: Name for the evaluation run (default: "agent_evaluation") + model_name: Gemini model to use (default: "gemini-2.0-flash-lite") + temperature: Model temperature (default: 0.1) + max_tokens: Max tokens for model (default: 1024) + output_dir: Output directory for results (default: "./evaluation_results") + + Returns: + Dictionary containing all results and status information + """ + + print("πŸš€ Starting Complete Agent Evaluation Pipeline") + print(f"πŸ“ Processing file: {selected_file}") + print("=" * 60) + + # Initialize results tracking + results = { + 'success': False, + 'file_processed': selected_file, + 'spans_loaded': 0, + 'dataset_created': False, + 'dataset_size': 0, + 'evaluation_completed': False, + 'results_df': None, + 'export_success': False, + 'errors': [] + } + + try: + # Step 1: Setup logging and validate environment + print("\nπŸ“‹ Step 1: Environment Setup") + setup_logging(log_level="INFO") + env_status = validate_environment() + + if not env_status['novaeval_available']: + results['errors'].append("NovaEval not available") + print("❌ NovaEval not available. Please install it first.") + return results + elif not env_status['gemini_api_key']: + results['errors'].append("GEMINI_API_KEY not set") + print("❌ GEMINI_API_KEY not set. Evaluation cannot proceed.") + return results + else: + print("βœ… Environment ready for evaluation!") + + # Step 2: Load and analyze dataset + print("\nπŸ“‹ Step 2: Loading Dataset") + spans_data, span_types = load_and_analyze_dataset(selected_file) + + if not spans_data: + results['errors'].append("Failed to load dataset") + print("❌ Failed to load dataset") + return results + + results['spans_loaded'] = len(spans_data) + print(f"βœ… Dataset loaded: {len(spans_data)} spans") + + # Step 3: Convert to AgentDataset format + print("\nπŸ“‹ Step 3: Converting to AgentDataset Format") + agent_data_list, conversion_errors, dataset = convert_spans_to_agent_dataset(spans_data) + + if not dataset or not dataset.data: + results['errors'].append("Failed to create dataset") + results['errors'].extend(conversion_errors[:5]) # Add first 5 conversion errors + print("❌ Failed to create dataset") + return results + + results['dataset_created'] = True + results['dataset_size'] = len(dataset.data) + print(f"βœ… AgentDataset created: {len(dataset.data)} records") + + # Step 4: Dataset analysis + print("\nπŸ“‹ Step 4: Dataset Analysis") + stats = analyze_dataset_statistics(dataset) + behavior_analysis = analyze_agent_behavior_patterns(dataset) + + # Step 5: Setup evaluation components + print("\nπŸ“‹ Step 5: Setting up Evaluation") + gemini_model = setup_gemini_model( + model_name=model_name, + temperature=temperature, + max_tokens=max_tokens + ) + + if not gemini_model: + results['errors'].append("Failed to setup Gemini model") + print("❌ Failed to setup Gemini model") + return results + + evaluator = setup_agent_evaluator( + dataset=dataset, + gemini_model=gemini_model, + output_dir=output_dir, + include_reasoning=True, + stream=False + ) + + if not evaluator: + results['errors'].append("Failed to setup evaluator") + print("❌ Failed to setup evaluator") + return results + + print("βœ… Evaluation components ready!") + + # Step 6: Run evaluation + print("\nπŸ“‹ Step 6: Running Evaluation") + print(f"🎯 Evaluating {sample_size} samples...") + + results_df = run_evaluation( + dataset=dataset, + evaluator=evaluator, + sample_size=sample_size, + file_name=evaluation_name + ) + + if results_df is not None: + results['evaluation_completed'] = True + results['results_df'] = results_df + print("βœ… Evaluation completed successfully!") + else: + results['errors'].append("Evaluation failed") + print("❌ Evaluation failed") + + # Step 7: Export processed dataset + print("\nπŸ“‹ Step 7: Exporting Dataset") + export_success = export_processed_dataset( + dataset=dataset, + file_name=f"./processed_datasets/{evaluation_name}_processed_dataset" + ) + + results['export_success'] = export_success + + # Final summary + print("\n" + "=" * 60) + print("πŸŽ‰ EVALUATION PIPELINE COMPLETED!") + print(f"πŸ“Š Final Results:") + print(f" - File processed: {selected_file}") + print(f" - Spans loaded: {results['spans_loaded']}") + print(f" - Dataset size: {results['dataset_size']}") + print(f" - Evaluation completed: {results['evaluation_completed']}") + print(f" - Export successful: {results['export_success']}") + + if results['evaluation_completed']: + print(f" - Results saved to: {output_dir}/{evaluation_name}/") + + if results['errors']: + print(f" - Errors encountered: {len(results['errors'])}") + + results['success'] = results['evaluation_completed'] and results['export_success'] + + return results + + except Exception as e: + error_msg = f"Pipeline failed with error: {str(e)}" + results['errors'].append(error_msg) + print(f"❌ {error_msg}") + import traceback + traceback.print_exc() + return results + +print("βœ… run_complete_agent_evaluation function defined!") \ No newline at end of file diff --git a/noveum_customer_support_bt/e2e_noveum_user_bot.ipynb b/noveum_customer_support_bt/e2e_noveum_user_bot.ipynb new file mode 100644 index 0000000..59871f6 --- /dev/null +++ b/noveum_customer_support_bt/e2e_noveum_user_bot.ipynb @@ -0,0 +1,2046 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "9eb0a8a7", + "metadata": {}, + "outputs": [], + "source": [ + "# Fix: Upload Dataset with Proper Environment Loading\n", + "!cd /Users/mramanindia/work/NovaEval/noveum_customer_support_bt && source .env && python upload_dataset.py --dataset-json split_datasets/agent.rag_evaluation_metrics_dataset.json --item-type conversation\n" + ] + }, + { + "cell_type": "markdown", + "id": "780ff8d0", + "metadata": {}, + "source": [ + "# Noveum AI Agent with RAG + Web Search\n", + "\n", + "An intelligent conversational agent that dynamically routes queries between **RAG (Retrieval-Augmented Generation)** for Noveum.ai-specific information and **Web Search** for external knowledge, providing comprehensive answers with full observability.\n", + "\n", + "## πŸš€ What This Agent Does\n", + "\n", + "### Core Functionality\n", + "- **Intelligent Query Routing**: Automatically determines whether to use RAG or Web Search based on query content\n", + "- **Dual Knowledge Sources**: \n", + " - **RAG Mode**: Answers questions about Noveum.ai platform using scraped documentation\n", + " - **Web Search Mode**: Handles external queries using real-time web search\n", + "- **Comprehensive Tracing**: Full observability with detailed metrics and performance tracking\n", + "- **Modular Architecture**: Clean separation of concerns for easy maintenance and extension\n", + "\n", + "### Key Capabilities\n", + "- 🧠 **Document Intelligence**: Scrapes and indexes Noveum.ai website content for semantic search\n", + "- 🌐 **Real-time Web Search**: Uses DuckDuckGo for current events and external knowledge\n", + "- 🎯 **Smart Classification**: LLM-powered query routing with keyword fallback\n", + "- πŸ“Š **Performance Monitoring**: Detailed metrics on response quality, latency, and token usage\n", + "- πŸ”„ **Scalable Design**: Easy to extend with new data sources or routing logic\n", + "\n", + "## πŸ“‹ Prerequisites & Requirements\n", + "\n", + "### Required Environment Variables\n", + "```bash\n", + "NOVEUM_API_KEY=your_noveum_api_key_here\n", + "OPENAI_API_KEY=your_openai_api_key_here\n", + "```\n", + "\n", + "### Required Python Packages\n", + "- `requests` - HTTP requests for web scraping\n", + "- `beautifulsoup4` - HTML parsing\n", + "- `trafilatura` - Advanced text extraction\n", + "- `langchain` - LLM framework and vector operations\n", + "- `langchain-openai` - OpenAI integration\n", + "- `langchain-community` - Community tools (FAISS, DuckDuckGo)\n", + "- `noveum-trace` - Observability and tracing\n", + "- `python-dotenv` - Environment variable management\n", + "\n", + "### System Requirements\n", + "- Python 3.8+\n", + "- Internet connection for web scraping and API calls\n", + "- ~500MB disk space for vector store and scraped data\n", + "\n", + "## πŸ—οΈ Architecture Overview\n", + "\n", + "### 1. **Website Scraper** (`NoveumWebsiteScraper`)\n", + "- Recursively scrapes noveum.ai website and sub-pages\n", + "- Extracts clean text content using trafilatura\n", + "- Discovers internal links automatically\n", + "- Saves scraped data to JSON for persistence\n", + "\n", + "### 2. **RAG System** (`NoveumRAGSystem`)\n", + "- Loads scraped documents and creates vector embeddings\n", + "- Uses FAISS for fast similarity search\n", + "- Generates context-aware responses using OpenAI GPT-4o-mini\n", + "- Tracks retrieval effectiveness and response quality\n", + "\n", + "### 3. **Web Search System** (`NoveumWebSearchSystem`)\n", + "- Integrates DuckDuckGo search for external queries\n", + "- Synthesizes information from multiple web sources\n", + "- Handles real-time information and current events\n", + "- Formats search results into coherent responses\n", + "\n", + "### 4. **Query Router** (`NoveumQueryRouter`)\n", + "- **Keyword-based classification**: Matches queries against predefined keyword lists\n", + "- **LLM-based classification**: Uses GPT-4o-mini for complex query analysis\n", + "- **Confidence scoring**: Evaluates routing decision quality\n", + "- **Fallback handling**: Defaults to Web Search for ambiguous queries\n", + "\n", + "### 5. **Main Agent** (`NoveumAIAgent`)\n", + "- Orchestrates all components\n", + "- Manages system initialization and data loading\n", + "- Provides unified interface for query processing\n", + "- Handles error recovery and response formatting\n", + "\n", + "## 🎯 How to Use\n", + "\n", + "### Quick Start\n", + "```python\n", + "# 1. Initialize the system (first time only)\n", + "noveum_agent.initialize_system(force_scrape=True)\n", + "\n", + "# 2. Ask questions\n", + "response = noveum_agent.process_query(\"What is Noveum and what does it do?\")\n", + "noveum_agent.display_response(response)\n", + "\n", + "# 3. Or use convenience function\n", + "ask_question(\"How do I integrate Noveum Trace?\")\n", + "```\n", + "\n", + "### Advanced Usage\n", + "```python\n", + "# Run full demo with 20 test queries\n", + "demo_noveum_agent()\n", + "\n", + "# Process queries programmatically\n", + "response = noveum_agent.process_query(\"What are the latest AI news?\")\n", + "print(f\"Mode: {response['mode']}\")\n", + "print(f\"Answer: {response['answer']}\")\n", + "print(f\"Sources: {response['sources']}\")\n", + "```\n", + "\n", + "### Query Types\n", + "\n", + "#### RAG Queries (Noveum-specific)\n", + "- \"What is Noveum and what does it do?\"\n", + "- \"How do I integrate Noveum Trace?\"\n", + "- \"What are Noveum's pricing plans?\"\n", + "- \"What features does Noveum Trace offer?\"\n", + "- \"How do I set up observability with Noveum?\"\n", + "\n", + "#### Web Search Queries (External knowledge)\n", + "- \"What are the latest AI news today?\"\n", + "- \"What's the weather like today?\"\n", + "- \"Tell me about recent developments in machine learning\"\n", + "- \"What are the current trends in observability tools?\"\n", + "- \"What happened in tech news this week?\"\n", + "\n", + "## πŸ“Š Observability & Monitoring\n", + "\n", + "### Traced Operations\n", + "- **System Initialization**: Website scraping and vector store creation\n", + "- **Query Processing**: End-to-end query handling with performance metrics\n", + "- **RAG Operations**: Document retrieval, context generation, and response creation\n", + "- **Web Search Operations**: Search execution, result synthesis, and response generation\n", + "- **Query Routing**: Classification decision making and confidence scoring\n", + "\n", + "### Key Metrics Tracked\n", + "- **Performance**: Response latency, processing time, token usage\n", + "- **Quality**: Response length, source diversity, context utilization\n", + "- **Routing**: Classification confidence, keyword scores, decision rationale\n", + "- **Model Usage**: Token consumption, cost estimation, efficiency scores\n", + "- **Retrieval**: Document relevance, context quality, source effectiveness\n", + "\n", + "### Noveum Trace Integration\n", + "- All operations are automatically traced with detailed spans\n", + "- Comprehensive attribute tracking for debugging and optimization\n", + "- Real-time monitoring through Noveum.ai dashboard\n", + "- Export capabilities for further analysis\n", + "\n", + "## πŸ”§ Configuration\n", + "\n", + "### Default Settings\n", + "```python\n", + "CONFIG = {\n", + " \"noveum_base_url\": \"https://noveum.ai\",\n", + " \"max_pages_to_scrape\": 50,\n", + " \"chunk_size\": 1000,\n", + " \"chunk_overlap\": 200,\n", + " \"max_search_results\": 5,\n", + " \"rag_threshold\": 0.7,\n", + " \"noveum_docs_file\": \"noveum_docs.json\",\n", + " \"vector_store_path\": \"noveum_vectorstore\"\n", + "}\n", + "```\n", + "\n", + "### Customization Options\n", + "- **Scraping**: Adjust `max_pages_to_scrape` for more/less content\n", + "- **RAG**: Modify `chunk_size` and `chunk_overlap` for different text splitting\n", + "- **Search**: Change `max_search_results` for more/fewer sources\n", + "- **Routing**: Add keywords to `rag_keywords` or `web_keywords` lists\n", + "\n", + "## 🚨 Error Handling\n", + "\n", + "### Common Issues\n", + "- **API Key Missing**: Ensure `NOVEUM_API_KEY` and `OPENAI_API_KEY` are set\n", + "- **Network Errors**: Check internet connection for scraping and API calls\n", + "- **Vector Store Issues**: Delete `noveum_vectorstore` folder to regenerate\n", + "- **Scraping Failures**: Set `force_scrape=True` to re-scrape website\n", + "\n", + "### Recovery Strategies\n", + "- Automatic fallback to Web Search for RAG failures\n", + "- Graceful error handling with informative messages\n", + "- Retry mechanisms for transient network issues\n", + "- Detailed error logging for debugging\n", + "\n", + "## πŸ”„ Maintenance\n", + "\n", + "### Regular Tasks\n", + "- **Update Scraped Content**: Run with `force_scrape=True` periodically\n", + "- **Monitor Performance**: Check Noveum Trace dashboard for metrics\n", + "- **Review Routing**: Analyze query classification accuracy\n", + "- **Update Keywords**: Add new terms to routing keyword lists\n", + "\n", + "### Scaling Considerations\n", + "- **Vector Store**: Can be shared across multiple agent instances\n", + "- **Scraped Data**: JSON file can be versioned and distributed\n", + "- **API Limits**: Monitor OpenAI token usage and costs\n", + "- **Performance**: Consider caching for frequently asked questions\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e27db4ec", + "metadata": {}, + "outputs": [], + "source": [ + "!pip3 install -r ./noveum_agent_requirements.txt\n" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "e7a810bf", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "βœ… All imports loaded successfully!\n" + ] + } + ], + "source": [ + "# Cell 1: Setup & Imports\n", + "import os\n", + "import json\n", + "import time\n", + "from typing import List, Dict, Any, Optional, Tuple\n", + "from urllib.parse import urljoin, urlparse\n", + "import requests\n", + "from bs4 import BeautifulSoup\n", + "import trafilatura\n", + "\n", + "# LangChain ecosystem\n", + "from langchain_openai import OpenAIEmbeddings, ChatOpenAI\n", + "from langchain_community.vectorstores import FAISS\n", + "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", + "from langchain.schema import Document\n", + "from langchain_community.tools import DuckDuckGoSearchRun\n", + "\n", + "# Noveum Trace integration\n", + "import noveum_trace\n", + "from noveum_trace.context_managers import trace_operation, trace_agent\n", + "\n", + "# Load environment variables\n", + "try:\n", + " from dotenv import load_dotenv\n", + " load_dotenv()\n", + "except ImportError:\n", + " print(\"python-dotenv not installed. Environment variables will be read from system only.\")\n", + "\n", + "print(\"βœ… All imports loaded successfully!\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2209736e", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "\n", + "## set openai api key\n", + "## set gemini api key\n", + "## set noveum api key\n", + "## set environment\n", + "## set project'\n", + "\n", + "# These are required for the project\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c0dc8f95", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "βœ… Noveum Trace initialized and configuration loaded!\n", + "πŸ”§ Configuration: {'noveum_base_url': 'https://noveum.ai', 'max_pages_to_scrape': 50, 'chunk_size': 1000, 'chunk_overlap': 200, 'max_search_results': 5, 'rag_threshold': 0.7, 'noveum_docs_file': 'noveum_docs.json', 'vector_store_path': 'noveum_vectorstore'}\n" + ] + } + ], + "source": [ + "# Cell 2: Noveum Trace Integration & Configuration\n", + "# Initialize the Noveum Trace SDK\n", + "noveum_trace.init(\n", + " project=\"customer_support_agent\",\n", + " api_key=os.getenv(\"NOVEUM_API_KEY\"),\n", + " environment=\"dev-aman\",\n", + ")\n", + "\n", + "# Configuration\n", + "CONFIG = {\n", + " \"noveum_base_url\": \"https://noveum.ai\",\n", + " \"max_pages_to_scrape\": 50,\n", + " \"chunk_size\": 1000,\n", + " \"chunk_overlap\": 200,\n", + " \"max_search_results\": 5,\n", + " \"rag_threshold\": 0.7, # Similarity threshold for RAG retrieval\n", + " \"noveum_docs_file\": \"noveum_docs.json\",\n", + " \"vector_store_path\": \"noveum_vectorstore\"\n", + "}\n", + "\n", + "# Initialize LLM and embeddings\n", + "llm = ChatOpenAI(\n", + " model=\"gpt-4o-mini\",\n", + " temperature=0.1,\n", + " api_key=os.getenv(\"OPENAI_API_KEY\")\n", + ")\n", + "\n", + "embeddings = OpenAIEmbeddings(\n", + " model=\"text-embedding-3-small\",\n", + " api_key=os.getenv(\"OPENAI_API_KEY\")\n", + ")\n", + "\n", + "# Initialize web search tool\n", + "web_search = DuckDuckGoSearchRun()\n", + "\n", + "print(\"βœ… Noveum Trace initialized and configuration loaded!\")\n", + "print(f\"πŸ”§ Configuration: {CONFIG}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "c765c0cc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "βœ… Website scraper initialized!\n" + ] + } + ], + "source": [ + "# Cell 3: Website Scraper - Extract content from noveum.ai and sub-URLs\n", + "class NoveumWebsiteScraper:\n", + " def __init__(self, base_url: str, max_pages: int = 50):\n", + " self.base_url = base_url\n", + " self.max_pages = max_pages\n", + " self.scraped_urls = set()\n", + " self.scraped_content = []\n", + " self.session = requests.Session()\n", + " self.session.headers.update({\n", + " 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'\n", + " })\n", + " \n", + " def is_valid_url(self, url: str) -> bool:\n", + " \"\"\"Check if URL is valid and belongs to noveum.ai domain\"\"\"\n", + " try:\n", + " parsed = urlparse(url)\n", + " return (\n", + " parsed.netloc in ['noveum.ai', 'www.noveum.ai'] and\n", + " not any(ext in url.lower() for ext in ['.pdf', '.jpg', '.png', '.gif', '.css', '.js', '.xml', '.txt']) and\n", + " '#' not in url\n", + " )\n", + " except:\n", + " return False\n", + " \n", + " def extract_text_content(self, html_content: str, url: str) -> str:\n", + " \"\"\"Extract clean text content from HTML\"\"\"\n", + " try:\n", + " # Use trafilatura for better text extraction\n", + " extracted = trafilatura.extract(html_content)\n", + " if extracted:\n", + " return extracted.strip()\n", + " \n", + " # Fallback to BeautifulSoup\n", + " soup = BeautifulSoup(html_content, 'html.parser')\n", + " \n", + " # Remove script and style elements\n", + " for script in soup([\"script\", \"style\"]):\n", + " script.decompose()\n", + " \n", + " # Get text and clean up\n", + " text = soup.get_text()\n", + " lines = (line.strip() for line in text.splitlines())\n", + " chunks = (phrase.strip() for line in lines for phrase in line.split(\" \"))\n", + " text = ' '.join(chunk for chunk in chunks if chunk)\n", + " \n", + " return text.strip()\n", + " except Exception as e:\n", + " print(f\"Error extracting text from {url}: {e}\")\n", + " return \"\"\n", + " \n", + " def find_internal_links(self, html_content: str, current_url: str) -> List[str]:\n", + " \"\"\"Find all internal links from the current page\"\"\"\n", + " try:\n", + " soup = BeautifulSoup(html_content, 'html.parser')\n", + " links = []\n", + " \n", + " for link in soup.find_all('a', href=True):\n", + " href = link['href']\n", + " full_url = urljoin(current_url, href)\n", + " \n", + " if self.is_valid_url(full_url) and full_url not in self.scraped_urls:\n", + " links.append(full_url)\n", + " \n", + " return links\n", + " except Exception as e:\n", + " print(f\"Error finding links in {current_url}: {e}\")\n", + " return []\n", + " \n", + " def scrape_page(self, url: str) -> Optional[Dict[str, Any]]:\n", + " \"\"\"Scrape a single page and return content\"\"\"\n", + " try:\n", + " print(f\"πŸ” Scraping: {url}\")\n", + " response = self.session.get(url, timeout=10)\n", + " response.raise_for_status()\n", + " \n", + " # Extract text content\n", + " text_content = self.extract_text_content(response.text, url)\n", + " \n", + " if not text_content or len(text_content) < 100: # Skip pages with too little content\n", + " print(f\"⚠️ Skipping {url} - insufficient content\")\n", + " return None\n", + " \n", + " # Find internal links\n", + " internal_links = self.find_internal_links(response.text, url)\n", + " \n", + " page_data = {\n", + " \"url\": url,\n", + " \"title\": self.extract_title(response.text),\n", + " \"content\": text_content,\n", + " \"content_length\": len(text_content),\n", + " \"internal_links\": internal_links,\n", + " \"scraped_at\": time.time()\n", + " }\n", + " \n", + " print(f\"βœ… Scraped {url} - {len(text_content)} chars, {len(internal_links)} internal links\")\n", + " return page_data\n", + " \n", + " except Exception as e:\n", + " print(f\"❌ Error scraping {url}: {e}\")\n", + " return None\n", + " \n", + " def extract_title(self, html_content: str) -> str:\n", + " \"\"\"Extract page title\"\"\"\n", + " try:\n", + " soup = BeautifulSoup(html_content, 'html.parser')\n", + " title_tag = soup.find('title')\n", + " return title_tag.get_text().strip() if title_tag else \"Untitled\"\n", + " except:\n", + " return \"Untitled\"\n", + " \n", + " def scrape_website(self) -> List[Dict[str, Any]]:\n", + " \"\"\"Main scraping function - scrape noveum.ai recursively\"\"\"\n", + " print(f\"πŸš€ Starting to scrape {self.base_url}\")\n", + " \n", + " urls_to_scrape = [self.base_url]\n", + " self.scraped_urls.add(self.base_url)\n", + " \n", + " with trace_operation(\"noveum_website_scraping\") as scrape_span:\n", + " scrape_span.set_attributes({\n", + " \"scraper.base_url\": self.base_url,\n", + " \"scraper.max_pages\": self.max_pages,\n", + " \"input_query\": f\"Scrape website: {self.base_url}\",\n", + " \"output_response\": f\"Scraping completed: {len(self.scraped_content)} pages scraped, {sum(page['content_length'] for page in self.scraped_content)} total characters extracted\"\n", + " })\n", + " \n", + " while urls_to_scrape and len(self.scraped_content) < self.max_pages:\n", + " current_url = urls_to_scrape.pop(0)\n", + " \n", + " # Scrape the current page\n", + " page_data = self.scrape_page(current_url)\n", + " \n", + " if page_data:\n", + " self.scraped_content.append(page_data)\n", + " \n", + " # Add new internal links to the queue\n", + " for link in page_data[\"internal_links\"]:\n", + " if link not in self.scraped_urls and len(urls_to_scrape) < 100: # Prevent infinite loops\n", + " urls_to_scrape.append(link)\n", + " self.scraped_urls.add(link)\n", + " \n", + " # Add page data to span\n", + " scrape_span.add_event(\"page_scraped\", {\n", + " \"input_query\": f\"Scrape page: {current_url}\",\n", + " \"output_response\": f\"Page scraped successfully: {page_data['content_length']} characters, {len(page_data['internal_links'])} internal links found\",\n", + " \"url\": current_url,\n", + " \"content_length\": page_data[\"content_length\"],\n", + " \"internal_links_found\": len(page_data[\"internal_links\"])\n", + " })\n", + " \n", + " # Small delay to be respectful\n", + " time.sleep(0.5)\n", + " \n", + " # Final metrics\n", + " scrape_span.set_attributes({\n", + " \"scraper.pages_scraped\": len(self.scraped_content),\n", + " \"scraper.total_urls_found\": len(self.scraped_urls),\n", + " \"scraper.total_content_length\": sum(page[\"content_length\"] for page in self.scraped_content)\n", + " })\n", + " \n", + " print(f\"βœ… Scraping complete! Scraped {len(self.scraped_content)} pages\")\n", + " return self.scraped_content\n", + " \n", + " def save_to_json(self, filename: str) -> None:\n", + " \"\"\"Save scraped content to JSON file\"\"\"\n", + " with open(filename, 'w', encoding='utf-8') as f:\n", + " json.dump(self.scraped_content, f, indent=2, ensure_ascii=False)\n", + " print(f\"πŸ’Ύ Saved scraped content to {filename}\")\n", + "\n", + "# Initialize scraper\n", + "scraper = NoveumWebsiteScraper(CONFIG[\"noveum_base_url\"], CONFIG[\"max_pages_to_scrape\"])\n", + "print(\"βœ… Website scraper initialized!\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "0d0e2a7a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "βœ… RAG system initialized!\n" + ] + } + ], + "source": [ + "# Cell 4: RAG System - Vector search and retrieval over scraped content\n", + "class NoveumRAGSystem:\n", + " def __init__(self, embeddings, llm, config):\n", + " self.embeddings = embeddings\n", + " self.llm = llm\n", + " self.config = config\n", + " self.vectorstore = None\n", + " self.documents = []\n", + " self.text_splitter = RecursiveCharacterTextSplitter(\n", + " chunk_size=config[\"chunk_size\"],\n", + " chunk_overlap=config[\"chunk_overlap\"]\n", + " )\n", + " \n", + " def load_documents_from_json(self, json_file: str) -> List[Document]:\n", + " \"\"\"Load documents from scraped JSON file\"\"\"\n", + " try:\n", + " with open(json_file, 'r', encoding='utf-8') as f:\n", + " scraped_data = json.load(f)\n", + " \n", + " documents = []\n", + " for page in scraped_data:\n", + " # Create document from page content\n", + " doc = Document(\n", + " page_content=page[\"content\"],\n", + " metadata={\n", + " \"url\": page[\"url\"],\n", + " \"title\": page[\"title\"],\n", + " \"content_length\": page[\"content_length\"],\n", + " \"scraped_at\": page[\"scraped_at\"]\n", + " }\n", + " )\n", + " documents.append(doc)\n", + " \n", + " print(f\"βœ… Loaded {len(documents)} documents from {json_file}\")\n", + " return documents\n", + " \n", + " except FileNotFoundError:\n", + " print(f\"❌ File {json_file} not found. Please run the scraper first.\")\n", + " return []\n", + " except Exception as e:\n", + " print(f\"❌ Error loading documents: {e}\")\n", + " return []\n", + " \n", + " def create_vectorstore(self, documents: List[Document]) -> None:\n", + " \"\"\"Create FAISS vector store from documents\"\"\"\n", + " if not documents:\n", + " print(\"❌ No documents to create vector store\")\n", + " return\n", + " \n", + " print(\"πŸ”„ Creating vector store...\")\n", + " \n", + " # Split documents into chunks\n", + " split_docs = self.text_splitter.split_documents(documents)\n", + " print(f\"πŸ“„ Split into {len(split_docs)} chunks\")\n", + " \n", + " # Create vector store\n", + " self.vectorstore = FAISS.from_documents(split_docs, self.embeddings)\n", + " \n", + " # Save vector store\n", + " self.vectorstore.save_local(self.config[\"vector_store_path\"])\n", + " print(f\"πŸ’Ύ Vector store saved to {self.config['vector_store_path']}\")\n", + " \n", + " def load_vectorstore(self) -> bool:\n", + " \"\"\"Load existing vector store from disk\"\"\"\n", + " try:\n", + " self.vectorstore = FAISS.load_local(\n", + " self.config[\"vector_store_path\"], \n", + " self.embeddings,\n", + " allow_dangerous_deserialization=True\n", + " )\n", + " print(f\"βœ… Loaded existing vector store from {self.config['vector_store_path']}\")\n", + " return True\n", + " except Exception as e:\n", + " print(f\"❌ Error loading vector store: {e}\")\n", + " return False\n", + " \n", + " def search_relevant_docs(self, query: str, k: int = 5) -> List[Document]:\n", + " \"\"\"Search for relevant documents using similarity search\"\"\"\n", + " if not self.vectorstore:\n", + " print(\"❌ Vector store not initialized\")\n", + " return []\n", + " \n", + " try:\n", + " # Perform similarity search\n", + " docs = self.vectorstore.similarity_search(query, k=k)\n", + " \n", + " # Filter by similarity threshold if needed\n", + " # Note: FAISS doesn't return scores by default, but we can add that if needed\n", + " \n", + " print(f\"πŸ” Found {len(docs)} relevant documents for query: '{query}'\")\n", + " return docs\n", + " \n", + " except Exception as e:\n", + " print(f\"❌ Error searching documents: {e}\")\n", + " return []\n", + " \n", + " def retrieve_context(self, query: str, max_docs: int = 5) -> str:\n", + " \"\"\"Retrieve and format context for the query\"\"\"\n", + " relevant_docs = self.search_relevant_docs(query, max_docs)\n", + " \n", + " if not relevant_docs:\n", + " return \"No relevant information found in Noveum documentation.\"\n", + " \n", + " context_parts = []\n", + " for i, doc in enumerate(relevant_docs, 1):\n", + " context_parts.append(f\"Source {i} ({doc.metadata.get('url', 'Unknown URL')}):\\n{doc.page_content[:500]}...\")\n", + " \n", + " return \"\\n\\n\".join(context_parts)\n", + " \n", + " def generate_rag_response(self, query: str) -> Dict[str, Any]:\n", + " \"\"\"Generate response using RAG\"\"\"\n", + " with trace_agent(\n", + " agent_type=\"rag_agent\",\n", + " operation=\"llm-rag\",\n", + " capabilities=[\"document_retrieval\", \"context_generation\", \"response_generation\"],\n", + " attributes={\n", + " \"agent.id\": \"noveum_rag_agent\",\n", + " \"input_query\": query,\n", + " \"query_length\": len(query)\n", + " }\n", + " ) as rag_span:\n", + " \n", + " # Retrieve relevant context\n", + " context = self.retrieve_context(query, CONFIG[\"max_search_results\"])\n", + " \n", + " # Create prompt for RAG\n", + " rag_prompt = f\"\"\"You are a helpful assistant for Noveum.ai. Answer the user's question based on the provided context from Noveum's documentation.\n", + "\n", + "Context from Noveum documentation:\n", + "{context}\n", + "\n", + "User Question: {query}\n", + "\n", + "Instructions:\n", + "1. Answer based primarily on the provided context\n", + "2. If the context doesn't contain enough information, say so clearly\n", + "3. Be specific and cite sources when possible\n", + "4. Keep responses concise but informative\n", + "5. If the question is not related to Noveum, politely redirect to ask about Noveum\n", + "\n", + "Answer:\"\"\"\n", + "\n", + " # Extract model parameters and metadata\n", + " model_name = getattr(self.llm, 'model_name', 'unknown')\n", + " model_temperature = getattr(self.llm, 'temperature', 0.0)\n", + " model_max_tokens = getattr(self.llm, 'max_tokens', None)\n", + " model_top_p = getattr(self.llm, 'top_p', None)\n", + " model_frequency_penalty = getattr(self.llm, 'frequency_penalty', None)\n", + " model_presence_penalty = getattr(self.llm, 'presence_penalty', None)\n", + " \n", + " # Model Details Span - Track model-specific information\n", + " with trace_agent(\n", + " agent_type=\"model_details\",\n", + " operation=\"llm_model_execution\",\n", + " capabilities=[\"model_invocation\", \"parameter_tracking\", \"latency_measurement\"],\n", + " attributes={\n", + " \"agent.id\": \"noveum_model_details\",\n", + " \"input_query\": f\"Model execution for query: {query[:100]}{'...' if len(query) > 100 else ''}\",\n", + " \"query_length\": len(query)\n", + " }\n", + " ) as model_span:\n", + " \n", + " # Record start time for latency measurement\n", + " model_start_time = time.time()\n", + " \n", + " # Generate response\n", + " response = self.llm.invoke(rag_prompt)\n", + " \n", + " # Record end time and calculate latency\n", + " model_end_time = time.time()\n", + " model_latency = model_end_time - model_start_time\n", + "\n", + " if response.content:\n", + " answer = response.content\n", + " else:\n", + " answer = str(response)\n", + "\n", + " # Extract token usage metadata - Enhanced extraction\n", + " prompt_tokens = 0\n", + " completion_tokens = 0\n", + " total_tokens = 0\n", + " \n", + " # Try multiple ways to extract token usage\n", + " if hasattr(response, 'usage_metadata') and response.usage_metadata:\n", + " usage = response.usage_metadata\n", + " prompt_tokens = getattr(usage, \"input_tokens\", 0) or getattr(usage, \"prompt_tokens\", 0)\n", + " completion_tokens = getattr(usage, \"output_tokens\", 0) or getattr(usage, \"completion_tokens\", 0)\n", + " total_tokens = getattr(usage, \"total_tokens\", 0)\n", + " elif hasattr(response, 'response_metadata') and response.response_metadata:\n", + " metadata = response.response_metadata\n", + " if 'token_usage' in metadata:\n", + " token_usage = metadata['token_usage']\n", + " prompt_tokens = token_usage.get('prompt_tokens', 0)\n", + " completion_tokens = token_usage.get('completion_tokens', 0)\n", + " total_tokens = token_usage.get('total_tokens', 0)\n", + " elif hasattr(response, 'token_usage'):\n", + " token_usage = response.token_usage\n", + " prompt_tokens = getattr(token_usage, \"prompt_tokens\", 0)\n", + " completion_tokens = getattr(token_usage, \"completion_tokens\", 0)\n", + " total_tokens = getattr(token_usage, \"total_tokens\", 0)\n", + " \n", + " # If still no tokens found, try to estimate from content length\n", + " if total_tokens == 0:\n", + " # Rough estimation: ~4 characters per token for English text\n", + " estimated_prompt_tokens = len(rag_prompt) // 4\n", + " estimated_completion_tokens = len(answer) // 4\n", + " prompt_tokens = estimated_prompt_tokens\n", + " completion_tokens = estimated_completion_tokens\n", + " total_tokens = prompt_tokens + completion_tokens\n", + "\n", + " # Set model details span attributes\n", + " model_span.set_attributes({\n", + " # Input metrics\n", + " \"input_query\": f\"Model execution for query: {query[:100]}{'...' if len(query) > 100 else ''}\",\n", + " \"query_length\": len(query),\n", + " \"query_type\": \"rag_model_query\",\n", + " \n", + " # Model parameters and configuration\n", + " \"model.name\": model_name,\n", + " \"model.temperature\": model_temperature,\n", + " \"model.max_tokens\": model_max_tokens,\n", + " \"model.top_p\": model_top_p,\n", + " \"model.frequency_penalty\": model_frequency_penalty,\n", + " \"model.presence_penalty\": model_presence_penalty,\n", + " \"model.provider\": \"openai\",\n", + " \"model.type\": \"chat_completion\",\n", + " \"model.version\": \"gpt-4o-mini\",\n", + " \n", + " # Latency and performance metrics\n", + " \"model.latency_seconds\": model_latency,\n", + " \"model.latency_ms\": model_latency * 1000,\n", + " \"model.start_time\": model_start_time,\n", + " \"model.end_time\": model_end_time,\n", + " \"model.performance_tier\": \"fast\" if model_latency < 2.0 else \"medium\" if model_latency < 5.0 else \"slow\",\n", + " \n", + " # Token usage and cost metrics\n", + " \"model.prompt_tokens\": prompt_tokens,\n", + " \"model.completion_tokens\": completion_tokens,\n", + " \"model.total_tokens\": total_tokens,\n", + " \"model.tokens_per_second\": total_tokens / model_latency if model_latency > 0 else 0,\n", + " \"model.estimated_cost\": total_tokens * 0.00003, # Rough cost estimate\n", + " \"model.efficiency_score\": len(answer) / total_tokens if total_tokens > 0 else 0,\n", + " \n", + " # Response characteristics\n", + " \"model.response_length\": len(answer),\n", + " \"model.response_quality\": \"high\" if len(answer) > 200 else \"medium\" if len(answer) > 100 else \"low\",\n", + " \"model.output_response\": f\"Model Response: {answer[:200]}{'...' if len(answer) > 200 else ''}\",\n", + " \n", + " # Model configuration details\n", + " \"model.config\": {\n", + " \"name\": model_name,\n", + " \"temperature\": model_temperature,\n", + " \"max_tokens\": model_max_tokens,\n", + " \"top_p\": model_top_p,\n", + " \"frequency_penalty\": model_frequency_penalty,\n", + " \"presence_penalty\": model_presence_penalty,\n", + " \"provider\": \"openai\",\n", + " \"type\": \"chat_completion\"\n", + " }\n", + " })\n", + "\n", + " # Other Details Span - Track retrieval, response quality, and evaluation metrics\n", + " with trace_agent(\n", + " agent_type=\"other_details\",\n", + " operation=\"rag_evaluation_metrics\",\n", + " capabilities=[\"retrieval_analysis\", \"response_evaluation\", \"quality_assessment\"],\n", + " attributes={\n", + " \"agent.id\": \"noveum_other_details\",\n", + " \"input_query\": f\"Evaluation for query: {query[:100]}{'...' if len(query) > 100 else ''}\",\n", + " \"query_length\": len(query)\n", + " }\n", + " ) as rag_node:\n", + " \n", + " # Calculate additional evaluation metrics\n", + " context_length = len(context)\n", + " answer_length = len(answer)\n", + " sources_count = len(context.split(\"Source\")) - 1 if \"Source\" in context else 0\n", + " \n", + " # Set other details span attributes\n", + " rag_node.set_attributes({\n", + " # Input metrics\n", + " \"input_query\": f\"Evaluation for query: {query[:100]}{'...' if len(query) > 100 else ''}\",\n", + " \"query_length\": len(query),\n", + " \"query_type\": \"rag_evaluation_query\",\n", + " \n", + " # Retrieval metrics\n", + " \"retrieval.context_retrieved\": f\"Context: {context[:300]}{'...' if len(context) > 300 else ''}\",\n", + " \"retrieval.context_length\": context_length,\n", + " \"retrieval.sources_count\": sources_count,\n", + " \"retrieval.context_quality\": \"high\" if context_length > 500 else \"medium\" if context_length > 200 else \"low\",\n", + " \"retrieval.effectiveness\": sources_count / 5.0, # Normalized to max expected sources\n", + " \"retrieval.context_utilization\": context_length / 1000.0, # Normalized context usage\n", + " \n", + " # Prompt engineering metrics\n", + " \"prompt.complete_prompt\": rag_prompt,\n", + " \"prompt.prompt_length\": len(rag_prompt),\n", + " \"prompt.context_injection\": f\"Context injected: {context[:200]}{'...' if len(context) > 200 else ''}\",\n", + " \"prompt.instruction_following\": \"rag_optimized\",\n", + " \n", + " # Response quality metrics\n", + " \"response.answer_length\": answer_length,\n", + " \"response.answer_completeness\": \"complete\" if answer_length > 100 else \"brief\",\n", + " \"response.response_quality\": \"high\" if answer_length > 200 and sources_count > 2 else \"medium\" if answer_length > 100 else \"low\",\n", + " \"response.source_citation\": sources_count,\n", + " \"response.context_utilization\": answer_length / context_length if context_length > 0 else 0,\n", + " \"output_response\": f\"RAG Answer: {answer[:200]}{'...' if len(answer) > 200 else ''}\",\n", + " \n", + " # Evaluation metrics\n", + " \"evaluation.retrieval_effectiveness\": sources_count / 5.0,\n", + " \"evaluation.response_completeness\": \"complete\" if answer_length > 150 else \"partial\",\n", + " \"evaluation.source_diversity\": sources_count,\n", + " \"evaluation.context_relevance\": \"high\" if context_length > 500 else \"medium\" if context_length > 200 else \"low\",\n", + " \"evaluation.overall_quality\": \"high\" if answer_length > 200 and sources_count > 2 and context_length > 500 else \"medium\" if answer_length > 100 and sources_count > 1 else \"low\",\n", + " \"evaluation.ready_for_production\": True,\n", + " \n", + " # RAG-specific metrics\n", + " \"rag.retrieval_strategy\": \"semantic_similarity\",\n", + " \"rag.vector_search_results\": sources_count,\n", + " \"rag.context_synthesis\": \"multi_source\" if sources_count > 1 else \"single_source\",\n", + " \"rag.document_coverage\": sources_count / 5.0, # Normalized coverage\n", + " \"rag.information_density\": answer_length / context_length if context_length > 0 else 0\n", + " })\n", + "\n", + " # Set main RAG span attributes (simplified)\n", + " rag_span.set_attributes({\n", + " \"input_query\": query,\n", + " \"query_length\": len(query),\n", + " \"query_type\": \"rag_query\",\n", + " \"output_response\": f\"RAG Answer: {answer[:200]}{'...' if len(answer) > 200 else ''}\",\n", + " \"rag.context_length\": context_length,\n", + " \"rag.sources_count\": sources_count,\n", + " \"rag.answer_length\": answer_length,\n", + " \"rag.mode\": \"retrieval_augmented_generation\"\n", + " })\n", + "\n", + " return {\n", + " \"answer\": answer,\n", + " \"context\": context,\n", + " \"mode\": \"RAG\",\n", + " \"sources\": [doc.metadata.get('url', 'Unknown') for doc in self.search_relevant_docs(query, CONFIG[\"max_search_results\"])],\n", + " \"model_info\": {\n", + " \"name\": model_name,\n", + " \"tokens_used\": total_tokens,\n", + " \"prompt_tokens\": prompt_tokens,\n", + " \"completion_tokens\": completion_tokens,\n", + " \"latency\": model_latency\n", + " }\n", + " }\n", + "\n", + "# Initialize RAG system\n", + "rag_system = NoveumRAGSystem(embeddings, llm, CONFIG)\n", + "print(\"βœ… RAG system initialized!\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "09823cb6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "βœ… Web search system initialized!\n" + ] + } + ], + "source": [ + "# Cell 5: Web Search Integration - DuckDuckGo search for external queries\n", + "class NoveumWebSearchSystem:\n", + " def __init__(self, web_search_tool, llm, config):\n", + " self.web_search = web_search_tool\n", + " self.llm = llm\n", + " self.config = config\n", + " \n", + " def search_web(self, query: str, max_results: int = 5) -> List[Dict[str, Any]]:\n", + " \"\"\"Perform web search and return formatted results\"\"\"\n", + " try:\n", + " # Perform web search\n", + " search_results = self.web_search.run(query)\n", + " \n", + " # Parse results (DuckDuckGo returns a string, need to parse it)\n", + " results = []\n", + " if isinstance(search_results, str):\n", + " # Split by lines and parse each result\n", + " lines = search_results.split('\\n')\n", + " for i, line in enumerate(lines[:max_results]):\n", + " if line.strip():\n", + " results.append({\n", + " \"title\": f\"Search Result {i+1}\",\n", + " \"snippet\": line.strip(),\n", + " \"url\": f\"https://duckduckgo.com/?q={query.replace(' ', '+')}\"\n", + " })\n", + " else:\n", + " # If it's already a list/dict format\n", + " results = search_results[:max_results]\n", + " \n", + " print(f\"πŸ” Found {len(results)} web search results for: '{query}'\")\n", + " return results\n", + " \n", + " except Exception as e:\n", + " print(f\"❌ Error performing web search: {e}\")\n", + " return []\n", + " \n", + " def format_search_context(self, search_results: List[Dict[str, Any]]) -> str:\n", + " \"\"\"Format search results into context string\"\"\"\n", + " if not search_results:\n", + " return \"No search results found.\"\n", + " \n", + " context_parts = []\n", + " for i, result in enumerate(search_results, 1):\n", + " title = result.get('title', f'Result {i}')\n", + " snippet = result.get('snippet', 'No description available')\n", + " url = result.get('url', 'No URL available')\n", + " \n", + " context_parts.append(f\"Source {i} - {title}:\\n{snippet}\\nURL: {url}\")\n", + " \n", + " return \"\\n\\n\".join(context_parts)\n", + " \n", + " def generate_web_response(self, query: str) -> Dict[str, Any]:\n", + " \"\"\"Generate response using web search\"\"\"\n", + " with trace_agent(\n", + " agent_type=\"web_search_agent\",\n", + " operation=\"web_search_generation\",\n", + " capabilities=[\"web_search\", \"content_synthesis\", \"response_generation\"],\n", + " attributes={\n", + " \"agent.id\": \"noveum_web_search_agent\",\n", + " \"input_query\": query,\n", + " \"query_length\": len(query)\n", + " }\n", + " ) as web_span:\n", + " \n", + " # Perform web search\n", + " search_results = self.search_web(query, self.config[\"max_search_results\"])\n", + " \n", + " # Format context\n", + " context = self.format_search_context(search_results)\n", + " \n", + " # Create prompt for web search response\n", + " web_prompt = f\"\"\"You are a helpful assistant. Answer the user's question based on the provided web search results.\n", + "\n", + "Web Search Results:\n", + "{context}\n", + "\n", + "User Question: {query}\n", + "\n", + "Instructions:\n", + "1. Answer based on the provided web search results\n", + "2. Synthesize information from multiple sources when relevant\n", + "3. Be informative and accurate\n", + "4. If the results don't contain enough information, say so clearly\n", + "5. Keep responses concise but comprehensive\n", + "6. Cite sources when possible\n", + "\n", + "Answer:\"\"\"\n", + "\n", + " # Extract model parameters and metadata\n", + " model_name = getattr(self.llm, 'model_name', 'unknown')\n", + " model_temperature = getattr(self.llm, 'temperature', 0.0)\n", + " model_max_tokens = getattr(self.llm, 'max_tokens', None)\n", + " model_top_p = getattr(self.llm, 'top_p', None)\n", + " model_frequency_penalty = getattr(self.llm, 'frequency_penalty', None)\n", + " model_presence_penalty = getattr(self.llm, 'presence_penalty', None)\n", + "\n", + " # Model Details Span - Track model-specific information\n", + " with trace_agent(\n", + " agent_type=\"model_details\",\n", + " operation=\"llm_model_execution\",\n", + " capabilities=[\"model_invocation\", \"parameter_tracking\", \"latency_measurement\"],\n", + " attributes={\n", + " \"agent.id\": \"noveum_model_details\",\n", + " \"input_query\": f\"Model execution for query: {query[:100]}{'...' if len(query) > 100 else ''}\",\n", + " \"query_length\": len(query)\n", + " }\n", + " ) as model_span:\n", + " \n", + " # Record start time for latency measurement\n", + " model_start_time = time.time()\n", + " \n", + " # Generate response\n", + " response = self.llm.invoke(web_prompt)\n", + " \n", + " # Record end time and calculate latency\n", + " model_end_time = time.time()\n", + " model_latency = model_end_time - model_start_time\n", + "\n", + " # Handle response content extraction\n", + " if hasattr(response, 'content'):\n", + " # When response is a proper SDK object\n", + " answer = response.content\n", + " elif isinstance(response, dict):\n", + " # When response is returned as a plain dict\n", + " answer = response.get('content', '')\n", + " else:\n", + " # Fallback to string\n", + " answer = str(response)\n", + "\n", + " # Extract token usage metadata - Enhanced extraction\n", + " prompt_tokens = 0\n", + " completion_tokens = 0\n", + " total_tokens = 0\n", + " \n", + " # Try multiple ways to extract token usage\n", + " if hasattr(response, 'usage_metadata') and response.usage_metadata:\n", + " usage = response.usage_metadata\n", + " prompt_tokens = getattr(usage, \"input_tokens\", 0) or getattr(usage, \"prompt_tokens\", 0)\n", + " completion_tokens = getattr(usage, \"output_tokens\", 0) or getattr(usage, \"completion_tokens\", 0)\n", + " total_tokens = getattr(usage, \"total_tokens\", 0)\n", + " elif hasattr(response, 'response_metadata') and response.response_metadata:\n", + " metadata = response.response_metadata\n", + " if 'token_usage' in metadata:\n", + " token_usage = metadata['token_usage']\n", + " prompt_tokens = token_usage.get('prompt_tokens', 0)\n", + " completion_tokens = token_usage.get('completion_tokens', 0)\n", + " total_tokens = token_usage.get('total_tokens', 0)\n", + " elif hasattr(response, 'token_usage'):\n", + " token_usage = response.token_usage\n", + " prompt_tokens = getattr(token_usage, \"prompt_tokens\", 0)\n", + " completion_tokens = getattr(token_usage, \"completion_tokens\", 0)\n", + " total_tokens = getattr(token_usage, \"total_tokens\", 0)\n", + " \n", + " # If still no tokens found, try to estimate from content length\n", + " if total_tokens == 0:\n", + " # Rough estimation: ~4 characters per token for English text\n", + " estimated_prompt_tokens = len(web_prompt) // 4\n", + " estimated_completion_tokens = len(answer) // 4\n", + " prompt_tokens = estimated_prompt_tokens\n", + " completion_tokens = estimated_completion_tokens\n", + " total_tokens = prompt_tokens + completion_tokens\n", + "\n", + " # Set model details span attributes\n", + " model_span.set_attributes({\n", + " # Input metrics\n", + " \"input_query\": f\"Model execution for query: {query[:100]}{'...' if len(query) > 100 else ''}\",\n", + " \"query_length\": len(query),\n", + " \"query_type\": \"web_search_model_query\",\n", + " \n", + " # Model parameters and configuration\n", + " \"model.name\": model_name,\n", + " \"model.temperature\": model_temperature,\n", + " \"model.max_tokens\": model_max_tokens,\n", + " \"model.top_p\": model_top_p,\n", + " \"model.frequency_penalty\": model_frequency_penalty,\n", + " \"model.presence_penalty\": model_presence_penalty,\n", + " \"model.provider\": \"openai\",\n", + " \"model.type\": \"chat_completion\",\n", + " \"model.version\": \"gpt-4o-mini\",\n", + " \n", + " # Latency and performance metrics\n", + " \"model.latency_seconds\": model_latency,\n", + " \"model.latency_ms\": model_latency * 1000,\n", + " \"model.start_time\": model_start_time,\n", + " \"model.end_time\": model_end_time,\n", + " \"model.performance_tier\": \"fast\" if model_latency < 2.0 else \"medium\" if model_latency < 5.0 else \"slow\",\n", + " \n", + " # Token usage and cost metrics\n", + " \"model.prompt_tokens\": prompt_tokens,\n", + " \"model.completion_tokens\": completion_tokens,\n", + " \"model.total_tokens\": total_tokens,\n", + " \"model.tokens_per_second\": total_tokens / model_latency if model_latency > 0 else 0,\n", + " \"model.estimated_cost\": total_tokens * 0.00003, # Rough cost estimate\n", + " \"model.efficiency_score\": len(answer) / total_tokens if total_tokens > 0 else 0,\n", + " \n", + " # Response characteristics\n", + " \"model.response_length\": len(answer),\n", + " \"model.response_quality\": \"high\" if len(answer) > 200 else \"medium\" if len(answer) > 100 else \"low\",\n", + " \"model.output_response\": f\"Model Response: {answer[:200]}{'...' if len(answer) > 200 else ''}\",\n", + " \n", + " # Model configuration details\n", + " \"model.config\": {\n", + " \"name\": model_name,\n", + " \"temperature\": model_temperature,\n", + " \"max_tokens\": model_max_tokens,\n", + " \"top_p\": model_top_p,\n", + " \"frequency_penalty\": model_frequency_penalty,\n", + " \"presence_penalty\": model_presence_penalty,\n", + " \"provider\": \"openai\",\n", + " \"type\": \"chat_completion\"\n", + " }\n", + " })\n", + "\n", + " # Other Details Span - Track web search, response quality, and evaluation metrics\n", + " with trace_agent(\n", + " agent_type=\"other_details\",\n", + " operation=\"web_search_evaluation_metrics\",\n", + " capabilities=[\"web_search_analysis\", \"response_evaluation\", \"quality_assessment\"],\n", + " attributes={\n", + " \"agent.id\": \"noveum_other_details\",\n", + " \"input_query\": f\"Evaluation for query: {query[:100]}{'...' if len(query) > 100 else ''}\",\n", + " \"query_length\": len(query)\n", + " }\n", + " ) as other_span:\n", + " \n", + " # Calculate additional evaluation metrics\n", + " search_results_count = len(search_results)\n", + " context_length = len(context)\n", + " answer_length = len(answer or \"\")\n", + " \n", + " # Set other details span attributes\n", + " other_span.set_attributes({\n", + " # Input metrics\n", + " \"input_query\": f\"Evaluation for query: {query[:100]}{'...' if len(query) > 100 else ''}\",\n", + " \"query_length\": len(query),\n", + " \"query_type\": \"web_search_evaluation_query\",\n", + " \n", + " # Web search metrics\n", + " \"web_search.results_count\": search_results_count,\n", + " \"web_search.context_length\": context_length,\n", + " \"web_search.context_synthesized\": f\"Context from {search_results_count} web sources\",\n", + " \"web_search.search_effectiveness\": search_results_count / 5.0, # Normalized to max expected results\n", + " \"web_search.context_quality\": \"high\" if context_length > 800 else \"medium\" if context_length > 400 else \"low\",\n", + " \"web_search.source_diversity\": search_results_count,\n", + " \"web_search.information_synthesis\": \"high\" if search_results_count > 3 and answer_length > 200 else \"medium\" if search_results_count > 1 else \"low\",\n", + " \"web_search.external_knowledge_utilization\": context_length / 1000.0, # Normalized context usage\n", + " \n", + " # Prompt engineering metrics\n", + " \"prompt.complete_prompt\": web_prompt,\n", + " \"prompt.prompt_length\": len(web_prompt),\n", + " \"prompt.context_injection\": f\"Web context injected: {context[:200]}{'...' if len(context) > 200 else ''}\",\n", + " \"prompt.instruction_following\": \"web_search_optimized\",\n", + " \n", + " # Response quality metrics\n", + " \"response.answer_length\": answer_length,\n", + " \"response.answer_completeness\": \"complete\" if answer_length > 150 else \"brief\",\n", + " \"response.response_quality\": \"high\" if answer_length > 300 and search_results_count > 3 else \"medium\" if answer_length > 150 else \"low\",\n", + " \"response.source_citation\": search_results_count,\n", + " \"response.context_utilization\": answer_length / context_length if context_length > 0 else 0,\n", + " \"output_response\": f\"Web Search Answer: {answer[:200]}{'...' if len(answer or '') > 200 else ''}\" if answer else \"No answer generated\",\n", + " \n", + " # Evaluation metrics\n", + " \"evaluation.search_effectiveness\": search_results_count / 5.0,\n", + " \"evaluation.response_completeness\": \"complete\" if answer_length > 200 else \"partial\",\n", + " \"evaluation.source_diversity\": search_results_count,\n", + " \"evaluation.context_relevance\": \"high\" if context_length > 800 else \"medium\" if context_length > 400 else \"low\",\n", + " \"evaluation.overall_quality\": \"high\" if answer_length > 300 and search_results_count > 3 and context_length > 800 else \"medium\" if answer_length > 150 and search_results_count > 1 else \"low\",\n", + " \"evaluation.ready_for_production\": True,\n", + " \n", + " # Web search specific metrics\n", + " \"web_search.search_strategy\": \"duckduckgo_api\",\n", + " \"web_search.real_time_data\": True,\n", + " \"web_search.external_sources\": search_results_count,\n", + " \"web_search.information_freshness\": \"current\",\n", + " \"web_search.knowledge_synthesis\": \"multi_source\" if search_results_count > 1 else \"single_source\"\n", + " })\n", + "\n", + " # Set main Web Search span attributes (simplified)\n", + " web_span.set_attributes({\n", + " \"input_query\": query,\n", + " \"query_length\": len(query),\n", + " \"query_type\": \"web_search_query\",\n", + " \"output_response\": f\"Web Search Answer: {answer[:200]}{'...' if len(answer or '') > 200 else ''}\" if answer else \"No answer generated\",\n", + " \"web_search.results_count\": search_results_count,\n", + " \"web_search.context_length\": context_length,\n", + " \"web_search.response_length\": answer_length,\n", + " \"web_search.mode\": \"external_web_search\"\n", + " })\n", + "\n", + " return {\n", + " \"answer\": answer,\n", + " \"context\": context,\n", + " \"mode\": \"Web Search\",\n", + " \"sources\": [result.get('url', 'Unknown') for result in search_results],\n", + " \"model_info\": {\n", + " \"name\": model_name,\n", + " \"tokens_used\": total_tokens,\n", + " \"prompt_tokens\": prompt_tokens,\n", + " \"completion_tokens\": completion_tokens,\n", + " \"latency\": model_latency\n", + " }\n", + " }\n", + "\n", + "# Initialize web search system\n", + "web_search_system = NoveumWebSearchSystem(web_search, llm, CONFIG)\n", + "print(\"βœ… Web search system initialized!\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "3d47287f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "βœ… Query router initialized!\n" + ] + } + ], + "source": [ + "# Cell 6: Query Router - Intelligent decision making between RAG and Web Search\n", + "class NoveumQueryRouter:\n", + " def __init__(self, llm, config):\n", + " self.llm = llm\n", + " self.config = config\n", + " \n", + " # Keywords that suggest RAG should be used\n", + " self.rag_keywords = [\n", + " \"noveum\", \"platform\", \"product\", \"feature\", \"api\", \"documentation\",\n", + " \"trace\", \"observability\", \"monitoring\", \"agent\", \"system\", \"tool\",\n", + " \"integration\", \"setup\", \"configuration\", \"usage\", \"guide\", \"tutorial\",\n", + " \"pricing\", \"plan\", \"subscription\", \"account\", \"dashboard\", \"metrics\"\n", + " ]\n", + " \n", + " # Keywords that suggest Web Search should be used\n", + " self.web_keywords = [\n", + " \"recent\", \"latest\", \"news\", \"update\", \"announcement\", \"release\",\n", + " \"today\", \"yesterday\", \"this week\", \"this month\", \"current\",\n", + " \"trending\", \"popular\", \"viral\", \"breaking\", \"live\", \"real-time\",\n", + " \"weather\", \"stock\", \"price\", \"market\", \"cryptocurrency\", \"bitcoin\",\n", + " \"election\", \"politics\", \"sports\", \"entertainment\", \"celebrity\"\n", + " ]\n", + " \n", + " def classify_query(self, query: str) -> str:\n", + " \"\"\"Classify query to determine whether to use RAG or Web Search\"\"\"\n", + " query_lower = query.lower()\n", + " \n", + " # Check for RAG keywords\n", + " rag_score = sum(1 for keyword in self.rag_keywords if keyword in query_lower)\n", + " \n", + " # Check for Web Search keywords\n", + " web_score = sum(1 for keyword in self.web_keywords if keyword in query_lower)\n", + " \n", + " # Check for explicit mentions of Noveum\n", + " if \"noveum\" in query_lower:\n", + " return \"RAG\"\n", + " \n", + " # If both scores are 0, use LLM-based classification\n", + " if rag_score == 0 and web_score == 0:\n", + " return self._llm_classify_query(query)\n", + " \n", + " # Return the mode with higher score\n", + " return \"RAG\" if rag_score >= web_score else \"Web Search\"\n", + " \n", + " def _llm_classify_query(self, query: str) -> str:\n", + " \"\"\"Use LLM to classify query when keyword matching is inconclusive\"\"\"\n", + " try:\n", + " classification_prompt = f\"\"\"Classify the following user query to determine the best response method:\n", + "\n", + "Query: \"{query}\"\n", + "\n", + "Choose between:\n", + "- RAG: Use when the query is about Noveum.ai platform, products, features, documentation, or internal information\n", + "- Web Search: Use when the query is about recent events, news, general knowledge, or external topics\n", + "\n", + "Respond with only \"RAG\" or \"Web Search\".\"\"\"\n", + "\n", + " # Extract model parameters for tracking\n", + " model_name = getattr(self.llm, 'model_name', 'unknown')\n", + " model_temperature = getattr(self.llm, 'temperature', 0.0)\n", + " model_max_tokens = getattr(self.llm, 'max_tokens', None)\n", + " model_top_p = getattr(self.llm, 'top_p', None)\n", + " model_frequency_penalty = getattr(self.llm, 'frequency_penalty', None)\n", + " model_presence_penalty = getattr(self.llm, 'presence_penalty', None)\n", + "\n", + " # Model Details Span for classification\n", + " with trace_agent(\n", + " agent_type=\"model_details\",\n", + " operation=\"llm_model_execution\",\n", + " capabilities=[\"model_invocation\", \"parameter_tracking\", \"latency_measurement\"],\n", + " attributes={\n", + " \"agent.id\": \"noveum_model_details\",\n", + " \"input_query\": f\"Model execution for classification: {query[:100]}{'...' if len(query) > 100 else ''}\",\n", + " \"query_length\": len(query)\n", + " }\n", + " ) as model_span:\n", + " \n", + " # Record start time for latency measurement\n", + " model_start_time = time.time()\n", + " \n", + " response = self.llm.invoke(classification_prompt)\n", + " \n", + " # Record end time and calculate latency\n", + " model_end_time = time.time()\n", + " model_latency = model_end_time - model_start_time\n", + " \n", + " if hasattr(response, 'content'):\n", + " result = response.content.strip().upper()\n", + " else:\n", + " result = str(response).strip().upper()\n", + "\n", + " # Extract token usage for classification\n", + " prompt_tokens = 0\n", + " completion_tokens = 0\n", + " total_tokens = 0\n", + " \n", + " if hasattr(response, 'usage_metadata') and response.usage_metadata:\n", + " usage = response.usage_metadata\n", + " prompt_tokens = getattr(usage, \"input_tokens\", 0) or getattr(usage, \"prompt_tokens\", 0)\n", + " completion_tokens = getattr(usage, \"output_tokens\", 0) or getattr(usage, \"completion_tokens\", 0)\n", + " total_tokens = getattr(usage, \"total_tokens\", 0)\n", + " elif hasattr(response, 'response_metadata') and response.response_metadata:\n", + " metadata = response.response_metadata\n", + " if 'token_usage' in metadata:\n", + " token_usage = metadata['token_usage']\n", + " prompt_tokens = token_usage.get('prompt_tokens', 0)\n", + " completion_tokens = token_usage.get('completion_tokens', 0)\n", + " total_tokens = token_usage.get('total_tokens', 0)\n", + " \n", + " # If still no tokens found, estimate\n", + " if total_tokens == 0:\n", + " estimated_prompt_tokens = len(classification_prompt) // 4\n", + " estimated_completion_tokens = len(result) // 4\n", + " prompt_tokens = estimated_prompt_tokens\n", + " completion_tokens = estimated_completion_tokens\n", + " total_tokens = prompt_tokens + completion_tokens\n", + "\n", + " # Set model details span attributes\n", + " model_span.set_attributes({\n", + " # Input metrics\n", + " \"input_query\": f\"Model execution for classification: {query[:100]}{'...' if len(query) > 100 else ''}\",\n", + " \"query_length\": len(query),\n", + " \"query_type\": \"classification_model_query\",\n", + " \n", + " # Model parameters and configuration\n", + " \"model.name\": model_name,\n", + " \"model.temperature\": model_temperature,\n", + " \"model.max_tokens\": model_max_tokens,\n", + " \"model.top_p\": model_top_p,\n", + " \"model.frequency_penalty\": model_frequency_penalty,\n", + " \"model.presence_penalty\": model_presence_penalty,\n", + " \"model.provider\": \"openai\",\n", + " \"model.type\": \"chat_completion\",\n", + " \"model.version\": \"gpt-4o-mini\",\n", + " \n", + " # Latency and performance metrics\n", + " \"model.latency_seconds\": model_latency,\n", + " \"model.latency_ms\": model_latency * 1000,\n", + " \"model.start_time\": model_start_time,\n", + " \"model.end_time\": model_end_time,\n", + " \"model.performance_tier\": \"fast\" if model_latency < 1.0 else \"medium\" if model_latency < 3.0 else \"slow\",\n", + " \n", + " # Token usage and cost metrics\n", + " \"model.prompt_tokens\": prompt_tokens,\n", + " \"model.completion_tokens\": completion_tokens,\n", + " \"model.total_tokens\": total_tokens,\n", + " \"model.tokens_per_second\": total_tokens / model_latency if model_latency > 0 else 0,\n", + " \"model.estimated_cost\": total_tokens * 0.00003, # Rough cost estimate\n", + " \"model.efficiency_score\": len(result) / total_tokens if total_tokens > 0 else 0,\n", + " \n", + " # Response characteristics\n", + " \"model.response_length\": len(result),\n", + " \"model.response_quality\": \"high\" if len(result) > 10 else \"medium\" if len(result) > 5 else \"low\",\n", + " \"model.output_response\": f\"Classification Result: {result}\",\n", + " \n", + " # Model configuration details\n", + " \"model.config\": {\n", + " \"name\": model_name,\n", + " \"temperature\": model_temperature,\n", + " \"max_tokens\": model_max_tokens,\n", + " \"top_p\": model_top_p,\n", + " \"frequency_penalty\": model_frequency_penalty,\n", + " \"presence_penalty\": model_presence_penalty,\n", + " \"provider\": \"openai\",\n", + " \"type\": \"chat_completion\"\n", + " }\n", + " })\n", + "\n", + " # Log classification details for debugging\n", + " print(f\"πŸ” LLM Classification - Model: {model_name}, Tokens: {total_tokens}, Result: {result}\")\n", + " \n", + " if \"RAG\" in result:\n", + " return \"RAG\"\n", + " elif \"WEB\" in result or \"SEARCH\" in result:\n", + " return \"Web Search\"\n", + " else:\n", + " # Default to Web Search if unclear\n", + " return \"Web Search\"\n", + " \n", + " except Exception as e:\n", + " print(f\"❌ Error in LLM classification: {e}\")\n", + " # Default to Web Search on error\n", + " return \"Web Search\"\n", + " \n", + " def route_query(self, query: str) -> Tuple[str, Dict[str, Any]]:\n", + " \"\"\"Route query to appropriate system and return response\"\"\"\n", + " with trace_agent(\n", + " agent_type=\"query_router\",\n", + " operation=\"query_routing\",\n", + " capabilities=[\"query_classification\", \"routing_decision\"],\n", + " attributes={\n", + " \"agent.id\": \"noveum_query_router\",\n", + " \"input_query\": query,\n", + " \"query_length\": len(query)\n", + " }\n", + " ) as router_span:\n", + " \n", + " # Define classification prompt for tracing\n", + " classification_prompt = f\"\"\"Classify the following user query to determine the best response method:\n", + "\n", + "Query: \"{query}\"\n", + "\n", + "Choose between:\n", + "- RAG: Use when the query is about Noveum.ai platform, products, features, documentation, or internal information\n", + "- Web Search: Use when the query is about recent events, news, general knowledge, or external topics\n", + "\n", + "Respond with only \"RAG\" or \"Web Search\".\"\"\"\n", + " \n", + " # Classify the query\n", + " mode = self.classify_query(query)\n", + " \n", + " # Calculate routing evaluation metrics\n", + " query_lower = query.lower()\n", + " rag_keywords = [\"noveum\", \"platform\", \"product\", \"feature\", \"api\", \"documentation\", \"trace\", \"observability\", \"monitoring\", \"agent\", \"system\", \"tool\", \"integration\", \"setup\", \"configuration\", \"usage\", \"guide\", \"tutorial\", \"pricing\", \"plan\", \"subscription\", \"account\", \"dashboard\", \"metrics\"]\n", + " web_keywords = [\"recent\", \"latest\", \"news\", \"update\", \"announcement\", \"release\", \"today\", \"yesterday\", \"this week\", \"this month\", \"current\", \"trending\", \"popular\", \"viral\", \"breaking\", \"live\", \"real-time\", \"weather\", \"stock\", \"price\", \"market\", \"cryptocurrency\", \"bitcoin\", \"election\", \"politics\", \"sports\", \"entertainment\", \"celebrity\"]\n", + " \n", + " rag_score = sum(1 for keyword in rag_keywords if keyword in query_lower)\n", + " web_score = sum(1 for keyword in web_keywords if keyword in query_lower)\n", + " confidence_score = abs(rag_score - web_score) / max(rag_score + web_score, 1)\n", + " \n", + " # Other Details Span - Track routing analysis and decision metrics\n", + " with trace_agent(\n", + " agent_type=\"other_details\",\n", + " operation=\"routing_evaluation_metrics\",\n", + " capabilities=[\"routing_analysis\", \"decision_evaluation\", \"quality_assessment\"],\n", + " attributes={\n", + " \"agent.id\": \"noveum_other_details\",\n", + " \"input_query\": f\"Routing evaluation for query: {query[:100]}{'...' if len(query) > 100 else ''}\",\n", + " \"query_length\": len(query)\n", + " }\n", + " ) as other_span:\n", + " \n", + " # Set other details span attributes\n", + " other_span.set_attributes({\n", + " # Input metrics\n", + " \"input_query\": f\"Routing evaluation for query: {query[:100]}{'...' if len(query) > 100 else ''}\",\n", + " \"query_length\": len(query),\n", + " \"query_type\": \"routing_evaluation_query\",\n", + " \n", + " # Classification metrics\n", + " \"classification.mode\": mode,\n", + " \"classification.rag_keyword_score\": rag_score,\n", + " \"classification.web_keyword_score\": web_score,\n", + " \"classification.confidence_score\": confidence_score,\n", + " \"classification.confidence_level\": \"high\" if confidence_score > 0.5 else \"medium\" if confidence_score > 0.2 else \"low\",\n", + " \"classification.method\": \"llm_based\" if rag_score == 0 and web_score == 0 else \"keyword_based\",\n", + " \n", + " # Query analysis metrics\n", + " \"query.complexity\": \"complex\" if len(query) > 50 else \"medium\" if len(query) > 20 else \"simple\",\n", + " \"query.intent\": \"noveum_specific\" if \"noveum\" in query_lower else \"general_knowledge\" if web_score > rag_score else \"documentation\",\n", + " \"query.keyword_density\": (rag_score + web_score) / len(query.split()),\n", + " \"query.domain_affinity\": \"noveum\" if rag_score > web_score else \"general\" if web_score > rag_score else \"neutral\",\n", + " \n", + " # Routing decision metrics\n", + " \"routing.decision\": f\"Routed to {mode} based on analysis\",\n", + " \"routing.rationale\": f\"RAG score: {rag_score}, Web score: {web_score}, Confidence: {confidence_score:.2f}\",\n", + " \"routing.expected_performance\": \"high\" if confidence_score > 0.5 else \"medium\" if confidence_score > 0.2 else \"low\",\n", + " \"routing.alternative_mode\": \"Web Search\" if mode == \"RAG\" else \"RAG\",\n", + " \"routing.decision_confidence\": confidence_score,\n", + " \n", + " # Evaluation metrics\n", + " \"evaluation.routing_accuracy\": \"high\" if confidence_score > 0.5 else \"medium\" if confidence_score > 0.2 else \"low\",\n", + " \"evaluation.keyword_coverage\": (rag_score + web_score) / len(rag_keywords + web_keywords),\n", + " \"evaluation.query_understanding\": \"clear\" if confidence_score > 0.5 else \"ambiguous\" if confidence_score > 0.2 else \"unclear\",\n", + " \"evaluation.ready_for_production\": True,\n", + " \n", + " # Router-specific metrics\n", + " \"router.classification_strategy\": \"hybrid_keyword_llm\",\n", + " \"router.keyword_matching\": \"used\" if rag_score > 0 or web_score > 0 else \"bypassed\",\n", + " \"router.llm_fallback\": \"used\" if rag_score == 0 and web_score == 0 else \"not_needed\",\n", + " \"router.decision_time\": \"instant\" if rag_score > 0 or web_score > 0 else \"llm_required\",\n", + " \"output_response\": f\"Routing Decision: {mode} (Confidence: {confidence_score:.2f})\"\n", + " })\n", + "\n", + " # Set main router span attributes (simplified)\n", + " router_span.set_attributes({\n", + " \"input_query\": query,\n", + " \"query_length\": len(query),\n", + " \"query_type\": \"routing_query\",\n", + " \"output_response\": f\"Routed to {mode} for query processing\",\n", + " \"router.classification\": mode,\n", + " \"router.confidence_score\": confidence_score,\n", + " \"router.rag_keyword_score\": rag_score,\n", + " \"router.web_keyword_score\": web_score,\n", + " \"router.mode\": \"intelligent_routing\"\n", + " })\n", + " \n", + " # Route to appropriate system\n", + " if mode == \"RAG\":\n", + " print(f\"🧠 Routing to RAG system for: '{query}'\")\n", + " response = rag_system.generate_rag_response(query)\n", + " else:\n", + " print(f\"🌐 Routing to Web Search for: '{query}'\")\n", + " response = web_search_system.generate_web_response(query)\n", + " \n", + " return mode, response\n", + "\n", + "# Initialize query router\n", + "query_router = NoveumQueryRouter(llm, CONFIG)\n", + "print(\"βœ… Query router initialized!\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a865a57f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "βœ… Noveum AI Agent initialized!\n" + ] + } + ], + "source": [ + "# Cell 7: Main Executor - Orchestrates the complete agent workflow\n", + "class NoveumAIAgent:\n", + " def __init__(self, scraper, rag_system, web_search_system, query_router, config):\n", + " self.scraper = scraper\n", + " self.rag_system = rag_system\n", + " self.web_search_system = web_search_system\n", + " self.query_router = query_router\n", + " self.config = config\n", + " self.is_initialized = False\n", + " \n", + " def initialize_system(self, force_scrape: bool = False) -> bool:\n", + " \"\"\"Initialize the system by setting up RAG with scraped data\"\"\"\n", + " print(\"πŸš€ Initializing Noveum AI Agent...\")\n", + " \n", + " with trace_operation(\"system_initialization\") as init_span:\n", + " init_span.set_attributes({\n", + " \"system.force_scrape\": force_scrape,\n", + " \"system.config\": self.config,\n", + " \"input_query\": f\"Initialize system with force_scrape={force_scrape}\",\n", + " \"output_response\": \"System initialization: RAG system loaded, vector store ready, agent operational\"\n", + " })\n", + " \n", + " # Check if we need to scrape or if data already exists\n", + " if force_scrape or not os.path.exists(self.config[\"noveum_docs_file\"]):\n", + " print(\"πŸ“₯ Scraping Noveum website...\")\n", + " \n", + " # Scrape the website\n", + " scraped_data = self.scraper.scrape_website()\n", + " \n", + " if not scraped_data:\n", + " print(\"❌ Failed to scrape website data\")\n", + " return False\n", + " \n", + " # Save scraped data\n", + " self.scraper.save_to_json(self.config[\"noveum_docs_file\"])\n", + " \n", + " init_span.add_event(\"website_scraped\", {\n", + " \"input_query\": f\"Scrape website: {self.config['noveum_base_url']}\",\n", + " \"output_response\": f\"Website scraping completed: {len(scraped_data)} pages scraped, {sum(page['content_length'] for page in scraped_data)} total characters extracted for RAG system\",\n", + " \"pages_scraped\": len(scraped_data),\n", + " \"total_content_length\": sum(page[\"content_length\"] for page in scraped_data)\n", + " })\n", + " else:\n", + " print(\"πŸ“ Using existing scraped data...\")\n", + " \n", + " # Load documents and create/load vector store\n", + " documents = self.rag_system.load_documents_from_json(self.config[\"noveum_docs_file\"])\n", + " \n", + " if not documents:\n", + " print(\"❌ Failed to load documents\")\n", + " return False\n", + " \n", + " # Try to load existing vector store, create if doesn't exist\n", + " if not self.rag_system.load_vectorstore():\n", + " print(\"πŸ”„ Creating new vector store...\")\n", + " self.rag_system.create_vectorstore(documents)\n", + " \n", + " self.is_initialized = True\n", + " print(\"βœ… Noveum AI Agent initialized successfully!\")\n", + " \n", + " init_span.set_attributes({\n", + " \"system.initialized\": True,\n", + " \"system.documents_loaded\": len(documents),\n", + " \"system.vectorstore_ready\": self.rag_system.vectorstore is not None\n", + " })\n", + " \n", + " return True\n", + " \n", + " def process_query(self, query: str) -> Dict[str, Any]:\n", + " \"\"\"Process a user query and return response\"\"\"\n", + " if not self.is_initialized:\n", + " print(\"❌ System not initialized. Please run initialize_system() first.\")\n", + " return {\n", + " \"answer\": \"System not initialized. Please run initialize_system() first.\",\n", + " \"mode\": \"Error\",\n", + " \"sources\": [],\n", + " \"error\": \"System not initialized\"\n", + " }\n", + " \n", + " print(f\"\\n🎯 Processing query: '{query}'\")\n", + " \n", + " with trace_operation(\"tool-orchestator\") as process_span:\n", + " process_span.set_attributes({\n", + " \"input_query\": query,\n", + " \"query.length\": len(query)\n", + " })\n", + " \n", + " start_time = time.time()\n", + " \n", + " try:\n", + " # Route query and get response\n", + " mode, response = self.query_router.route_query(query)\n", + " \n", + " # Add processing metrics\n", + " end_time = time.time()\n", + " processing_time = end_time - start_time\n", + " \n", + " response.update({\n", + " \"processing_time\": processing_time,\n", + " \"timestamp\": time.time()\n", + " })\n", + " \n", + " # Add metrics to span\n", + " process_span.set_attributes({\n", + " \"processing.mode\": mode,\n", + " \"processing.time_seconds\": processing_time,\n", + " \"processing.response_length\": len(response.get(\"answer\", \"\")),\n", + " \"processing.sources_count\": len(response.get(\"sources\", [])),\n", + " \"output_response\": f\"Final Answer: {response.get('answer', '')[:200]}{'...' if len(response.get('answer', '')) > 200 else ''}\",\n", + " \"final_answer_mode\": mode,\n", + " \"query_processed.input_query\": query,\n", + " \"query_processed.output_response\": f\"Successfully processed query using {mode}, generated {len(response.get('answer', ''))} character response\",\n", + " \"query_processed.mode\": mode,\n", + " \"query_processed.processing_time\": processing_time,\n", + " \"query_processed.response_length\": len(response.get(\"answer\", \"\"))\n", + " })\n", + " \n", + " print(f\"βœ… Query processed in {processing_time:.2f}s using {mode}\")\n", + " return response\n", + " \n", + " except Exception as e:\n", + " error_msg = f\"Error processing query: {str(e)}\"\n", + " print(f\"❌ {error_msg}\")\n", + " \n", + " process_span.add_event(\"query_processing_error\", {\n", + " \"error\": str(e),\n", + " \"input_query\": query,\n", + " \"output_response\": f\"I encountered an error while processing your query: {str(e)}\"\n", + " })\n", + " \n", + " return {\n", + " \"answer\": f\"I encountered an error while processing your query: {str(e)}\",\n", + " \"mode\": \"Error\",\n", + " \"sources\": [],\n", + " \"error\": str(e),\n", + " \"processing_time\": time.time() - start_time\n", + " }\n", + " \n", + " def display_response(self, response: Dict[str, Any]) -> None:\n", + " \"\"\"Display the response in a formatted way\"\"\"\n", + " print(\"\\n\" + \"=\"*80)\n", + " print(f\"πŸ€– NOVEUM AI AGENT RESPONSE\")\n", + " print(\"=\"*80)\n", + " print(f\"πŸ“Š Mode: {response.get('mode', 'Unknown')}\")\n", + " print(f\"⏱️ Processing Time: {response.get('processing_time', 0):.2f}s\")\n", + " print(f\"πŸ“… Timestamp: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(response.get('timestamp', time.time())))}\")\n", + " \n", + " if response.get('sources'):\n", + " print(f\"πŸ“š Sources ({len(response['sources'])}):\")\n", + " for i, source in enumerate(response['sources'][:3], 1): # Show first 3 sources\n", + " print(f\" {i}. {source}\")\n", + " if len(response['sources']) > 3:\n", + " print(f\" ... and {len(response['sources']) - 3} more\")\n", + " \n", + " print(\"\\nπŸ’¬ Answer:\")\n", + " print(\"-\" * 40)\n", + " print(response.get('answer', 'No answer provided'))\n", + " print(\"=\"*80)\n", + "\n", + "# Initialize the main agent\n", + "noveum_agent = NoveumAIAgent(scraper, rag_system, web_search_system, query_router, CONFIG)\n", + "print(\"βœ… Noveum AI Agent initialized!\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "4b30e6d3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "βœ… Demo functions ready!\n", + "\n", + "πŸš€ To get started:\n", + "1. Run: demo_noveum_agent() # For a full demo\n", + "2. Run: ask_question('Your question here') # For a single question\n", + "3. Or use: noveum_agent.process_query('Your question') # For programmatic access\n" + ] + } + ], + "source": [ + "# Cell 8: Usage Examples and Demo\n", + "def demo_noveum_agent():\n", + " \"\"\"Demo function showing how to use the Noveum AI Agent\"\"\"\n", + " \n", + " print(\"🎬 NOVEUM AI AGENT DEMO\")\n", + " print(\"=\"*50)\n", + " \n", + " # Step 1: Initialize the system\n", + " print(\"\\n1️⃣ Initializing the system...\")\n", + " success = noveum_agent.initialize_system(force_scrape=False) # Set to True to force re-scraping\n", + " \n", + " if not success:\n", + " print(\"❌ Failed to initialize system\")\n", + " return\n", + " \n", + " # Step 2: Demo queries - 20 comprehensive test questions\n", + " demo_queries = [\n", + " # RAG Queries (Noveum-specific)\n", + " \"What is Noveum and what does it do?\", # Basic product info\n", + " \"How do I integrate Noveum Trace in my application?\", # Technical integration\n", + " \"What are Noveum's pricing plans?\", # Pricing information\n", + " \"What features does Noveum Trace offer?\", # Feature overview\n", + " \"How do I set up observability with Noveum?\", # Setup guidance\n", + " \"What APIs are available in Noveum platform?\", # API documentation\n", + " \"How does Noveum handle agent tracing?\", # Technical details\n", + " \"What monitoring capabilities does Noveum provide?\", # Capabilities\n", + " \"How do I configure Noveum for my system?\", # Configuration\n", + " \"What are the benefits of using Noveum Trace?\", # Value proposition\n", + " \n", + " # Web Search Queries (External/Recent information)\n", + " \"What are the latest AI news today?\", # Recent news\n", + " \"What's the weather like today?\", # Current weather\n", + " \"Tell me about recent developments in machine learning\", # Recent developments\n", + " \"What are the current trends in observability tools?\", # Industry trends\n", + " \"What happened in tech news this week?\", # Weekly tech news\n", + " \"What are the latest updates in Python programming?\", # Recent updates\n", + " \"What's the current status of cryptocurrency markets?\", # Market information\n", + " \"What are the newest features in cloud computing?\", # Recent features\n", + " \"What's happening in the software development world today?\", # Current events\n", + " \"What are the latest breakthroughs in artificial intelligence?\" # Recent breakthroughs\n", + " ]\n", + " \n", + " print(f\"\\n2️⃣ Running {len(demo_queries)} demo queries...\")\n", + " \n", + " for i, query in enumerate(demo_queries, 1):\n", + " print(f\"\\n--- Demo Query {i} ---\")\n", + " response = noveum_agent.process_query(query)\n", + " noveum_agent.display_response(response)\n", + " \n", + " # Small delay between queries\n", + " time.sleep(1)\n", + " \n", + " print(f\"\\nπŸŽ‰ Demo completed! Check Noveum Trace dashboard for detailed observability data.\")\n", + " print(\"πŸ’‘ You can now use noveum_agent.process_query('your question') for your own queries!\")\n", + "\n", + "# Interactive query function\n", + "def ask_question(question: str):\n", + " \"\"\"Convenience function to ask a single question\"\"\"\n", + " if not noveum_agent.is_initialized:\n", + " print(\"⚠️ System not initialized. Initializing now...\")\n", + " if not noveum_agent.initialize_system():\n", + " print(\"❌ Failed to initialize system\")\n", + " return\n", + " \n", + " response = noveum_agent.process_query(question)\n", + " noveum_agent.display_response(response)\n", + " return response\n", + "\n", + "print(\"βœ… Demo functions ready!\")\n", + "print(\"\\nπŸš€ To get started:\")\n", + "print(\"1. Run: demo_noveum_agent() # For a full demo\")\n", + "print(\"2. Run: ask_question('Your question here') # For a single question\")\n", + "print(\"3. Or use: noveum_agent.process_query('Your question') # For programmatic access\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eed0fd46", + "metadata": {}, + "outputs": [], + "source": [ + "demo_noveum_agent()" + ] + }, + { + "cell_type": "markdown", + "id": "d5b29876", + "metadata": {}, + "source": [ + "## Downloading the data set" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "244c76d1", + "metadata": {}, + "outputs": [], + "source": [ + "!python noveum_customer_support_bt/traces/fetch_traces_api.py 50\n", + "\n", + "#. This script fetches traces for our project and saves them locally." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d9fe279c", + "metadata": {}, + "outputs": [], + "source": [ + "!python NovaEval/noveum_customer_support_bt/traces/combine_spans_api_compat.py" + ] + }, + { + "cell_type": "markdown", + "id": "0bddd552", + "metadata": {}, + "source": [ + "## Data Filteration and mapping" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4b68d30e", + "metadata": {}, + "outputs": [], + "source": [ + "!python preprocess_map.py ./traces/dataset_filtered.json" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9e242af4", + "metadata": {}, + "outputs": [], + "source": [ + "!python preprocess_map.py NovaEval/noveum_customer_support_bt/traces/traces/dataset_filtered.json\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "008fd0e1", + "metadata": {}, + "source": [ + "## Running eval on the dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "770777e3", + "metadata": {}, + "outputs": [], + "source": [ + "# 1. Setup\n", + "!source .venv/bin/activate\n", + "!cd noveum_customer_support_bt\n", + "\n", + "# 2. Create Dataset\n", + "!python create_dataset.py --dataset-type agent --description \"Customer Support Agent Evaluation Dataset\" --pretty\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9358c590", + "metadata": {}, + "outputs": [], + "source": [ + "# 3. Create Version\n", + "!python create_dataset_version.py --pretty\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "224276f1", + "metadata": {}, + "outputs": [], + "source": [ + "# Getting scores\n", + "from demo_utils import run_complete_agent_evaluation\n", + "import os\n", + "\n", + "# Process all JSON files in split_datasets directory\n", + "for file in os.listdir('split_datasets'):\n", + " if file.endswith('.json'):\n", + " print(f'Processing {file}...')\n", + " run_complete_agent_evaluation(\n", + " f'split_datasets/{file}', \n", + " sample_size=25, \n", + " evaluation_name=file.replace('.json', ''),\n", + " output_dir='./demo_results'\n", + " )\n", + " print(f'Completed {file}\\n')" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "1f5ca2d3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Created api_data.json with 20 items\n", + "Sample items:\n", + " {'item_key': 'eda4fe22-9a2b-4b73-856b-f4f3309bf719', 'item_id': 'item_1'}\n", + " {'item_key': '0ffffba1-8a37-443c-8866-d53ffbfa7718', 'item_id': 'item_2'}\n", + " {'item_key': 'f1f37bd7-0851-4659-b493-b80d3800d920', 'item_id': 'item_3'}\n" + ] + } + ], + "source": [ + "\n", + "import pandas as pd\n", + "import json\n", + "\n", + "# Read the CSV file\n", + "df = pd.read_csv('demo_results/agent.query_routing_dataset/agent_evaluation_results.csv')\n", + "\n", + "# Create API data structure with all task_ids\n", + "api_data = {\n", + " 'items': [\n", + " {\n", + " 'item_key': str(row['task_id']),\n", + " 'item_id': f'item_{i+1}' # Generate unique item IDs\n", + " }\n", + " for i, row in df.iterrows()\n", + " ]\n", + "}\n", + "\n", + "# Save to JSON\n", + "with open('api_data.json', 'w') as f:\n", + " json.dump(api_data, f, indent=2)\n", + "\n", + "print('Created api_data.json with', len(api_data['items']), 'items')\n", + "print('Sample items:')\n", + "for item in api_data['items'][:3]:\n", + " print(f' {item}')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "529a4eed", + "metadata": {}, + "outputs": [], + "source": [ + "!python upload_scores.py demo_results/agent.query_routing_dataset/agent_evaluation_results.csv --item-key-col task_id --score-col context_relevancy --reasoning-col context_relevancy_reasoning --api-data api_data.json --scorer-id context_relevancy_scorer" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c67e5df7", + "metadata": {}, + "outputs": [], + "source": [ + "!python upload_scores.py demo_results/agent.query_routing_dataset/agent_evaluation_results.csv --item-key-col task_id --score-col role_adherence --reasoning-col role_adherence_reasoning --api-data api_data.json --scorer-id role_adherence_scorer" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d84c0615", + "metadata": {}, + "outputs": [], + "source": [ + "!python upload_scores.py demo_results/agent.query_routing_dataset/agent_evaluation_results.csv --item-key-col task_id --score-col parameter_correctness --reasoning-col parameter_correctness_reasoning --api-data api_data.json --scorer-id parameter_correctness_scorer" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "83466df7", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/noveum_customer_support_bt/fetch_dataset_items.py b/noveum_customer_support_bt/fetch_dataset_items.py new file mode 100644 index 0000000..4a5c758 --- /dev/null +++ b/noveum_customer_support_bt/fetch_dataset_items.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python3 +""" +Script to fetch dataset items from Noveum API and create api_data.json file. +This file is needed for the upload_scores.py script. +""" + +import os +import json +import requests +from dotenv import load_dotenv +from typing import Dict, Any, Optional + +# Load environment variables +load_dotenv() + +# Get API credentials from environment +api_key = os.getenv('NOVEUM_API_KEY') +org_slug = os.getenv('NOVEUM_ORG_SLUG') +dataset_slug = os.getenv('NOVEUM_DATASET_SLUG') +latest_version = os.getenv('LATEST_VERSION') +beta_env = os.getenv('BETA', 'false').lower() == 'true' + +def fetch_dataset_items() -> Optional[Dict[str, Any]]: + """Fetch dataset items from Noveum API""" + + # Construct API URL based on BETA environment variable + if beta_env: + api_url = f"https://noveum.ai/api/v1/datasets/{dataset_slug}/items?organizationSlug={org_slug}&version={latest_version}" + else: + api_url = f"https://noveum.ai/api/v1/organizations/{org_slug}/datasets/{dataset_slug}/items?version={latest_version}" + + # Prepare headers + headers = { + 'Authorization': f'Bearer {api_key}', + 'Cookie': f'apiKeyCookie={api_key}' + } + + print(f"Fetching dataset items from: {api_url}") + print(f"Organization: {org_slug}") + print(f"Dataset: {dataset_slug}") + print(f"Version: {latest_version}") + + try: + response = requests.get(api_url, headers=headers, timeout=30) + response.raise_for_status() + + data = response.json() + print(f"Successfully fetched {len(data.get('items', []))} items") + print(f"Response status: {response.status_code}") + + return data + + except requests.exceptions.RequestException as e: + print(f"Error fetching dataset items: {e}") + if hasattr(e, 'response') and e.response is not None: + print(f"Response status: {e.response.status_code}") + print(f"Response text: {e.response.text}") + return None + +def main(): + # Validate environment variables + required_vars = { + 'NOVEUM_API_KEY': api_key, + 'NOVEUM_ORG_SLUG': org_slug, + 'NOVEUM_DATASET_SLUG': dataset_slug, + 'LATEST_VERSION': latest_version + } + + missing_vars = [var for var, value in required_vars.items() if not value] + + if missing_vars: + print(f"Error: Missing required environment variables: {', '.join(missing_vars)}") + return 1 + + # Fetch dataset items + data = fetch_dataset_items() + + if data is None: + return 1 + + # Save response to api_data.json + try: + with open('api_data.json', 'w', encoding='utf-8') as f: + json.dump(data, f, indent=2) + print(f"\nDataset items saved to: api_data.json") + print(f"Total items: {len(data.get('items', []))}") + except (OSError, IOError) as e: + print(f"Error saving dataset items: {e}") + return 1 + + return 0 + +if __name__ == "__main__": + exit(main()) diff --git a/noveum_customer_support_bt/final_agent_evaluation_demo.ipynb b/noveum_customer_support_bt/final_agent_evaluation_demo.ipynb new file mode 100644 index 0000000..5b484b3 --- /dev/null +++ b/noveum_customer_support_bt/final_agent_evaluation_demo.ipynb @@ -0,0 +1,251 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# The agent as shown on Noveum.ai platform\n", + "\n", + "![Alt text](support_agent.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Final Agent Evaluation Demo with NovaEval\n", + "\n", + "This notebook demonstrates a streamlined approach to agent evaluation using modular utility functions:\n", + "\n", + "1. **Load agent trace data** from JSON datasets\n", + "2. **Map trace spans** to AgentData format using utility functions\n", + "3. **Create and analyze** AgentDataset\n", + "4. **Evaluate agent performance** using AgentEvaluator with Gemini model\n", + "5. **Analyze results** and export data\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Scorers Used\n", + "\n", + "**context_relevancy_scorer** - Evaluates whether the agent response is appropriate and relevant given the agent's task and role.\n", + "\n", + "**role_adherence_scorer** - Scores whether the agent's tool calls and response adhere to its assigned role and task.\n", + "\n", + "**task_progression_scorer** - Measures whether the agent has made meaningful progress on the assigned task.\n", + "\n", + "**tool_relevancy_scorer** - Assesses how relevant and appropriate the tool call is given the available tools and the agent's context.\n", + "\n", + "**tool_correctness_scorer** - Compares actual tool calls against expected tool calls to evaluate correctness of tool usage and parameters.\n", + "\n", + "**parameter_correctness_scorer** - Validates whether correct parameters were passed to tool calls by analyzing the tool results." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 1: Import Dependencies and Utility Functions\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Import our custom utility functions\n", + "from demo_utils import (\n", + " list_dataset_files,\n", + " load_and_analyze_dataset,\n", + " convert_spans_to_agent_dataset,\n", + " analyze_dataset_statistics,\n", + " setup_gemini_model,\n", + " setup_agent_evaluator,\n", + " run_evaluation,\n", + " analyze_agent_behavior_patterns,\n", + " export_processed_dataset,\n", + " setup_logging,\n", + " validate_environment,\n", + " print_demo_summary\n", + ")\n", + "\n", + "print(\"βœ… All utility functions imported successfully!\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dotenv import load_dotenv\n", + "\n", + "load_dotenv()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!python preprocess_filter.py ./traces/traces/dataset.json\n", + "!python preprocess_map.py ./traces/dataset_filtered.json\n", + "!python preprocess_split_data.py ./traces/dataset_filtered_mapped.json" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Force reload the demo_utils module to get the latest changes\n", + "import importlib\n", + "import sys\n", + "\n", + "# Remove the module from cache if it exists\n", + "if 'demo_utils' in sys.modules:\n", + " del sys.modules['demo_utils']\n", + "\n", + "# Import the updated module\n", + "from demo_utils import run_complete_agent_evaluation\n", + "\n", + "print(\"βœ… Module reloaded successfully!\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from demo_utils import run_complete_agent_evaluation\n", + "run_complete_agent_evaluation('./split_datasets/agent.rag_evaluation_metrics_dataset.json',\n", + "evaluation_name = \"agent.rag_evaluation_metrics_dataset\", output_dir = \"./demo_results\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Analysis of poor scores in comment generation agent." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "comment_gen = pd.read_csv(\"demo_results/agent.rag_evaluation_metrics_dataset/agent_evaluation_results.csv\")\n", + "\n", + "split_size = 3\n", + "\n", + "task_progression = comment_gen.sort_values(by = 'task_progression', ascending= True).iloc[:split_size][['task_progression', 'task_progression_reasoning']]\n", + "\n", + "print(\"Task Progression:\")\n", + "print()\n", + "for idx, row in task_progression.iterrows():\n", + " print(f\"Score = {row['task_progression']}\")\n", + " print(f\"Reasoning = {row['task_progression_reasoning']}\")\n", + " print() # blank line" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Context Relevancy Analysis\n", + "context_relevancy = comment_gen.sort_values(by='context_relevancy', ascending=True).iloc[:3][['context_relevancy', 'context_relevancy_reasoning']]\n", + "\n", + "print(\"Context Relevancy Analysis:\")\n", + "print(\"=\" * 50)\n", + "for idx, row in context_relevancy.iterrows():\n", + " print(f\"Score = {row['context_relevancy']}\")\n", + " print(f\"Reasoning = {row['context_relevancy_reasoning']}\")\n", + " print()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Role Adherence Analysis\n", + "role_adherence = comment_gen.sort_values(by='role_adherence', ascending=True).iloc[:3][['role_adherence', 'role_adherence_reasoning']]\n", + "\n", + "print(\"Role Adherence Analysis:\")\n", + "print(\"=\" * 50)\n", + "for idx, row in role_adherence.iterrows():\n", + " print(f\"Score = {row['role_adherence']}\")\n", + " print(f\"Reasoning = {row['role_adherence_reasoning']}\")\n", + " print()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/noveum_customer_support_bt/fix_api_data_v2.py b/noveum_customer_support_bt/fix_api_data_v2.py new file mode 100644 index 0000000..cfc4f5b --- /dev/null +++ b/noveum_customer_support_bt/fix_api_data_v2.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python3 +""" +Script to fix api_data.json by adding item_key field from turn_id found anywhere in the item. +""" + +import json + +def fix_api_data(): + """Fix api_data.json by adding item_key field""" + + # Load the api_data.json file + with open('api_data.json', 'r') as f: + data = json.load(f) + + # Process each item to add item_key + items = data.get('items', []) + fixed_items = [] + + for item in items: + # Create a copy of the item + fixed_item = item.copy() + + # Look for turn_id in any field + turn_id = None + + # Check if turn_id is directly in the item + if 'turn_id' in item: + turn_id = item['turn_id'] + else: + # Search through all string values for turn_id pattern + for key, value in item.items(): + if isinstance(value, str) and 'turn_id' in value: + try: + # Try to parse as JSON and extract turn_id + parsed = json.loads(value) + if isinstance(parsed, dict) and 'turn_id' in parsed: + turn_id = parsed['turn_id'] + break + except: + pass + + # Add item_key field + fixed_item['item_key'] = turn_id or '' + fixed_items.append(fixed_item) + + # Update the data + data['items'] = fixed_items + + # Save the fixed file + with open('api_data.json', 'w') as f: + json.dump(data, f, indent=2) + + print(f"Fixed api_data.json with {len(fixed_items)} items") + print(f"Added item_key field for each item using turn_id found in the data") + + # Show a sample of the fixed data + if fixed_items: + print(f"\nSample item_key: {fixed_items[0].get('item_key', 'NOT_FOUND')}") + +if __name__ == "__main__": + fix_api_data() diff --git a/noveum_customer_support_bt/novapilot_utils.py b/noveum_customer_support_bt/novapilot_utils.py new file mode 100644 index 0000000..f1d3d3a --- /dev/null +++ b/noveum_customer_support_bt/novapilot_utils.py @@ -0,0 +1,561 @@ +""" +NovaPilot Utilities - Agent Analysis and Evaluation Tools + +This module provides utilities for analyzing agent performance using Gemini AI +and generating comprehensive reports from evaluation data. +""" + +import os +import pandas as pd +import google.generativeai as genai +from dotenv import load_dotenv +from datetime import datetime +import json +from pathlib import Path +from typing import Dict, List, Optional, Tuple + + +# Prompt templates +SCORER_ANALYSIS_PROMPT = """ A scorer named {scorer_name} is run on different runs of the same agent/llm/tool, +the scorer gives a score and a reasoning, you will be given 25 such samples. Now the scorer may be giving different +reasonings and all of them are in natural language. I want you to highlight the key reasons for your response. +These key reasons will further be used for analysis, and improvement of the agent. Do not suggest any fixes. +Just focus on not missing out on any of the information regarding why the agent is failing. +You have to focus on the low scores only, as we have to improve them. +Some rows might show - "Missing required fields" it is a code issue, on the developer's side, so do not include it in the reasoning. + +Just to clarify, your job is not to analyze the scorers, but to analyze the agent. You are basically the representative of the scorers. + +Give the reasoning, and start with the scorer name. + +In the format - +Scorer Name: Task Progression +Reasoning: + +""" + +AGENTWISE_SUMMARY_PROMPT = """ +Different scorers are run on different runs of the same agent/llm/tool, you will be given the reasoning +for each scorer, as to why it gave poor scores. You job is to summarize the information from different +scorers into a single analysis. All the scores are of one specific part of the entire agentic workflow, +so please remove the redundancies that you get. Do not try to suggest fixes, only focus on removing +the redundant information, and keeping the important information. + +Just to clarify, your job is not to analyze the summaries, but to analyze the agent. You are basically the representative of the scorers. + +In the format - +Agent Name: query_generation +Reasoning: + +""" + +FINAL_ANALYSIS_PROMPT = """ +An agent is run, and then different scorers are run on specific parts of the agentic workflow. +So if an agent has 5 different parts (llm/tool/agent), and there are 3 scorers, then there will be a total of 15 scores, and respective reasonings. I have condensed these reasonings, in a part wise manner. +You will be given the part wise analysis, and you will also be given the entire agentic workflow, explaining how the agent is set up. + +You have to figure out why the agent is failing, you are given a bird's eye view, as in agents, a failure at step 1, may surface at step 3 in the analysis, so you will have to be aware of that. + +You have to suggest fixes to the developer in bullet points, in the format -> + +Suggested Fixes: + - fix_1: + - fix_2: + +""" + + +class NovaPilotAnalyzer: + """ + A class for analyzing agent performance using Gemini AI and generating comprehensive reports. + """ + + def __init__(self, api_key: Optional[str] = None, model_name: str = 'gemini-2.5-pro'): + """ + Initialize the NovaPilot Analyzer. + + Args: + api_key: Gemini API key. If None, will try to load from environment. + model_name: Name of the Gemini model to use. + """ + self._setup_gemini(api_key, model_name) + self.log_file: Optional[Path] = None + self.reddit_agent_doc: Optional[str] = None + + def _setup_gemini(self, api_key: Optional[str], model_name: str) -> None: + """Setup Gemini API configuration.""" + if api_key is None: + load_dotenv() + api_key = os.getenv('GEMINI_API_KEY') + + if not api_key: + raise ValueError("GEMINI_API_KEY not found in environment variables") + + genai.configure(api_key=api_key) + self.model = genai.GenerativeModel(model_name) + + def setup_logging(self, log_dir: str = 'log') -> Path: + """ + Setup logging directory and create timestamped log file. + + Args: + log_dir: Directory to store log files. + + Returns: + Path to the created log file. + """ + log_path = Path(log_dir) + log_path.mkdir(exist_ok=True) + + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + self.log_file = log_path / f"analysis_log_{timestamp}.txt" + + return self.log_file + + def load_agent_documentation(self, doc_path: str) -> str: + """ + Load agent documentation from a markdown file. + + Args: + doc_path: Path to the agent documentation file. + + Returns: + Content of the documentation file. + """ + with open(doc_path, 'r', encoding='utf-8') as f: + self.reddit_agent_doc = f.read() + return self.reddit_agent_doc + + def log_response(self, response: str, description: str) -> None: + """ + Log a response to the log file. + + Args: + response: The response text to log. + description: Description of what the response contains. + """ + if not self.log_file: + raise ValueError("Logging not setup. Call setup_logging() first.") + + with open(self.log_file, 'a', encoding='utf-8') as f: + f.write(f"\n{'='*50}\n") + f.write(f"TIMESTAMP: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n") + f.write(f"DESCRIPTION: {description}\n") + f.write(f"{'='*50}\n") + f.write(f"{response}\n") + f.write(f"{'='*50}\n\n") + + def process_csv_file(self, csv_path: str, max_rows: int = 25) -> Dict[str, str]: + """ + Process CSV file and extract scorer data for specified number of rows. + + Args: + csv_path: Path to the CSV file. + max_rows: Maximum number of rows to process. + + Returns: + Dictionary with scorer data strings. + """ + df = pd.read_csv(csv_path) + + # Get first max_rows rows (or all if less than max_rows) + rows_to_process = min(max_rows, len(df)) + df_subset = df.head(rows_to_process) + + # Get column names + columns = df.columns.tolist() + + # Skip first 4 columns (IDs) + # The remaining columns are split into score columns and reasoning columns + remaining_columns = columns[4:] + n_scorers = len(remaining_columns) // 2 # Each scorer has score + reasoning column + + score_columns = remaining_columns[:n_scorers] # First half are score columns + reasoning_columns = remaining_columns[n_scorers:] # Second half are reasoning columns + + scorer_data = {} + + # For each scorer + for i, scorer_name in enumerate(score_columns): + scorer_strings = [] + reasoning_col = reasoning_columns[i] + + for _, row in df_subset.iterrows(): + score = row[scorer_name] + reasoning = row[reasoning_col] + scorer_string = f"{scorer_name} score = {score} reasoning = {reasoning}" + scorer_strings.append(scorer_string) + + scorer_data[scorer_name] = "\n".join(scorer_strings) + + return scorer_data + + def analyze_scorer(self, scorer_name: str, scorer_data: str, dataset_name: str) -> str: + """ + Analyze a single scorer's data using Gemini AI. + + Args: + scorer_name: Name of the scorer. + scorer_data: Data string for the scorer. + dataset_name: Name of the dataset being analyzed. + + Returns: + Analysis response from Gemini AI. + """ + prompt = f"""{SCORER_ANALYSIS_PROMPT} + + Scorer Data: + Scorer Name: {scorer_name} + {scorer_data} + """ + + try: + response = self.model.generate_content(prompt) + response_text = response.text + response_text = "Analysis for scorer: " + scorer_name + "\n" + response_text + + # Log the response + self.log_response(response_text, f"{dataset_name} - {scorer_name} Analysis") + + return f"Scorer: {scorer_name}\n{response_text}" + + except Exception as e: + error_msg = f"Error processing {scorer_name}: {str(e)}" + self.log_response(error_msg, f"{dataset_name} - {scorer_name} Error") + raise e + + def create_dataset_summary(self, dataset_name: str, scorer_analyses: List[str]) -> str: + """ + Create a summary for a dataset based on scorer analyses. + + Args: + dataset_name: Name of the dataset. + scorer_analyses: List of scorer analysis strings. + + Returns: + Summary response from Gemini AI. + """ + combined_responses = "\n\n".join(scorer_analyses) + + summary_prompt = f"""{AGENTWISE_SUMMARY_PROMPT} + + Agent Name: {dataset_name} + Scorer Analyses: + {combined_responses} + """ + + try: + summary_response = self.model.generate_content(summary_prompt) + summary_text = summary_response.text + + # Log the summary + self.log_response(summary_text, f"{dataset_name} - Dataset Summary") + + return f"Dataset: {dataset_name}\n{summary_text}" + + except Exception as e: + error_msg = f"Error creating summary for {dataset_name}: {str(e)}" + self.log_response(error_msg, f"{dataset_name} - Summary Error") + raise e + + def process_dataset_directory(self, dataset_dir: Path, verbose: bool = True) -> Optional[str]: + """ + Process a single dataset directory. + + Args: + dataset_dir: Path to the dataset directory. + verbose: Whether to print progress messages. + + Returns: + Dataset summary string if successful, None if no CSV files found. + """ + if verbose: + print(f"\nProcessing {dataset_dir.name}...") + + # Find CSV file in the directory + csv_files = list(dataset_dir.glob('*.csv')) + if not csv_files: + if verbose: + print(f" No CSV files found in {dataset_dir.name}, skipping...") + return None + + csv_file = csv_files[0] # Take the first CSV file + if verbose: + print(f" Processing CSV: {csv_file.name}") + + # Process the CSV file + scorer_data = self.process_csv_file(csv_file) + + # Store responses for this dataset + dataset_responses = [] + + # For each scorer, make a Gemini API call + for scorer_name, scorer_string in scorer_data.items(): + if verbose: + print(f" Making Gemini call for scorer: {scorer_name}") + + try: + analysis = self.analyze_scorer(scorer_name, scorer_string, dataset_dir.name) + dataset_responses.append(analysis) + except Exception as e: + if verbose: + print(f" Error processing {scorer_name}: {str(e)}") + continue + + # Make summary call for this dataset + if verbose: + print(f" Making summary call for {dataset_dir.name}") + + try: + summary = self.create_dataset_summary(dataset_dir.name, dataset_responses) + return summary + except Exception as e: + if verbose: + print(f" Error creating summary: {str(e)}") + return None + + def process_all_datasets(self, demo_results_dir: str = 'demo_results', verbose: bool = True) -> List[str]: + """ + Process all dataset directories in the demo results folder. + + Args: + demo_results_dir: Path to the demo results directory. + verbose: Whether to print progress messages. + + Returns: + List of dataset summary strings. + """ + demo_results_path = Path(demo_results_dir) + all_summaries = [] + + # Get all dataset directories + dataset_dirs = [d for d in demo_results_path.iterdir() if d.is_dir()] + + if verbose: + print(f"Found {len(dataset_dirs)} dataset directories to process:") + for d in dataset_dirs: + print(f" - {d.name}") + + # Process each dataset directory + for dataset_dir in dataset_dirs: + summary = self.process_dataset_directory(dataset_dir, verbose) + if summary: + all_summaries.append(summary) + + if verbose: + print(f"\nCompleted processing {len(dataset_dirs)} datasets.") + + return all_summaries + + def create_final_analysis(self, dataset_summaries: List[str], agent_doc: Optional[str] = None) -> str: + """ + Create final comprehensive analysis from all dataset summaries. + + Args: + dataset_summaries: List of dataset summary strings. + agent_doc: Agent documentation string. If None, uses loaded documentation. + + Returns: + Final analysis response from Gemini AI. + """ + if agent_doc is None: + agent_doc = self.reddit_agent_doc + + if not agent_doc: + raise ValueError("No agent documentation provided. Load it first or pass as parameter.") + + # Combine all dataset summaries + combined_summaries = "\n\n".join(dataset_summaries) + + final_prompt = f"""{FINAL_ANALYSIS_PROMPT} + + Reddit Agent Documentation: + {agent_doc} + + Dataset Summaries: + {combined_summaries} + """ + + try: + final_response = self.model.generate_content(final_prompt) + final_text = final_response.text + + # Log the final analysis + self.log_response(final_text, "Final Comprehensive Analysis") + + return final_text + + except Exception as e: + error_msg = f"Error creating final analysis: {str(e)}" + self.log_response(error_msg, "Final Analysis Error") + raise e + + def run_complete_analysis(self, + demo_results_dir: str = 'demo_results', + agent_doc_path: Optional[str] = None, + log_dir: str = 'log', + verbose: bool = True) -> Tuple[str, List[str], Path]: + """ + Run the complete analysis pipeline. + + Args: + demo_results_dir: Path to the demo results directory. + agent_doc_path: Path to agent documentation file. + log_dir: Directory to store log files. + verbose: Whether to print progress messages. + + Returns: + Tuple of (final_analysis, dataset_summaries, log_file_path). + """ + # Setup logging + log_file = self.setup_logging(log_dir) + + # Load agent documentation if provided + if agent_doc_path: + self.load_agent_documentation(agent_doc_path) + + # Process all datasets + dataset_summaries = self.process_all_datasets(demo_results_dir, verbose) + + # Create final analysis + if verbose: + print("\nMaking final comprehensive analysis call...") + + final_analysis = self.create_final_analysis(dataset_summaries) + + if verbose: + print("Final analysis completed and logged!") + print(f"All responses have been logged to: {log_file}") + print("\n" + "="*50) + print("ANALYSIS COMPLETE!") + print("="*50) + print(f"Log file location: {log_file}") + print(f"Total datasets processed: {len(dataset_summaries)}") + print(f"Total summaries generated: {len(dataset_summaries)}") + print("="*50) + + return final_analysis, dataset_summaries, log_file + + +# Convenience functions for backward compatibility +def setup_gemini_analyzer(api_key: Optional[str] = None, model_name: str = 'gemini-2.5-pro') -> NovaPilotAnalyzer: + """ + Create and setup a NovaPilot Analyzer instance. + + Args: + api_key: Gemini API key. If None, will try to load from environment. + model_name: Name of the Gemini model to use. + + Returns: + Configured NovaPilotAnalyzer instance. + """ + return NovaPilotAnalyzer(api_key, model_name) + + +def run_agent_analysis(demo_results_dir: str = 'demo_results', + agent_doc_path: Optional[str] = None, + log_dir: str = 'log', + api_key: Optional[str] = None, + model_name: str = 'gemini-2.5-pro', + verbose: bool = True) -> Tuple[str, List[str], Path]: + """ + Run complete agent analysis pipeline. + + Args: + demo_results_dir: Path to the demo results directory. + agent_doc_path: Path to agent documentation file. + log_dir: Directory to store log files. + api_key: Gemini API key. If None, will try to load from environment. + model_name: Name of the Gemini model to use. + verbose: Whether to print progress messages. + + Returns: + Tuple of (final_analysis, dataset_summaries, log_file_path). + """ + analyzer = setup_gemini_analyzer(api_key, model_name) + return analyzer.run_complete_analysis(demo_results_dir, agent_doc_path, log_dir, verbose) + + +def recommend_improvements(demo_results_dir: str = 'demo_results', + agent_doc_path: str = 'reddit_agent.md', + log_dir: str = 'log', + api_key: Optional[str] = None, + model_name: str = 'gemini-2.5-pro', + verbose: bool = True) -> Tuple[str, List[str], Path]: + """ + Single function to run the complete agent analysis pipeline. + This is equivalent to running the entire complete_analysis_demo.ipynb notebook. + + This function: + 1. Sets up Gemini API and logging + 2. Loads agent documentation + 3. Processes all dataset directories + 4. Analyzes each scorer's data + 5. Creates dataset summaries + 6. Generates final comprehensive analysis with improvement recommendations + + Args: + demo_results_dir: Path to the demo results directory containing dataset folders. + agent_doc_path: Path to the agent documentation markdown file. + log_dir: Directory to store analysis log files. + api_key: Gemini API key. If None, will try to load from environment. + model_name: Name of the Gemini model to use. + verbose: Whether to print progress messages. + + Returns: + Tuple of (final_analysis, dataset_summaries, log_file_path). + - final_analysis: Complete analysis with suggested fixes + - dataset_summaries: List of summaries for each dataset + - log_file_path: Path to the generated log file + + Example: + >>> final_analysis, summaries, log_file = recommend_improvements("demo_results") + >>> print(final_analysis) # Shows suggested fixes for the agent + """ + if verbose: + print("="*60) + print("NOVAPILOT AGENT ANALYSIS - RECOMMEND IMPROVEMENTS") + print("="*60) + print("This function runs the complete analysis pipeline equivalent to") + print("running the entire complete_analysis_demo.ipynb notebook.") + print("="*60) + + # Initialize analyzer + analyzer = NovaPilotAnalyzer(api_key, model_name) + + # Setup logging + log_file = analyzer.setup_logging(log_dir) + if verbose: + print(f"Setup complete! Log file: {log_file}") + + # Load agent documentation + if os.path.exists(agent_doc_path): + agent_doc = analyzer.load_agent_documentation(agent_doc_path) + if verbose: + print(f"Agent document loaded: {len(agent_doc)} characters") + else: + if verbose: + print(f"Warning: Agent document not found at {agent_doc_path}") + agent_doc = None + + # Process all datasets + dataset_summaries = analyzer.process_all_datasets(demo_results_dir, verbose) + + # Create final comprehensive analysis + if verbose: + print("\nMaking final comprehensive analysis call...") + + final_analysis = analyzer.create_final_analysis(dataset_summaries, agent_doc) + + if verbose: + print("Final analysis completed and logged!") + print(f"All responses have been logged to: {log_file}") + print("\n" + "="*50) + print("ANALYSIS COMPLETE!") + print("="*50) + print(f"Log file location: {log_file}") + print(f"Total datasets processed: {len(dataset_summaries)}") + print(f"Total summaries generated: {len(dataset_summaries)}") + print("="*50) + + return final_analysis, dataset_summaries, log_file diff --git a/noveum_customer_support_bt/noveum_agent_requirements.txt b/noveum_customer_support_bt/noveum_agent_requirements.txt new file mode 100644 index 0000000..a72a6b8 --- /dev/null +++ b/noveum_customer_support_bt/noveum_agent_requirements.txt @@ -0,0 +1,33 @@ +# Noveum AI Agent with RAG + Web Search - Additional Requirements +# Core dependencies (already in main requirements.txt) +requests==2.32.3 +beautifulsoup4==4.12.3 +python-dotenv==1.0.1 + +# LangChain ecosystem +langchain==0.3.26 +langchain-community==0.3.18 +langchain-core==0.3.66 +langchain-openai==0.3.25 + +# Web scraping and text extraction +trafilatura>=1.6.4 +lxml>=5.3.0 + +# Vector search and embeddings +faiss-cpu==1.12.0 +# Alternative: faiss-gpu==1.12.0 # Use this if you have CUDA support + +# Web search +duckduckgo-search>=6.1.12 + +# Data processing +pandas>=2.2.3 +tqdm>=4.67.1 + +# Noveum Trace +noveum_trace>=0.3.5 + +# Jupyter notebook support +jupyter==1.0.0 +ipykernel==6.29.4 diff --git a/noveum_customer_support_bt/noveum_docs.json b/noveum_customer_support_bt/noveum_docs.json new file mode 100644 index 0000000..01778a8 --- /dev/null +++ b/noveum_customer_support_bt/noveum_docs.json @@ -0,0 +1,378 @@ +[ + { + "url": "https://noveum.ai", + "title": "AI Observability, LLM Evals & Agent Monitoring | Noveum.ai", + "content": "Monitor all your AI Agents\nimprove AI Agents today\nNoveum.ai helps you monitor, trace, and optimize your AI applications.\nNoveum.ai works with any AI framework – LangChain, CrewAI, AutoGen, custom implementations, or direct LLM calls. One dashboard shows everything.\nMonitor, Evaluate, Improve Your AI Agents\nThe control plane for AI agents.\nMonitor Everything, Miss Nothing\nOur lightweight SDKs capture every trace and span across your AI agent ecosystemβ€”from simple LLM calls to complex multi-agent workflows. Get complete visibility without performance overhead.\nStart MonitoringEvaluate with 30+ Advanced Metrics\nNovaEval automatically scores every agent interaction using our comprehensive evaluation framework. Track accuracy, semantic similarity, safety, bias, and custom business metrics in real-time.\nView EvaluationsImprove Automatically with NovaPilot\nOur AI engineer analyzes performance data and automatically generates fixes for failing agents. Get detailed reports on model changes, prompt optimizations, and tool improvementsβ€”all without human intervention.\nTry Auto-ImprovementEnterprise Ready\nNoveum.ai is built for enterprise-scale AI applications, with support for multi-tenant, multi-region deployments and advanced security features.\nContact Saleswith the world's favorite AI Observability Platform\nEverything You Need to Master AI Agent Operations\nNoveum.ai helps you monitor, trace, and optimize your AI applications with comprehensive observability tools designed for modern LLM workflows.\nSee Every Agent, Every Interaction, Every Decision\n30+ Metrics That Actually Matter for Business\nYour AI Engineer That Never Sleeps\n100% visibility on every AI agent\nReduce AI Incidents by 85%\nGet comprehensive AI monitoring with automated incident prevention, faster debugging, and built-in compliance tools.\nInstead of spending days investigating AI agent failures, your team gets instant insights into what went wrong and how to fix it. Detailed traces and automated analysis eliminate guesswork.\n0+\nAI FrameworksWith the world's favorite AI observability platform\nEasy integration with your AI stack\nNoveum.ai integrates seamlessly with all popular AI frameworks and providers, giving you comprehensive observability across your entire AI pipeline.\nWorks great with: LangChain, OpenAI, Anthropic, AWS Bedrock, Azure OpenAI, Google Cloud (Vertex AI), CrewAI, LangGraph, LlamaIndex, AutoGen, custom SDKs, and more\nwith the world's favorite AI observability platform\nTrusted AI monitoring tools by thousands of developers\n0+\nAI Eval Metrics0.0%\nuptime SLA0M+\ntraces processed", + "content_length": 2591, + "internal_links": [ + "https://noveum.ai/en", + "https://noveum.ai/en", + "https://noveum.ai/en/blog", + "https://noveum.ai/en/docs", + "https://noveum.ai/en/careers", + "https://noveum.ai/en/contact", + "https://noveum.ai/auth/login", + "https://noveum.ai/en", + "https://noveum.ai/en", + "https://noveum.ai/en/blog", + "https://noveum.ai/en/docs", + "https://noveum.ai/en/contact", + "https://noveum.ai/auth/login", + "https://noveum.ai/auth/login", + "https://noveum.ai/docs", + "https://noveum.ai/auth/login", + "https://noveum.ai/auth/login", + "https://noveum.ai/auth/login", + "https://noveum.ai/en", + "https://noveum.ai/en/blog", + "https://noveum.ai/en/docs", + "https://noveum.ai/en/contact", + "https://noveum.ai/en/pricing", + "https://noveum.ai/en/docs/getting-started/sdk-integration", + "https://noveum.ai/en/docs/getting-started/overview", + "https://noveum.ai/en/auth/login" + ], + "scraped_at": 1759935886.929589 + }, + { + "url": "https://noveum.ai/en", + "title": "AI Observability, LLM Evals & Agent Monitoring | Noveum.ai", + "content": "Monitor all your AI Agents\nimprove AI Agents today\nNoveum.ai helps you monitor, trace, and optimize your AI applications.\nNoveum.ai works with any AI framework – LangChain, CrewAI, AutoGen, custom implementations, or direct LLM calls. One dashboard shows everything.\nMonitor, Evaluate, Improve Your AI Agents\nThe control plane for AI agents.\nMonitor Everything, Miss Nothing\nOur lightweight SDKs capture every trace and span across your AI agent ecosystemβ€”from simple LLM calls to complex multi-agent workflows. Get complete visibility without performance overhead.\nStart MonitoringEvaluate with 30+ Advanced Metrics\nNovaEval automatically scores every agent interaction using our comprehensive evaluation framework. Track accuracy, semantic similarity, safety, bias, and custom business metrics in real-time.\nView EvaluationsImprove Automatically with NovaPilot\nOur AI engineer analyzes performance data and automatically generates fixes for failing agents. Get detailed reports on model changes, prompt optimizations, and tool improvementsβ€”all without human intervention.\nTry Auto-ImprovementEnterprise Ready\nNoveum.ai is built for enterprise-scale AI applications, with support for multi-tenant, multi-region deployments and advanced security features.\nContact Saleswith the world's favorite AI Observability Platform\nEverything You Need to Master AI Agent Operations\nNoveum.ai helps you monitor, trace, and optimize your AI applications with comprehensive observability tools designed for modern LLM workflows.\nSee Every Agent, Every Interaction, Every Decision\n30+ Metrics That Actually Matter for Business\nYour AI Engineer That Never Sleeps\n100% visibility on every AI agent\nReduce AI Incidents by 85%\nGet comprehensive AI monitoring with automated incident prevention, faster debugging, and built-in compliance tools.\nInstead of spending days investigating AI agent failures, your team gets instant insights into what went wrong and how to fix it. Detailed traces and automated analysis eliminate guesswork.\n0+\nAI FrameworksWith the world's favorite AI observability platform\nEasy integration with your AI stack\nNoveum.ai integrates seamlessly with all popular AI frameworks and providers, giving you comprehensive observability across your entire AI pipeline.\nWorks great with: LangChain, OpenAI, Anthropic, AWS Bedrock, Azure OpenAI, Google Cloud (Vertex AI), CrewAI, LangGraph, LlamaIndex, AutoGen, custom SDKs, and more\nwith the world's favorite AI observability platform\nTrusted AI monitoring tools by thousands of developers\n0+\nAI Eval Metrics0.0%\nuptime SLA0M+\ntraces processed", + "content_length": 2591, + "internal_links": [], + "scraped_at": 1759935887.816968 + }, + { + "url": "https://noveum.ai/en/blog", + "title": "Noveum.ai Blog | Noveum.ai", + "content": "Noveum.ai Blog\nRead the latest news & articles from Noveum.ai (prev MagicAPI Inc).\nLearn what evals for AI agents are, why they are essential for production AI, and how Noveum.ai makes running evaluations practical without slowing down your development roadmap.\nAditi Upaddhyay\n9/25/2025\nMMLU benchmark comparison of GPT-OSS (thinking modes), GPT-5, O3, and GPT-4o-mini focusing on accuracy, runtime efficiency, and practical model selection.\nShivam Gupta\n8/13/2025\nWe compared Azure o1-mini vs gpt-4o-mini on 1,000 MMLU math samples using NovaEval. Here’s how we tested, what worked, what didn’t, and when the 15Γ— cost premium makes sense.\nShashank Agarwal\n8/12/2025\nDiscover how Noveum.ai provides comprehensive tracing and observability for AI applications, from development debugging to production optimization.\nShashank Agarwal\n3/3/2025\nDiscover how Noveum.ai provides comprehensive tracing and observability for LLM applications, RAG systems, and multi-agent workflows with our powerful Python and TypeScript SDKs.\nShashank Agarwal\n3/2/2025", + "content_length": 1046, + "internal_links": [ + "https://noveum.ai/en/changelog", + "https://noveum.ai/en/blog/evals-for-ai-agents", + "https://noveum.ai/en/blog/evals-for-ai-agents", + "https://noveum.ai/en/blog/gpt-oss-vs-gpt-5-vs-gpt-4o-mini-mmlu-evaluation-report", + "https://noveum.ai/en/blog/gpt-oss-vs-gpt-5-vs-gpt-4o-mini-mmlu-evaluation-report", + "https://noveum.ai/en/blog/comprehensive-mmlu-evaluation-analysis-report", + "https://noveum.ai/en/blog/comprehensive-mmlu-evaluation-analysis-report", + "https://noveum.ai/en/blog/from-logs-to-intelligent-choices-inside-noveum-ais-evaluation-process", + "https://noveum.ai/en/blog/from-logs-to-intelligent-choices-inside-noveum-ais-evaluation-process", + "https://noveum.ai/en/blog/noveum-ai-your-one-stop-ai-evaluation-platform", + "https://noveum.ai/en/blog/noveum-ai-your-one-stop-ai-evaluation-platform", + "https://noveum.ai/en/legal/privacy-policy", + "https://noveum.ai/en/legal/terms" + ], + "scraped_at": 1759935888.8273768 + }, + { + "url": "https://noveum.ai/en/docs", + "title": "Welcome to Noveum.ai Docs | Documentation | Noveum.ai", + "content": "Welcome to Noveum.ai Docs\nComprehensive AI tracing and observability for LLM applications, RAG systems, and AI agents\nWelcome to the Noveum.ai documentation! Here you'll find everything you need to integrate, configure, and optimize your AI applications using our comprehensive tracing and observability platform.\nWe provide powerful SDKs for Python and TypeScript that enable you to trace LLM calls, RAG pipelines, and multi-agent workflows with minimal code changes. Our documentation is organized to help you get started quickly while diving into advanced observability patterns and best practices.\nπŸš€ What is Noveum.ai?\nNoveum.ai is a comprehensive AI tracing and observability platform designed specifically for modern AI applications. Unlike traditional monitoring tools, Noveum understands the unique challenges of LLM applications, RAG systems, and AI agents.\nCore Components\n-\n🐍 Python SDK (\nnoveum-trace\n)- Decorator-based tracing for LLM calls, agents, and RAG pipelines\n- Automatic instrumentation for popular AI frameworks\n- Context propagation across async operations\n-\nπŸ“˜ TypeScript SDK (\n@noveum/trace\n)- Framework integrations for Next.js, Express.js, and Hono\n- TypeScript-first design with full type safety\n- Universal compatibility (Node.js, Edge Runtime, browsers)\n-\nπŸ“Š Noveum Platform\n- Real-time dashboard for analyzing traces and performance\n- Advanced filtering and search capabilities\n- Cost analysis and optimization insights\n- Team collaboration and project management\nπŸƒ Quick Start\n1. Choose Your SDK\n2. View Your Data\nWithin minutes, you'll see comprehensive traces in the Noveum dashboard:\n- πŸ” Request/Response details and timing\n- πŸ’° Cost tracking across providers\n- πŸš€ Performance metrics (latency, throughput)\n- πŸ› Error analysis and debugging context\nπŸ“š Documentation Structure\nGetting Started\n-\nOverview Introduction to Noveum.ai's tracing and observability capabilities.\n-\nSDK Integration Step-by-step guide to integrate Python or TypeScript SDKs into your application.\n-\nTracing Concepts Understanding traces, spans, and observability best practices for AI applications.\n-\nFramework Integrations Specific guides for Next.js, Express.js, FastAPI, and other popular frameworks.\nAdvanced Usage\n-\nMulti-Agent Tracing Observe complex agent workflows and inter-agent communications.\n-\nRAG Pipeline Observability Monitor retrieval, generation, and context handling in RAG systems.\n-\nCustom Instrumentation Add custom spans and attributes for domain-specific observability.\n-\nPerformance Optimization Use tracing data to identify bottlenecks and optimize AI application performance.\nPlatform Features\n-\nDashboard Overview Navigate the Noveum platform and understand key metrics.\n-\nProjects & Environments Organize your applications and manage different deployment environments.\n-\nTeam Collaboration Share insights and collaborate on AI application observability.\n-\nAPI Reference Direct API access for custom integrations and advanced use cases.\n🎯 Use Cases\nLLM Application Monitoring\nTrack every LLM call across your application with automatic cost calculation, latency measurement, and error tracking.\nRAG System Observability\nMonitor the entire RAG pipeline from query understanding to document retrieval to answer generation.\nMulti-Agent Workflows\nObserve complex agent interactions, tool usage, and decision-making processes across distributed AI systems.\nPerformance Optimization\nIdentify slow operations, expensive API calls, and opportunities for caching or model optimization.\nπŸ”— SDK Resources\nPython SDK\n- πŸ“¦ PyPI Package: noveum-trace\n- πŸ™ GitHub Repository: Noveum/noveum-trace\n- πŸ“– API Documentation: Python SDK Docs\nTypeScript SDK\n- πŸ“¦ NPM Package: @noveum/trace\n- πŸ™ GitHub Repository: Noveum/noveum-trace-ts\n- πŸ“– API Documentation: TypeScript SDK Docs\nπŸ’‘ Key Benefits\n- πŸ”§ Minimal Setup: Start tracing with just a few lines of code\n- 🎯 AI-Native: Purpose-built for LLM, RAG, and agent observability\n- πŸš€ Production Ready: Battle-tested at scale with intelligent sampling\n- πŸ”’ Secure: End-to-end encryption with configurable data retention\n- 🌍 Universal: Works across frameworks, clouds, and deployment models\n🀝 Community & Support\n- πŸ’¬ Discord Community: Join our Discord\n- πŸ“§ Email Support: [email protected]\n- πŸ› Bug Reports: GitHub Issues\n- πŸ“– Knowledge Base: Help Center\nReady to get started? Head to our SDK Integration Guide to begin tracing your AI applications in under 5 minutes!\nGet Early Access to Noveum.ai Platform\nBe the first one to get notified when we open Noveum Platform to more users. All users get access to Observability suite for free, early users get free eval jobs and premium support for the first year.", + "content_length": 4652, + "internal_links": [ + "https://noveum.ai/en/docs/getting-started/tracing-concepts", + "https://noveum.ai/en/docs/getting-started/framework-integrations", + "https://noveum.ai/en/docs/advanced/multi-agent-tracing", + "https://noveum.ai/en/docs/advanced/rag-observability", + "https://noveum.ai/en/docs/advanced/custom-instrumentation", + "https://noveum.ai/en/docs/advanced/performance-optimization", + "https://noveum.ai/docs/getting-started/overview", + "https://noveum.ai/docs/getting-started/sdk-integration", + "https://noveum.ai/docs/getting-started/tracing-concepts", + "https://noveum.ai/docs/getting-started/framework-integrations", + "https://noveum.ai/docs/advanced/multi-agent-tracing", + "https://noveum.ai/docs/advanced/rag-observability", + "https://noveum.ai/docs/advanced/custom-instrumentation", + "https://noveum.ai/docs/advanced/performance-optimization", + "https://noveum.ai/docs/platform/dashboard", + "https://noveum.ai/docs/platform/projects", + "https://noveum.ai/docs/platform/teams", + "https://noveum.ai/docs/platform/api", + "https://noveum.ai/docs/getting-started/sdk-integration" + ], + "scraped_at": 1759935889.759006 + }, + { + "url": "https://noveum.ai/en/careers", + "title": "AI Observability, LLM Evals & Agent Monitoring | Noveum.ai", + "content": "Why Work at Noveum.ai?\nJoin a team that's building the future of AI while maintaining a culture of innovation, transparency, and mutual respect.\nAI Agents Economy Pioneer\nBe at the forefront of the revolutionary AI Agents Economy and shape the future of autonomous AI systems.\nCreator of API.market\nJoin the team behind API.market, already loved by thousands of developers worldwide.\n100% Remote\nWork from anywhere in the world. We believe in flexibility and work-life balance.\nIndustry Leading Compensation\nCompetitive salaries and equity packages that reflect your value and contribution.\nAI-First Culture\nWe encourage using AI tools like Cursor IDE and ChatGPT. We'll train you to use them effectively.\nNo Politics, No Toxicity\nA healthy work environment focused on building great products and supporting each other.\nOpen Positions\nJoin our growing team and help build the next generation of AI agents and developer tools.\nMonitor, evaluate, and improve AI agents in production. Build eval pipelines, debug agent workflows, and deploy/tune models alongside observability to drive reliability and quality.\nKey Requirements:\nOur Culture & Values\nWe're building more than just technologyβ€”we're creating a workplace where innovation thrives and everyone can do their best work.\nDo No Harm\nWe build technology that benefits humanity and take responsibility for our impact on the world.\nBe the Good You Want to See\nWe lead by example and create positive change in our industry and communities.\nOwnership\nWe take ownership of our work, decisions, and outcomes. Everyone is empowered to make a difference.\nTransparency\nWe believe in open communication, honest feedback, and sharing knowledge across the team.\nReady to Join Our Mission?\nWe're looking for passionate individuals who want to shape the future of AI agents and build technology that makes a positive impact.", + "content_length": 1864, + "internal_links": [ + "https://noveum.ai/en/careers/apply/senior-ai-engineer", + "https://noveum.ai/en/careers/apply/fullstack-developer-ai" + ], + "scraped_at": 1759935890.6604009 + }, + { + "url": "https://noveum.ai/en/contact", + "title": "Contact us | Noveum.ai", + "content": "Blog\nChangelog\nContact\nGitHub\nCareers\nDocs\nLogin\nContact us\nWe are here to help you. Please use the form below to get in touch with us.\nName\nEmail\nMessage\nSend message", + "content_length": 167, + "internal_links": [], + "scraped_at": 1759935891.625962 + }, + { + "url": "https://noveum.ai/auth/login", + "title": "Welcome back | Noveum.ai", + "content": "Welcome back\nPlease enter your credentials to sign in.\nPassword\nOTP\nEmail\nPassword\nForgot password?\nSign in\nOr continue with\nGoogle\nGoogle\nGithub\nGithub\nLogin with passkey\nDon't have an account yet?\nCreate an account", + "content_length": 216, + "internal_links": [ + "https://noveum.ai/", + "https://noveum.ai/auth/forgot-password", + "https://noveum.ai/auth/signup" + ], + "scraped_at": 1759935892.526221 + }, + { + "url": "https://noveum.ai/docs", + "title": "Welcome to Noveum.ai Docs | Documentation | Noveum.ai", + "content": "Welcome to Noveum.ai Docs\nComprehensive AI tracing and observability for LLM applications, RAG systems, and AI agents\nWelcome to the Noveum.ai documentation! Here you'll find everything you need to integrate, configure, and optimize your AI applications using our comprehensive tracing and observability platform.\nWe provide powerful SDKs for Python and TypeScript that enable you to trace LLM calls, RAG pipelines, and multi-agent workflows with minimal code changes. Our documentation is organized to help you get started quickly while diving into advanced observability patterns and best practices.\nπŸš€ What is Noveum.ai?\nNoveum.ai is a comprehensive AI tracing and observability platform designed specifically for modern AI applications. Unlike traditional monitoring tools, Noveum understands the unique challenges of LLM applications, RAG systems, and AI agents.\nCore Components\n-\n🐍 Python SDK (\nnoveum-trace\n)- Decorator-based tracing for LLM calls, agents, and RAG pipelines\n- Automatic instrumentation for popular AI frameworks\n- Context propagation across async operations\n-\nπŸ“˜ TypeScript SDK (\n@noveum/trace\n)- Framework integrations for Next.js, Express.js, and Hono\n- TypeScript-first design with full type safety\n- Universal compatibility (Node.js, Edge Runtime, browsers)\n-\nπŸ“Š Noveum Platform\n- Real-time dashboard for analyzing traces and performance\n- Advanced filtering and search capabilities\n- Cost analysis and optimization insights\n- Team collaboration and project management\nπŸƒ Quick Start\n1. Choose Your SDK\n2. View Your Data\nWithin minutes, you'll see comprehensive traces in the Noveum dashboard:\n- πŸ” Request/Response details and timing\n- πŸ’° Cost tracking across providers\n- πŸš€ Performance metrics (latency, throughput)\n- πŸ› Error analysis and debugging context\nπŸ“š Documentation Structure\nGetting Started\n-\nOverview Introduction to Noveum.ai's tracing and observability capabilities.\n-\nSDK Integration Step-by-step guide to integrate Python or TypeScript SDKs into your application.\n-\nTracing Concepts Understanding traces, spans, and observability best practices for AI applications.\n-\nFramework Integrations Specific guides for Next.js, Express.js, FastAPI, and other popular frameworks.\nAdvanced Usage\n-\nMulti-Agent Tracing Observe complex agent workflows and inter-agent communications.\n-\nRAG Pipeline Observability Monitor retrieval, generation, and context handling in RAG systems.\n-\nCustom Instrumentation Add custom spans and attributes for domain-specific observability.\n-\nPerformance Optimization Use tracing data to identify bottlenecks and optimize AI application performance.\nPlatform Features\n-\nDashboard Overview Navigate the Noveum platform and understand key metrics.\n-\nProjects & Environments Organize your applications and manage different deployment environments.\n-\nTeam Collaboration Share insights and collaborate on AI application observability.\n-\nAPI Reference Direct API access for custom integrations and advanced use cases.\n🎯 Use Cases\nLLM Application Monitoring\nTrack every LLM call across your application with automatic cost calculation, latency measurement, and error tracking.\nRAG System Observability\nMonitor the entire RAG pipeline from query understanding to document retrieval to answer generation.\nMulti-Agent Workflows\nObserve complex agent interactions, tool usage, and decision-making processes across distributed AI systems.\nPerformance Optimization\nIdentify slow operations, expensive API calls, and opportunities for caching or model optimization.\nπŸ”— SDK Resources\nPython SDK\n- πŸ“¦ PyPI Package: noveum-trace\n- πŸ™ GitHub Repository: Noveum/noveum-trace\n- πŸ“– API Documentation: Python SDK Docs\nTypeScript SDK\n- πŸ“¦ NPM Package: @noveum/trace\n- πŸ™ GitHub Repository: Noveum/noveum-trace-ts\n- πŸ“– API Documentation: TypeScript SDK Docs\nπŸ’‘ Key Benefits\n- πŸ”§ Minimal Setup: Start tracing with just a few lines of code\n- 🎯 AI-Native: Purpose-built for LLM, RAG, and agent observability\n- πŸš€ Production Ready: Battle-tested at scale with intelligent sampling\n- πŸ”’ Secure: End-to-end encryption with configurable data retention\n- 🌍 Universal: Works across frameworks, clouds, and deployment models\n🀝 Community & Support\n- πŸ’¬ Discord Community: Join our Discord\n- πŸ“§ Email Support: [email protected]\n- πŸ› Bug Reports: GitHub Issues\n- πŸ“– Knowledge Base: Help Center\nReady to get started? Head to our SDK Integration Guide to begin tracing your AI applications in under 5 minutes!\nGet Early Access to Noveum.ai Platform\nBe the first one to get notified when we open Noveum Platform to more users. All users get access to Observability suite for free, early users get free eval jobs and premium support for the first year.", + "content_length": 4652, + "internal_links": [], + "scraped_at": 1759935893.7828538 + }, + { + "url": "https://noveum.ai/en/docs/getting-started/sdk-integration", + "title": "SDK Integration Guide | Documentation | Noveum.ai", + "content": "SDK Integration Guide\nIntegrate Noveum.ai tracing into your AI applications with Python or TypeScript SDKs\nThe Noveum.ai SDKs provide comprehensive tracing and observability for your AI applications with minimal code changes. Whether you're building LLM applications, RAG systems, or multi-agent workflows, our SDKs automatically capture essential metrics and traces.\nπŸš€ Quick Start\n1. Create Your Account & Get API Key\n- Sign up at noveum.ai\n- Create a project in your dashboard\n- Generate an API key from the integration page\n- Choose your SDK based on your application language\n2. Install the SDK\nRequirements: Python 3.8+\n3. Initialize the Client\nEnvironment Variables:\n🎯 Basic Usage\nTrace LLM Calls\nAlternative - Context Manager:\nTrace RAG Pipelines\nπŸ”§ Framework Integrations\nNext.js Integration\nExpress.js Integration\nFastAPI Integration (Python)\nπŸ“Š Advanced Features\nCustom Attributes & Events\nSampling Configuration\nπŸ”— What's Captured Automatically\n- πŸ“Š Performance Metrics: Latency, throughput, error rates\n- πŸ’° Cost Tracking: Token usage, API costs across providers\n- πŸ” Request/Response: Configurable capture of inputs/outputs\n- 🏷️ Metadata: Model names, parameters, user context\n- 🌊 Context Flow: Trace relationships across services\n- πŸ› Error Details: Stack traces, error classification\nπŸ“ˆ View Your Data\nOnce integrated, visit your Noveum Dashboard to:\n- πŸ” Search & Filter traces by any attribute\n- πŸ“Š Analyze Performance trends and bottlenecks\n- πŸ’° Monitor Costs across different models and providers\n- πŸ› Debug Issues with detailed trace timelines\n- πŸ‘₯ Collaborate with your team on insights\nπŸ”’ Security & Privacy\n- πŸ” Encryption: All data encrypted in transit and at rest\n- πŸŽ›οΈ Configurable Capture: Control what data is collected\n- 🏠 Data Residency: Choose your data storage region\n- ⏰ Retention Control: Set custom data retention policies\nNext Steps\n- Tracing Concepts - Learn about traces, spans, and observability best practices\n- Framework Integrations - Deep dive into specific framework setups\n- Multi-Agent Tracing - Observe complex agent workflows\n- Dashboard Guide - Master the Noveum platform interface\nExclusive Early Access\nGet Early Access to Noveum.ai Platform\nBe the first one to get notified when we open Noveum Platform to more users. All users get access to Observability suite for free, early users get free eval jobs and premium support for the first year.", + "content_length": 2378, + "internal_links": [], + "scraped_at": 1759935895.715991 + }, + { + "url": "https://noveum.ai/en/docs/getting-started/overview", + "title": "Noveum.ai Overview | Documentation | Noveum.ai", + "content": "Noveum.ai Overview\nComprehensive AI tracing and observability platform for LLM applications, RAG systems, and AI agents\nWelcome to Noveum.aiβ€”the comprehensive tracing and observability platform built specifically for AI applications. Whether you're building LLM-powered chatbots, RAG systems, multi-agent workflows, or any AI-driven application, Noveum provides the insights you need to understand, debug, and optimize your systems.\n🎯 Why AI Applications Need Specialized Observability\nTraditional monitoring tools fall short when it comes to AI applications because they don't understand:\n- πŸ“Š AI-Specific Metrics: Token usage, model costs, prompt effectiveness\n- πŸ”€ Complex Workflows: Multi-step RAG pipelines, agent interactions, tool usage\n- 🧠 Context Flow: How data moves through embeddings, retrievals, and generations\n- πŸ’° Cost Attribution: Which operations drive your AI spending\n- 🎯 Quality Metrics: Beyond latency - understanding output quality and relevance\nNoveum.ai bridges this gap with purpose-built observability for the AI era.\nπŸš€ Core Platform Components\n1. 🐍 Python SDK (noveum-trace\n)\n- Decorator-based tracing for seamless integration\n- Automatic instrumentation for LangChain, LlamaIndex, and OpenAI\n- Async-aware context propagation\n- Production-ready with intelligent sampling and batching\n2. πŸ“˜ TypeScript SDK (@noveum/trace\n)\n- Framework integrations for Next.js, Express.js, Hono\n- TypeScript-first with full type safety\n- Universal compatibility (Node.js, Edge Runtime, browsers)\n- Zero-config automatic instrumentation\n3. πŸ“Š Noveum Platform\n- Real-time dashboard with AI-specific visualizations\n- Advanced search & filtering across traces and spans\n- Cost analysis and optimization recommendations\n- Team collaboration with shared insights and alerts\nπŸ” What Noveum Traces\nLLM Operations\n- Model calls across all providers (OpenAI, Anthropic, Google, etc.)\n- Token usage and cost calculation\n- Prompt engineering effectiveness\n- Response quality metrics\nRAG Pipelines\n- Document retrieval performance and relevance\n- Embedding generation costs and latency\n- Context assembly and prompt construction\n- Answer generation with source attribution\nMulti-Agent Systems\n- Agent interactions and communication patterns\n- Tool usage and external API calls\n- Decision trees and reasoning chains\n- Workflow orchestration across agents\nCustom Operations\n- Business logic specific to your domain\n- External integrations and API calls\n- Data processing pipelines\n- User interactions and session flows\n🎯 Key Benefits\nπŸ”§ Developer Experience\n- 5-minute setup with minimal code changes\n- Intelligent defaults that work out-of-the-box\n- Rich SDKs with comprehensive documentation\n- Local development support with optional cloud sync\nπŸ“Š Production Insights\n- Real-time monitoring of AI application health\n- Performance optimization with bottleneck identification\n- Cost management with detailed spend analysis\n- Quality assurance through automated alerting\nπŸ”’ Enterprise Ready\n- Security first with end-to-end encryption\n- Compliance support for regulated industries\n- Scalable architecture handling millions of traces\n- Data sovereignty with region-specific storage\nπŸ‘₯ Team Collaboration\n- Shared dashboards for cross-functional teams\n- Incident management with trace-based debugging\n- Performance baselines and regression detection\n- Knowledge sharing through trace annotations\nπŸ“ˆ Common Use Cases\nπŸ€– LLM Application Monitoring\nTrack every aspect of your LLM-powered application:\n- Monitor response quality and user satisfaction\n- Optimize prompt engineering for better results\n- Control costs across different models and providers\n- Debug edge cases and improve error handling\nπŸ” RAG System Optimization\nUnderstand and improve your RAG pipeline:\n- Measure retrieval accuracy and relevance\n- Optimize embedding models and vector search\n- Track context utilization and prompt effectiveness\n- Debug hallucinations and improve grounding\n🀝 Multi-Agent Coordination\nObserve complex agent interactions:\n- Visualize agent communication patterns\n- Track tool usage and external dependencies\n- Optimize workflow efficiency and resource usage\n- Debug coordination failures and deadlocks\nπŸš€ Performance Engineering\nOptimize your AI application performance:\n- Identify slow operations and bottlenecks\n- Right-size models for your workload\n- Implement intelligent caching strategies\n- Scale services based on actual usage patterns\n🎨 Platform Features\nπŸ” Trace Explorer\n- Hierarchical visualization of complex AI workflows\n- Timeline view showing operation sequences\n- Detailed span inspection with all attributes and events\n- Cross-trace correlation for distributed operations\nπŸ’° Cost Analytics\n- Real-time cost tracking across all AI providers\n- Cost attribution by user, feature, or operation\n- Budget alerts and spending forecasts\n- Optimization recommendations for cost reduction\nπŸ“Š Performance Dashboard\n- Latency percentiles and throughput metrics\n- Error rates and failure analysis\n- Model comparison across providers and versions\n- Custom metrics and business KPIs\n🚨 Alerting & Monitoring\n- Intelligent alerts based on AI-specific thresholds\n- Anomaly detection for unusual patterns\n- Escalation policies for critical issues\n- Integration with Slack, PagerDuty, and more\nπŸ› οΈ Integration Patterns\nIncremental Adoption\nStart small and expand coverage:\n- Single endpoint tracing for immediate value\n- Critical path instrumentation for core workflows\n- Full application coverage for comprehensive insights\n- Advanced features like custom metrics and alerts\nFramework Integration\nNative support for popular frameworks:\n- Next.js with App Router and API routes\n- Express.js and other Node.js frameworks\n- FastAPI and Flask for Python applications\n- Custom integrations for any framework\nCI/CD Integration\nEmbed observability in your development process:\n- Performance regression detection in CI\n- Trace-based testing for quality assurance\n- Deployment monitoring with rollback triggers\n- Feature flag integration for safe releases\n🌟 Getting Started\nReady to transform your AI application observability? Here's your path:\n- Quick Start - Integrate your first SDK in 5 minutes\n- Tracing Concepts - Learn the fundamentals\n- Framework Guides - Deep dive into your stack\n- Advanced Features - Unlock the full platform potential\nBuilt by developers, for developers. Noveum.ai understands that AI applications are different, and we've designed our platform from the ground up to meet their unique observability needs.\nGet Early Access to Noveum.ai Platform\nBe the first one to get notified when we open Noveum Platform to more users. All users get access to Observability suite for free, early users get free eval jobs and premium support for the first year.", + "content_length": 6733, + "internal_links": [ + "https://noveum.ai/docs/advanced" + ], + "scraped_at": 1759935896.711361 + }, + { + "url": "https://noveum.ai/en/changelog", + "title": "AI Observability, LLM Evals & Agent Monitoring | Noveum.ai", + "content": "Changelog\nStay up to date with the latest changes in our product.\n2 months ago\n- 🎯 Major Platform Evolution: NovaEval Framework - Launched comprehensive AI model evaluation framework with 20+ built-in scorers including accuracy, RAG metrics, conversational scoring, and specialized agent evaluation tools. Features production-grade deployment on Docker and Kubernetes. (GitHub | PyPI Package)\n- πŸš€ Next-Generation Traces UI - Complete redesign featuring three-pane layout with directory tree navigation, advanced filtering, real-time search, mobile responsiveness, and comprehensive keyboard shortcuts. Includes connection status monitoring and performance optimizations.\n- πŸ—„οΈ ClickHouse Integration & BYOD Support - Full ClickHouse telemetry backend with connection pooling, retry mechanisms, advanced monitoring, and bring-your-own-database capabilities. Includes secure configuration management and performance optimizations.\n- πŸ“¦ Noveum Trace SDK - Production-ready Python and TypeScript SDKs for comprehensive AI application tracing. Features automatic instrumentation, multi-agent support, and seamless integration with popular frameworks. (GitHub | PyPI Package)\n- πŸ” Enhanced Security: Encrypted API key storage with support for OpenAI, AWS, Anthropic, and other major providers\n- πŸ“Š Advanced Analytics: Real-time metrics dashboard with cost tracking, error rate monitoring, and performance insights\n- πŸ”§ Developer Experience: Comprehensive integration guides, documentation updates, and SDK examples\n- πŸ—οΈ Infrastructure: Kubernetes deployments, Docker optimizations, and scalable architecture improvements\n3 months ago\n- 🎨 Landing Page Redesign: Modern, responsive design with improved user experience and conversion optimization\n- πŸ“ˆ Enhanced Dashboard Analytics: Real-time request tracking, latency monitoring, and cost analysis\n- πŸ” Improved Logs Interface: Better search functionality, detailed trace views, and enhanced debugging capabilities\n- πŸ‘₯ Team Management: Advanced member management with role-based access control and invitation system\n- πŸ›‘οΈ Security Improvements: Enhanced API key management and secure credential storage\n4 months ago\n- πŸ“Š Advanced Metrics Collection: Comprehensive telemetry system with custom metrics support\n- ⚑ Performance Optimizations: Improved database queries and response times across the platform\n- 🧩 Model Comparison Tools: Side-by-side evaluation capabilities for AI model performance\n- 🌐 Cross-Platform Compatibility: Enhanced browser support and mobile responsiveness\n- πŸ“ Documentation Updates: Comprehensive guides for integration and best practices\n5 months ago\n- πŸŽ‰ Platform Foundation: Initial release of Noveum.ai AI observability platform\n- πŸš€ Core Features: AI Gateway integration, basic evaluation metrics, and monitoring capabilities\n- πŸ“ˆ Evaluation Jobs: Automated model evaluation with configurable metrics and reporting\n- πŸ”Œ Provider Integrations: Support for major AI providers and custom model deployments", + "content_length": 2960, + "internal_links": [], + "scraped_at": 1759935898.441791 + }, + { + "url": "https://noveum.ai/en/blog/evals-for-ai-agents", + "title": "Evals for AI Agents: What They Are, Why They Matter, and How Noveum.ai Makes Them Practical | Noveum.ai", + "content": "Evals for AI Agents: What They Are, Why They Matter, and How Noveum.ai Makes Them Practical\nAditi Upaddhyay\n9/25/2025\nIf you’ve ever shipped an AI agent, you’ve probably felt this: it works great in your dev sandbox, but the moment it meets real users, things get messy. Suddenly, your β€œreliable” agent is skipping steps, hallucinating facts, or taking 20 seconds to do something simple.\nThat’s where evals come in. They’re not about passing or failing in the old-school software senseβ€”they’re about continuously measuring whether your agent is doing the job you hired it for.\nIn this blog, we’ll cover:\n- What evals for AI agents actually mean.\n- Why they’re a must-have (not a β€œnice-to-have”).\n- How Noveum.ai helps you run evals without slowing down your roadmap.\nLet’s dive in.\n1. What are Evals for AI Agents?\nThink of evals as health check-ups for your AI agents. Just like you wouldn’t assume you’re healthy without getting your vitals checked, you shouldn’t assume your agent is β€œgood enough” without running evals.\nIn plain terms\nEvals are structured tests that tell you:\n- Did the agent do what it was supposed to?\n- How well did it do it?\n- Was it safe, accurate, and efficient?\nUnlike traditional QA, which looks for bugs, evals look for quality signalsβ€”correctness, completeness, tone, safety, cost, and speed.\nTypes of evals you’ll hear about\n- Offline evals - Run before release, on a fixed dataset. Great for testing prompts or comparing models.\n- Online evals - Run in production on real traffic. Crucial for catching drift, edge cases, and \"silent failures.\"\n- End-to-end evals - Test an entire workflow, not just one answer.\n- Span-level evals - Check each step in the traceβ€”retrieval, reasoning, tool calls.\n- Safety evals - Spot harmful, biased, or policy-violating answers.\nScoring methods\n- Exact matches for structured answers.\n- Rubrics for tone and reasoning.\n- Panel of LLMs (multiple judges instead of one) to cut down bias.\n- Cost + latency tracking side by side with accuracy.\nπŸ‘‰ In short: evals give you visibility. Without them, you’re just trusting your gut.\n2. Why are Evals Necessary for AI Agents?\nLet’s be blunt: AI agents don’t crash with an error message. They fail quietly.\n- A travel booking agent might β€œforget” a step in the workflow.\n- A fintech agent might miscalculate fees.\n- A support agent might confidently tell a customer something false.\nThese failures don’t show up in logsβ€”they show up in churn, complaints, and unexpected cloud bills.\nReal-world challenges evals solve\n- Data drift: Users change how they ask questions. Models can degrade over time.\n- Messy inputs: People paste screenshots, slang, typosβ€”stuff you never tested for.\n- Hidden costs: A prompt tweak that looks harmless can double your token spend.\n- Compliance needs: For sensitive domains, you need proof your system is safe.\nThe balance every team faces\nYou want your agent to be:\n- Accurate (so users trust it),\n- Fast (so they don’t drop off),\n- Affordable (so margins stay healthy).\nWithout evals, you’re flying blind on all three.\nThat’s why leading teams now treat evals like CI/CD for AI. Every prompt, model, or tool change gets tested against baselines before going live.\n3. How Noveum.ai Helps You Run Evals (Without the Headache)\nYou could build an eval system yourself. Many teams try. Most underestimate the complexity: trace capture, scoring frameworks, dashboards, alerts, model-judge orchestration, drift detection…the list goes on.\nNoveum.ai was built to solve this problem end-to-end.\nSee exactly what your agent is doing (NovaTrace SDK)\n- Instrument once, capture everything: inputs, outputs, tokens, latency, tool calls.\n- Visualize every trace as a flow chartβ€”see where it struggled or wasted time.\n- Span-level clarity: you don’t just see that it failed, you see why.\nScore with fairness and depth (NovaEval)\n- Panel-of-LLMs (patent pending): multiple judges reduce bias.\n- Pre-built scorers: ExactMatch, F1, RAG relevance, Safety.\n- Custom rubrics: Define β€œgood” in your domain (finance β‰  healthcare).\n- Mix human review where it matters most.\nTest like you deploy\n- Offline evals for prompt/model experiments.\n- Shadow or canary live traffic to catch issues before full rollout.\n- A/B test prompts, retrieval configs, or model swaps with dashboards that track accuracy + cost + latency.\nOperate with confidence\n- Set thresholds (e.g., accuracy β‰₯85%, p95 latency <2s, cost single-judge bias.\n- Production-first - Canary + shadow testing keep evals alive after launch.\n- Actionable insights - Not just chartsβ€”concrete next steps.\n- Scales with your roadmap - From one agent to many, from single prompt to multi-agent orchestration.\nFinal Thoughts\nAI agents are powerful, but they’re unpredictable without the right checks in place. Evals are how you turn that unpredictability into measurable, improvable performance.\nAt Noveum.ai, we’re not just giving you dashboards. We’re giving you a system that helps your agents get better week after weekβ€”automatically.\nIf you’re serious about shipping reliable AI agents, it’s time to put evals at the center of your workflow.\nπŸ‘‰ Contact our cofounder at [email protected] or book a call here: https://calendly.com/aditi-noveum\nGet Early Access to Noveum.ai Platform\nJoin the select group of AI teams optimizing their models with our data-driven platform. We're onboarding users in limited batches to ensure a premium experience.", + "content_length": 6415, + "internal_links": [], + "scraped_at": 1759935899.3275049 + }, + { + "url": "https://noveum.ai/en/blog/gpt-oss-vs-gpt-5-vs-gpt-4o-mini-mmlu-evaluation-report", + "title": "GPT-OSS vs GPT-5 vs GPT-4o-mini β€” MMLU Benchmark Comparison (Accuracy, Runtime, Thinking Modes) | Noveum.ai", + "content": "GPT-OSS vs GPT-5 vs GPT-4o-mini β€” MMLU Benchmark Comparison (Accuracy, Runtime, Thinking Modes)\nShivam Gupta\n8/13/2025\nGenerated on: 2025-08-13\nThis report is a plain-English summary of how seven model setups perform on the MMLU benchmark across 10 subjects (about 500 questions each). We compare:\n- accuracy (who gets the most answers right)\n- speed/runtime (how fast the models finish)\n- the impact of GPT-OSS β€œthinking modes” (low, medium, high, unspecified) on results and efficiency\nUse this to quickly choose the model or mode that best fits your needsβ€”fastest, most accurate, or most consistent.\nTL;DR\n- Most accurate: GPT-5 (91.38%)\n- Best speed/efficiency: GPT-OSS (low thinking mode)\n- Best balance (accuracy + runtime): O3 or GPT-OSS (medium thinking mode)\n1. Overall Model Performance\nThe evaluation tested 7 model configurations across different thinking modes, revealing significant performance variations and the impact of reasoning strategies:\n| Model | Accuracy | Questions | Performance Rank | Thinking Mode |\n|---|---|---|---|---|\n| GPT-5 (OpenAI) | 91.38% | 500 | 1st | unspecified |\n| O3 (OpenAI) | 88.60% | 500 | 2nd | unspecified |\n| GPT-OSS (unspecified) | 88.40% | 500 | 3rd | unspecified |\n| GPT-OSS (medium) | 87.20% | 500 | 4th | medium |\n| GPT-OSS (low) | 84.77% | 499 | 5th | low |\n| GPT-OSS (high) | 83.00% | 500 | 6th | high |\n| GPT-4o-mini (OpenAI) | 74.20% | 500 | 7th | unspecified |\nPerformance Insights:\n- Clear leader: GPT-5 tops accuracy (91.38%)\n- Strong contenders: O3 (88.60%) and GPT-OSS (unspecified: 88.40%)\n- Long tail: GPT-4o-mini trails at 74.20%\n- Gap: 17.18pp between first and last\n2. Subject-Wise Performance Analysis\nThe evaluation covered 10 academic subjects, revealing subject-specific strengths and weaknesses across all models:\nTop Performing Subjects\n- Elementary Mathematics: 95.71% average accuracy\n- College Physics: 94.29% average accuracy\n- Conceptual Physics: 90.57% average accuracy\nMost Challenging Subjects\n- College Chemistry: 63.63% average accuracy\n- College Mathematics: 81.43% average accuracy\n- High School Mathematics: 83.69% average accuracy\nSubject Difficulty Analysis:\n- Hardest: College Chemistry (36.37% difficulty)\n- Easiest: Elementary Mathematics (4.29% difficulty)\n- Pattern: Physics stays strong; advanced math varies by model\nSubject-Specific Performance Patterns:\nAbstract Algebra: GPT-OSS models dominate the top 2 positions, with both high and medium thinking modes achieving 92% accuracy. This suggests that GPT-OSS excels at mathematical reasoning tasks when given appropriate thinking parameters.\nCollege Chemistry: All models struggle, with GPT-5 performing best at 71.43%. The subject shows the highest variability, indicating fundamental challenges in chemical reasoning that persist across all thinking strategies.\nCollege Mathematics: O3 leads with 94% accuracy, followed closely by GPT-5. GPT-OSS models show consistent performance around 88%, suggesting good mathematical capabilities regardless of thinking mode.\nPhysics Subjects: Both college and conceptual physics show strong performance across models, with GPT-OSS low thinking mode achieving perfect scores in college physics, indicating efficient reasoning for physics problems.\nElementary Mathematics: All models perform exceptionally well (94%+), with GPT-5 leading at 98%. This suggests that basic mathematical reasoning is well-handled by all models.\n3. Thinking Mode Effects Analysis\nThe GPT-OSS model was tested across four thinking modes using an A100 40GB GPU, revealing critical insights into the relationship between reasoning depth and performance:\nThinking Mode Performance Comparison\n| Mode | Accuracy | Avg Thinking Tokens | GPU Runtime | Efficiency |\n|---|---|---|---|---|\n| Unspecified | 88.40% | 384.60 Β± 532.24 | 135.57s | Best Balance |\n| Medium | 87.20% | 390.57 Β± 570.66 | 138.42s | Most Efficient |\n| Low | 84.77% | 76.55 Β± 84.97 | 57.49s | Most Runtime-Efficient |\n| High | 83.00% | 1061.48 Β± 1311.63 | 279.54s | Least Efficient |\nKey Insights:\n- Low mode = fastest good-enough results (best runtime/accuracy tradeoff)\n- Medium mode = balanced choice for most workloads\n- Unspecified = highest accuracy within GPT-OSS, with extra runtime\n- High mode = slowest and often worse; use only for niche deep-reasoning cases\nRuntime-Performance Trade-offs:\n- Low mode: 0.68 seconds per percentage point of accuracy\n- Medium mode: 1.59 seconds per percentage point of accuracy\n- Unspecified mode: 1.53 seconds per percentage point of accuracy\n- High mode: 3.37 seconds per percentage point of accuracy\n4. Token Usage and Efficiency\nEfficiency Curves\n- Low mode: Best at 2–59 tokens (~90%); noticeable drop beyond 170 tokens\n- Medium mode: Broad sweet spot 15–446 tokens (~92.65%); balanced overall\n- High mode: Diminishing returns; accuracy declines past ~2500 tokens\n- Unspecified: Reliable 17–428 tokens (~93.77%); steady decline with complexity\n5. Runtime vs Accuracy\nRuntime-Effectiveness Rankings\n- Low Thinking Mode: 57.49s for 84.77% accuracy\n- Runtime per percentage point: 0.68s\n- Best value proposition for time-sensitive applications\n- Medium Thinking Mode: 138.42s for 87.20% accuracy\n- Runtime per percentage point: 1.59s\n- Good balance of performance and runtime\n- Unspecified Mode: 135.57s for 88.40% accuracy\n- Runtime per percentage point: 1.53s\n- Premium performance at higher runtime cost\n- High Thinking Mode: 279.54s for 83.00% accuracy\n- Runtime per percentage point: 3.37s\n- Least efficient option\nRuntime-Performance Insights:\n- Best ROI: Low mode\n- Best balance: Medium mode\n- Premium accuracy: Unspecified\n- Avoid: High mode unless required\nHow to choose (quick guide)\n- Max accuracy: Choose GPT-5\n- Speed/cost-sensitive: Choose GPT-OSS with thinking mode = low\n- Balanced: Choose O3 or GPT-OSS with thinking mode = medium\n- Chemistry-heavy workloads: Validate on your data; consider domain-tuned models\n6. Consistency (Subject Variance)\nConsistency Rankings (Lower CV = More Consistent)\n| Model | Mean Accuracy | CV | Consistency Rank |\n|---|---|---|---|\n| GPT-5 (OpenAI) | 91.34% | 8.58% | Most Consistent |\n| GPT-OSS Medium | 87.20% | 10.16% | 2nd |\n| GPT-OSS Unspecified | 88.40% | 10.21% | 3rd |\n| O3 (OpenAI) | 88.60% | 10.49% | 4th |\n| GPT-OSS Low | 84.78% | 11.60% | 5th |\n| GPT-OSS High | 83.00% | 11.82% | 6th |\n| GPT-4o-mini | 74.20% | 19.49% | Least Consistent |\nConsistency Insights:\n- GPT-5 shows remarkable consistency across subjects despite high performance\n- GPT-4o-mini exhibits high variability, suggesting subject-specific weaknesses\n- GPT-OSS models show moderate consistency with thinking mode effects\n- O3 model shows competitive performance (88.60%) with moderate consistency, positioning it as a strong alternative to GPT-5\n7. Thinking Tokens vs Accuracy\nThinking Token vs. Accuracy Correlations\nAll GPT-OSS models show negative correlations between thinking tokens and accuracy:\n- High Mode: -0.56 (strongest negative correlation)\n- Unspecified Mode: -0.38 (moderate negative correlation)\n- Medium Mode: -0.27 (moderate negative correlation)\n- Low Mode: Insufficient data for correlation (most responses use minimal tokens)\nInterpretation:\n- More thinking tokens correlate with lower accuracy\n- This suggests that simpler, more direct reasoning lead to better performance\n- Complex reasoning may introduce errors or overthinking\n- The relationship is consistent across all thinking modes for GPT-OSS 20B\n- High thinking mode shows the strongest negative correlation, indicating the most significant performance degradation with increased complexity\n8. Outliers (College Chemistry)\nSubject Outlier: College Chemistry\n- Z-score: -2.5354 (statistically significant outlier)\n- Mean Accuracy: 63.63%\n- Outlier Type: Low performer\nAnalysis:\n- College Chemistry shows significantly lower performance than expected\n- All models struggle with this subject, suggesting fundamental challenges\n- GPT-5 performs best at 71.43%, but even this is not good\n- The subject may require specialized knowledge or reasoning patterns not well-represented in the training data\n- Thinking mode variations show minimal impact, indicating the challenge is fundamental rather than reasoning-strategy dependent\n9. Recommendations\nFor Production Use\n- Runtime-Sensitive Applications: Use GPT-OSS with low thinking mode for best speed-accuracy ratio\n- Performance-Critical Applications: Use GPT-5 for maximum accuracy and consistency\n- Balanced Applications: Use O3 for high performance with moderate runtime, or GPT-OSS with medium thinking mode\n- Enterprise Applications: Consider O3 as a cost-effective alternative to GPT-5 when 88.6% accuracy is sufficient\nFor Research and Development\n- Consistency Studies: Focus on GPT-5 model for stable performance across subjects\n- Efficiency Optimization: Study low thinking mode patterns for runtime reduction\n- Subject-Specific Tuning: Develop specialized models for challenging subjects like Chemistry\n- Thinking Mode Research: Investigate why high thinking modes show performance degradation\nFor Model Selection\n- Academic Applications: Prioritize GPT-5 for comprehensive coverage, with O3 as a strong alternative\n- Resource-Constrained Environments: Choose GPT-OSS low thinking mode\n- Real-Time Applications: Consider medium thinking mode for speed-accuracy balance\n- Enterprise Deployments: O3 offers an excellent balance of performance (88.6%) and runtime efficiency\n10. Limitations and Future Work\nCurrent Limitations\n- All models have consistent sample sizes (approximately 500 questions each)\n- Single evaluation run per model configuration\n- Focus on GPT-OSS thinking modes only\n- Limited subject coverage (10 out of 57 MMLU subjects)\n- Runtime measurements based on A100 40GB GPU performance\nFuture Research Directions\n- Extended Subject Coverage: Evaluate all 57 MMLU subjects\n- Multiple Runs: Assess model consistency across multiple evaluations\n- Thinking Mode Optimization: Develop adaptive thinking mode selection based on problem complexity\n- Runtime Analysis: Include latency and throughput metrics across different GPU configurations\n- Cross-Model Comparison: Evaluate thinking modes across different model architectures\n- Performance Degradation Study: Investigate why high thinking modes show worse performance\nConclusion\nThis comprehensive MMLU evaluation provides critical insights into model performance, efficiency, and runtime characteristics across different thinking strategies. Key takeaways include:\n- GPT-5 remains the performance leader with excellent consistency (91.38% accuracy, 8.58% CV)\n- O3 emerges as a strong competitor with 88.6% accuracy, offering enterprise-grade performance\n- GPT-OSS offers excellent runtime-performance ratios across different thinking modes\n- Low thinking mode provides the best value for GPU runtime (57.49s for 84.77% accuracy)\n- Subject difficulty varies significantly, with Chemistry being most challenging (36.37% difficulty)\n- Model consistency varies widely, with GPT-5 showing remarkable stability\n- Thinking token efficiency shows that simpler reasoning often leads to better performance\n- High thinking modes show performance degradation, challenging the assumption that more complex reasoning improves results\nKey takeaways:\n- GPT-5 leads in accuracy and consistency (91.38%, CV 8.58%)\n- O3 is a strong, fast alternative (88.6%)\n- GPT-OSS (low mode) delivers the best speed/accuracy tradeoff\n- Chemistry is hardest; validate domain-specific workloads\n- More tokens β‰  better: excessive thinking often reduces accuracy\nPractical implication: prefer simpler, focused reasoning by default; scale up thinking depth only when problems demand it.\nReport generated from MMLU evaluation data covering 7 model configurations, 4 thinking modes, and 10 academic subjects across approximately 500 total samples per model. GPU runtime measurements based on A100 40GB GPU performance. Analysis generated on 2025-08-13.\nGet Early Access to Noveum.ai Platform\nJoin the select group of AI teams optimizing their models with our data-driven platform. We're onboarding users in limited batches to ensure a premium experience.", + "content_length": 12117, + "internal_links": [], + "scraped_at": 1759935900.21145 + }, + { + "url": "https://noveum.ai/en/blog/comprehensive-mmlu-evaluation-analysis-report", + "title": "o1-mini vs gpt-4o-mini β€” What We Learned from 1,000 MMLU Samples | Noveum.ai", + "content": "o1-mini vs gpt-4o-mini β€” What We Learned from 1,000 MMLU Samples\nShashank Agarwal\n8/12/2025\no1-mini vs gpt-4o-mini: MMLU math comparison\nWe compared Azure o1-mini and gpt-4o-mini on 1,000 MMLU math questions across 10 subjects using NovaEval. The goal was simple: learn when paying more for o1‑mini actually makes sense.\nKey takeaways\n- o1‑mini: 73.3% vs gpt‑4o‑mini: 57.2% (+16.1pp, p < 0.001)\n- Similar throughput and reliability for both models\n- o1‑mini costs ~15Γ— more per request; payoff depends on the cost of a wrong answer\nHow we tested\n- Dataset: 1,000 MMLU math questions; 809 processed per model after filtering/timeouts\n- Setup: Azure OpenAI deployments, identical prompts, seed 42, 10 workers\n- Framework: NovaEval with retries, checkpointing, and consistent answer extraction\nDetailed Results\nOverall Performance Metrics\n| Metric | gpt-4o-mini | o1-mini | Difference |\n|---|---|---|---|\n| Accuracy | 57.2% | 73.3% | +16.1% |\n| Correct Answers | 463/809 | 593/809 | +130 |\n| Processing Speed | 2.10 samples/sec | 2.27 samples/sec | +0.17 |\n| Total Time | 385.8 seconds | 357.0 seconds | -28.8s |\n| Error Rate | 0.2% (2 errors) | 0.1% (1 error) | -0.1% |\n| Status | Partial errors | Partial errors | - |\nStatistical Significance Analysis\n- Z-score: 5.24 (highly significant)\n- P-value: < 0.001 (extremely significant)\n- Effect Size (Cohen's h): 0.334 (Medium effect)\n- 95% Confidence Interval: [10.0%, 22.2%]\n- Statistical Power: > 99% (highly powered study)\nThe results demonstrate statistically significant and practically meaningful performance differences between the models.\nSubject-wise Performance Analysis\nPerformance by Mathematical Domain\n| Subject | gpt-4o-mini | o1-mini | Difference | Significant* |\n|---|---|---|---|---|\n| Abstract Algebra | 36.4% | 58.2% | +21.8% | βœ“ |\n| College Mathematics | 37.3% | 67.8% | +30.5% | βœ“ |\n| College Physics | 49.2% | 71.2% | +22.0% | βœ“ |\n| College Chemistry | 52.5% | 74.6% | +22.1% | βœ“ |\n| Conceptual Physics | 80.0% | 81.0% | +1.0% | βœ— |\n| Elementary Mathematics | 80.0% | 89.0% | +9.0% | βœ— |\n| High School Chemistry | 55.9% | 76.3% | +20.4% | βœ“ |\n| High School Mathematics | 40.7% | 79.7% | +39.0% | βœ“ |\n| High School Physics | 54.2% | 74.6% | +20.4% | βœ“ |\n| High School Statistics | 55.9% | 61.0% | +5.1% | βœ— |\n- Statistical significance at p < 0.05 level\nKey Subject Insights\n- Largest Performance Gap: High School Mathematics (39.0% difference)\n- Most Consistent Performance: Elementary Mathematics (both models > 80%)\n- Significant Improvements: 7 out of 10 subjects show statistically significant gains\n- College-level Advantage: o1-mini shows particularly strong performance in college-level subjects\nConfidence and Quality Analysis\nAnswer Extraction Confidence\n| Model | Overall Confidence | Correct Answers | Incorrect Answers |\n|---|---|---|---|\n| gpt-4o-mini | 0.859 | 0.859 | 0.859 |\n| o1-mini | 0.859 | 0.859 | 0.859 |\nBoth models demonstrate identical confidence patterns, suggesting consistent answer extraction methodology across different model architectures.\nResponse Quality Metrics\n- Average Response Length:\n- gpt-4o-mini: ~800 characters\n- o1-mini: ~850 characters\n- Extraction Success Rate: 99.9% for both models\n- Processing Reliability: > 99.8% success rate\nTechnical Implementation Details\nNovaEval Integration\nThe evaluation utilized a custom NovaEval implementation with the following components:\n- Azure OpenAI Client: Custom integration supporting both models\n- Answer Extraction: Multi-pattern regex-based extraction with confidence scoring\n- Concurrent Processing: 10-worker parallel processing for efficiency\n- Error Handling: Robust retry logic and checkpoint saving\nAnswer Extraction Methodology\n# Primary extraction patterns (in order of confidence)\npatterns = [\nr'^([A-D])', # Letter at start (confidence: 1.0)\nr'\\b([A-D])\\b', # Single letter (confidence: 0.9)\nr'answer\\s*is\\s*([A-D])', # \"answer is X\" (confidence: 0.8)\nr'([A-D])\\.', # Letter with period (confidence: 0.7)\nr'\\(([A-D])\\)', # Letter in parentheses (confidence: 0.6)\n# ... additional patterns with decreasing confidence\n]\nQuality Assurance\n- Reproducibility: Fixed random seed (42) ensures consistent results\n- Validation: Cross-validation with manual spot-checks\n- Error Monitoring: Real-time error tracking and logging\n- Checkpoint System: Regular saves prevent data loss\nCost Analysis\n- o1-mini costs 15x more than gpt-4o-mini ($0.001720 vs $0.000115 per request)\n- Break-even point: ~$0.01 per incorrect answer makes o1-mini cost-effective\n- ROI threshold: Applications where errors cost β‰₯$0.01 show positive ROI\n- Performance premium: 28% relative accuracy improvement for ~1,396% cost increase\nCost Breakdown Analysis\nTotal Evaluation Costs\n| Model | Total Cost | Cost per Request | Cost per Correct Answer | Accuracy |\n|---|---|---|---|---|\n| gpt-4o-mini | $0.0928 | $0.000115 | $0.000201 | 57.2% |\n| o1-mini | $1.3912 | $0.001720 | $0.002346 | 73.3% |\n| Difference | +$1.2984 | +$0.001605 | +$0.002145 | +16.1% |\nToken Usage Economics\ngpt-4o-mini Token Analysis:\n- Total Input Tokens: 154,036 (80.3% of total)\n- Total Output Tokens: 37,782 (19.7% of total)\n- Total Tokens: 191,818\n- Input Cost Share: 37.2% ($0.0345)\n- Output Cost Share: 62.8% ($0.0583)\no1-mini Token Analysis:\nToken breakdown omitted pending verified usage data. We will update this section with non-estimated counts to avoid artifacts.\nNote: Token estimation based on 1 token β‰ˆ 4 characters approximation\nBreak-Even Analysis\nCritical Break-Even Thresholds\nPrimary Break-Even Point: ~$0.01 per incorrect answer\n- With a per-request cost delta of $0.001605 and an accuracy delta of 0.161, the break-even error cost is: break-even β‰ˆ 0.001605 / 0.161 β‰ˆ $0.00997 per error (β‰ˆ $0.01). Example (10,000 requests): +1,610 additional correct answers at an added cost of ~$16.05.\nROI and Net Benefit (10K requests)\n- Net Benefit(10K, error_cost = C) β‰ˆ 1,610 Γ— C - $16.05\n- Examples:\n- C = $0.01 β†’ Net β‰ˆ $0.05 (β‰ˆ break-even)\n- C = $1.00 β†’ Net β‰ˆ $1,593.95\n- C = $10.00 β†’ Net β‰ˆ $16,083.95\nVolume-Based Break-Even Analysis\nScaling Economics (Cost Difference):\n| Request Volume | gpt-4o-mini Total Cost | o1-mini Total Cost | Cost Difference | Additional Correct |\n|---|---|---|---|---|\n| 100 | $0.01 | $0.17 | +$0.16 | 16 |\n| 1,000 | $0.12 | $1.72 | +$1.60 | 161 |\n| 10,000 | $1.15 | $17.20 | +$16.05 | 1,610 |\n| 100,000 | $11.50 | $172.00 | +$160.50 | 16,100 |\n| 1,000,000 | $115.00 | $1,720.00 | +$1,605.00 | 161,000 |\nUse Case Recommendations\nWhen o1-mini is Cost-Effective\nHigh-Value Applications (ROI > 100%)\n- Financial Trading Systems: Error cost $100-$10,000 per mistake\n- Medical Diagnosis Support: Error cost $1,000-$100,000 per mistake\n- Legal Document Analysis: Error cost $500-$50,000 per mistake\n- Quality Control Systems: Error cost $10-$1,000 per mistake\n- Critical Decision Support: Error cost $100-$10,000 per mistake\nMedium-Value Applications (ROI 25-100%)\n- Academic Research: Error cost $10-$100 per mistake\n- Business Intelligence: Error cost $50-$500 per mistake\n- Content Moderation: Error cost $1-$50 per mistake\n- Automated Grading: Error cost $5-$100 per mistake\nWhen gpt-4o-mini is Preferred\nCost-Sensitive Applications\n- Bulk Content Processing: High volume, low error cost\n- Development and Testing: Non-production environments\n- Exploratory Analysis: Initial data exploration\n- Non-Critical Evaluations: Low-stakes decision making\n- Budget-Constrained Projects: Limited financial resources\nSubject-Wise Cost Analysis\nCost Efficiency by Mathematical Domain\nTop Performing Subjects (o1-mini Cost Efficiency):\n| Subject | Accuracy | Cost per Request | Cost per Correct | Efficiency Score* |\n|---|---|---|---|---|\n| Elementary Mathematics | 89.0% | $0.00172 | $0.00193 | 517 |\n| Conceptual Physics | 81.0% | $0.00172 | $0.00212 | 471 |\n| High School Mathematics | 79.7% | $0.00172 | $0.00216 | 463 |\n| High School Physics | 74.6% | $0.00172 | $0.00231 | 434 |\n| College Chemistry | 74.6% | $0.00172 | $0.00231 | 434 |\n- Efficiency Score = (Accuracy / Cost per Request) Γ— 1000\nHighest Cost Premium Subjects:\n- High School Mathematics: 39.0% accuracy improvement, highest ROI\n- College Mathematics: 30.5% accuracy improvement, strong value\n- Abstract Algebra: 21.8% accuracy improvement, solid gains\nSubject-Specific Break-Even Analysis\nPremium Justified Subjects (Error cost threshold < $1.00):\n- High School Mathematics: $0.41 per error\n- College Mathematics: $0.53 per error\n- Abstract Algebra: $0.74 per error\nPremium Questionable Subjects (Error cost threshold > $2.00):\n- Conceptual Physics: $2.15 per error\n- Elementary Mathematics: $1.89 per error\n- High School Statistics: $3.21 per error\nStatistical Validation\nHypothesis Testing\nNull Hypothesis (Hβ‚€): No difference in accuracy between gpt-4o-mini and o1-mini Alternative Hypothesis (H₁): o1-mini has higher accuracy than gpt-4o-mini\nTest Results:\n- Z-statistic: 5.24\n- Critical value: 1.96 (Ξ± = 0.05)\n- Decision: Reject Hβ‚€ (5.24 > 1.96)\n- Conclusion: Strong evidence for superior o1-mini performance\nEffect Size Interpretation\nCohen's h = 0.334 indicates a Medium Effect Size:\n- Small effect: h < 0.2\n- Medium effect: 0.2 ≀ h < 0.5\n- Large effect: h β‰₯ 0.5\nConfidence Intervals\nThe 95% confidence interval [10.0%, 22.2%] for the accuracy difference indicates:\n- Lower bound: o1-mini is at least 10.0% better\n- Upper bound: o1-mini could be up to 22.2% better\n- Point estimate: 16.1% improvement is most likely\nBusiness and Practical Implications\nModel Selection Guidance\nWhen to Choose o1-mini:\n- Mathematical reasoning tasks (73.3% accuracy advantage)\n- College-level problem solving (strong performance across all college subjects)\n- High-stakes applications where accuracy is paramount\n- Complex analytical tasks requiring step-by-step reasoning\nWhen to Consider gpt-4o-mini:\n- Cost-sensitive applications (if pricing differs significantly)\n- Simple mathematical tasks where 57.2% accuracy is sufficient\n- High-throughput scenarios where speed matters more than accuracy\n- General-purpose applications beyond mathematical reasoning\nPerformance-Cost Analysis\nBased on the evaluation results:\n- Performance Gain: 28% relative improvement (73.3% vs 57.2%)\n- Speed Difference: Minimal (2.27 vs 2.10 samples/sec)\n- Reliability: Comparable error rates (< 0.2% for both)\n- ROI Calculation: Depends on specific use case and pricing structure\nRisk Assessment\nLow Risk Factors:\n- Consistent performance across multiple mathematical domains\n- Statistical significance with large sample size (809 samples per model)\n- Reproducible results with proper methodology\n- Robust evaluation framework with error handling\nConsiderations:\n- Domain specificity: Results specific to mathematical reasoning\n- Sample representation: Limited to MMLU mathematical subjects\n- Model versions: Results tied to specific model deployments\nRecommendations\nImmediate Actions\n- Deploy o1-mini for mathematical applications with confidence\n- Implement A/B testing for specific use cases to validate results\n- Monitor performance in production environments\n- Establish quality metrics based on confidence scoring\nStrategic Considerations\n- Expand evaluation to other MMLU domains (science, humanities, etc.)\n- Conduct cost-benefit analysis based on actual pricing\n- Develop hybrid approaches leveraging strengths of both models\n- Create performance benchmarks for ongoing model evaluation\nQuality Assurance Framework\n- Confidence Thresholding: Flag responses with confidence < 0.7\n- Subject-specific Monitoring: Track performance by mathematical domain\n- Error Pattern Analysis: Identify and address systematic failures\n- Continuous Evaluation: Regular re-assessment with new model versions\nPrimary Findings\nThis comprehensive evaluation provides definitive evidence that o1-mini significantly outperforms gpt-4o-mini on mathematical reasoning tasks:\n- Substantial Accuracy Improvement: 16.1 percentage point gain (28% relative improvement)\n- Statistical Significance: p < 0.001 with large effect size\n- Broad Applicability: Consistent improvements across 7/10 mathematical subjects\n- Production Readiness: Reliable performance with minimal error rates\nScientific Rigor\nThe evaluation meets high standards for scientific rigor:\n- Large sample size (1,000 samples, 809 processed per model)\n- Proper statistical testing with appropriate methods\n- Reproducible methodology with documented procedures\n- Comprehensive analysis including effect sizes and confidence intervals\nBusiness Impact\nFor organizations requiring mathematical reasoning capabilities:\n- Clear model choice: o1-mini provides superior performance\n- Quantified benefits: 28% relative improvement in accuracy\n- Risk mitigation: Statistically validated results reduce deployment risk\n- Strategic advantage: Early adoption of superior reasoning capabilities\nFuture Directions\nThis evaluation establishes a gold standard methodology for model comparison and provides a foundation for future research into reasoning model capabilities. The results strongly support the adoption of o1-mini for mathematical reasoning applications while highlighting the importance of rigorous evaluation in model selection decisions\nGet Early Access to Noveum.ai Platform\nJoin the select group of AI teams optimizing their models with our data-driven platform. We're onboarding users in limited batches to ensure a premium experience.", + "content_length": 13352, + "internal_links": [], + "scraped_at": 1759935901.217552 + }, + { + "url": "https://noveum.ai/en/blog/from-logs-to-intelligent-choices-inside-noveum-ais-evaluation-process", + "title": "From Development to Production - Inside Noveum.ai's AI Observability Platform | Noveum.ai", + "content": "From Development to Production - Inside Noveum.ai's AI Observability Platform\nShashank Agarwal\n3/3/2025\nIntroduction\nEvery AI application tells a storyβ€”but most developers never get to hear it. When your RAG pipeline returns irrelevant results, when your multi-agent system gets stuck in loops, or when your LLM costs suddenly spike, you're left guessing what went wrong and where.\nThat's exactly the challenge Noveum.ai solves. Rather than flying blind, our platform provides comprehensive tracing and observability specifically designed for AI applications. Whether you're building LLM-powered chatbots, RAG systems, or complex multi-agent workflows, Noveum.ai gives you the insights you need to understand, debug, and optimize your AI applications.\nThe AI Observability Challenge\nWhy Traditional Monitoring Falls Short\nTraditional application monitoring tools weren't built for AI applications. They can tell you if your API is responding, but they can't answer the questions that matter most for AI systems:\n- Why did my RAG pipeline retrieve irrelevant documents?\n- Which LLM calls are driving my costs?\n- How are my agents communicating with each other?\n- What's causing hallucinations in my responses?\n- Why is my embedding generation so slow?\nWhat Makes AI Applications Different\nAI applications have unique characteristics that require specialized observability:\n- 🧠 Context Flow: Data flows through embeddings, retrievals, and generations\n- πŸ’° Variable Costs: Token usage creates unpredictable expenses\n- πŸ”€ Complex Workflows: Multi-step pipelines with branching logic\n- πŸ€– Agent Interactions: Multiple AI entities coordinating tasks\n- πŸ“Š Quality Metrics: Success isn't just about uptimeβ€”it's about output quality\nStep 1: SDK Integration\nEffortless Instrumentation\nAt the heart of Noveum.ai are our Python and TypeScript SDKs that integrate seamlessly into your existing codebase. With just a few lines of code, you can start capturing comprehensive traces of your AI operations.\nimport noveum_trace\n# Initialize once at startup\nnoveum_trace.init(\napi_key=\"your-api-key\",\nproject=\"customer-support-bot\",\nenvironment=\"production\"\n)\n# Trace LLM calls automatically\n@noveum_trace.trace_llm\ndef generate_response(user_question: str) -> str:\nreturn openai.chat.completions.create(\nmodel=\"gpt-4\",\nmessages=[{\"role\": \"user\", \"content\": user_question}]\n).choices[0].message.content\nWhat Gets Captured Automatically\nOnce integrated, Noveum.ai automatically captures:\n- πŸ” Request/Response Data: Inputs, outputs, and transformations\n- ⏱️ Performance Metrics: Latency, throughput, and bottlenecks\n- πŸ’° Cost Tracking: Token usage and API costs across providers\n- 🏷️ Rich Metadata: Model parameters, user context, and custom attributes\n- 🌊 Context Flow: How data moves through your AI pipeline\n- πŸ› Error Details: Stack traces and failure analysis\nStep 2: Understanding Your AI Workflows\nRAG Pipeline Visibility\nRAG (Retrieval-Augmented Generation) systems involve multiple complex steps. Noveum.ai traces each phase, giving you complete visibility:\n@noveum_trace.trace(\"rag-pipeline\")\ndef answer_customer_question(question: str) -> str:\n# Phase 1: Query understanding\nwith noveum_trace.trace_step(\"query-analysis\") as step:\nintent = analyze_query_intent(question)\nstep.set_attribute(\"query.intent\", intent)\nstep.set_attribute(\"query.complexity\", get_complexity_score(question))\n# Phase 2: Document retrieval\nwith noveum_trace.trace_step(\"document-retrieval\") as step:\nembeddings = generate_embeddings(question)\ndocuments = vector_search(embeddings, k=5)\nstep.set_attribute(\"documents.retrieved\", len(documents))\nstep.set_attribute(\"documents.avg_similarity\", avg_similarity(documents))\nstep.set_attribute(\"retrieval.model\", \"text-embedding-ada-002\")\n# Phase 3: Answer generation\nwith noveum_trace.trace_step(\"answer-generation\") as step:\ncontext = build_context(documents)\nanswer = generate_answer_with_context(question, context)\nstep.set_attribute(\"context.length\", len(context))\nstep.set_attribute(\"answer.confidence\", calculate_confidence(answer))\nstep.set_attribute(\"generation.model\", \"gpt-4\")\nreturn answer\nMulti-Agent Coordination\nWhen multiple AI agents work together, Noveum.ai tracks their interactions and coordination:\nconst multiAgentTask = trace('customer-inquiry-resolution', async (inquiry: string) => {\n// Agent 1: Classification\nconst category = await span('classify-inquiry', async (spanInstance) => {\nspanInstance.setAttribute('agent.name', 'classifier');\nspanInstance.setAttribute('inquiry.length', inquiry.length);\nreturn await classificationAgent.categorize(inquiry);\n});\n// Agent 2: Research (if needed)\nlet context = null;\nif (category.needsResearch) {\ncontext = await span('research-context', async (spanInstance) => {\nspanInstance.setAttribute('agent.name', 'researcher');\nspanInstance.setAttribute('research.category', category.type);\nreturn await researchAgent.gatherContext(inquiry);\n});\n}\n// Agent 3: Response generation\nconst response = await span('generate-response', async (spanInstance) => {\nspanInstance.setAttribute('agent.name', 'responder');\nspanInstance.setAttribute('response.has_context', !!context);\nreturn await responseAgent.generate(inquiry, context);\n});\nreturn response;\n});\nStep 3: Real-Time Debugging and Optimization\nPerformance Bottleneck Identification\nNoveum.ai's dashboard automatically identifies performance issues:\n- 🐌 Slow Operations: Which LLM calls or embeddings are taking too long?\n- πŸ”„ Redundant Processing: Are you generating the same embeddings multiple times?\n- πŸ“Š Resource Usage: Which operations consume the most tokens or memory?\n- 🚨 Error Patterns: What types of failures occur most frequently?\nCost Optimization Insights\nWith detailed cost tracking, you can optimize your AI spending:\n- Provider Comparison: See actual costs across OpenAI, Anthropic, Google, etc.\n- Model Analysis: Compare performance vs. cost for different models\n- Usage Patterns: Identify expensive operations and optimize them\n- Budget Alerts: Get notified when costs exceed thresholds\nQuality Assurance\nBeyond performance, Noveum.ai helps ensure output quality:\n- Response Analysis: Track confidence scores and quality metrics\n- A/B Testing: Compare different models or prompts\n- User Feedback: Correlate user satisfaction with trace data\n- Drift Detection: Identify when model performance degrades\nReal-World Example: Customer Support Bot\nLet's walk through a real example. You've built a customer support bot using RAG that helps users with product questions. Here's how Noveum.ai provides insights:\nDevelopment Phase\nDuring development, you discover through tracing that:\n- Embedding generation takes 200ms on average\n- Vector search finds relevant documents 85% of the time\n- Answer generation costs $0.02 per query with GPT-4\nProduction Deployment\nIn production, Noveum.ai reveals:\n- Peak usage occurs during business hours, causing latency spikes\n- Certain question types consistently retrieve irrelevant documents\n- Token usage is 30% higher than expected due to verbose context\nOptimization Cycle\nBased on these insights, you:\n- Cache embeddings for common questions (reduces latency by 60%)\n- Improve vector search by fine-tuning similarity thresholds\n- Switch to GPT-3.5 for simple questions (reduces costs by 40%)\n- Implement streaming for better user experience\nContinuous Improvement\nAs your bot evolves:\n- New conversation patterns are automatically captured\n- Quality metrics help identify areas for improvement\n- Cost trends inform capacity planning\n- Error analysis guides bug fixes and feature development\nAdvanced Observability Patterns\nCustom Metrics and Attributes\nNoveum.ai allows you to add domain-specific insights:\n@noveum_trace.trace(\"content-moderation\")\ndef moderate_content(text: str, user_id: str):\n# Add business context\nnoveum_trace.set_attribute(\"user.trust_level\", get_user_trust_level(user_id))\nnoveum_trace.set_attribute(\"content.category\", classify_content_type(text))\nnoveum_trace.set_attribute(\"moderation.policy_version\", \"v2.1\")\n# Perform moderation\nresult = run_content_moderation(text)\n# Add results\nnoveum_trace.set_attribute(\"moderation.risk_score\", result.risk_score)\nnoveum_trace.set_attribute(\"moderation.action_taken\", result.action)\nreturn result\nError Tracking and Alerting\nComprehensive error handling with actionable insights:\nconst processDocument = trace('document-processing', async (documentId: string) => {\ntry {\nconst result = await span('extract-text', async () => {\nreturn await extractTextFromDocument(documentId);\n});\nreturn await span('analyze-content', async (spanInstance) => {\nspanInstance.setAttribute('document.word_count', result.wordCount);\nspanInstance.setAttribute('document.language', result.language);\nreturn await analyzeContent(result.text);\n});\n} catch (error) {\n// Rich error context for debugging\nconst currentSpan = getCurrentSpan();\ncurrentSpan.setAttribute('error.type', error.constructor.name);\ncurrentSpan.setAttribute('error.message', error.message);\ncurrentSpan.setAttribute('error.recoverable', isRecoverableError(error));\ncurrentSpan.setStatus('ERROR', error.message);\nthrow error;\n}\n});\nThe Future of AI Observability\nWhat's Coming Next\nAs AI applications become more sophisticated, observability needs to evolve:\n- πŸ€– Agent Ecosystems: Observing complex multi-agent societies\n- 🧠 Reasoning Chains: Tracing LLM thought processes\n- πŸ”„ Feedback Loops: Connecting user outcomes back to traces\n- πŸ“Š Quality Metrics: Advanced measures of AI output quality\n- πŸ›‘οΈ Safety Monitoring: Detecting harmful or biased outputs\nBuilding Observability-First AI Applications\nThe future belongs to teams who build observability into their AI applications from day one:\n- Faster Debugging: Find and fix issues before they impact users\n- Data-Driven Optimization: Make decisions based on real usage patterns\n- Proactive Monitoring: Catch problems before they become incidents\n- Continuous Improvement: Use traces to guide development priorities\nConclusion\nThe power of Noveum.ai lies in its comprehensive observability approach:\n- πŸš€ Easy Integration: Start tracing with minimal code changes\n- πŸ” Deep Insights: Understand every aspect of your AI workflows\n- πŸ“Š Actionable Analytics: Make data-driven optimization decisions\n- πŸ› οΈ Developer-Friendly: Built by AI engineers, for AI engineers\nEvery API call, every embedding generation, every agent interaction becomes part of a bigger picture that helps you build better AI applications. Instead of guessing why something went wrong, you have concrete data and detailed traces to guide your decisions.\nWhether you're debugging a complex RAG pipeline, optimizing multi-agent coordination, or simply trying to reduce your AI costs, Noveum.ai provides the visibility you need to succeed.\nReady to see your AI applications like never before? Start tracing today or talk to our team about your specific observability needs. We're here to help you build AI applications that are transparent, optimized, and reliableβ€”one trace at a time.\nGet Early Access to Noveum.ai Platform\nJoin the select group of AI teams optimizing their models with our data-driven platform. We're onboarding users in limited batches to ensure a premium experience.", + "content_length": 11174, + "internal_links": [], + "scraped_at": 1759935902.2391648 + }, + { + "url": "https://noveum.ai/en/blog/noveum-ai-your-one-stop-ai-evaluation-platform", + "title": "Noveum.ai - Comprehensive AI Tracing and Observability Platform | Noveum.ai", + "content": "Noveum.ai - Comprehensive AI Tracing and Observability Platform\nShashank Agarwal\n3/2/2025\nIntroduction\nArtificial Intelligence applications are becoming increasingly complex, with multi-step workflows, RAG pipelines, and sophisticated agent systems. But how do you debug when things go wrong? How do you optimize performance? How do you understand what's happening inside your AI applications? That's where Noveum.ai comes in.\nIn this post, I'll walk you through what Noveum.ai is, how it works, and why I believe it's essential for teams building production AI applications. My name is Shashank Agarwalβ€”founder of Noveum.ai, AI enthusiast, and someone who's been building and scaling large AI/ML platforms for over a decade. Let's dive in.\nWhy Noveum.ai?\nBuilding production AI applications involves complex workflows with multiple components: LLM calls, vector searches, data retrieval, agent reasoning, and more. Without proper observability, debugging becomes a nightmare, optimization is guesswork, and understanding user interactions is nearly impossible.\nNoveum.ai solves this by providing comprehensive tracing and observability specifically designed for AI applications.\nKey benefits of using Noveum.ai:\n- Complete Visibility: Trace every step of your LLM calls, RAG pipelines, and agent workflows\n- Easy Integration: Simple SDKs for Python and TypeScript with minimal code changes\n- Multi-Agent Support: Built-in support for complex multi-agent systems and workflows\n- Performance Insights: Detailed metrics on latency, token usage, costs, and success rates\n- Error Tracking: Automatic error capture and analysis for faster debugging\n- Framework Agnostic: Works with any LLM provider, framework, or architecture\nHow It Works\nStep 1: Install and Initialize the SDK\nNoveum.ai provides native SDKs for both Python and TypeScript applications. Getting started takes just minutes:\nPython SDK\npip install noveum-trace\nimport noveum_trace\n# Initialize the SDK\nnoveum_trace.init(\napi_key=\"your-noveum-api-key\",\nproject=\"my-ai-application\",\nenvironment=\"production\"\n)\nTypeScript SDK\nnpm install @noveum/trace\nimport { initializeClient } from '@noveum/trace';\nconst client = initializeClient({\napiKey: \"your-noveum-api-key\",\nproject: \"my-ai-application\",\nenvironment: \"production\",\n});\nStep 2: Add Tracing to Your Code\nWith Noveum.ai, adding comprehensive tracing to your AI applications is as simple as adding decorators or function calls:\nPython Examples\nBasic LLM Tracing:\n@noveum_trace.trace_llm\ndef call_openai(prompt: str) -> str:\nclient = openai.OpenAI()\nresponse = client.chat.completions.create(\nmodel=\"gpt-4\",\nmessages=[{\"role\": \"user\", \"content\": prompt}]\n)\nreturn response.choices[0].message.content\nMulti-Agent Workflows:\n@noveum_trace.trace_agent(agent_id=\"orchestrator\")\ndef orchestrate_workflow(task: str) -> dict:\n# Coordinate multiple agents\nresearch_result = research_agent(task)\nanalysis_result = analysis_agent(research_result)\nreturn synthesis_agent(research_result, analysis_result)\n@noveum_trace.trace_agent(agent_id=\"researcher\")\ndef research_agent(task: str) -> dict:\n# Research implementation with automatic tracing\nreturn {\"data\": \"...\", \"sources\": [...]}\nRAG Pipeline Tracing:\n@noveum_trace.trace_retrieval\ndef retrieve_documents(query: str) -> list:\n# Vector search implementation\nreturn vector_db.search(query)\n@noveum_trace.trace\ndef rag_pipeline(user_query: str) -> str:\ndocuments = retrieve_documents(user_query)\ncontext = prepare_context(documents)\nreturn generate_response(user_query, context)\nTypeScript Examples\nBasic Tracing:\nconst result = await trace('user-query-processing', async (traceInstance) => {\ntraceInstance.setAttribute('user.id', userId);\ntraceInstance.setAttribute('query.type', 'search');\n// Create spans for sub-operations\nconst embeddings = await span('generate-embeddings', async () => {\nreturn await openai.embeddings.create({\nmodel: 'text-embedding-ada-002',\ninput: userQuery,\n});\n});\nconst searchResults = await span('vector-search', async () => {\nreturn await vectorDB.search(embeddings.data[0].embedding);\n});\nreturn searchResults;\n});\nNext.js Integration:\n// app/api/chat/route.ts\nimport { withNoveumTracing } from '@noveum/trace/integrations/nextjs';\nexport const POST = withNoveumTracing(\nasync (request: NextRequest) => {\nconst { message } = await request.json();\nconst response = await processMessage(message);\nreturn NextResponse.json(response);\n},\n{\nclient,\nspanName: 'chat-completion',\ncaptureRequest: true,\n}\n);\nStep 3: Automatic Data Collection\nOnce integrated, Noveum.ai automatically captures:\n- Request/Response Data: Complete LLM prompts, responses, and parameters\n- Performance Metrics: Latency, token usage, throughput, and costs\n- Error Information: Stack traces, error messages, and failure patterns\n- Agent Interactions: Multi-agent communication and coordination patterns\n- Custom Attributes: Any additional context you want to track\nRich Metadata Example\n{\n\"trace_id\": \"trace_abc123\",\n\"span_id\": \"span_def456\",\n\"operation\": \"llm_completion\",\n\"model\": \"gpt-4\",\n\"provider\": \"openai\",\n\"duration_ms\": 1250,\n\"token_usage\": {\n\"input_tokens\": 150,\n\"output_tokens\": 75,\n\"total_tokens\": 225\n},\n\"cost\": {\n\"input_cost\": 0.0045,\n\"output_cost\": 0.0075,\n\"total_cost\": 0.012\n},\n\"attributes\": {\n\"user_id\": \"user_123\",\n\"session_id\": \"session_456\",\n\"query_type\": \"search\"\n},\n\"status\": \"success\"\n}\nStep 4: Analyze and Debug with the Dashboard\nNoveum.ai provides powerful dashboards to help you:\n- Trace Visualization: See complete request flows through your AI pipelines\n- Performance Analytics: Identify bottlenecks and optimization opportunities\n- Cost Analysis: Track spending across models, users, and features\n- Error Investigation: Quickly identify and debug issues in production\n- Agent Behavior: Understand how your multi-agent systems are performing\nStep 5: Continuous Optimization\nWith comprehensive tracing in place, you can:\n- A/B Test Models: Compare different LLMs on real production traffic\n- Optimize Prompts: See which prompts perform best for different use cases\n- Reduce Costs: Identify expensive operations and optimize them\n- Improve Quality: Monitor output quality and user satisfaction\n- Scale Confidently: Understand system behavior under load\nReal-World Example: RAG-Powered Customer Support\nImagine you're building an AI-powered customer support system with a RAG pipeline:\n- User Query: Customer asks about pricing\n- Document Retrieval: System searches knowledge base\n- Context Preparation: Relevant documents are processed\n- LLM Generation: GPT-4 generates response with context\n- Response Delivery: Answer is sent to customer\nWithout Noveum.ai: When the system gives wrong answers, you have no visibility into what went wrong. Was it poor retrieval? Bad context preparation? LLM hallucination?\nWith Noveum.ai: Every step is traced:\n@noveum_trace.trace\ndef handle_support_query(user_query: str, user_id: str) -> str:\n# Each step is automatically traced\n# Step 1: Query analysis\nquery_intent = analyze_query_intent(user_query)\n# Step 2: Document retrieval\nrelevant_docs = retrieve_documents(user_query, query_intent)\n# Step 3: Context preparation\ncontext = prepare_context(relevant_docs)\n# Step 4: LLM generation\nresponse = generate_response(user_query, context)\n# Step 5: Response validation\nvalidated_response = validate_response(response, user_query)\nreturn validated_response\nNow you can see:\n- Which queries are taking too long (retrieval vs. generation)\n- When retrieval is returning irrelevant documents\n- How much each interaction costs\n- Which responses users rate poorly\n- Complete audit trail for compliance\nAdvanced Features\nMulti-Agent System Observability\n@noveum_trace.trace_agent(agent_id=\"coordinator\")\ndef coordinate_research_project(topic: str) -> dict:\n# Assign tasks to specialist agents\nliterature_review = literature_agent(topic)\ndata_analysis = data_agent(topic)\nsynthesis = synthesis_agent(literature_review, data_analysis)\nreturn {\n\"literature\": literature_review,\n\"analysis\": data_analysis,\n\"synthesis\": synthesis\n}\nCustom Sampling and Filtering\nnoveum_trace.init(\napi_key=\"your-key\",\nproject=\"my-project\",\ntransport_config={\n\"sample_rate\": 0.1, # Sample 10% of traces\n\"capture_errors\": True, # Always capture errors\n\"capture_stack_traces\": False # Skip stack traces for performance\n}\n)\nContext Propagation\n// Automatic context propagation across async operations\nconst contextManager = getGlobalContextManager();\nawait contextManager.withSpan(span, async () => {\n// This function runs with span in context\nawait someNestedOperation();\n});\nWhy Choose Noveum.ai?\n- AI-Native Design: Built specifically for LLM applications, not generic APM tools\n- Easy Integration: Minutes to get started, not days of configuration\n- Framework Agnostic: Works with any LLM provider, vector database, or framework\n- Production Ready: Built for scale with intelligent sampling and batching\n- Privacy Focused: You control what data is captured and how it's stored\n- Open Source SDKs: Transparent, auditable, and extensible\nGetting Started\nReady to add comprehensive observability to your AI applications?\n- Sign up at noveum.ai\n- Create a project and get your API key\n- Install the SDK for your preferred language\n- Add tracing to your AI workflows\n- Explore insights in the Noveum.ai dashboard\nCheck out our integration guide for detailed setup instructions and examples.\nWrapping Up\nNoveum.ai brings the observability your AI applications deserve. No more black box debugging or guessing why your RAG pipeline failed. With comprehensive tracing, you get complete visibility into every LLM call, agent interaction, and data flow.\nThe AI landscape is complex and moving fast. With Noveum.ai, you'll have the insights you need to build, debug, and optimize AI applications with confidence.\nReady to see what's really happening inside your AI applications? Try Noveum.ai today and transform how you build and monitor AI systems.\nLet's build more reliable, observable AIβ€”together.\nGet Early Access to Noveum.ai Platform\nJoin the select group of AI teams optimizing their models with our data-driven platform. We're onboarding users in limited batches to ensure a premium experience.", + "content_length": 10177, + "internal_links": [ + "https://noveum.ai/integration" + ], + "scraped_at": 1759935903.28646 + }, + { + "url": "https://noveum.ai/en/legal/privacy-policy", + "title": "Privacy Policy | Noveum.ai", + "content": "Privacy Policy\nThis is the placeholder page for your privacy policy. Edit the content/legal/privacy-policy.md\nfile to add your own content here.\nThis is the placeholder page for your privacy policy. Edit the content/legal/privacy-policy.md\nfile to add your own content here.", + "content_length": 274, + "internal_links": [], + "scraped_at": 1759935904.1824288 + }, + { + "url": "https://noveum.ai/en/legal/terms", + "title": "Terms and conditions | Noveum.ai", + "content": "Terms and conditions\nThis is the placeholder page for your terms and conditions. Edit the content/legal/terms.md\nfile to add your own content here.\nThis is the placeholder page for your terms and conditions. Edit the content/legal/terms.md\nfile to add your own content here.", + "content_length": 274, + "internal_links": [], + "scraped_at": 1759935905.058074 + }, + { + "url": "https://noveum.ai/en/docs/getting-started/tracing-concepts", + "title": "Tracing Concepts for AI Applications | Documentation | Noveum.ai", + "content": "Tracing Concepts for AI Applications\nUnderstanding traces, spans, and observability fundamentals for LLM applications, RAG systems, and AI agents\nUnderstanding the fundamentals of tracing is essential for getting the most out of Noveum.ai. This guide explains key concepts specifically in the context of AI applications, helping you design effective observability strategies for your LLM applications, RAG systems, and AI agents.\n🎯 What is Tracing?\nTracing is the practice of tracking requests as they flow through your system, creating a detailed map of what happened, when, and how long each operation took. For AI applications, tracing provides crucial insights into:\n- πŸ” Request Flow: How user queries move through your AI pipeline\n- ⏱️ Performance: Where time is spent in your AI operations\n- πŸ’° Costs: Which operations drive your AI spending\n- πŸ› Debugging: What went wrong when errors occur\n- πŸ“Š Quality: How well your AI system is performing\n🌟 Core Concepts\n1. Traces\nA trace represents a single journey through your systemβ€”like a user asking a question and getting an answer. Think of it as the complete story of one request.\nTrace Characteristics:\n- πŸ†” Unique ID: Every trace has a unique identifier\n- ⏰ Timeline: Start and end timestamps\n- 🌐 Distributed: Can span multiple services\n- πŸ“Š Hierarchical: Contains multiple related spans\n2. Spans\nA span represents a single operation within a trace. Each span has a clear start and end time and represents work being done.\nSpan Characteristics:\n- πŸ“› Name: Descriptive name of the operation\n- ⏱️ Duration: How long the operation took\n- πŸ‘₯ Parent-Child: Spans can contain other spans\n- 🏷️ Attributes: Key-value metadata about the operation\n- πŸ“ Events: Point-in-time occurrences during the span\n3. Attributes\nAttributes are key-value pairs that provide context about what happened during a span. They're crucial for understanding and filtering your traces.\nCommon AI Attribute Categories:\n- πŸ€– LLM Attributes:\nllm.model\n,llm.provider\n,llm.temperature\n- πŸ’° Cost Attributes:\nllm.tokens.input\n,llm.tokens.output\n,llm.cost\n- πŸ‘€ User Attributes:\nuser.id\n,user.plan\n,user.location\n- πŸ“„ Content Attributes:\nprompt.length\n,response.length\n,content.type\n- πŸ” Quality Attributes:\nrelevance.score\n,confidence.level\n,accuracy.rating\n4. Events\nEvents represent things that happened at a specific point in time during a span. They're perfect for capturing important moments or milestones.\n🧠 AI-Specific Tracing Patterns\nRAG Pipeline Tracing\nRAG (Retrieval-Augmented Generation) systems have distinct phases that should be traced separately:\nMulti-Agent Tracing\nWhen dealing with multiple AI agents, trace their interactions and coordination:\nπŸ“Š Observability Best Practices\n1. Meaningful Span Names\nUse descriptive, consistent naming conventions:\n2. Rich Attributes\nInclude context that helps with debugging and analysis:\n3. Error Handling\nAlways capture error details:\n4. Performance Context\nInclude performance-relevant attributes:\nπŸ” Using Traces for Debugging\nCommon Debugging Scenarios\n1. Slow Response Times\n2. High Costs\n3. Quality Issues\n4. Error Patterns\n🎯 Next Steps\nNow that you understand tracing concepts, you're ready to:\n- Implement SDK Integration - Add tracing to your application\n- Explore Framework Integrations - Framework-specific guidance\n- Learn Advanced Patterns - Custom instrumentation techniques\n- Master the Dashboard - Analyze your traces effectively\nRemember: Good observability is not about collecting all possible data, but about collecting the right data that helps you understand, debug, and optimize your AI applications.\nGet Early Access to Noveum.ai Platform\nBe the first one to get notified when we open Noveum Platform to more users. All users get access to Observability suite for free, early users get free eval jobs and premium support for the first year.", + "content_length": 3822, + "internal_links": [], + "scraped_at": 1759935906.108872 + }, + { + "url": "https://noveum.ai/en/docs/getting-started/framework-integrations", + "title": "Framework Integrations | Documentation | Noveum.ai", + "content": "Framework Integrations\nDeep-dive integration guides for Next.js, Express.js, FastAPI, Flask and other popular frameworks\nNoveum.ai provides native integrations for popular web frameworks, making it easy to add comprehensive tracing to your AI applications. This guide covers framework-specific setup, best practices, and advanced patterns.\nπŸš€ Quick Framework Overview\n| Framework | Language | Integration Type | Difficulty |\n|---|---|---|---|\n| Next.js | TypeScript | Middleware + Wrappers | ⭐ Easy |\n| Express.js | TypeScript | Middleware | ⭐ Easy |\n| Hono | TypeScript | Middleware + Decorators | ⭐ Easy |\n| FastAPI | Python | Middleware + Decorators | ⭐⭐ Moderate |\n| Flask | Python | Extensions + Decorators | ⭐⭐ Moderate |\n| Django | Python | Middleware + Decorators | ⭐⭐⭐ Advanced |\nπŸ“˜ TypeScript Frameworks\nNext.js Integration\nNext.js is one of the most popular frameworks for AI applications. Noveum provides seamless integration for both App Router and Pages Router.\nApp Router Setup\n1. Initialize Noveum (Root Layout)\n2. API Route Tracing\n3. Advanced API Route with Custom Tracing\nServer Actions Tracing\nExpress.js Integration\nExpress.js integration provides automatic tracing for all routes and middleware.\n1. Setup Middleware\n2. Manual Route Tracing\n3. Middleware with Custom Logic\nHono Integration\nHono is a lightweight framework perfect for edge computing and AI applications.\n🐍 Python Frameworks\nFastAPI Integration\nFastAPI is excellent for building high-performance AI APIs with automatic documentation.\n1. Setup with Middleware\n2. Traced Endpoints\n3. RAG Endpoint with Detailed Tracing\nFlask Integration\nFlask integration provides flexibility for existing applications.\n1. Setup with Extensions\n2. Traced Routes\n3. Background Task Tracing\nπŸ”§ Advanced Patterns\nEnvironment-Specific Configuration\nCustom Middleware\nError Boundary Integration\n🎯 Next Steps\nChoose your framework and dive deeper:\n- Implement Basic Integration - Start with the basics\n- Learn Tracing Concepts - Understand the fundamentals\n- Explore Advanced Patterns - Custom instrumentation\n- Master the Dashboard - Analyze your traces\nFramework not listed? Check our Custom Integration Guide or contact our team for specific framework support.\nGet Early Access to Noveum.ai Platform\nBe the first one to get notified when we open Noveum Platform to more users. All users get access to Observability suite for free, early users get free eval jobs and premium support for the first year.", + "content_length": 2462, + "internal_links": [], + "scraped_at": 1759935907.308911 + }, + { + "url": "https://noveum.ai/en/docs/advanced/multi-agent-tracing", + "title": "Multi-Agent Tracing | Documentation | Noveum.ai", + "content": "Multi-Agent Tracing\nObserve complex agent workflows and inter-agent communications with comprehensive tracing\nMulti-agent systems represent some of the most complex AI applications, involving multiple agents that coordinate, communicate, and collaborate to achieve shared goals. Noveum.ai provides specialized tracing capabilities to help you understand and optimize these intricate workflows.\n🎯 Why Multi-Agent Tracing Matters\nMulti-agent systems introduce unique observability challenges:\n- Complex Dependencies: Agents depend on each other's outputs and decisions\n- Asynchronous Operations: Agents may operate concurrently or in parallel\n- Communication Patterns: Understanding how agents share information\n- Resource Coordination: Managing shared resources and preventing conflicts\n- Error Propagation: How failures in one agent affect the entire system\nπŸ—οΈ Agent System Architecture\nAgent Types and Roles\nNoveum.ai can trace various agent patterns:\nTypeScript Multi-Agent Example\nπŸ“Š Tracing Multi-Agent Workflows\nCoordination Patterns\nSequential Agent Execution\nParallel Agent Execution\nHierarchical Agent Systems\nπŸ”— Inter-Agent Communication Tracing\nMessage Passing\nShared State Management\nπŸ“ˆ Multi-Agent Performance Analysis\nAgent Performance Metrics\nTrack key metrics for each agent:\nSystem-Wide Coordination Metrics\nπŸ”§ Best Practices for Multi-Agent Tracing\n1. Agent Identification\nAlways clearly identify agents in your traces:\n2. Communication Tracing\nTrace all inter-agent communications:\n3. Error Propagation Tracking\nMonitor how errors propagate through agent systems:\n4. Resource Coordination\nTrack shared resource usage:\n🎯 Advanced Multi-Agent Patterns\nSelf-Organizing Agent Systems\nAdaptive Agent Workflows\nπŸ“Š Monitoring and Alerts\nSet up monitoring for multi-agent systems:\nMulti-agent tracing with Noveum.ai provides the visibility needed to understand, optimize, and scale complex agent systems. By implementing comprehensive tracing across all agent interactions, communications, and coordination patterns, you can build more reliable and efficient multi-agent AI applications.\nπŸ”— Next Steps\n- RAG Pipeline Observability - Monitor retrieval and generation systems\n- Custom Instrumentation - Add domain-specific tracing\n- Performance Optimization - Optimize based on tracing insights\nGet Early Access to Noveum.ai Platform\nBe the first one to get notified when we open Noveum Platform to more users. All users get access to Observability suite for free, early users get free eval jobs and premium support for the first year.", + "content_length": 2537, + "internal_links": [], + "scraped_at": 1759935908.445167 + }, + { + "url": "https://noveum.ai/en/docs/advanced/rag-observability", + "title": "RAG Pipeline Observability | Documentation | Noveum.ai", + "content": "RAG Pipeline Observability\nMonitor retrieval, generation, and context handling in RAG systems with comprehensive tracing\nRetrieval-Augmented Generation (RAG) systems combine the power of information retrieval with large language models to provide accurate, contextual responses. Monitoring these complex pipelines requires specialized observability to understand retrieval quality, context relevance, and generation effectiveness.\n🎯 Why RAG Observability Matters\nRAG systems introduce unique challenges that traditional monitoring can't address:\n- Retrieval Quality: Are you finding the most relevant documents?\n- Context Utilization: How effectively is retrieved context being used?\n- Generation Fidelity: Is the LLM accurately using the provided context?\n- Pipeline Performance: Where are the bottlenecks in your RAG pipeline?\n- Cost Optimization: Which components consume the most resources?\nπŸ—οΈ RAG Pipeline Architecture\nCore RAG Components\nNoveum.ai can trace each stage of your RAG pipeline:\nTypeScript RAG Implementation\nπŸ“Š Tracing Retrieval Components\nVector Database Operations\nDocument Ranking and Reranking\nContext Window Management\nπŸ€– Tracing LLM Generation\nContext-Aware Generation\nResponse Evaluation and Feedback\nπŸ“ˆ RAG Pipeline Performance Analysis\nEnd-to-End Pipeline Metrics\nCost and Resource Tracking\nπŸ”§ Best Practices for RAG Observability\n1. Comprehensive Pipeline Tracing\n2. Quality Monitoring\n3. A/B Testing for RAG Components\n🎯 Advanced RAG Patterns\nMulti-Modal RAG\nConversational RAG\nRAG observability with Noveum.ai provides the deep insights needed to build, optimize, and scale retrieval-augmented generation systems. By implementing comprehensive tracing across retrieval, context preparation, and generation stages, you can ensure your RAG pipeline delivers accurate, relevant, and cost-effective responses.\nπŸ”— Next Steps\n- Custom Instrumentation - Add domain-specific tracing\n- Multi-Agent Tracing - Observe agent workflows\n- Performance Optimization - Optimize based on tracing insights\nGet Early Access to Noveum.ai Platform\nBe the first one to get notified when we open Noveum Platform to more users. All users get access to Observability suite for free, early users get free eval jobs and premium support for the first year.", + "content_length": 2253, + "internal_links": [], + "scraped_at": 1759935909.55895 + }, + { + "url": "https://noveum.ai/en/docs/advanced/custom-instrumentation", + "title": "Custom Instrumentation | Documentation | Noveum.ai", + "content": "Custom Instrumentation\nAdd custom spans and attributes for domain-specific observability and advanced tracing patterns\nWhile Noveum.ai's automatic instrumentation covers common AI operations, custom instrumentation allows you to add domain-specific observability, track business metrics, and create detailed traces for unique workflows. This guide covers advanced techniques for implementing custom tracing patterns.\n🎯 Why Custom Instrumentation?\nCustom instrumentation enables you to:\n- Track Business Metrics: Monitor domain-specific KPIs alongside technical metrics\n- Trace Complex Workflows: Create detailed observability for unique business logic\n- Add Context: Enrich traces with application-specific attributes\n- Monitor Custom Components: Instrument proprietary algorithms and processes\n- Optimize Performance: Track specific bottlenecks in your application\nπŸ› οΈ Custom Span Creation\nBasic Custom Spans\nTypeScript Custom Instrumentation\nπŸ“Š Custom Metrics and Attributes\nBusiness Metrics Integration\nPerformance Profiling Integration\nπŸ”§ Custom Context Propagation\nThread-Safe Context Management\nAsync Context Propagation\nπŸ“ˆ Advanced Custom Patterns\nEvent-Driven Instrumentation\nCustom Sampling Strategies\nπŸ”— Integration with External Systems\nDatabase Operation Tracing\nCustom instrumentation with Noveum.ai provides the flexibility to create detailed, domain-specific observability that goes beyond standard LLM and AI operation tracing. By implementing custom spans, attributes, context propagation, and advanced patterns, you can build comprehensive monitoring tailored to your specific application needs.\nπŸ”— Next Steps\n- Performance Optimization - Use tracing insights to optimize performance\n- Multi-Agent Tracing - Observe agent workflows\n- RAG Pipeline Observability - Monitor retrieval and generation systems\nGet Early Access to Noveum.ai Platform\nBe the first one to get notified when we open Noveum Platform to more users. All users get access to Observability suite for free, early users get free eval jobs and premium support for the first year.", + "content_length": 2056, + "internal_links": [], + "scraped_at": 1759935910.8084621 + }, + { + "url": "https://noveum.ai/en/docs/advanced/performance-optimization", + "title": "Performance Optimization | Documentation | Noveum.ai", + "content": "Performance Optimization\nUse tracing data to identify bottlenecks and optimize AI application performance\nPerformance optimization for AI applications requires understanding the unique characteristics of LLM calls, vector operations, and complex workflows. Noveum.ai's tracing data provides detailed insights to identify bottlenecks, optimize resource usage, and improve overall system performance.\n🎯 Why AI Performance Optimization Matters\nAI applications have unique performance characteristics:\n- Token-Based Costs: LLM usage is measured in tokens, making efficiency crucial\n- Variable Latency: AI operations can have unpredictable response times\n- Context Dependencies: Performance varies with input size and complexity\n- Resource Intensive: Vector operations and embeddings require significant compute\n- Cascading Effects: Slow AI components impact entire application workflows\nπŸ“Š Performance Analysis with Tracing Data\nIdentifying Performance Bottlenecks\nOptimization Implementation Strategies\nπŸš€ Advanced Optimization Techniques\nModel Selection Optimization\nResource Usage Optimization\n🎯 Performance Optimization Best Practices\n1. Establish Performance Baselines\n2. Implement Gradual Optimization\n3. Monitor Optimization Impact\nPerformance optimization for AI applications requires a systematic approach combining detailed tracing insights, strategic implementation, and continuous monitoring. By leveraging Noveum.ai's comprehensive tracing data, you can identify bottlenecks, implement targeted optimizations, and achieve significant improvements in latency, cost, and resource efficiency.\nπŸ”— Next Steps\n- Multi-Agent Tracing - Observe agent workflows\n- RAG Pipeline Observability - Monitor retrieval and generation systems\n- Custom Instrumentation - Add domain-specific tracing\nGet Early Access to Noveum.ai Platform\nBe the first one to get notified when we open Noveum Platform to more users. All users get access to Observability suite for free, early users get free eval jobs and premium support for the first year.", + "content_length": 2025, + "internal_links": [], + "scraped_at": 1759935912.062154 + }, + { + "url": "https://noveum.ai/docs/getting-started/overview", + "title": "Noveum.ai Overview | Documentation | Noveum.ai", + "content": "Noveum.ai Overview\nComprehensive AI tracing and observability platform for LLM applications, RAG systems, and AI agents\nWelcome to Noveum.aiβ€”the comprehensive tracing and observability platform built specifically for AI applications. Whether you're building LLM-powered chatbots, RAG systems, multi-agent workflows, or any AI-driven application, Noveum provides the insights you need to understand, debug, and optimize your systems.\n🎯 Why AI Applications Need Specialized Observability\nTraditional monitoring tools fall short when it comes to AI applications because they don't understand:\n- πŸ“Š AI-Specific Metrics: Token usage, model costs, prompt effectiveness\n- πŸ”€ Complex Workflows: Multi-step RAG pipelines, agent interactions, tool usage\n- 🧠 Context Flow: How data moves through embeddings, retrievals, and generations\n- πŸ’° Cost Attribution: Which operations drive your AI spending\n- 🎯 Quality Metrics: Beyond latency - understanding output quality and relevance\nNoveum.ai bridges this gap with purpose-built observability for the AI era.\nπŸš€ Core Platform Components\n1. 🐍 Python SDK (noveum-trace\n)\n- Decorator-based tracing for seamless integration\n- Automatic instrumentation for LangChain, LlamaIndex, and OpenAI\n- Async-aware context propagation\n- Production-ready with intelligent sampling and batching\n2. πŸ“˜ TypeScript SDK (@noveum/trace\n)\n- Framework integrations for Next.js, Express.js, Hono\n- TypeScript-first with full type safety\n- Universal compatibility (Node.js, Edge Runtime, browsers)\n- Zero-config automatic instrumentation\n3. πŸ“Š Noveum Platform\n- Real-time dashboard with AI-specific visualizations\n- Advanced search & filtering across traces and spans\n- Cost analysis and optimization recommendations\n- Team collaboration with shared insights and alerts\nπŸ” What Noveum Traces\nLLM Operations\n- Model calls across all providers (OpenAI, Anthropic, Google, etc.)\n- Token usage and cost calculation\n- Prompt engineering effectiveness\n- Response quality metrics\nRAG Pipelines\n- Document retrieval performance and relevance\n- Embedding generation costs and latency\n- Context assembly and prompt construction\n- Answer generation with source attribution\nMulti-Agent Systems\n- Agent interactions and communication patterns\n- Tool usage and external API calls\n- Decision trees and reasoning chains\n- Workflow orchestration across agents\nCustom Operations\n- Business logic specific to your domain\n- External integrations and API calls\n- Data processing pipelines\n- User interactions and session flows\n🎯 Key Benefits\nπŸ”§ Developer Experience\n- 5-minute setup with minimal code changes\n- Intelligent defaults that work out-of-the-box\n- Rich SDKs with comprehensive documentation\n- Local development support with optional cloud sync\nπŸ“Š Production Insights\n- Real-time monitoring of AI application health\n- Performance optimization with bottleneck identification\n- Cost management with detailed spend analysis\n- Quality assurance through automated alerting\nπŸ”’ Enterprise Ready\n- Security first with end-to-end encryption\n- Compliance support for regulated industries\n- Scalable architecture handling millions of traces\n- Data sovereignty with region-specific storage\nπŸ‘₯ Team Collaboration\n- Shared dashboards for cross-functional teams\n- Incident management with trace-based debugging\n- Performance baselines and regression detection\n- Knowledge sharing through trace annotations\nπŸ“ˆ Common Use Cases\nπŸ€– LLM Application Monitoring\nTrack every aspect of your LLM-powered application:\n- Monitor response quality and user satisfaction\n- Optimize prompt engineering for better results\n- Control costs across different models and providers\n- Debug edge cases and improve error handling\nπŸ” RAG System Optimization\nUnderstand and improve your RAG pipeline:\n- Measure retrieval accuracy and relevance\n- Optimize embedding models and vector search\n- Track context utilization and prompt effectiveness\n- Debug hallucinations and improve grounding\n🀝 Multi-Agent Coordination\nObserve complex agent interactions:\n- Visualize agent communication patterns\n- Track tool usage and external dependencies\n- Optimize workflow efficiency and resource usage\n- Debug coordination failures and deadlocks\nπŸš€ Performance Engineering\nOptimize your AI application performance:\n- Identify slow operations and bottlenecks\n- Right-size models for your workload\n- Implement intelligent caching strategies\n- Scale services based on actual usage patterns\n🎨 Platform Features\nπŸ” Trace Explorer\n- Hierarchical visualization of complex AI workflows\n- Timeline view showing operation sequences\n- Detailed span inspection with all attributes and events\n- Cross-trace correlation for distributed operations\nπŸ’° Cost Analytics\n- Real-time cost tracking across all AI providers\n- Cost attribution by user, feature, or operation\n- Budget alerts and spending forecasts\n- Optimization recommendations for cost reduction\nπŸ“Š Performance Dashboard\n- Latency percentiles and throughput metrics\n- Error rates and failure analysis\n- Model comparison across providers and versions\n- Custom metrics and business KPIs\n🚨 Alerting & Monitoring\n- Intelligent alerts based on AI-specific thresholds\n- Anomaly detection for unusual patterns\n- Escalation policies for critical issues\n- Integration with Slack, PagerDuty, and more\nπŸ› οΈ Integration Patterns\nIncremental Adoption\nStart small and expand coverage:\n- Single endpoint tracing for immediate value\n- Critical path instrumentation for core workflows\n- Full application coverage for comprehensive insights\n- Advanced features like custom metrics and alerts\nFramework Integration\nNative support for popular frameworks:\n- Next.js with App Router and API routes\n- Express.js and other Node.js frameworks\n- FastAPI and Flask for Python applications\n- Custom integrations for any framework\nCI/CD Integration\nEmbed observability in your development process:\n- Performance regression detection in CI\n- Trace-based testing for quality assurance\n- Deployment monitoring with rollback triggers\n- Feature flag integration for safe releases\n🌟 Getting Started\nReady to transform your AI application observability? Here's your path:\n- Quick Start - Integrate your first SDK in 5 minutes\n- Tracing Concepts - Learn the fundamentals\n- Framework Guides - Deep dive into your stack\n- Advanced Features - Unlock the full platform potential\nBuilt by developers, for developers. Noveum.ai understands that AI applications are different, and we've designed our platform from the ground up to meet their unique observability needs.\nGet Early Access to Noveum.ai Platform\nBe the first one to get notified when we open Noveum Platform to more users. All users get access to Observability suite for free, early users get free eval jobs and premium support for the first year.", + "content_length": 6733, + "internal_links": [], + "scraped_at": 1759935913.302529 + }, + { + "url": "https://noveum.ai/docs/getting-started/sdk-integration", + "title": "SDK Integration Guide | Documentation | Noveum.ai", + "content": "SDK Integration Guide\nIntegrate Noveum.ai tracing into your AI applications with Python or TypeScript SDKs\nThe Noveum.ai SDKs provide comprehensive tracing and observability for your AI applications with minimal code changes. Whether you're building LLM applications, RAG systems, or multi-agent workflows, our SDKs automatically capture essential metrics and traces.\nπŸš€ Quick Start\n1. Create Your Account & Get API Key\n- Sign up at noveum.ai\n- Create a project in your dashboard\n- Generate an API key from the integration page\n- Choose your SDK based on your application language\n2. Install the SDK\nRequirements: Python 3.8+\n3. Initialize the Client\nEnvironment Variables:\n🎯 Basic Usage\nTrace LLM Calls\nAlternative - Context Manager:\nTrace RAG Pipelines\nπŸ”§ Framework Integrations\nNext.js Integration\nExpress.js Integration\nFastAPI Integration (Python)\nπŸ“Š Advanced Features\nCustom Attributes & Events\nSampling Configuration\nπŸ”— What's Captured Automatically\n- πŸ“Š Performance Metrics: Latency, throughput, error rates\n- πŸ’° Cost Tracking: Token usage, API costs across providers\n- πŸ” Request/Response: Configurable capture of inputs/outputs\n- 🏷️ Metadata: Model names, parameters, user context\n- 🌊 Context Flow: Trace relationships across services\n- πŸ› Error Details: Stack traces, error classification\nπŸ“ˆ View Your Data\nOnce integrated, visit your Noveum Dashboard to:\n- πŸ” Search & Filter traces by any attribute\n- πŸ“Š Analyze Performance trends and bottlenecks\n- πŸ’° Monitor Costs across different models and providers\n- πŸ› Debug Issues with detailed trace timelines\n- πŸ‘₯ Collaborate with your team on insights\nπŸ”’ Security & Privacy\n- πŸ” Encryption: All data encrypted in transit and at rest\n- πŸŽ›οΈ Configurable Capture: Control what data is collected\n- 🏠 Data Residency: Choose your data storage region\n- ⏰ Retention Control: Set custom data retention policies\nNext Steps\n- Tracing Concepts - Learn about traces, spans, and observability best practices\n- Framework Integrations - Deep dive into specific framework setups\n- Multi-Agent Tracing - Observe complex agent workflows\n- Dashboard Guide - Master the Noveum platform interface\nExclusive Early Access\nGet Early Access to Noveum.ai Platform\nBe the first one to get notified when we open Noveum Platform to more users. All users get access to Observability suite for free, early users get free eval jobs and premium support for the first year.", + "content_length": 2378, + "internal_links": [], + "scraped_at": 1759935914.586812 + }, + { + "url": "https://noveum.ai/docs/getting-started/tracing-concepts", + "title": "Tracing Concepts for AI Applications | Documentation | Noveum.ai", + "content": "Tracing Concepts for AI Applications\nUnderstanding traces, spans, and observability fundamentals for LLM applications, RAG systems, and AI agents\nUnderstanding the fundamentals of tracing is essential for getting the most out of Noveum.ai. This guide explains key concepts specifically in the context of AI applications, helping you design effective observability strategies for your LLM applications, RAG systems, and AI agents.\n🎯 What is Tracing?\nTracing is the practice of tracking requests as they flow through your system, creating a detailed map of what happened, when, and how long each operation took. For AI applications, tracing provides crucial insights into:\n- πŸ” Request Flow: How user queries move through your AI pipeline\n- ⏱️ Performance: Where time is spent in your AI operations\n- πŸ’° Costs: Which operations drive your AI spending\n- πŸ› Debugging: What went wrong when errors occur\n- πŸ“Š Quality: How well your AI system is performing\n🌟 Core Concepts\n1. Traces\nA trace represents a single journey through your systemβ€”like a user asking a question and getting an answer. Think of it as the complete story of one request.\nTrace Characteristics:\n- πŸ†” Unique ID: Every trace has a unique identifier\n- ⏰ Timeline: Start and end timestamps\n- 🌐 Distributed: Can span multiple services\n- πŸ“Š Hierarchical: Contains multiple related spans\n2. Spans\nA span represents a single operation within a trace. Each span has a clear start and end time and represents work being done.\nSpan Characteristics:\n- πŸ“› Name: Descriptive name of the operation\n- ⏱️ Duration: How long the operation took\n- πŸ‘₯ Parent-Child: Spans can contain other spans\n- 🏷️ Attributes: Key-value metadata about the operation\n- πŸ“ Events: Point-in-time occurrences during the span\n3. Attributes\nAttributes are key-value pairs that provide context about what happened during a span. They're crucial for understanding and filtering your traces.\nCommon AI Attribute Categories:\n- πŸ€– LLM Attributes:\nllm.model\n,llm.provider\n,llm.temperature\n- πŸ’° Cost Attributes:\nllm.tokens.input\n,llm.tokens.output\n,llm.cost\n- πŸ‘€ User Attributes:\nuser.id\n,user.plan\n,user.location\n- πŸ“„ Content Attributes:\nprompt.length\n,response.length\n,content.type\n- πŸ” Quality Attributes:\nrelevance.score\n,confidence.level\n,accuracy.rating\n4. Events\nEvents represent things that happened at a specific point in time during a span. They're perfect for capturing important moments or milestones.\n🧠 AI-Specific Tracing Patterns\nRAG Pipeline Tracing\nRAG (Retrieval-Augmented Generation) systems have distinct phases that should be traced separately:\nMulti-Agent Tracing\nWhen dealing with multiple AI agents, trace their interactions and coordination:\nπŸ“Š Observability Best Practices\n1. Meaningful Span Names\nUse descriptive, consistent naming conventions:\n2. Rich Attributes\nInclude context that helps with debugging and analysis:\n3. Error Handling\nAlways capture error details:\n4. Performance Context\nInclude performance-relevant attributes:\nπŸ” Using Traces for Debugging\nCommon Debugging Scenarios\n1. Slow Response Times\n2. High Costs\n3. Quality Issues\n4. Error Patterns\n🎯 Next Steps\nNow that you understand tracing concepts, you're ready to:\n- Implement SDK Integration - Add tracing to your application\n- Explore Framework Integrations - Framework-specific guidance\n- Learn Advanced Patterns - Custom instrumentation techniques\n- Master the Dashboard - Analyze your traces effectively\nRemember: Good observability is not about collecting all possible data, but about collecting the right data that helps you understand, debug, and optimize your AI applications.\nGet Early Access to Noveum.ai Platform\nBe the first one to get notified when we open Noveum Platform to more users. All users get access to Observability suite for free, early users get free eval jobs and premium support for the first year.", + "content_length": 3822, + "internal_links": [], + "scraped_at": 1759935915.863421 + }, + { + "url": "https://noveum.ai/docs/getting-started/framework-integrations", + "title": "Framework Integrations | Documentation | Noveum.ai", + "content": "Framework Integrations\nDeep-dive integration guides for Next.js, Express.js, FastAPI, Flask and other popular frameworks\nNoveum.ai provides native integrations for popular web frameworks, making it easy to add comprehensive tracing to your AI applications. This guide covers framework-specific setup, best practices, and advanced patterns.\nπŸš€ Quick Framework Overview\n| Framework | Language | Integration Type | Difficulty |\n|---|---|---|---|\n| Next.js | TypeScript | Middleware + Wrappers | ⭐ Easy |\n| Express.js | TypeScript | Middleware | ⭐ Easy |\n| Hono | TypeScript | Middleware + Decorators | ⭐ Easy |\n| FastAPI | Python | Middleware + Decorators | ⭐⭐ Moderate |\n| Flask | Python | Extensions + Decorators | ⭐⭐ Moderate |\n| Django | Python | Middleware + Decorators | ⭐⭐⭐ Advanced |\nπŸ“˜ TypeScript Frameworks\nNext.js Integration\nNext.js is one of the most popular frameworks for AI applications. Noveum provides seamless integration for both App Router and Pages Router.\nApp Router Setup\n1. Initialize Noveum (Root Layout)\n2. API Route Tracing\n3. Advanced API Route with Custom Tracing\nServer Actions Tracing\nExpress.js Integration\nExpress.js integration provides automatic tracing for all routes and middleware.\n1. Setup Middleware\n2. Manual Route Tracing\n3. Middleware with Custom Logic\nHono Integration\nHono is a lightweight framework perfect for edge computing and AI applications.\n🐍 Python Frameworks\nFastAPI Integration\nFastAPI is excellent for building high-performance AI APIs with automatic documentation.\n1. Setup with Middleware\n2. Traced Endpoints\n3. RAG Endpoint with Detailed Tracing\nFlask Integration\nFlask integration provides flexibility for existing applications.\n1. Setup with Extensions\n2. Traced Routes\n3. Background Task Tracing\nπŸ”§ Advanced Patterns\nEnvironment-Specific Configuration\nCustom Middleware\nError Boundary Integration\n🎯 Next Steps\nChoose your framework and dive deeper:\n- Implement Basic Integration - Start with the basics\n- Learn Tracing Concepts - Understand the fundamentals\n- Explore Advanced Patterns - Custom instrumentation\n- Master the Dashboard - Analyze your traces\nFramework not listed? Check our Custom Integration Guide or contact our team for specific framework support.\nGet Early Access to Noveum.ai Platform\nBe the first one to get notified when we open Noveum Platform to more users. All users get access to Observability suite for free, early users get free eval jobs and premium support for the first year.", + "content_length": 2462, + "internal_links": [], + "scraped_at": 1759935917.23431 + }, + { + "url": "https://noveum.ai/docs/advanced/multi-agent-tracing", + "title": "Multi-Agent Tracing | Documentation | Noveum.ai", + "content": "Multi-Agent Tracing\nObserve complex agent workflows and inter-agent communications with comprehensive tracing\nMulti-agent systems represent some of the most complex AI applications, involving multiple agents that coordinate, communicate, and collaborate to achieve shared goals. Noveum.ai provides specialized tracing capabilities to help you understand and optimize these intricate workflows.\n🎯 Why Multi-Agent Tracing Matters\nMulti-agent systems introduce unique observability challenges:\n- Complex Dependencies: Agents depend on each other's outputs and decisions\n- Asynchronous Operations: Agents may operate concurrently or in parallel\n- Communication Patterns: Understanding how agents share information\n- Resource Coordination: Managing shared resources and preventing conflicts\n- Error Propagation: How failures in one agent affect the entire system\nπŸ—οΈ Agent System Architecture\nAgent Types and Roles\nNoveum.ai can trace various agent patterns:\nTypeScript Multi-Agent Example\nπŸ“Š Tracing Multi-Agent Workflows\nCoordination Patterns\nSequential Agent Execution\nParallel Agent Execution\nHierarchical Agent Systems\nπŸ”— Inter-Agent Communication Tracing\nMessage Passing\nShared State Management\nπŸ“ˆ Multi-Agent Performance Analysis\nAgent Performance Metrics\nTrack key metrics for each agent:\nSystem-Wide Coordination Metrics\nπŸ”§ Best Practices for Multi-Agent Tracing\n1. Agent Identification\nAlways clearly identify agents in your traces:\n2. Communication Tracing\nTrace all inter-agent communications:\n3. Error Propagation Tracking\nMonitor how errors propagate through agent systems:\n4. Resource Coordination\nTrack shared resource usage:\n🎯 Advanced Multi-Agent Patterns\nSelf-Organizing Agent Systems\nAdaptive Agent Workflows\nπŸ“Š Monitoring and Alerts\nSet up monitoring for multi-agent systems:\nMulti-agent tracing with Noveum.ai provides the visibility needed to understand, optimize, and scale complex agent systems. By implementing comprehensive tracing across all agent interactions, communications, and coordination patterns, you can build more reliable and efficient multi-agent AI applications.\nπŸ”— Next Steps\n- RAG Pipeline Observability - Monitor retrieval and generation systems\n- Custom Instrumentation - Add domain-specific tracing\n- Performance Optimization - Optimize based on tracing insights\nGet Early Access to Noveum.ai Platform\nBe the first one to get notified when we open Noveum Platform to more users. All users get access to Observability suite for free, early users get free eval jobs and premium support for the first year.", + "content_length": 2537, + "internal_links": [], + "scraped_at": 1759935918.6890082 + }, + { + "url": "https://noveum.ai/docs/advanced/rag-observability", + "title": "RAG Pipeline Observability | Documentation | Noveum.ai", + "content": "RAG Pipeline Observability\nMonitor retrieval, generation, and context handling in RAG systems with comprehensive tracing\nRetrieval-Augmented Generation (RAG) systems combine the power of information retrieval with large language models to provide accurate, contextual responses. Monitoring these complex pipelines requires specialized observability to understand retrieval quality, context relevance, and generation effectiveness.\n🎯 Why RAG Observability Matters\nRAG systems introduce unique challenges that traditional monitoring can't address:\n- Retrieval Quality: Are you finding the most relevant documents?\n- Context Utilization: How effectively is retrieved context being used?\n- Generation Fidelity: Is the LLM accurately using the provided context?\n- Pipeline Performance: Where are the bottlenecks in your RAG pipeline?\n- Cost Optimization: Which components consume the most resources?\nπŸ—οΈ RAG Pipeline Architecture\nCore RAG Components\nNoveum.ai can trace each stage of your RAG pipeline:\nTypeScript RAG Implementation\nπŸ“Š Tracing Retrieval Components\nVector Database Operations\nDocument Ranking and Reranking\nContext Window Management\nπŸ€– Tracing LLM Generation\nContext-Aware Generation\nResponse Evaluation and Feedback\nπŸ“ˆ RAG Pipeline Performance Analysis\nEnd-to-End Pipeline Metrics\nCost and Resource Tracking\nπŸ”§ Best Practices for RAG Observability\n1. Comprehensive Pipeline Tracing\n2. Quality Monitoring\n3. A/B Testing for RAG Components\n🎯 Advanced RAG Patterns\nMulti-Modal RAG\nConversational RAG\nRAG observability with Noveum.ai provides the deep insights needed to build, optimize, and scale retrieval-augmented generation systems. By implementing comprehensive tracing across retrieval, context preparation, and generation stages, you can ensure your RAG pipeline delivers accurate, relevant, and cost-effective responses.\nπŸ”— Next Steps\n- Custom Instrumentation - Add domain-specific tracing\n- Multi-Agent Tracing - Observe agent workflows\n- Performance Optimization - Optimize based on tracing insights\nGet Early Access to Noveum.ai Platform\nBe the first one to get notified when we open Noveum Platform to more users. All users get access to Observability suite for free, early users get free eval jobs and premium support for the first year.", + "content_length": 2253, + "internal_links": [], + "scraped_at": 1759935920.029203 + }, + { + "url": "https://noveum.ai/docs/advanced/custom-instrumentation", + "title": "Custom Instrumentation | Documentation | Noveum.ai", + "content": "Custom Instrumentation\nAdd custom spans and attributes for domain-specific observability and advanced tracing patterns\nWhile Noveum.ai's automatic instrumentation covers common AI operations, custom instrumentation allows you to add domain-specific observability, track business metrics, and create detailed traces for unique workflows. This guide covers advanced techniques for implementing custom tracing patterns.\n🎯 Why Custom Instrumentation?\nCustom instrumentation enables you to:\n- Track Business Metrics: Monitor domain-specific KPIs alongside technical metrics\n- Trace Complex Workflows: Create detailed observability for unique business logic\n- Add Context: Enrich traces with application-specific attributes\n- Monitor Custom Components: Instrument proprietary algorithms and processes\n- Optimize Performance: Track specific bottlenecks in your application\nπŸ› οΈ Custom Span Creation\nBasic Custom Spans\nTypeScript Custom Instrumentation\nπŸ“Š Custom Metrics and Attributes\nBusiness Metrics Integration\nPerformance Profiling Integration\nπŸ”§ Custom Context Propagation\nThread-Safe Context Management\nAsync Context Propagation\nπŸ“ˆ Advanced Custom Patterns\nEvent-Driven Instrumentation\nCustom Sampling Strategies\nπŸ”— Integration with External Systems\nDatabase Operation Tracing\nCustom instrumentation with Noveum.ai provides the flexibility to create detailed, domain-specific observability that goes beyond standard LLM and AI operation tracing. By implementing custom spans, attributes, context propagation, and advanced patterns, you can build comprehensive monitoring tailored to your specific application needs.\nπŸ”— Next Steps\n- Performance Optimization - Use tracing insights to optimize performance\n- Multi-Agent Tracing - Observe agent workflows\n- RAG Pipeline Observability - Monitor retrieval and generation systems\nGet Early Access to Noveum.ai Platform\nBe the first one to get notified when we open Noveum Platform to more users. All users get access to Observability suite for free, early users get free eval jobs and premium support for the first year.", + "content_length": 2056, + "internal_links": [], + "scraped_at": 1759935921.483979 + }, + { + "url": "https://noveum.ai/docs/advanced/performance-optimization", + "title": "Performance Optimization | Documentation | Noveum.ai", + "content": "Performance Optimization\nUse tracing data to identify bottlenecks and optimize AI application performance\nPerformance optimization for AI applications requires understanding the unique characteristics of LLM calls, vector operations, and complex workflows. Noveum.ai's tracing data provides detailed insights to identify bottlenecks, optimize resource usage, and improve overall system performance.\n🎯 Why AI Performance Optimization Matters\nAI applications have unique performance characteristics:\n- Token-Based Costs: LLM usage is measured in tokens, making efficiency crucial\n- Variable Latency: AI operations can have unpredictable response times\n- Context Dependencies: Performance varies with input size and complexity\n- Resource Intensive: Vector operations and embeddings require significant compute\n- Cascading Effects: Slow AI components impact entire application workflows\nπŸ“Š Performance Analysis with Tracing Data\nIdentifying Performance Bottlenecks\nOptimization Implementation Strategies\nπŸš€ Advanced Optimization Techniques\nModel Selection Optimization\nResource Usage Optimization\n🎯 Performance Optimization Best Practices\n1. Establish Performance Baselines\n2. Implement Gradual Optimization\n3. Monitor Optimization Impact\nPerformance optimization for AI applications requires a systematic approach combining detailed tracing insights, strategic implementation, and continuous monitoring. By leveraging Noveum.ai's comprehensive tracing data, you can identify bottlenecks, implement targeted optimizations, and achieve significant improvements in latency, cost, and resource efficiency.\nπŸ”— Next Steps\n- Multi-Agent Tracing - Observe agent workflows\n- RAG Pipeline Observability - Monitor retrieval and generation systems\n- Custom Instrumentation - Add domain-specific tracing\nGet Early Access to Noveum.ai Platform\nBe the first one to get notified when we open Noveum Platform to more users. All users get access to Observability suite for free, early users get free eval jobs and premium support for the first year.", + "content_length": 2025, + "internal_links": [], + "scraped_at": 1759935922.847022 + }, + { + "url": "https://noveum.ai/docs/platform/dashboard", + "title": "Dashboard Overview | Documentation | Noveum.ai", + "content": "Dashboard Overview\nNavigate the Noveum platform and understand key metrics for your AI applications\nDashboard Overview\nThe Noveum.ai dashboard provides comprehensive visibility into your AI application's performance, giving you real-time insights into traces, costs, and system health. Built specifically for AI workloads, it offers both high-level analytics and detailed trace inspection capabilities.\n🎯 Key Dashboard Features\nReal-Time Traces Monitoring\n- Live Trace Stream: Monitor LLM calls, RAG operations, and agent activities in real-time\n- Advanced Filtering: Filter by project, environment, status, date ranges, and custom attributes\n- Search Functionality: Quickly find specific traces using full-text search across all trace data\n- Status Indicators: Visual status badges for success, error, and pending operations\nPerformance Analytics\n- Latency Metrics: Track response times across different operations and time periods\n- Cost Analysis: Monitor spending across different LLM providers and operations\n- Throughput Monitoring: Observe request volumes and system capacity\n- Error Rate Tracking: Identify and monitor failure patterns\nInteractive Trace Inspection\n- Detailed Trace View: Expand any trace to see complete request/response data\n- Span Hierarchy: Navigate complex multi-step operations with visual span trees\n- Timing Analysis: Understand where time is spent in your AI operations\n- Context Preservation: See how data flows through embeddings, retrievals, and generations\nπŸ“Š Dashboard Components\nTraces List Interface\nThe main traces interface offers two viewing modes:\nClassic Interface\n- Tabular view of all traces with sortable columns\n- Quick filtering and search capabilities\n- Expandable detail panels for trace inspection\nThree-Pane Interface\n- Directory tree navigation for complex trace hierarchies\n- Split-pane view for simultaneous trace browsing and detail inspection\n- Advanced filtering with visual feedback\nFilter Controls\n- Environment Filter: Switch between development, staging, and production\n- Project Filter: Focus on specific applications or services\n- Status Filter: View only successful, failed, or pending operations\n- Date Range: Analyze performance over custom time periods\n- Clear Filters: Quick reset to view all traces\nConnection Status\n- Real-time Status: Monitor connection health to your trace storage\n- Error Reporting: Clear error messages when connectivity issues occur\n- Refresh Controls: Manual refresh capability for troubleshooting\nπŸ” Trace Detail Analysis\nComprehensive Trace Information\nEach trace provides detailed insights including:\n- Basic Metadata: Timestamp, duration, status, project, and environment\n- Request Context: User ID, session ID, and custom attributes\n- Response Data: Complete LLM responses, tool outputs, and generated content\n- Performance Metrics: Token usage, costs, and timing breakdowns\n- Error Details: Stack traces and error context when operations fail\nSpan Analysis\n- Operation Types: Automatic categorization of LLM calls, vector searches, tool usage\n- Attribute Inspection: View all custom attributes and metadata\n- Timing Visualization: Understand operation sequencing and bottlenecks\n- Parent-Child Relationships: Navigate complex workflow hierarchies\nFlow Visualization\n- Interactive Flow Charts: Visual representation of operation sequences\n- Dependency Mapping: See how different components interact\n- Error Path Analysis: Trace failure points through your system\n🎨 Interface Customization\nLayout Options\n- Responsive Design: Optimized for desktop and mobile viewing\n- Panel Sizing: Adjustable interface panels for different screen sizes\n- Dark/Light Themes: Switch between themes for comfortable viewing\nData Display\n- Sortable Columns: Sort traces by any metric (time, duration, cost, status)\n- Configurable Views: Customize which trace attributes are displayed\n- Export Capabilities: Download trace data for external analysis\nπŸ“ˆ Getting Started with the Dashboard\nInitial Setup\n- Connect Your Applications: Ensure your AI applications are instrumented with Noveum SDKs\n- Verify Data Flow: Check the connection status indicator for successful trace ingestion\n- Explore Filters: Use environment and project filters to focus on relevant data\nBest Practices\n- Set Up Projects: Organize your applications into logical projects for better filtering\n- Use Environments: Separate development, staging, and production traces\n- Monitor Regularly: Check dashboard daily for performance trends and issues\n- Deep Dive on Errors: Use detailed trace inspection to troubleshoot failures\nPerformance Tips\n- Filter Early: Use filters to reduce data volume for faster loading\n- Time Range Selection: Limit date ranges for better performance with large datasets\n- Regular Refresh: Enable auto-refresh for monitoring live systems\nπŸ”— Integration with Other Platform Features\nThe dashboard seamlessly integrates with other Noveum platform capabilities:\n- Projects: Filter and organize traces by project structure\n- Team Collaboration: Share trace URLs with team members for collaborative debugging\n- API Access: Export trace data programmatically using the Noveum API\n- Alert Systems: Set up notifications based on dashboard metrics\nπŸ’‘ Advanced Features\nCustom Attributes\n- Search by Attributes: Find traces using custom metadata you've added\n- Attribute Filtering: Create complex filters using custom attributes\n- Attribute Visualization: See custom data alongside standard metrics\nBulk Operations\n- Multi-Select: Select multiple traces for batch operations\n- Bulk Export: Download multiple traces simultaneously\n- Comparative Analysis: Compare performance across multiple traces\nReal-Time Updates\n- Live Refresh: Automatic updates as new traces arrive\n- Connection Monitoring: Real-time status of your trace ingestion pipeline\n- Performance Indicators: Live metrics for system health monitoring\nReady to dive deeper? Explore Projects & Environments to organize your AI applications, or check out Team Collaboration to share insights with your team.\nGet Early Access to Noveum.ai Platform\nBe the first one to get notified when we open Noveum Platform to more users. All users get access to Observability suite for free, early users get free eval jobs and premium support for the first year.", + "content_length": 6265, + "internal_links": [], + "scraped_at": 1759935923.99634 + }, + { + "url": "https://noveum.ai/en/careers/apply/senior-ai-engineer", + "title": "AI Observability, LLM Evals & Agent Monitoring | Noveum.ai", + "content": "EngineeringFull-time\nPython AI/ML Engineer\nRemote\nFull-time\nβ‚Ή12-36 LPA\n100% Remote\nJob Description\nJoin Noveum's core AI team that monitors, evaluates, and improves AI agents in production. You will design rigorous eval pipelines, build and debug agentic workflows, deploy and tune models, and close the loop with observability to drive reliability, quality, and cost/performance. This is a high-ownership role working directly with founders and customers to ship end-to-end fixes and new agent capabilities.\nKey Responsibilities\n- Design and implement production-grade AI agent architectures and tools\n- Build rigorous evaluation pipelines; define metrics, datasets, and pass/fail thresholds\n- Instrument, monitor, and debug agents using tracing/observability to improve reliability\n- Deploy, fine-tune, and optimize models (latency, cost, and accuracy)\n- Collaborate with founders and customers to scope, build, and ship new agent capabilities\n- Mentor and raise the bar on engineering quality and operational excellence\nRequirements\n- 5+ years in AI/ML engineering with hands-on model development and deployment\n- Expertise in Python with PyTorch and/or TensorFlow\n- Production experience with LLMs/GenAI (OpenAI, Anthropic, etc.)\n- Proven experience building agentic systems or complex ML pipelines\n- Strong MLOps foundations: packaging, CI/CD, containers, cloud\n- Bias for ownership: able to self-unblock, deliver end-to-end, and operate independently\nPreferred Qualifications\n- Hands-on experience designing/running evals for LLMs/agents\n- Experience building new agents and tools for real customer workflows\n- Open-source contributions or public work in AI/ML\n- Next.js familiarity is a plus but not required", + "content_length": 1715, + "internal_links": [], + "scraped_at": 1759935928.3825111 + }, + { + "url": "https://noveum.ai/en/careers/apply/fullstack-developer-ai", + "title": "AI Observability, LLM Evals & Agent Monitoring | Noveum.ai", + "content": "EngineeringFull-time\nFullStack Next.js Engineer\nRemote\nFull-time\nβ‚Ή12-36 LPA\n100% Remote\nJob Description\nBuild end-to-end product features using Next.js App Router, TypeScript, Tailwind CSS, and Node.js.\nKey Responsibilities\n- Own features across UI and backend with Next.js App Router\n- Implement accessible, responsive UI using Tailwind CSS and shadcn/ui\n- Design and integrate Node.js APIs with Prisma/Postgres\n- Collaborate with design/product to ship high-quality experiences\n- Write tests and ensure performance, security, and reliability\nRequirements\n- 3-6+ years building web apps with React/Next.js\n- Strong TypeScript and modern React patterns\n- Next.js (App Router), server components, SSR/ISR\n- Tailwind CSS, shadcn/ui, Radix UI\n- Node.js APIs, Prisma, Postgres\n- Git, CI, and basic Docker knowledge\nPreferred Qualifications\n- Experience with AI/LLM integrations\n- Performance optimization and accessibility mindset\n- Experience in monorepos and pnpm", + "content_length": 961, + "internal_links": [], + "scraped_at": 1759935929.383203 + }, + { + "url": "https://noveum.ai/", + "title": "AI Observability, LLM Evals & Agent Monitoring | Noveum.ai", + "content": "Monitor all your AI Agents\nimprove AI Agents today\nNoveum.ai helps you monitor, trace, and optimize your AI applications.\nNoveum.ai works with any AI framework – LangChain, CrewAI, AutoGen, custom implementations, or direct LLM calls. One dashboard shows everything.\nMonitor, Evaluate, Improve Your AI Agents\nThe control plane for AI agents.\nMonitor Everything, Miss Nothing\nOur lightweight SDKs capture every trace and span across your AI agent ecosystemβ€”from simple LLM calls to complex multi-agent workflows. Get complete visibility without performance overhead.\nStart MonitoringEvaluate with 30+ Advanced Metrics\nNovaEval automatically scores every agent interaction using our comprehensive evaluation framework. Track accuracy, semantic similarity, safety, bias, and custom business metrics in real-time.\nView EvaluationsImprove Automatically with NovaPilot\nOur AI engineer analyzes performance data and automatically generates fixes for failing agents. Get detailed reports on model changes, prompt optimizations, and tool improvementsβ€”all without human intervention.\nTry Auto-ImprovementEnterprise Ready\nNoveum.ai is built for enterprise-scale AI applications, with support for multi-tenant, multi-region deployments and advanced security features.\nContact Saleswith the world's favorite AI Observability Platform\nEverything You Need to Master AI Agent Operations\nNoveum.ai helps you monitor, trace, and optimize your AI applications with comprehensive observability tools designed for modern LLM workflows.\nSee Every Agent, Every Interaction, Every Decision\n30+ Metrics That Actually Matter for Business\nYour AI Engineer That Never Sleeps\n100% visibility on every AI agent\nReduce AI Incidents by 85%\nGet comprehensive AI monitoring with automated incident prevention, faster debugging, and built-in compliance tools.\nInstead of spending days investigating AI agent failures, your team gets instant insights into what went wrong and how to fix it. Detailed traces and automated analysis eliminate guesswork.\n0+\nAI FrameworksWith the world's favorite AI observability platform\nEasy integration with your AI stack\nNoveum.ai integrates seamlessly with all popular AI frameworks and providers, giving you comprehensive observability across your entire AI pipeline.\nWorks great with: LangChain, OpenAI, Anthropic, AWS Bedrock, Azure OpenAI, Google Cloud (Vertex AI), CrewAI, LangGraph, LlamaIndex, AutoGen, custom SDKs, and more\nwith the world's favorite AI observability platform\nTrusted AI monitoring tools by thousands of developers\n0+\nAI Eval Metrics0.0%\nuptime SLA0M+\ntraces processed", + "content_length": 2591, + "internal_links": [], + "scraped_at": 1759935930.515405 + }, + { + "url": "https://noveum.ai/auth/forgot-password", + "title": "Forgot your password? | Noveum.ai", + "content": "Forgot your password?\nPlease enter your email address and we will send you a link to reset your password.\nEmail\nSend link\nBack to signin", + "content_length": 136, + "internal_links": [], + "scraped_at": 1759935931.469873 + }, + { + "url": "https://noveum.ai/auth/signup", + "title": "Create an account | Noveum.ai", + "content": "Create an account\nWe are happy that you want to join us. Please fill in the form below to create your account.\nOr continue with\nAlready have an account? Sign in\nWe are happy that you want to join us. Please fill in the form below to create your account.\nOr continue with", + "content_length": 270, + "internal_links": [], + "scraped_at": 1759935932.3582091 + } +] \ No newline at end of file diff --git a/noveum_customer_support_bt/preprocess_filter.py b/noveum_customer_support_bt/preprocess_filter.py new file mode 100644 index 0000000..3f38da9 --- /dev/null +++ b/noveum_customer_support_bt/preprocess_filter.py @@ -0,0 +1,120 @@ +#!/usr/bin/env python3 +""" +Filtering script that removes metadata and container spans from dataset. + +This script filters out spans that are not needed for agent evaluation: +- api_selection spans +- reddit_agent_run_1 and reddit_agent_run_2 spans + +Usage: python preprocess_filter.py +Output: _filtered.json +""" + +import json +import sys +import os +from typing import Dict, Any + + +def should_keep_span(span: Dict[str, Any]) -> bool: + """ + Determine if a span should be kept based on its name. + Filters out metadata and container spans. + """ + span_name = span.get('name', '') + + # Remove these span types + excluded_spans = { + 'api_selection', + 'reddit_agent_run_1', + 'reddit_agent_run_2' + } + + return span_name not in excluded_spans + + +def convert_tool_output_to_string(span: Dict[str, Any]) -> Dict[str, Any]: + """ + Convert tool.output.output from JSON array to concatenated string format. + """ + attributes = span.get('attributes', {}) + tool_output = attributes.get('tool.output.output') + + if tool_output and isinstance(tool_output, list): + # Convert list of objects to concatenated string format + output_strings = [] + for item in tool_output: + if isinstance(item, dict): + # Convert dict to string format like "{'url': '...', 'content': '...'}" + item_str = str(item).replace("'", "'") # Ensure single quotes + output_strings.append(item_str) + else: + output_strings.append(str(item)) + + # Join all items with space + attributes['tool.output.output'] = ' '.join(output_strings) + span['attributes'] = attributes + + return span + + +def filter_dataset(input_file: str) -> str: + """ + Filter the dataset by removing unwanted spans. + + Args: + input_file: Path to input JSON file + + Returns: + Path to output file + """ + # Read input file + print(f"Reading {input_file}...") + with open(input_file, 'r') as f: + data = json.load(f) + + print(f"Original dataset: {len(data)} records") + + # Filter spans + print("Filtering spans...") + filtered_data = [span for span in data if should_keep_span(span)] + print(f"After filtering: {len(filtered_data)} records") + + # Convert tool.output.output from JSON array to string format + print("Converting tool output format...") + filtered_data = [convert_tool_output_to_string(span) for span in filtered_data] + + # Generate output filename + base_name = os.path.splitext(input_file)[0] + output_file = f"{base_name}_filtered.json" + + # Write output file + print(f"Writing {output_file}...") + with open(output_file, 'w') as f: + json.dump(filtered_data, f, indent=2) + + print(f"Filtering complete! Output: {output_file}") + return output_file + + +def main(): + if len(sys.argv) != 2: + print("Usage: python preprocess_filter.py ") + sys.exit(1) + + input_file = sys.argv[1] + + if not os.path.exists(input_file): + print(f"Error: File {input_file} not found") + sys.exit(1) + + try: + output_file = filter_dataset(input_file) + print(f"\nSuccess! Created {output_file}") + except Exception as e: + print(f"Error: {e}") + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/noveum_customer_support_bt/preprocess_map.py b/noveum_customer_support_bt/preprocess_map.py new file mode 100644 index 0000000..196b7d0 --- /dev/null +++ b/noveum_customer_support_bt/preprocess_map.py @@ -0,0 +1,219 @@ +#!/usr/bin/env python3 +""" +Mapping script that adds standardized fields to dataset spans. + +This script adds evaluation-ready fields to different span types: +- Agent spans get agent_task and agent_response fields +- Tool/validation spans get tool_response field + +Usage: python preprocess_map.py +Output: _mapped.json +""" + +import json +import sys +import os +from typing import Dict, Any + + +def add_agent_comment_generation_fields(span: Dict[str, Any]) -> Dict[str, Any]: + """ + Add agent_task and agent_response fields for agent.comment_generation spans. + + agent_task -> agent operation is + (agent.operation) +\n + api_title - (api_title) + (events[0].attributes.subreddit) + (events[0].attributes.post_title) + agent_response -> (events[0].attributes.comment) + """ + attributes = span.get('attributes', {}) + events = span.get('events', []) + + # Build agent_task + agent_operation = attributes.get('agent.operation', '') + api_title = attributes.get('api_title', '') + + agent_task_parts = [f"agent operation is {agent_operation}"] + if api_title: + agent_task_parts.append(f"api_title - {api_title}") + + # Add subreddit and post_title from events[0].attributes + if events and len(events) > 0: + event_attrs = events[0].get('attributes', {}) + subreddit = event_attrs.get('subreddit', '') + post_title = event_attrs.get('post_title', '') + + if subreddit: + agent_task_parts.append(f"subreddit is - {subreddit}") + if post_title: + agent_task_parts.append(f"post title is - {post_title}") + + agent_task = '\n'.join(agent_task_parts) + attributes['agent_task'] = agent_task + + # Build agent_response from events[0].attributes.comment + if events and len(events) > 0: + event_attrs = events[0].get('attributes', {}) + comment = event_attrs.get('comment', '') + attributes['agent_response'] = comment + else: + attributes['agent_response'] = '' + + span['attributes'] = attributes + return span + + +def add_agent_query_generation_fields(span: Dict[str, Any]) -> Dict[str, Any]: + """ + Add agent_task and agent_response fields for agent.query_generation spans. + + agent_task -> api title - (api.title) + api description - (api.description) + agent_response -> (concatenate query_generation.queries, it has a list of strings) + """ + attributes = span.get('attributes', {}) + + # Build agent_task + api_title = attributes.get('api.title', '') + api_source = attributes.get('api.source', '') + api_url = attributes.get('api.url', '') + api_description = attributes.get('api.description', 'not available') # Default to 'not available' if not found + + agent_task_parts = [] + if api_title: + agent_task_parts.append(f"api title - {api_title}") + if api_source: + agent_task_parts.append(f"api source - {api_source}") + if api_url: + agent_task_parts.append(f"api url - {api_url}") + agent_task_parts.append(f"api description - {api_description}") + + attributes['agent_task'] = '\n'.join(agent_task_parts) + + # Build agent_response from query_generation.queries + queries = attributes.get('query_generation.queries', []) + if isinstance(queries, list) and queries: + # Concatenate all queries with newlines + agent_response = '\n'.join(queries) + elif isinstance(queries, list) and not queries: + # If list is empty, use stringified version of events + events = span.get('events', []) + agent_response = json.dumps(events) + else: + agent_response = str(queries) if queries else '' + + attributes['agent_response'] = agent_response + + span['attributes'] = attributes + return span + + +def add_email_generation_fields(span: Dict[str, Any]) -> Dict[str, Any]: + """ + Add tool_response field for email_generation_and_sending spans. + + tool_response -> (string value of events key's value, it is a list of jsons) + """ + attributes = span.get('attributes', {}) + events = span.get('events', []) + + # Convert events list to JSON string + if events: + attributes['tool_response'] = json.dumps(events) + else: + attributes['tool_response'] = '[]' + + span['attributes'] = attributes + return span + + +def add_post_validation_fields(span: Dict[str, Any]) -> Dict[str, Any]: + """ + Add tool_response field for post_validation spans. + + tool_response -> (string value of events key's value, it is a list of jsons) + """ + attributes = span.get('attributes', {}) + events = span.get('events', []) + + # Convert events list to JSON string + if events: + attributes['tool_response'] = json.dumps(events) + else: + attributes['tool_response'] = '[]' + + span['attributes'] = attributes + return span + + +def process_span(span: Dict[str, Any]) -> Dict[str, Any]: + """ + Process a single span by adding appropriate mapped fields. + """ + span_name = span.get('name', '') + + # Process different span types + if span_name == 'agent.comment_generation': + span = add_agent_comment_generation_fields(span) + elif span_name == 'agent.query_generation': + span = add_agent_query_generation_fields(span) + elif span_name == 'email_generation_and_sending': + span = add_email_generation_fields(span) + elif span_name == 'post_validation': + span = add_post_validation_fields(span) + # Tool call spans (tool:tavily_search_results_json) don't need changes + + return span + + +def map_dataset(input_file: str) -> str: + """ + Map the dataset by adding standardized fields to spans. + + Args: + input_file: Path to input JSON file + + Returns: + Path to output file + """ + # Read input file + print(f"Reading {input_file}...") + with open(input_file, 'r') as f: + data = json.load(f) + + print(f"Input dataset: {len(data)} records") + + # Map spans + print("Mapping spans...") + mapped_data = [process_span(span) for span in data] + + # Generate output filename + base_name = os.path.splitext(input_file)[0] + output_file = f"{base_name}_mapped.json" + + # Write output file + print(f"Writing {output_file}...") + with open(output_file, 'w') as f: + json.dump(mapped_data, f, indent=2) + + print(f"Mapping complete! Output: {output_file}") + return output_file + + +def main(): + if len(sys.argv) != 2: + print("Usage: python preprocess_map.py ") + sys.exit(1) + + input_file = sys.argv[1] + + if not os.path.exists(input_file): + print(f"Error: File {input_file} not found") + sys.exit(1) + + try: + output_file = map_dataset(input_file) + print(f"\nSuccess! Created {output_file}") + except Exception as e: + print(f"Error: {e}") + sys.exit(1) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/noveum_customer_support_bt/preprocess_split_data.py b/noveum_customer_support_bt/preprocess_split_data.py new file mode 100644 index 0000000..e4caf20 --- /dev/null +++ b/noveum_customer_support_bt/preprocess_split_data.py @@ -0,0 +1,119 @@ +#!/usr/bin/env python3 +""" +Script to split dataset_filtered_mapped.json by span name into separate files. + +Usage: + python preprocess_split_data.py [input_file] [output_dir] + +If no arguments provided, uses: + - input_file: dataset_filtered_mapped.json + - output_dir: split_datasets +""" + +import json +import os +import sys +from collections import defaultdict +from pathlib import Path + + +def create_name_to_filename_mapping(): + """Create mapping from span names to desired filenames.""" + return { + 'agent.comment_generation': 'agent_comment_gen_dataset.json', + 'agent.query_generation': 'agent_query_gen_dataset.json', + 'email_generation_and_sending': 'email_gen_send_dataset.json', + 'post_validation': 'post_validation_dataset.json', + 'tool:tavily_search_results_json:tavily_search_results_json': 'tavily_search_results_dataset.json', + } + + +def sanitize_filename(name): + """Convert span name to a safe filename.""" + # Replace problematic characters with underscores + safe_name = name.replace(':', '_').replace('/', '_').replace('\\', '_').replace('*', '_').replace('?', '_').replace('"', '_').replace('<', '_').replace('>', '_').replace('|', '_') + return f"{safe_name}_dataset.json" + + +def split_dataset_by_name(input_file, output_dir): + """ + Split dataset by span name into separate files. + + Args: + input_file (str): Path to input JSON file + output_dir (str): Path to output directory + """ + # Create output directory if it doesn't exist + Path(output_dir).mkdir(parents=True, exist_ok=True) + + # Load the dataset + print(f"Loading dataset from {input_file}...") + with open(input_file, 'r') as f: + data = json.load(f) + + print(f"Loaded {len(data)} objects") + + # Group data by name + grouped_data = defaultdict(list) + for obj in data: + name = obj.get('name', 'unknown') + grouped_data[name].append(obj) + + print(f"Found {len(grouped_data)} unique span names") + + # Create name to filename mapping + name_mapping = create_name_to_filename_mapping() + + # Write separate files for each name + for name, objects in grouped_data.items(): + # Determine filename + if name in name_mapping: + filename = name_mapping[name] + print(f"Using hardcoded mapping: {name} -> {filename}") + else: + filename = sanitize_filename(name) + print(f"Using sanitized name: {name} -> {filename}") + + output_path = os.path.join(output_dir, filename) + + # Write the file + with open(output_path, 'w', encoding='utf-8') as f: + json.dump(objects, f, indent=2, ensure_ascii=False) + + print(f" Wrote {len(objects)} objects to {output_path}") + + print(f"\nSplit complete! Created {len(grouped_data)} files in {output_dir}") + + +def main(): + """Main function to handle command line arguments.""" + # Check for help + if len(sys.argv) > 1 and sys.argv[1] in ['-h', '--help', 'help']: + print(__doc__) + sys.exit(0) + + # Default values + input_file = "dataset_filtered_mapped.json" + output_dir = "split_datasets" + + # Parse command line arguments + if len(sys.argv) > 1: + input_file = sys.argv[1] + if len(sys.argv) > 2: + output_dir = sys.argv[2] + + # Check if input file exists + if not os.path.exists(input_file): + print(f"Error: Input file '{input_file}' not found!") + sys.exit(1) + + print(f"Input file: {input_file}") + print(f"Output directory: {output_dir}") + print() + + # Split the dataset + split_dataset_by_name(input_file, output_dir) + + +if __name__ == "__main__": + main() diff --git a/noveum_customer_support_bt/publish_dataset_version.py b/noveum_customer_support_bt/publish_dataset_version.py new file mode 100644 index 0000000..37cf008 --- /dev/null +++ b/noveum_customer_support_bt/publish_dataset_version.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python3 +""" +Script to publish a dataset version in Noveum API. +Publishes the specified version of the dataset. +""" + +import os +import json +import requests +import argparse +from dotenv import load_dotenv +from typing import Dict, Any, Optional + +# Load environment variables +load_dotenv() + +# Get API credentials from environment +api_key = os.getenv('NOVEUM_API_KEY') +org_slug = os.getenv('NOVEUM_ORG_SLUG') +dataset_slug = os.getenv('NOVEUM_DATASET_SLUG') +latest_version = os.getenv('LATEST_VERSION') +beta_env = os.getenv('BETA', 'false').lower() == 'true' + +def validate_environment(): + """Validate that all required environment variables are set""" + required_vars = { + 'NOVEUM_API_KEY': api_key, + 'NOVEUM_ORG_SLUG': org_slug, + 'NOVEUM_DATASET_SLUG': dataset_slug, + 'LATEST_VERSION': latest_version + } + + missing_vars = [var for var, value in required_vars.items() if not value] + + if missing_vars: + print(f"Error: Missing required environment variables: {', '.join(missing_vars)}") + print("Please set these variables in your .env file or environment") + return False + + return True + +def publish_dataset_version(version: str) -> Optional[Dict[str, Any]]: + """Publish a dataset version in Noveum API""" + + # Construct API URL based on BETA environment variable + if beta_env: + api_url = f"https://noveum.ai/api/v1/datasets/{dataset_slug}/versions/{version}/publish?organizationSlug={org_slug}" + else: + api_url = f"https://noveum.ai/api/v1/organizations/{org_slug}/datasets/{dataset_slug}/versions/{version}/publish" + + # Prepare headers + headers = { + 'Authorization': f'Bearer {api_key}', + 'Cookie': f'apiKeyCookie={api_key}' + } + + print(f"Publishing dataset version at: {api_url}") + print(f"Organization: {org_slug}") + print(f"Dataset: {dataset_slug}") + print(f"Version: {version}") + + try: + response = requests.post(api_url, headers=headers, timeout=30) + response.raise_for_status() + + data = response.json() + print("Successfully published dataset version") + print(f"Response status: {response.status_code}") + + return data + + except requests.exceptions.RequestException as e: + print(f"Error publishing dataset version: {e}") + if hasattr(e, 'response') and e.response is not None: + print(f"Response status: {e.response.status_code}") + print(f"Response text: {e.response.text}") + return None + +def main(): + parser = argparse.ArgumentParser(description='Publish a dataset version in Noveum API') + parser.add_argument('--pretty', action='store_true', + help='Pretty print the JSON response') + parser.add_argument('--output', type=str, default="dataset_publish_response.json", + help='Output file to save the JSON response (default: dataset_publish_response.json)') + + args = parser.parse_args() + + # Validate environment variables + if not validate_environment(): + return 1 + + # Publish dataset version + data = publish_dataset_version(version=latest_version) + + if data is None: + return 1 + + # Save response to file + try: + with open(args.output, 'w', encoding='utf-8') as f: + json.dump(data, f, indent=2) + print(f"\nResponse saved to: {args.output}") + except (OSError, IOError) as e: + print(f"Error saving response to file: {e}") + return 1 + + # Print the response + if args.pretty: + print("\nResponse data:") + print(json.dumps(data, indent=2)) + else: + print(f"\nResponse data: {json.dumps(data)}") + + return 0 + +if __name__ == "__main__": + exit(main()) diff --git a/noveum_customer_support_bt/support_agent.png b/noveum_customer_support_bt/support_agent.png new file mode 100644 index 0000000..dec6399 Binary files /dev/null and b/noveum_customer_support_bt/support_agent.png differ diff --git a/noveum_customer_support_bt/traces/combine_spans_api_compat.py b/noveum_customer_support_bt/traces/combine_spans_api_compat.py new file mode 100644 index 0000000..3c71617 --- /dev/null +++ b/noveum_customer_support_bt/traces/combine_spans_api_compat.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python3 +""" +Script to combine spans from all trace files into a single dataset. +Each span will be merged with its parent trace data, with clashing keys prefixed. +""" + +import json +import os +from pathlib import Path + + +def combine_spans_from_traces(traces_dir): + """ + Combine all spans from trace files into a single list. + Each span gets merged with its parent trace data. + """ + combined_spans = [] + + # Get all trace files + trace_files = sorted([f for f in os.listdir( + traces_dir) if f.startswith('trace') and f.endswith('.json')]) + + print(f"Found {len(trace_files)} trace files: {trace_files}") + + for trace_file in trace_files: + file_path = os.path.join(traces_dir, trace_file) + print(f"Processing {trace_file}...") + + try: + with open(file_path, 'r') as f: + data = json.load(f) + + # Extract trace data (process all traces in the traces array) + if 'traces' in data and len(data['traces']) > 0: + # Process each trace in the traces array + for trace in data['traces']: + # Process each span in this trace + for span in trace.get('spans', []): + # Create a new object that combines span and trace data + combined_span = {} + + # Add all span fields first + for key, value in span.items(): + combined_span[key] = value + + # Add trace fields with prefix to avoid clashes + for key, value in trace.items(): + if key != 'spans': # Skip the spans array itself + # Check if key already exists in span + if key in span: + # Prefix with 'trace_' to avoid clash + combined_span[f'trace_{key}'] = value + else: + # No clash, add as is + combined_span[key] = value + + combined_spans.append(combined_span) + + except Exception as e: + print(f"Error processing {trace_file}: {e}") + continue + + return combined_spans + + +def main(): + # Get the directory where this script is located + script_dir = Path(__file__).parent + traces_dir = str(script_dir / "traces") + + print(f"Processing traces from: {traces_dir}") + + # Combine all spans + combined_spans = combine_spans_from_traces(traces_dir) + + print(f"Combined {len(combined_spans)} spans total") + + # Save to dataset.json + output_file = os.path.join(traces_dir, 'dataset.json') + + with open(output_file, 'w') as f: + json.dump(combined_spans, f, indent=2) + + print(f"Saved combined spans to: {output_file}") + + # Print some statistics + if combined_spans: + print(f"\nSample of first span keys: {list(combined_spans[0].keys())}") + print(f"Total spans: {len(combined_spans)}") + + # Count spans by type + span_types = {} + for span in combined_spans: + span_name = span.get('name', 'unknown') + span_types[span_name] = span_types.get(span_name, 0) + 1 + + print(f"\nSpan types distribution:") + for span_type, count in sorted(span_types.items()): + print(f" {span_type}: {count}") + + +if __name__ == "__main__": + main() diff --git a/noveum_customer_support_bt/traces/fetch_traces_api.py b/noveum_customer_support_bt/traces/fetch_traces_api.py new file mode 100755 index 0000000..5333af9 --- /dev/null +++ b/noveum_customer_support_bt/traces/fetch_traces_api.py @@ -0,0 +1,161 @@ +#!/usr/bin/env python3 +""" +Script to fetch traces from Noveum API and save them to a traces directory. +Supports batch fetching with pagination for large numbers of traces. +""" + +import os +import json +import requests +import argparse +import shutil +from dotenv import load_dotenv +from typing import List, Dict, Any + +# Load environment variables +load_dotenv() + +# Get API key and project from environment +api_key = os.getenv('NOVEUM_API_KEY') +project = 'noveum-ai-agent-rag-websearch' + +if not api_key: + print('Error: NOVEUM_API_KEY environment variable not found') + exit(1) + +# Common headers +headers = { + 'Authorization': f'Bearer {api_key}', + 'Content-Type': 'application/json' +} + +def clean_and_create_traces_dir(): + """Clean existing traces directory and create a new one""" + # Get the directory where this script is located + script_dir = os.path.dirname(os.path.abspath(__file__)) + traces_dir = os.path.join(script_dir, 'traces') + + if os.path.exists(traces_dir): + print(f"Cleaning existing traces directory: {traces_dir}") + shutil.rmtree(traces_dir) + + os.makedirs(traces_dir) + print(f"Created traces directory: {traces_dir}") + return traces_dir + +def fetch_traces_batch(size: int, from_offset: int = 0) -> Dict[str, Any]: + """Fetch a batch of traces from the API""" + traces_url = 'https://api.noveum.ai/api/v1/traces' + params = { + 'project': project, + 'size': size, + 'from': from_offset, + 'includeSpans': True + } + + print(f"Fetching traces: size={size}, from={from_offset}") + + try: + response = requests.get(traces_url, headers=headers, params=params) + response.raise_for_status() + + data = response.json() + print(f"Successfully fetched {len(data.get('traces', []))} traces") + return data + + except requests.exceptions.RequestException as e: + print(f'Error fetching traces: {e}') + if hasattr(e, 'response') and e.response is not None: + print(f'Response status: {e.response.status_code}') + print(f'Response text: {e.response.text}') + return None + +def save_traces_batch(traces_dir: str, batch_data: Dict[str, Any], batch_number: int): + """Save a batch of traces to a JSON file""" + filename = f"traces_batch_{batch_number:03d}.json" + filepath = os.path.join(traces_dir, filename) + + with open(filepath, 'w') as f: + json.dump(batch_data, f, indent=2) + + print(f"Saved batch {batch_number} to: {filepath}") + return filepath + +def main(): + global project + + parser = argparse.ArgumentParser(description='Fetch traces from Noveum API and save to traces directory') + parser.add_argument('count', type=int, help='Number of traces to fetch') + parser.add_argument('--project', type=str, default=project, help=f'Project name (default: {project})') + + args = parser.parse_args() + + # Update project if specified + project = args.project + + print(f"Fetching {args.count} traces for project: {project}") + + # Clean and create traces directory + traces_dir = clean_and_create_traces_dir() + + # Calculate number of batches needed + max_per_batch = 100 + num_batches = (args.count + max_per_batch - 1) // max_per_batch # Ceiling division + + print(f"Will fetch in {num_batches} batch(es) of up to {max_per_batch} traces each") + + total_fetched = 0 + batch_number = 1 + + for batch in range(num_batches): + # Calculate size and from_offset for this batch + remaining_traces = args.count - total_fetched + current_size = min(max_per_batch, remaining_traces) + current_from = batch * max_per_batch + + print(f"\n--- Batch {batch_number}/{num_batches} ---") + + # Fetch this batch + batch_data = fetch_traces_batch(current_size, current_from) + + if batch_data is None: + print(f"Failed to fetch batch {batch_number}") + break + + # Save this batch + save_traces_batch(traces_dir, batch_data, batch_number) + + # Update counters + traces_in_batch = len(batch_data.get('traces', [])) + total_fetched += traces_in_batch + + print(f"Batch {batch_number} complete: {traces_in_batch} traces") + print(f"Total fetched so far: {total_fetched}/{args.count}") + + # Check if we've fetched enough traces + if total_fetched >= args.count: + print(f"Reached target of {args.count} traces") + break + + # Check if there are more traces available + pagination = batch_data.get('pagination', {}) + has_more = pagination.get('has_more', False) + + if not has_more: + print("No more traces available from API") + break + + batch_number += 1 + + print(f"\n=== Summary ===") + print(f"Total traces fetched: {total_fetched}") + print(f"Batches created: {batch_number}") + print(f"Traces directory: {traces_dir}") + + # List all created files + files = [f for f in os.listdir(traces_dir) if f.endswith('.json')] + files.sort() + print(f"Created files: {', '.join(files)}") + +if __name__ == "__main__": + main() diff --git a/noveum_customer_support_bt/upload_dataset.py b/noveum_customer_support_bt/upload_dataset.py new file mode 100644 index 0000000..33ec2c0 --- /dev/null +++ b/noveum_customer_support_bt/upload_dataset.py @@ -0,0 +1,213 @@ +#!/usr/bin/env python3 +""" +Script to upload dataset items to Noveum API. +Reads a JSON file containing a list of dataset items and uploads them via POST request. +""" + +import os +import json +import requests +import argparse +from dotenv import load_dotenv +from typing import List, Dict, Any + +# Load environment variables +load_dotenv() + +# Default dataset JSON path +dataset_json = 'processed_agent_dataset.json' + +# Get API credentials from environment +api_key = os.getenv('NOVEUM_API_KEY') +org_slug = os.getenv('NOVEUM_ORG_SLUG') +dataset_slug = os.getenv('NOVEUM_DATASET_SLUG') +latest_version = os.getenv('LATEST_VERSION') +beta_env = os.getenv('BETA', 'false').lower() == 'true' + +def validate_environment(): + """Validate that all required environment variables are set""" + required_vars = { + 'NOVEUM_API_KEY': api_key, + 'NOVEUM_ORG_SLUG': org_slug, + 'NOVEUM_DATASET_SLUG': dataset_slug, + 'LATEST_VERSION': latest_version + } + + missing_vars = [var for var, value in required_vars.items() if not value] + + if missing_vars: + print(f"Error: Missing required environment variables: {', '.join(missing_vars)}") + print("Please set these variables in your .env file or environment") + return False + + return True + +def load_dataset_items(file_path: str) -> List[Dict[str, Any]]: + """Load dataset items from JSON file""" + try: + with open(file_path, 'r', encoding='utf-8') as f: + data = json.load(f) + + if not isinstance(data, list): + print(f"Error: JSON file should contain a list of objects, got {type(data)}") + return [] + + print(f"Loaded {len(data)} items from {file_path}") + return data + + except FileNotFoundError: + print(f"Error: File not found: {file_path}") + return [] + except json.JSONDecodeError as e: + print(f"Error: Invalid JSON in file {file_path}: {e}") + return [] + except (OSError, IOError) as e: + print(f"Error loading dataset: {e}") + return [] + +def upload_dataset_items(items: List[Dict[str, Any]], version: str, item_type: str = "conversation") -> bool: + """Upload dataset items to Noveum API""" + if not items: + print("No items to upload") + return False + + # Schema keys from schema.tsx - these will be surfaced at the same level as content + schema_keys = { + "item_id", "dataset_id", "item_key", "item_hash", "organization_id", + "organization_slug", "dataset_slug", "item_version", "deleted_at_version", + "deleted_at_date", "item_type", "schema_version", "source_trace_id", + "source_span_id", "content", "metadata", "agent_name", "agent_role", + "agent_task", "agent_response", "system_prompt", "user_id", "session_id", + "turn_id", "ground_truth", "expected_tool_call", "tools_available", + "tool_calls", "tool_call_results", "parameters_passed", "retrieval_query", + "retrieved_context", "exit_status", "agent_exit", "trace_data", + "conversation_id", "speaker", "message", "conversation_context", + "input_text", "output_text", "expected_output", "evaluation_context", + "criteria", "quality_score", "validation_status", "validation_errors", + "tags", "custom_attributes", "created_at", "updated_at" + } + + # Transform items to the required format + transformed_items = [] + for item in items: + # Create a copy of the item to avoid modifying the original + item['item_type'] = item_type + item_copy = item.copy() + + # Start with base structure + transformed_item = { + "item_key": item.get("turn_id", ""), + "item_type": item_type, # Use the provided item_type + "metadata": {} # Empty metadata as specified + } + + # Surface schema keys at the same level as content + for key in schema_keys: + if key in item_copy: + value = item_copy.pop(key) + # Handle special cases for required fields + if key == "item_type" and (not value or value == ""): + # Keep our default value if the item's value is empty + continue + elif key == "metadata": + # Ensure metadata is always an object + if isinstance(value, dict): + transformed_item[key] = value + elif isinstance(value, str) and value.strip(): + try: + transformed_item[key] = json.loads(value) + except json.JSONDecodeError: + transformed_item[key] = {} + else: + transformed_item[key] = {} + elif key == "content": + # Handle content field - always ensure it's an object + if isinstance(value, str) and value.strip(): + try: + transformed_item[key] = json.loads(value) + except json.JSONDecodeError: + transformed_item[key] = {} + elif isinstance(value, dict): + transformed_item[key] = value + else: + transformed_item[key] = {} + else: + transformed_item[key] = value + + # Always put the entire original item in content field + transformed_item["content"] = item + transformed_items.append(transformed_item) + + # Construct API URL based on BETA environment variable + if beta_env: + api_url = f"https://noveum.ai/api/v1/datasets/{dataset_slug}/items?organizationSlug={org_slug}" + else: + api_url = f"https://noveum.ai/api/v1/organizations/{org_slug}/datasets/{dataset_slug}/items" + + # Prepare headers + headers = { + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {api_key}', + 'Cookie': f'apiKeyCookie={api_key}' + } + + # Prepare request data + request_data = { + "version": version, + "items": transformed_items + } + + print(f"Uploading {len(items)} items to: {api_url}") + print(f"Organization: {org_slug}") + print(f"Dataset: {dataset_slug}") + print(f"Version: {version}") + + try: + response = requests.post(api_url, headers=headers, json=request_data, timeout=30) + response.raise_for_status() + + print(f"Successfully uploaded {len(items)} items") + print(f"Response status: {response.status_code}") + + # Print response content if available + try: + response_data = response.json() + print(f"Response data: {json.dumps(response_data, indent=2)}") + except json.JSONDecodeError: + print(f"Response text: {response.text}") + + return True + + except requests.exceptions.RequestException as e: + print(f"Error uploading dataset items: {e}") + if hasattr(e, 'response') and e.response is not None: + print(f"Response status: {e.response.status_code}") + print(f"Response text: {e.response.text}") + return False + +def main(): + default_item_type = "conversation" + + parser = argparse.ArgumentParser(description='Upload dataset items to Noveum API') + parser.add_argument('--dataset-json', type=str, default=dataset_json, + help=f'Path to JSON file containing dataset items (default: {dataset_json})') + parser.add_argument('--item-type', type=str, default=default_item_type, + help=f'Item type for the dataset items (default: {default_item_type})') + + args = parser.parse_args() + + # Validate environment variables + if not validate_environment(): + return 1 + + # Load dataset items + items = load_dataset_items(args.dataset_json) + if not items: + return 1 + + # Upload items + success = upload_dataset_items(items, latest_version, args.item_type) + return 0 if success else 1 + +if __name__ == "__main__": + exit(main()) diff --git a/noveum_customer_support_bt/upload_scores.py b/noveum_customer_support_bt/upload_scores.py new file mode 100644 index 0000000..0b8c198 --- /dev/null +++ b/noveum_customer_support_bt/upload_scores.py @@ -0,0 +1,337 @@ +#!/usr/bin/env python3 +""" +Script to upload scorer results to Noveum API. +Reads scores and reasonings from a CSV file and uploads them via API. +""" + +import os +import json +import argparse +import csv +from typing import Dict, List, Optional +import requests +from dotenv import load_dotenv + + +def load_api_data(api_data_path: str) -> Dict[str, str]: + """ + Load api_data.json and create a mapping from item_key to item_id. + + Args: + api_data_path: Path to the api_data.json file + + Returns: + Dictionary mapping item_key to item_id + """ + print(f"Loading API data from {api_data_path}...") + with open(api_data_path, 'r') as f: + data = json.load(f) + + # Create mapping from item_key to item_id + key_to_id = {} + items = data.get('items', []) + + for item in items: + item_key = item.get('item_key') + item_id = item.get('item_id') + if item_key and item_id: + key_to_id[item_key] = item_id + + print(f"Loaded {len(key_to_id)} item mappings") + return key_to_id + + +def read_csv_data( + csv_path: str, + item_key_col: str, + score_col: str, + reasoning_col: str +) -> List[Dict]: + """ + Read CSV file and extract relevant columns. + + Args: + csv_path: Path to the CSV file + item_key_col: Column name for item keys + score_col: Column name for scores + reasoning_col: Column name for reasonings + + Returns: + List of dictionaries with item_key, score, and reasoning + """ + print(f"Reading CSV from {csv_path}...") + results = [] + + with open(csv_path, 'r', encoding='utf-8') as f: + reader = csv.DictReader(f) + + # Verify columns exist + if item_key_col not in reader.fieldnames: + raise ValueError(f"Column '{item_key_col}' not found in CSV. Available columns: {reader.fieldnames}") + if score_col not in reader.fieldnames: + raise ValueError(f"Column '{score_col}' not found in CSV. Available columns: {reader.fieldnames}") + if reasoning_col not in reader.fieldnames: + raise ValueError(f"Column '{reasoning_col}' not found in CSV. Available columns: {reader.fieldnames}") + + for row in reader: + item_key = row[item_key_col] + score = row[score_col] + reasoning = row[reasoning_col] + + # Skip empty rows + if not item_key or not score: + continue + + results.append({ + 'item_key': item_key, + 'score': float(score), + 'reasoning': reasoning + }) + + print(f"Read {len(results)} rows from CSV") + return results + + +def create_batch_payload( + csv_data: List[Dict], + key_to_id: Dict[str, str], + org_slug: str, + project: str, + environment: str, + dataset_slug: str, + dataset_version: str, + scorer_id: str = "custom_scorer", + scorer_version: str = "1.0.0" +) -> List[Dict]: + """ + Create the batch payload for API submission. + + Args: + csv_data: List of dictionaries with item_key, score, and reasoning + key_to_id: Mapping from item_key to item_id + org_slug: Organization slug + project: Project name + environment: Environment name + dataset_slug: Dataset slug + dataset_version: Dataset version + scorer_id: Scorer ID (default: "custom_scorer") + scorer_version: Scorer version (default: "1.0.0") + + Returns: + List of result objects ready for API submission + """ + results = [] + skipped = [] + + for row in csv_data: + item_key = row['item_key'] + + # Find corresponding item_id + item_id = key_to_id.get(item_key) + + if not item_id: + skipped.append(item_key) + continue + + result = { + "organizationSlug": org_slug, + "project": project, + "environment": environment, + "datasetSlug": dataset_slug, + "datasetVersion": dataset_version, + "itemId": item_id, + "scorerId": scorer_id, + "scorerVersion": scorer_version, + "score": row['score'], + "passed": row['score'] > 0.5, # Default threshold, can be adjusted + "metadata": { + "details": row['reasoning'] + }, + "executionTimeMs": 0.0 + } + + results.append(result) + + if skipped: + print(f"Warning: Skipped {len(skipped)} rows with missing item_id mappings") + print(f"First few skipped keys: {skipped[:5]}") + + print(f"Created {len(results)} results for upload") + return results + + +def upload_results( + results: List[Dict], + api_key: str, + org_slug: str, + batch_size: int = 100 +) -> None: + """ + Upload results to the API in batches. + + Args: + results: List of result objects + api_key: API key for authentication + org_slug: Organization slug + batch_size: Number of results per batch (default: 100) + """ + api_url = f"https://beta.noveum.ai/api/v1/scorers/results/batch?organizationSlug={org_slug}" + + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {api_key}" + } + + # Split results into batches + total = len(results) + batches = [results[i:i + batch_size] for i in range(0, total, batch_size)] + + print(f"\nUploading {total} results in {len(batches)} batches...") + + for i, batch in enumerate(batches, 1): + payload = {"results": batch} + + try: + response = requests.post( + api_url, + headers=headers, + json=payload, + timeout=60 + ) + + if response.status_code == 200: + print(f"βœ“ Batch {i}/{len(batches)} uploaded successfully ({len(batch)} results)") + else: + print(f"βœ— Batch {i}/{len(batches)} failed: {response.status_code}") + print(f" Response: {response.text}") + + except Exception as e: + print(f"βœ— Batch {i}/{len(batches)} error: {str(e)}") + + print("\nUpload complete!") + + +def main(): + parser = argparse.ArgumentParser( + description="Upload scorer results from CSV to Noveum API" + ) + parser.add_argument( + "csv_file", + help="Path to the CSV file containing scores and reasonings" + ) + parser.add_argument( + "--item-key-col", + required=True, + help="Column name for item keys" + ) + parser.add_argument( + "--score-col", + required=True, + help="Column name for scores" + ) + parser.add_argument( + "--reasoning-col", + required=True, + help="Column name for reasonings" + ) + parser.add_argument( + "--api-data", + default="api_data.json", + help="Path to api_data.json file (default: api_data.json)" + ) + parser.add_argument( + "--scorer-id", + default="custom_scorer", + help="Scorer ID (default: custom_scorer)" + ) + parser.add_argument( + "--scorer-version", + default="1.0.0", + help="Scorer version (default: 1.0.0)" + ) + parser.add_argument( + "--batch-size", + type=int, + default=100, + help="Number of results per batch (default: 100)" + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Prepare data but don't upload to API" + ) + + args = parser.parse_args() + + # Load environment variables + load_dotenv() + print("Loaded environment variables") + + # Get required environment variables + required_vars = { + 'NOVEUM_PROJECT': os.getenv('NOVEUM_PROJECT'), + 'NOVEUM_ENVIRONMENT': os.getenv('NOVEUM_ENVIRONMENT'), + 'NOVEUM_API_KEY': os.getenv('NOVEUM_API_KEY'), + 'NOVEUM_ORG_SLUG': os.getenv('NOVEUM_ORG_SLUG'), + 'NOVEUM_DATASET_SLUG': os.getenv('NOVEUM_DATASET_SLUG'), + 'LATEST_VERSION': os.getenv('LATEST_VERSION') + } + + # Check for missing variables + missing_vars = [var for var, value in required_vars.items() if not value] + if missing_vars: + print(f"Error: Missing required environment variables: {', '.join(missing_vars)}") + print("\nPlease set them in your .env file or environment:") + for var in missing_vars: + print(f" {var}=") + return 1 + + # Load API data + key_to_id = load_api_data(args.api_data) + + # Read CSV data + csv_data = read_csv_data( + args.csv_file, + args.item_key_col, + args.score_col, + args.reasoning_col + ) + + # Create batch payload + results = create_batch_payload( + csv_data=csv_data, + key_to_id=key_to_id, + org_slug=required_vars['NOVEUM_ORG_SLUG'], + project=required_vars['NOVEUM_PROJECT'], + environment=required_vars['NOVEUM_ENVIRONMENT'], + dataset_slug=required_vars['NOVEUM_DATASET_SLUG'], + dataset_version=required_vars['LATEST_VERSION'], + scorer_id=args.scorer_id, + scorer_version=args.scorer_version + ) + + if not results: + print("Error: No valid results to upload") + return 1 + + if args.dry_run: + print("\n--- DRY RUN MODE ---") + print(f"Would upload {len(results)} results") + print("\nSample result:") + print(json.dumps(results[0], indent=2)) + return 0 + + # Upload results + upload_results( + results=results, + api_key=required_vars['NOVEUM_API_KEY'], + org_slug=required_vars['NOVEUM_ORG_SLUG'], + batch_size=args.batch_size + ) + + return 0 + + +if __name__ == "__main__": + exit(main()) +