From decc022755a874abac5b0f9d632e8d29c0447e01 Mon Sep 17 00:00:00 2001 From: Eric Campbell Date: Thu, 15 Jan 2026 14:34:10 -0600 Subject: [PATCH] Add centralized structured logging - Create logging_utils.py with StructuredFormatter (JSON) and HumanReadableFormatter - Console output uses human-readable format with timestamps - File output uses JSON with ISO 8601 timestamps and context fields - Support for context fields: task_id, config, duration_ms, tokens_used, cost_usd - LogContext context manager for adding fields to log records - Integrate into main.py and run_all.py entry points - Add 17 unit tests for logging utilities --- cli/run_all.py | 35 +- main.py | 64 +--- .../tests/test_e2e_providers.py | 8 +- .../tests/test_logging_utils.py | 334 ++++++++++++++++++ .../utils/logging_utils.py | 279 +++++++++++++++ 5 files changed, 636 insertions(+), 84 deletions(-) create mode 100644 src/arc_agi_benchmarking/tests/test_logging_utils.py create mode 100644 src/arc_agi_benchmarking/utils/logging_utils.py diff --git a/cli/run_all.py b/cli/run_all.py index 6f94b550..1d57b954 100644 --- a/cli/run_all.py +++ b/cli/run_all.py @@ -29,6 +29,7 @@ from arc_agi_benchmarking.utils.rate_limiter import AsyncRequestRateLimiter from arc_agi_benchmarking.utils.metrics import set_metrics_enabled, set_metrics_filename_prefix from arc_agi_benchmarking.utils.preflight import run_preflight +from arc_agi_benchmarking.utils.logging_utils import setup_logging, StructuredFormatter from tenacity import retry, wait_exponential, stop_after_attempt, retry_if_exception_type, before_sleep_log @@ -165,23 +166,8 @@ def filter(self, record: logging.LogRecord) -> bool: config_token = LOG_CONFIG_CTX.set(config_name) task_token = LOG_TASK_CTX.set(task_id) - class _JsonFormatter(logging.Formatter): - def format(self, record): - payload = { - "ts": self.formatTime(record, "%Y-%m-%dT%H:%M:%S%z"), - "level": record.levelname, - "logger": record.name, - "message": record.getMessage(), - } - if record.exc_info: - payload["exc_info"] = self.formatException(record.exc_info) - # Include context for easier grepping - payload["config"] = config_name - payload["task_id"] = task_id - return json.dumps(payload) - file_handler = logging.FileHandler(log_path, encoding="utf-8") - file_handler.setFormatter(_JsonFormatter()) + file_handler.setFormatter(StructuredFormatter()) file_handler.addFilter(_TaskFilter()) logging.getLogger().addHandler(file_handler) logging.getLogger("openai").setLevel(logging.INFO) @@ -421,21 +407,8 @@ async def main(task_list_file: Optional[str], # Set metrics enabled status based on CLI arg set_metrics_enabled(args.enable_metrics) - # Configure logging for the entire application based on --log-level - if args.log_level == "NONE": - log_level_to_set = logging.CRITICAL + 1 - logging.basicConfig( - level=log_level_to_set, - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', - handlers=[logging.StreamHandler(sys.stdout)] - ) - else: - log_level_to_set = getattr(logging, args.log_level.upper()) - logging.basicConfig( - level=log_level_to_set, - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', - handlers=[logging.StreamHandler(sys.stdout)] - ) + # Configure structured logging for the entire application + setup_logging(level=args.log_level, quiet_libraries=True) config_name = args.config.strip() if args.config else DEFAULT_MODEL_CONFIG if not config_name: diff --git a/main.py b/main.py index 1cc22cfa..47b21c01 100644 --- a/main.py +++ b/main.py @@ -13,6 +13,7 @@ from dotenv import load_dotenv import arc_agi_benchmarking.utils as utils from arc_agi_benchmarking.utils.metrics import timeit, set_metrics_enabled +from arc_agi_benchmarking.utils.logging_utils import setup_logging from arc_agi_benchmarking.schemas import ARCPair, Attempt from arc_agi_benchmarking.prompts.prompt_manager import convert_task_pairs_to_prompt from typing import List, Optional @@ -260,60 +261,25 @@ def main_cli(cli_args: Optional[List[str]] = None): # Set metrics enabled status based on CLI arg first set_metrics_enabled(args.enable_metrics) - # # Prepare OpenAI SDK file logging in logs///openai.jsonl + # Prepare log file path for structured JSON logs log_dir = Path("logs") / args.config / args.task_id log_dir.mkdir(parents=True, exist_ok=True) log_path = log_dir / "openai.jsonl" - class _JsonFormatter(logging.Formatter): - def format(self, record): - payload = { - "ts": self.formatTime(record, "%Y-%m-%dT%H:%M:%S%z"), - "level": record.levelname, - "logger": record.name, - "message": record.getMessage(), - } - if record.exc_info: - payload["exc_info"] = self.formatException(record.exc_info) - return json.dumps(payload) - - file_handler = logging.FileHandler(log_path, encoding="utf-8") - file_handler.setFormatter(_JsonFormatter()) - - # Configure logging + # Configure structured logging (writes to console and file) + log_level = "DEBUG" if args.verbose else args.log_level + setup_logging( + level=log_level, + log_file=log_path, + quiet_libraries=True, + ) + + # Enable OpenAI SDK logging to capture API calls + logging.getLogger('openai').setLevel(logging.INFO) + if args.verbose: - # Verbose mode: Show DEBUG for our code, WARNING+ for libraries - logging.basicConfig( - level=logging.DEBUG, - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' - ) - - # Set library loggers to WARNING to reduce noise - library_loggers = [ - 'httpx', 'httpcore', 'urllib3', 'requests', - 'anthropic', 'google', 'pydantic', 'transformers' - ] - for lib_logger in library_loggers: - logging.getLogger(lib_logger).setLevel(logging.WARNING) - - # Keep our application loggers at DEBUG - logging.getLogger('arc_agi_benchmarking').setLevel(logging.DEBUG) - logging.getLogger('__main__').setLevel(logging.DEBUG) - - logger.info("Verbose mode enabled - showing debug output for arc_agi_benchmarking only") - else: - # Normal mode: Use the specified log level - logging.basicConfig( - level=getattr(logging, args.log_level.upper()), - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' - ) - - # Attach file handler for OpenAI SDK and app logs, and ensure OpenAI logger is enabled - root_logger = logging.getLogger() - root_logger.addHandler(file_handler) - openai_logger = logging.getLogger('openai') - openai_logger.setLevel(logging.INFO) - logger.info(f"OpenAI SDK logs will be written to {log_path}") + logger.info("Verbose mode enabled - showing debug output") + logger.info(f"Structured logs will be written to {log_path}") arc_solver = ARCTester( config=args.config, diff --git a/src/arc_agi_benchmarking/tests/test_e2e_providers.py b/src/arc_agi_benchmarking/tests/test_e2e_providers.py index c9857319..9c8c7b60 100644 --- a/src/arc_agi_benchmarking/tests/test_e2e_providers.py +++ b/src/arc_agi_benchmarking/tests/test_e2e_providers.py @@ -141,10 +141,10 @@ def test_gpt_4o_provider_e2e_mocked(mock_init_provider): # REMOVE mock_backscan_ @patch('main.ARCTester.generate_task_solution') @patch('main.ARCTester.init_provider') @patch('main.utils.read_models_config') # Patched at the source where it's used by main.py -@patch('main.logging.basicConfig') # To suppress logging output during tests +@patch('main.setup_logging') # To suppress logging output during tests @patch('main.set_metrics_enabled') # To avoid side effects with metrics def test_main_cli_execution_and_arctester_mocking(mock_set_metrics_enabled, - mock_logging_basic_config, + mock_setup_logging, mock_read_models_config, mock_init_provider, mock_generate_task_solution): @@ -202,6 +202,6 @@ def test_main_cli_execution_and_arctester_mocking(mock_set_metrics_enabled, mock_set_metrics_enabled.assert_called_once_with(False) # Default behavior print(f"mock_set_metrics_enabled called with: {mock_set_metrics_enabled.call_args}") - - mock_logging_basic_config.assert_called_once() + + mock_setup_logging.assert_called_once() print("✅ test_main_cli_execution_and_arctester_mocking completed successfully.") diff --git a/src/arc_agi_benchmarking/tests/test_logging_utils.py b/src/arc_agi_benchmarking/tests/test_logging_utils.py new file mode 100644 index 00000000..884a0f3b --- /dev/null +++ b/src/arc_agi_benchmarking/tests/test_logging_utils.py @@ -0,0 +1,334 @@ +"""Tests for structured logging utilities.""" + +import json +import logging +import tempfile +from pathlib import Path + +import pytest + +from arc_agi_benchmarking.utils.logging_utils import ( + HumanReadableFormatter, + LogContext, + StructuredFormatter, + get_logger, + log_with_context, + setup_logging, +) + + +class TestStructuredFormatter: + """Tests for StructuredFormatter.""" + + def test_basic_format(self): + """Test basic JSON formatting.""" + formatter = StructuredFormatter() + record = logging.LogRecord( + name="test.logger", + level=logging.INFO, + pathname="test.py", + lineno=1, + msg="Test message", + args=(), + exc_info=None, + ) + + output = formatter.format(record) + data = json.loads(output) + + assert data["level"] == "INFO" + assert data["logger"] == "test.logger" + assert data["message"] == "Test message" + assert "timestamp" in data + + def test_format_with_args(self): + """Test formatting with message arguments.""" + formatter = StructuredFormatter() + record = logging.LogRecord( + name="test.logger", + level=logging.INFO, + pathname="test.py", + lineno=1, + msg="Task %s completed in %dms", + args=("abc123", 500), + exc_info=None, + ) + + output = formatter.format(record) + data = json.loads(output) + + assert data["message"] == "Task abc123 completed in 500ms" + + def test_format_with_exception(self): + """Test formatting with exception info.""" + formatter = StructuredFormatter() + + try: + raise ValueError("Test error") + except ValueError: + import sys + + exc_info = sys.exc_info() + + record = logging.LogRecord( + name="test.logger", + level=logging.ERROR, + pathname="test.py", + lineno=1, + msg="An error occurred", + args=(), + exc_info=exc_info, + ) + + output = formatter.format(record) + data = json.loads(output) + + assert data["level"] == "ERROR" + assert "exception" in data + assert "ValueError: Test error" in data["exception"] + + def test_format_with_context_fields(self): + """Test formatting with extra context fields.""" + formatter = StructuredFormatter() + record = logging.LogRecord( + name="test.logger", + level=logging.INFO, + pathname="test.py", + lineno=1, + msg="Processing", + args=(), + exc_info=None, + ) + record.task_id = "task123" + record.config = "gpt-4o" + record.duration_ms = 1500 + record.tokens_used = 2000 + record.cost_usd = 0.0045 + + output = formatter.format(record) + data = json.loads(output) + + assert data["task_id"] == "task123" + assert data["config"] == "gpt-4o" + assert data["duration_ms"] == 1500 + assert data["tokens_used"] == 2000 + assert data["cost_usd"] == 0.0045 + + def test_format_excludes_none_values(self): + """Test that None values are excluded from output.""" + formatter = StructuredFormatter() + record = logging.LogRecord( + name="test.logger", + level=logging.INFO, + pathname="test.py", + lineno=1, + msg="Test", + args=(), + exc_info=None, + ) + record.task_id = "task123" + record.config = None # Should be excluded + + output = formatter.format(record) + data = json.loads(output) + + assert data["task_id"] == "task123" + assert "config" not in data + + +class TestHumanReadableFormatter: + """Tests for HumanReadableFormatter.""" + + def test_basic_format(self): + """Test basic human-readable formatting.""" + formatter = HumanReadableFormatter() + record = logging.LogRecord( + name="test.logger", + level=logging.INFO, + pathname="test.py", + lineno=1, + msg="Test message", + args=(), + exc_info=None, + ) + + output = formatter.format(record) + + assert "INFO" in output + assert "test.logger" in output + assert "Test message" in output + + def test_format_with_context(self): + """Test formatting with context fields.""" + formatter = HumanReadableFormatter() + record = logging.LogRecord( + name="test.logger", + level=logging.INFO, + pathname="test.py", + lineno=1, + msg="Task completed", + args=(), + exc_info=None, + ) + record.task_id = "task123" + record.duration_ms = 1500 + record.cost_usd = 0.0045 + record.tokens_used = 2000 + + output = formatter.format(record) + + assert "task_id=task123" in output + assert "1500ms" in output + assert "$0.0045" in output + assert "2000 tokens" in output + + +class TestSetupLogging: + """Tests for setup_logging function.""" + + def test_setup_default(self): + """Test default logging setup.""" + logger = setup_logging(level="INFO") + + assert logger.name == "arc_agi_benchmarking" + # Level is set on root logger, app logger inherits via getEffectiveLevel + assert logger.getEffectiveLevel() == logging.INFO + # Handlers are on root logger + assert len(logging.getLogger().handlers) == 1 + + def test_setup_with_file(self): + """Test logging setup with file output.""" + with tempfile.TemporaryDirectory() as tmpdir: + log_file = Path(tmpdir) / "test.jsonl" + logger = setup_logging(level="DEBUG", log_file=log_file) + + # Handlers are on root logger + assert len(logging.getLogger().handlers) == 2 # Console + file + + # Log something and check file + logger.info("Test message") + + assert log_file.exists() + with open(log_file) as f: + line = f.readline() + data = json.loads(line) + assert data["message"] == "Test message" + + def test_setup_none_level(self): + """Test that NONE level disables logging.""" + logger = setup_logging(level="NONE") + + # Should not raise but logging should be disabled + logger.critical("This should not appear") + + # Re-enable for other tests + logging.disable(logging.NOTSET) + + def test_setup_json_console(self): + """Test JSON console output.""" + logger = setup_logging(level="INFO", json_console=True) + + # Check that formatter is StructuredFormatter (on root logger) + assert isinstance(logging.getLogger().handlers[0].formatter, StructuredFormatter) + + def test_quiet_libraries(self): + """Test that noisy libraries are quieted.""" + setup_logging(level="DEBUG", quiet_libraries=True) + + httpx_logger = logging.getLogger("httpx") + assert httpx_logger.level == logging.WARNING + + openai_logger = logging.getLogger("openai") + assert openai_logger.level == logging.WARNING + + +class TestGetLogger: + """Tests for get_logger function.""" + + def test_get_logger_with_module_name(self): + """Test getting logger with module name.""" + logger = get_logger("arc_agi_benchmarking.adapters.openai") + assert logger.name == "arc_agi_benchmarking.adapters.openai" + + def test_get_logger_adds_prefix(self): + """Test that prefix is added for non-app loggers.""" + logger = get_logger("my_module") + assert logger.name == "arc_agi_benchmarking.my_module" + + +class TestLogContext: + """Tests for LogContext context manager.""" + + def test_context_adds_fields(self): + """Test that context adds fields to log records.""" + setup_logging(level="DEBUG") + logger = get_logger("test") + + records = [] + + class RecordCapture(logging.Handler): + def emit(self, record): + records.append(record) + + logger.addHandler(RecordCapture()) + + with LogContext(task_id="task123", config="gpt-4o"): + logger.info("Test message") + + assert len(records) == 1 + assert records[0].task_id == "task123" + assert records[0].config == "gpt-4o" + + def test_context_restored_after_exit(self): + """Test that context is properly restored after exiting.""" + setup_logging(level="DEBUG") + logger = get_logger("test2") + + records = [] + + class RecordCapture(logging.Handler): + def emit(self, record): + records.append(record) + + logger.addHandler(RecordCapture()) + + with LogContext(task_id="task123"): + logger.info("Inside context") + + logger.info("Outside context") + + assert len(records) == 2 + assert hasattr(records[0], "task_id") + # Outside context should not have task_id + assert not hasattr(records[1], "task_id") or records[1].task_id is None + + +class TestLogWithContext: + """Tests for log_with_context function.""" + + def test_log_with_context(self): + """Test logging with context fields.""" + setup_logging(level="DEBUG") + logger = get_logger("test3") + + records = [] + + class RecordCapture(logging.Handler): + def emit(self, record): + records.append(record) + + logger.addHandler(RecordCapture()) + + log_with_context( + logger, + logging.INFO, + "Task completed", + task_id="task123", + duration_ms=1500, + tokens_used=2000, + ) + + assert len(records) == 1 + assert records[0].task_id == "task123" + assert records[0].duration_ms == 1500 + assert records[0].tokens_used == 2000 diff --git a/src/arc_agi_benchmarking/utils/logging_utils.py b/src/arc_agi_benchmarking/utils/logging_utils.py new file mode 100644 index 00000000..9e76f6d0 --- /dev/null +++ b/src/arc_agi_benchmarking/utils/logging_utils.py @@ -0,0 +1,279 @@ +"""Centralized structured logging configuration. + +Provides JSON-formatted logging with consistent fields across the application. +Supports both console and file output with configurable formats. +""" + +import json +import logging +import sys +from datetime import datetime, timezone +from pathlib import Path +from typing import Any, Optional + +# Standard fields included in all log records +STANDARD_FIELDS = [ + "timestamp", + "level", + "logger", + "message", +] + +# Optional context fields that may be present +CONTEXT_FIELDS = [ + "task_id", + "config", + "provider", + "attempt", + "duration_ms", + "tokens_used", + "cost_usd", + "error_type", +] + + +class StructuredFormatter(logging.Formatter): + """JSON formatter for structured logging. + + Outputs logs as single-line JSON objects with consistent fields. + Additional context can be added via the `extra` parameter in log calls. + """ + + def __init__(self, include_extra: bool = True): + """Initialize the formatter. + + Args: + include_extra: Whether to include extra fields from log records. + """ + super().__init__() + self.include_extra = include_extra + + def format(self, record: logging.LogRecord) -> str: + """Format a log record as JSON.""" + log_data: dict[str, Any] = { + "timestamp": datetime.now(timezone.utc).isoformat(), + "level": record.levelname, + "logger": record.name, + "message": record.getMessage(), + } + + # Add exception info if present + if record.exc_info: + log_data["exception"] = self.formatException(record.exc_info) + + # Add context fields if present in the record + if self.include_extra: + for field in CONTEXT_FIELDS: + if hasattr(record, field): + value = getattr(record, field) + if value is not None: + log_data[field] = value + + return json.dumps(log_data, default=str) + + +class HumanReadableFormatter(logging.Formatter): + """Human-readable formatter for console output. + + Formats logs with timestamp, level, and message in a readable format. + Includes context fields inline when present. + """ + + def __init__(self): + super().__init__() + + def format(self, record: logging.LogRecord) -> str: + """Format a log record for human reading.""" + timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + + # Build context string from extra fields + context_parts = [] + for field in CONTEXT_FIELDS: + if hasattr(record, field): + value = getattr(record, field) + if value is not None: + # Format field name nicely + display_name = field.replace("_", " ").title() + if field == "duration_ms": + context_parts.append(f"{value}ms") + elif field == "cost_usd": + context_parts.append(f"${value:.4f}") + elif field == "tokens_used": + context_parts.append(f"{value} tokens") + else: + context_parts.append(f"{field}={value}") + + context_str = f" [{', '.join(context_parts)}]" if context_parts else "" + + base_msg = f"{timestamp} | {record.levelname:<8} | {record.name} | {record.getMessage()}{context_str}" + + # Add exception info if present + if record.exc_info: + base_msg += f"\n{self.formatException(record.exc_info)}" + + return base_msg + + +def setup_logging( + level: str = "INFO", + json_console: bool = False, + log_file: Optional[Path] = None, + quiet_libraries: bool = True, +) -> logging.Logger: + """Configure application-wide logging. + + Args: + level: Log level (DEBUG, INFO, WARNING, ERROR, CRITICAL, NONE). + json_console: If True, output JSON to console instead of human-readable. + log_file: Optional path to write JSON logs to a file. + quiet_libraries: If True, suppress verbose logging from third-party libraries. + + Returns: + The root logger for the application. + """ + # Handle special "NONE" level + if level.upper() == "NONE": + logging.disable(logging.CRITICAL) + return logging.getLogger("arc_agi_benchmarking") + + # Enable logging if previously disabled + logging.disable(logging.NOTSET) + + # Get numeric level + numeric_level = getattr(logging, level.upper(), logging.INFO) + + # Configure the root logger so all loggers inherit the config + root_logger = logging.getLogger() + root_logger.setLevel(numeric_level) + + # Remove existing handlers to avoid duplicates + root_logger.handlers.clear() + + # Console handler + console_handler = logging.StreamHandler(sys.stdout) + console_handler.setLevel(numeric_level) + + if json_console: + console_handler.setFormatter(StructuredFormatter()) + else: + console_handler.setFormatter(HumanReadableFormatter()) + + root_logger.addHandler(console_handler) + + # File handler (always JSON) + if log_file: + log_file.parent.mkdir(parents=True, exist_ok=True) + file_handler = logging.FileHandler(log_file, mode="a") + file_handler.setLevel(numeric_level) + file_handler.setFormatter(StructuredFormatter()) + root_logger.addHandler(file_handler) + + # Quiet noisy libraries + if quiet_libraries: + noisy_loggers = [ + "httpx", + "httpcore", + "urllib3", + "requests", + "anthropic", + "google", + "openai", + "pydantic", + "transformers", + ] + for logger_name in noisy_loggers: + logging.getLogger(logger_name).setLevel(logging.WARNING) + + # Return the app logger for convenience + return logging.getLogger("arc_agi_benchmarking") + + +def get_logger(name: str) -> logging.Logger: + """Get a logger with the application prefix. + + Args: + name: Logger name (typically __name__). + + Returns: + A logger instance. + """ + # Ensure it's under the app namespace + if not name.startswith("arc_agi_benchmarking"): + name = f"arc_agi_benchmarking.{name}" + return logging.getLogger(name) + + +class LogContext: + """Context manager for adding fields to log records. + + Usage: + with LogContext(task_id="abc123", config="gpt-4o"): + logger.info("Processing task") # Will include task_id and config + """ + + def __init__(self, **kwargs: Any): + """Initialize with context fields.""" + self.context = kwargs + self._old_factory = None + + def __enter__(self): + """Add context to log records.""" + self._old_factory = logging.getLogRecordFactory() + context = self.context + + def record_factory(*args, **kwargs): + record = self._old_factory(*args, **kwargs) + for key, value in context.items(): + setattr(record, key, value) + return record + + logging.setLogRecordFactory(record_factory) + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + """Restore original log record factory.""" + if self._old_factory: + logging.setLogRecordFactory(self._old_factory) + return False + + +def log_with_context( + logger: logging.Logger, + level: int, + message: str, + **context: Any, +) -> None: + """Log a message with additional context fields. + + Args: + logger: The logger to use. + level: Log level (e.g., logging.INFO). + message: Log message. + **context: Additional fields to include in the log record. + + Example: + log_with_context(logger, logging.INFO, "Task completed", + task_id="abc123", duration_ms=1500, tokens_used=2000) + """ + # Filter out keys that might already be set by the global record factory + # to avoid KeyError from logging's extra dict handling + safe_context = {} + for key, value in context.items(): + if key not in ["message", "asctime"]: + safe_context[key] = value + + # Create a temporary record factory that adds our context + old_factory = logging.getLogRecordFactory() + + def factory_with_context(*args, **kwargs): + record = old_factory(*args, **kwargs) + for key, value in safe_context.items(): + # Only set if not already set or if we're overriding with new value + setattr(record, key, value) + return record + + try: + logging.setLogRecordFactory(factory_with_context) + logger.log(level, message) + finally: + logging.setLogRecordFactory(old_factory)