From decc022755a874abac5b0f9d632e8d29c0447e01 Mon Sep 17 00:00:00 2001
From: Eric Campbell <ericcampbell59@gmail.com>
Date: Thu, 15 Jan 2026 14:34:10 -0600
Subject: [PATCH] Add centralized structured logging

- Create logging_utils.py with StructuredFormatter (JSON) and HumanReadableFormatter
- Console output uses human-readable format with timestamps
- File output uses JSON with ISO 8601 timestamps and context fields
- Support for context fields: task_id, config, duration_ms, tokens_used, cost_usd
- LogContext context manager for adding fields to log records
- Integrate into main.py and run_all.py entry points
- Add 17 unit tests for logging utilities
---
 cli/run_all.py                                |  35 +-
 main.py                                       |  64 +---
 .../tests/test_e2e_providers.py               |   8 +-
 .../tests/test_logging_utils.py               | 334 ++++++++++++++++++
 .../utils/logging_utils.py                    | 279 +++++++++++++++
 5 files changed, 636 insertions(+), 84 deletions(-)
 create mode 100644 src/arc_agi_benchmarking/tests/test_logging_utils.py
 create mode 100644 src/arc_agi_benchmarking/utils/logging_utils.py

diff --git a/cli/run_all.py b/cli/run_all.py
index 6f94b550..1d57b954 100644
--- a/cli/run_all.py
+++ b/cli/run_all.py
@@ -29,6 +29,7 @@
 from arc_agi_benchmarking.utils.rate_limiter import AsyncRequestRateLimiter
 from arc_agi_benchmarking.utils.metrics import set_metrics_enabled, set_metrics_filename_prefix
 from arc_agi_benchmarking.utils.preflight import run_preflight
+from arc_agi_benchmarking.utils.logging_utils import setup_logging, StructuredFormatter
 
 from tenacity import retry, wait_exponential, stop_after_attempt, retry_if_exception_type, before_sleep_log
 
@@ -165,23 +166,8 @@ def filter(self, record: logging.LogRecord) -> bool:
         config_token = LOG_CONFIG_CTX.set(config_name)
         task_token = LOG_TASK_CTX.set(task_id)
 
-        class _JsonFormatter(logging.Formatter):
-            def format(self, record):
-                payload = {
-                    "ts": self.formatTime(record, "%Y-%m-%dT%H:%M:%S%z"),
-                    "level": record.levelname,
-                    "logger": record.name,
-                    "message": record.getMessage(),
-                }
-                if record.exc_info:
-                    payload["exc_info"] = self.formatException(record.exc_info)
-                # Include context for easier grepping
-                payload["config"] = config_name
-                payload["task_id"] = task_id
-                return json.dumps(payload)
-
         file_handler = logging.FileHandler(log_path, encoding="utf-8")
-        file_handler.setFormatter(_JsonFormatter())
+        file_handler.setFormatter(StructuredFormatter())
         file_handler.addFilter(_TaskFilter())
         logging.getLogger().addHandler(file_handler)
         logging.getLogger("openai").setLevel(logging.INFO)
@@ -421,21 +407,8 @@ async def main(task_list_file: Optional[str],
     # Set metrics enabled status based on CLI arg
     set_metrics_enabled(args.enable_metrics)
 
-    # Configure logging for the entire application based on --log-level
-    if args.log_level == "NONE":
-        log_level_to_set = logging.CRITICAL + 1 
-        logging.basicConfig(
-            level=log_level_to_set,
-            format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
-            handlers=[logging.StreamHandler(sys.stdout)] 
-        )
-    else:
-        log_level_to_set = getattr(logging, args.log_level.upper())
-        logging.basicConfig(
-            level=log_level_to_set,
-            format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
-            handlers=[logging.StreamHandler(sys.stdout)]
-        )
+    # Configure structured logging for the entire application
+    setup_logging(level=args.log_level, quiet_libraries=True)
 
     config_name = args.config.strip() if args.config else DEFAULT_MODEL_CONFIG
     if not config_name:
diff --git a/main.py b/main.py
index 1cc22cfa..47b21c01 100644
--- a/main.py
+++ b/main.py
@@ -13,6 +13,7 @@
 from dotenv import load_dotenv
 import arc_agi_benchmarking.utils as utils
 from arc_agi_benchmarking.utils.metrics import timeit, set_metrics_enabled
+from arc_agi_benchmarking.utils.logging_utils import setup_logging
 from arc_agi_benchmarking.schemas import ARCPair, Attempt
 from arc_agi_benchmarking.prompts.prompt_manager import convert_task_pairs_to_prompt
 from typing import List, Optional
@@ -260,60 +261,25 @@ def main_cli(cli_args: Optional[List[str]] = None):
     # Set metrics enabled status based on CLI arg first
     set_metrics_enabled(args.enable_metrics)
 
-    # # Prepare OpenAI SDK file logging in logs/<config>/<task_id>/openai.jsonl
+    # Prepare log file path for structured JSON logs
     log_dir = Path("logs") / args.config / args.task_id
     log_dir.mkdir(parents=True, exist_ok=True)
     log_path = log_dir / "openai.jsonl"
 
-    class _JsonFormatter(logging.Formatter):
-        def format(self, record):
-            payload = {
-                "ts": self.formatTime(record, "%Y-%m-%dT%H:%M:%S%z"),
-                "level": record.levelname,
-                "logger": record.name,
-                "message": record.getMessage(),
-            }
-            if record.exc_info:
-                payload["exc_info"] = self.formatException(record.exc_info)
-            return json.dumps(payload)
-
-    file_handler = logging.FileHandler(log_path, encoding="utf-8")
-    file_handler.setFormatter(_JsonFormatter())
-
-    # Configure logging
+    # Configure structured logging (writes to console and file)
+    log_level = "DEBUG" if args.verbose else args.log_level
+    setup_logging(
+        level=log_level,
+        log_file=log_path,
+        quiet_libraries=True,
+    )
+
+    # Enable OpenAI SDK logging to capture API calls
+    logging.getLogger('openai').setLevel(logging.INFO)
+
     if args.verbose:
-        # Verbose mode: Show DEBUG for our code, WARNING+ for libraries
-        logging.basicConfig(
-            level=logging.DEBUG,
-            format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
-        )
-        
-        # Set library loggers to WARNING to reduce noise
-        library_loggers = [
-            'httpx', 'httpcore', 'urllib3', 'requests', 
-            'anthropic', 'google', 'pydantic', 'transformers'
-        ]
-        for lib_logger in library_loggers:
-            logging.getLogger(lib_logger).setLevel(logging.WARNING)
-        
-        # Keep our application loggers at DEBUG
-        logging.getLogger('arc_agi_benchmarking').setLevel(logging.DEBUG)
-        logging.getLogger('__main__').setLevel(logging.DEBUG)
-        
-        logger.info("Verbose mode enabled - showing debug output for arc_agi_benchmarking only")
-    else:
-        # Normal mode: Use the specified log level
-        logging.basicConfig(
-            level=getattr(logging, args.log_level.upper()),
-            format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
-        )
-
-    # Attach file handler for OpenAI SDK and app logs, and ensure OpenAI logger is enabled
-    root_logger = logging.getLogger()
-    root_logger.addHandler(file_handler)
-    openai_logger = logging.getLogger('openai')
-    openai_logger.setLevel(logging.INFO)
-    logger.info(f"OpenAI SDK logs will be written to {log_path}")
+        logger.info("Verbose mode enabled - showing debug output")
+    logger.info(f"Structured logs will be written to {log_path}")
 
     arc_solver = ARCTester(
         config=args.config,
diff --git a/src/arc_agi_benchmarking/tests/test_e2e_providers.py b/src/arc_agi_benchmarking/tests/test_e2e_providers.py
index c9857319..9c8c7b60 100644
--- a/src/arc_agi_benchmarking/tests/test_e2e_providers.py
+++ b/src/arc_agi_benchmarking/tests/test_e2e_providers.py
@@ -141,10 +141,10 @@ def test_gpt_4o_provider_e2e_mocked(mock_init_provider): # REMOVE mock_backscan_
 @patch('main.ARCTester.generate_task_solution')
 @patch('main.ARCTester.init_provider')
 @patch('main.utils.read_models_config') # Patched at the source where it's used by main.py
-@patch('main.logging.basicConfig') # To suppress logging output during tests
+@patch('main.setup_logging') # To suppress logging output during tests
 @patch('main.set_metrics_enabled') # To avoid side effects with metrics
 def test_main_cli_execution_and_arctester_mocking(mock_set_metrics_enabled,
-                                           mock_logging_basic_config,
+                                           mock_setup_logging,
                                            mock_read_models_config,
                                            mock_init_provider,
                                            mock_generate_task_solution):
@@ -202,6 +202,6 @@ def test_main_cli_execution_and_arctester_mocking(mock_set_metrics_enabled,
 
     mock_set_metrics_enabled.assert_called_once_with(False) # Default behavior
     print(f"mock_set_metrics_enabled called with: {mock_set_metrics_enabled.call_args}")
-    
-    mock_logging_basic_config.assert_called_once()
+
+    mock_setup_logging.assert_called_once()
     print("✅ test_main_cli_execution_and_arctester_mocking completed successfully.")
diff --git a/src/arc_agi_benchmarking/tests/test_logging_utils.py b/src/arc_agi_benchmarking/tests/test_logging_utils.py
new file mode 100644
index 00000000..884a0f3b
--- /dev/null
+++ b/src/arc_agi_benchmarking/tests/test_logging_utils.py
@@ -0,0 +1,334 @@
+"""Tests for structured logging utilities."""
+
+import json
+import logging
+import tempfile
+from pathlib import Path
+
+import pytest
+
+from arc_agi_benchmarking.utils.logging_utils import (
+    HumanReadableFormatter,
+    LogContext,
+    StructuredFormatter,
+    get_logger,
+    log_with_context,
+    setup_logging,
+)
+
+
+class TestStructuredFormatter:
+    """Tests for StructuredFormatter."""
+
+    def test_basic_format(self):
+        """Test basic JSON formatting."""
+        formatter = StructuredFormatter()
+        record = logging.LogRecord(
+            name="test.logger",
+            level=logging.INFO,
+            pathname="test.py",
+            lineno=1,
+            msg="Test message",
+            args=(),
+            exc_info=None,
+        )
+
+        output = formatter.format(record)
+        data = json.loads(output)
+
+        assert data["level"] == "INFO"
+        assert data["logger"] == "test.logger"
+        assert data["message"] == "Test message"
+        assert "timestamp" in data
+
+    def test_format_with_args(self):
+        """Test formatting with message arguments."""
+        formatter = StructuredFormatter()
+        record = logging.LogRecord(
+            name="test.logger",
+            level=logging.INFO,
+            pathname="test.py",
+            lineno=1,
+            msg="Task %s completed in %dms",
+            args=("abc123", 500),
+            exc_info=None,
+        )
+
+        output = formatter.format(record)
+        data = json.loads(output)
+
+        assert data["message"] == "Task abc123 completed in 500ms"
+
+    def test_format_with_exception(self):
+        """Test formatting with exception info."""
+        formatter = StructuredFormatter()
+
+        try:
+            raise ValueError("Test error")
+        except ValueError:
+            import sys
+
+            exc_info = sys.exc_info()
+
+        record = logging.LogRecord(
+            name="test.logger",
+            level=logging.ERROR,
+            pathname="test.py",
+            lineno=1,
+            msg="An error occurred",
+            args=(),
+            exc_info=exc_info,
+        )
+
+        output = formatter.format(record)
+        data = json.loads(output)
+
+        assert data["level"] == "ERROR"
+        assert "exception" in data
+        assert "ValueError: Test error" in data["exception"]
+
+    def test_format_with_context_fields(self):
+        """Test formatting with extra context fields."""
+        formatter = StructuredFormatter()
+        record = logging.LogRecord(
+            name="test.logger",
+            level=logging.INFO,
+            pathname="test.py",
+            lineno=1,
+            msg="Processing",
+            args=(),
+            exc_info=None,
+        )
+        record.task_id = "task123"
+        record.config = "gpt-4o"
+        record.duration_ms = 1500
+        record.tokens_used = 2000
+        record.cost_usd = 0.0045
+
+        output = formatter.format(record)
+        data = json.loads(output)
+
+        assert data["task_id"] == "task123"
+        assert data["config"] == "gpt-4o"
+        assert data["duration_ms"] == 1500
+        assert data["tokens_used"] == 2000
+        assert data["cost_usd"] == 0.0045
+
+    def test_format_excludes_none_values(self):
+        """Test that None values are excluded from output."""
+        formatter = StructuredFormatter()
+        record = logging.LogRecord(
+            name="test.logger",
+            level=logging.INFO,
+            pathname="test.py",
+            lineno=1,
+            msg="Test",
+            args=(),
+            exc_info=None,
+        )
+        record.task_id = "task123"
+        record.config = None  # Should be excluded
+
+        output = formatter.format(record)
+        data = json.loads(output)
+
+        assert data["task_id"] == "task123"
+        assert "config" not in data
+
+
+class TestHumanReadableFormatter:
+    """Tests for HumanReadableFormatter."""
+
+    def test_basic_format(self):
+        """Test basic human-readable formatting."""
+        formatter = HumanReadableFormatter()
+        record = logging.LogRecord(
+            name="test.logger",
+            level=logging.INFO,
+            pathname="test.py",
+            lineno=1,
+            msg="Test message",
+            args=(),
+            exc_info=None,
+        )
+
+        output = formatter.format(record)
+
+        assert "INFO" in output
+        assert "test.logger" in output
+        assert "Test message" in output
+
+    def test_format_with_context(self):
+        """Test formatting with context fields."""
+        formatter = HumanReadableFormatter()
+        record = logging.LogRecord(
+            name="test.logger",
+            level=logging.INFO,
+            pathname="test.py",
+            lineno=1,
+            msg="Task completed",
+            args=(),
+            exc_info=None,
+        )
+        record.task_id = "task123"
+        record.duration_ms = 1500
+        record.cost_usd = 0.0045
+        record.tokens_used = 2000
+
+        output = formatter.format(record)
+
+        assert "task_id=task123" in output
+        assert "1500ms" in output
+        assert "$0.0045" in output
+        assert "2000 tokens" in output
+
+
+class TestSetupLogging:
+    """Tests for setup_logging function."""
+
+    def test_setup_default(self):
+        """Test default logging setup."""
+        logger = setup_logging(level="INFO")
+
+        assert logger.name == "arc_agi_benchmarking"
+        # Level is set on root logger, app logger inherits via getEffectiveLevel
+        assert logger.getEffectiveLevel() == logging.INFO
+        # Handlers are on root logger
+        assert len(logging.getLogger().handlers) == 1
+
+    def test_setup_with_file(self):
+        """Test logging setup with file output."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            log_file = Path(tmpdir) / "test.jsonl"
+            logger = setup_logging(level="DEBUG", log_file=log_file)
+
+            # Handlers are on root logger
+            assert len(logging.getLogger().handlers) == 2  # Console + file
+
+            # Log something and check file
+            logger.info("Test message")
+
+            assert log_file.exists()
+            with open(log_file) as f:
+                line = f.readline()
+                data = json.loads(line)
+                assert data["message"] == "Test message"
+
+    def test_setup_none_level(self):
+        """Test that NONE level disables logging."""
+        logger = setup_logging(level="NONE")
+
+        # Should not raise but logging should be disabled
+        logger.critical("This should not appear")
+
+        # Re-enable for other tests
+        logging.disable(logging.NOTSET)
+
+    def test_setup_json_console(self):
+        """Test JSON console output."""
+        logger = setup_logging(level="INFO", json_console=True)
+
+        # Check that formatter is StructuredFormatter (on root logger)
+        assert isinstance(logging.getLogger().handlers[0].formatter, StructuredFormatter)
+
+    def test_quiet_libraries(self):
+        """Test that noisy libraries are quieted."""
+        setup_logging(level="DEBUG", quiet_libraries=True)
+
+        httpx_logger = logging.getLogger("httpx")
+        assert httpx_logger.level == logging.WARNING
+
+        openai_logger = logging.getLogger("openai")
+        assert openai_logger.level == logging.WARNING
+
+
+class TestGetLogger:
+    """Tests for get_logger function."""
+
+    def test_get_logger_with_module_name(self):
+        """Test getting logger with module name."""
+        logger = get_logger("arc_agi_benchmarking.adapters.openai")
+        assert logger.name == "arc_agi_benchmarking.adapters.openai"
+
+    def test_get_logger_adds_prefix(self):
+        """Test that prefix is added for non-app loggers."""
+        logger = get_logger("my_module")
+        assert logger.name == "arc_agi_benchmarking.my_module"
+
+
+class TestLogContext:
+    """Tests for LogContext context manager."""
+
+    def test_context_adds_fields(self):
+        """Test that context adds fields to log records."""
+        setup_logging(level="DEBUG")
+        logger = get_logger("test")
+
+        records = []
+
+        class RecordCapture(logging.Handler):
+            def emit(self, record):
+                records.append(record)
+
+        logger.addHandler(RecordCapture())
+
+        with LogContext(task_id="task123", config="gpt-4o"):
+            logger.info("Test message")
+
+        assert len(records) == 1
+        assert records[0].task_id == "task123"
+        assert records[0].config == "gpt-4o"
+
+    def test_context_restored_after_exit(self):
+        """Test that context is properly restored after exiting."""
+        setup_logging(level="DEBUG")
+        logger = get_logger("test2")
+
+        records = []
+
+        class RecordCapture(logging.Handler):
+            def emit(self, record):
+                records.append(record)
+
+        logger.addHandler(RecordCapture())
+
+        with LogContext(task_id="task123"):
+            logger.info("Inside context")
+
+        logger.info("Outside context")
+
+        assert len(records) == 2
+        assert hasattr(records[0], "task_id")
+        # Outside context should not have task_id
+        assert not hasattr(records[1], "task_id") or records[1].task_id is None
+
+
+class TestLogWithContext:
+    """Tests for log_with_context function."""
+
+    def test_log_with_context(self):
+        """Test logging with context fields."""
+        setup_logging(level="DEBUG")
+        logger = get_logger("test3")
+
+        records = []
+
+        class RecordCapture(logging.Handler):
+            def emit(self, record):
+                records.append(record)
+
+        logger.addHandler(RecordCapture())
+
+        log_with_context(
+            logger,
+            logging.INFO,
+            "Task completed",
+            task_id="task123",
+            duration_ms=1500,
+            tokens_used=2000,
+        )
+
+        assert len(records) == 1
+        assert records[0].task_id == "task123"
+        assert records[0].duration_ms == 1500
+        assert records[0].tokens_used == 2000
diff --git a/src/arc_agi_benchmarking/utils/logging_utils.py b/src/arc_agi_benchmarking/utils/logging_utils.py
new file mode 100644
index 00000000..9e76f6d0
--- /dev/null
+++ b/src/arc_agi_benchmarking/utils/logging_utils.py
@@ -0,0 +1,279 @@
+"""Centralized structured logging configuration.
+
+Provides JSON-formatted logging with consistent fields across the application.
+Supports both console and file output with configurable formats.
+"""
+
+import json
+import logging
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any, Optional
+
+# Standard fields included in all log records
+STANDARD_FIELDS = [
+    "timestamp",
+    "level",
+    "logger",
+    "message",
+]
+
+# Optional context fields that may be present
+CONTEXT_FIELDS = [
+    "task_id",
+    "config",
+    "provider",
+    "attempt",
+    "duration_ms",
+    "tokens_used",
+    "cost_usd",
+    "error_type",
+]
+
+
+class StructuredFormatter(logging.Formatter):
+    """JSON formatter for structured logging.
+
+    Outputs logs as single-line JSON objects with consistent fields.
+    Additional context can be added via the `extra` parameter in log calls.
+    """
+
+    def __init__(self, include_extra: bool = True):
+        """Initialize the formatter.
+
+        Args:
+            include_extra: Whether to include extra fields from log records.
+        """
+        super().__init__()
+        self.include_extra = include_extra
+
+    def format(self, record: logging.LogRecord) -> str:
+        """Format a log record as JSON."""
+        log_data: dict[str, Any] = {
+            "timestamp": datetime.now(timezone.utc).isoformat(),
+            "level": record.levelname,
+            "logger": record.name,
+            "message": record.getMessage(),
+        }
+
+        # Add exception info if present
+        if record.exc_info:
+            log_data["exception"] = self.formatException(record.exc_info)
+
+        # Add context fields if present in the record
+        if self.include_extra:
+            for field in CONTEXT_FIELDS:
+                if hasattr(record, field):
+                    value = getattr(record, field)
+                    if value is not None:
+                        log_data[field] = value
+
+        return json.dumps(log_data, default=str)
+
+
+class HumanReadableFormatter(logging.Formatter):
+    """Human-readable formatter for console output.
+
+    Formats logs with timestamp, level, and message in a readable format.
+    Includes context fields inline when present.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def format(self, record: logging.LogRecord) -> str:
+        """Format a log record for human reading."""
+        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+
+        # Build context string from extra fields
+        context_parts = []
+        for field in CONTEXT_FIELDS:
+            if hasattr(record, field):
+                value = getattr(record, field)
+                if value is not None:
+                    # Format field name nicely
+                    display_name = field.replace("_", " ").title()
+                    if field == "duration_ms":
+                        context_parts.append(f"{value}ms")
+                    elif field == "cost_usd":
+                        context_parts.append(f"${value:.4f}")
+                    elif field == "tokens_used":
+                        context_parts.append(f"{value} tokens")
+                    else:
+                        context_parts.append(f"{field}={value}")
+
+        context_str = f" [{', '.join(context_parts)}]" if context_parts else ""
+
+        base_msg = f"{timestamp} | {record.levelname:<8} | {record.name} | {record.getMessage()}{context_str}"
+
+        # Add exception info if present
+        if record.exc_info:
+            base_msg += f"\n{self.formatException(record.exc_info)}"
+
+        return base_msg
+
+
+def setup_logging(
+    level: str = "INFO",
+    json_console: bool = False,
+    log_file: Optional[Path] = None,
+    quiet_libraries: bool = True,
+) -> logging.Logger:
+    """Configure application-wide logging.
+
+    Args:
+        level: Log level (DEBUG, INFO, WARNING, ERROR, CRITICAL, NONE).
+        json_console: If True, output JSON to console instead of human-readable.
+        log_file: Optional path to write JSON logs to a file.
+        quiet_libraries: If True, suppress verbose logging from third-party libraries.
+
+    Returns:
+        The root logger for the application.
+    """
+    # Handle special "NONE" level
+    if level.upper() == "NONE":
+        logging.disable(logging.CRITICAL)
+        return logging.getLogger("arc_agi_benchmarking")
+
+    # Enable logging if previously disabled
+    logging.disable(logging.NOTSET)
+
+    # Get numeric level
+    numeric_level = getattr(logging, level.upper(), logging.INFO)
+
+    # Configure the root logger so all loggers inherit the config
+    root_logger = logging.getLogger()
+    root_logger.setLevel(numeric_level)
+
+    # Remove existing handlers to avoid duplicates
+    root_logger.handlers.clear()
+
+    # Console handler
+    console_handler = logging.StreamHandler(sys.stdout)
+    console_handler.setLevel(numeric_level)
+
+    if json_console:
+        console_handler.setFormatter(StructuredFormatter())
+    else:
+        console_handler.setFormatter(HumanReadableFormatter())
+
+    root_logger.addHandler(console_handler)
+
+    # File handler (always JSON)
+    if log_file:
+        log_file.parent.mkdir(parents=True, exist_ok=True)
+        file_handler = logging.FileHandler(log_file, mode="a")
+        file_handler.setLevel(numeric_level)
+        file_handler.setFormatter(StructuredFormatter())
+        root_logger.addHandler(file_handler)
+
+    # Quiet noisy libraries
+    if quiet_libraries:
+        noisy_loggers = [
+            "httpx",
+            "httpcore",
+            "urllib3",
+            "requests",
+            "anthropic",
+            "google",
+            "openai",
+            "pydantic",
+            "transformers",
+        ]
+        for logger_name in noisy_loggers:
+            logging.getLogger(logger_name).setLevel(logging.WARNING)
+
+    # Return the app logger for convenience
+    return logging.getLogger("arc_agi_benchmarking")
+
+
+def get_logger(name: str) -> logging.Logger:
+    """Get a logger with the application prefix.
+
+    Args:
+        name: Logger name (typically __name__).
+
+    Returns:
+        A logger instance.
+    """
+    # Ensure it's under the app namespace
+    if not name.startswith("arc_agi_benchmarking"):
+        name = f"arc_agi_benchmarking.{name}"
+    return logging.getLogger(name)
+
+
+class LogContext:
+    """Context manager for adding fields to log records.
+
+    Usage:
+        with LogContext(task_id="abc123", config="gpt-4o"):
+            logger.info("Processing task")  # Will include task_id and config
+    """
+
+    def __init__(self, **kwargs: Any):
+        """Initialize with context fields."""
+        self.context = kwargs
+        self._old_factory = None
+
+    def __enter__(self):
+        """Add context to log records."""
+        self._old_factory = logging.getLogRecordFactory()
+        context = self.context
+
+        def record_factory(*args, **kwargs):
+            record = self._old_factory(*args, **kwargs)
+            for key, value in context.items():
+                setattr(record, key, value)
+            return record
+
+        logging.setLogRecordFactory(record_factory)
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Restore original log record factory."""
+        if self._old_factory:
+            logging.setLogRecordFactory(self._old_factory)
+        return False
+
+
+def log_with_context(
+    logger: logging.Logger,
+    level: int,
+    message: str,
+    **context: Any,
+) -> None:
+    """Log a message with additional context fields.
+
+    Args:
+        logger: The logger to use.
+        level: Log level (e.g., logging.INFO).
+        message: Log message.
+        **context: Additional fields to include in the log record.
+
+    Example:
+        log_with_context(logger, logging.INFO, "Task completed",
+                        task_id="abc123", duration_ms=1500, tokens_used=2000)
+    """
+    # Filter out keys that might already be set by the global record factory
+    # to avoid KeyError from logging's extra dict handling
+    safe_context = {}
+    for key, value in context.items():
+        if key not in ["message", "asctime"]:
+            safe_context[key] = value
+
+    # Create a temporary record factory that adds our context
+    old_factory = logging.getLogRecordFactory()
+
+    def factory_with_context(*args, **kwargs):
+        record = old_factory(*args, **kwargs)
+        for key, value in safe_context.items():
+            # Only set if not already set or if we're overriding with new value
+            setattr(record, key, value)
+        return record
+
+    try:
+        logging.setLogRecordFactory(factory_with_context)
+        logger.log(level, message)
+    finally:
+        logging.setLogRecordFactory(old_factory)