Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 4 additions & 31 deletions cli/run_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from arc_agi_benchmarking.utils.rate_limiter import AsyncRequestRateLimiter
from arc_agi_benchmarking.utils.metrics import set_metrics_enabled, set_metrics_filename_prefix
from arc_agi_benchmarking.utils.preflight import run_preflight
from arc_agi_benchmarking.utils.logging_utils import setup_logging, StructuredFormatter

from tenacity import retry, wait_exponential, stop_after_attempt, retry_if_exception_type, before_sleep_log

Expand Down Expand Up @@ -165,23 +166,8 @@ def filter(self, record: logging.LogRecord) -> bool:
config_token = LOG_CONFIG_CTX.set(config_name)
task_token = LOG_TASK_CTX.set(task_id)

class _JsonFormatter(logging.Formatter):
def format(self, record):
payload = {
"ts": self.formatTime(record, "%Y-%m-%dT%H:%M:%S%z"),
"level": record.levelname,
"logger": record.name,
"message": record.getMessage(),
}
if record.exc_info:
payload["exc_info"] = self.formatException(record.exc_info)
# Include context for easier grepping
payload["config"] = config_name
payload["task_id"] = task_id
return json.dumps(payload)

file_handler = logging.FileHandler(log_path, encoding="utf-8")
file_handler.setFormatter(_JsonFormatter())
file_handler.setFormatter(StructuredFormatter())
file_handler.addFilter(_TaskFilter())
logging.getLogger().addHandler(file_handler)
logging.getLogger("openai").setLevel(logging.INFO)
Expand Down Expand Up @@ -421,21 +407,8 @@ async def main(task_list_file: Optional[str],
# Set metrics enabled status based on CLI arg
set_metrics_enabled(args.enable_metrics)

# Configure logging for the entire application based on --log-level
if args.log_level == "NONE":
log_level_to_set = logging.CRITICAL + 1
logging.basicConfig(
level=log_level_to_set,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[logging.StreamHandler(sys.stdout)]
)
else:
log_level_to_set = getattr(logging, args.log_level.upper())
logging.basicConfig(
level=log_level_to_set,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[logging.StreamHandler(sys.stdout)]
)
# Configure structured logging for the entire application
setup_logging(level=args.log_level, quiet_libraries=True)

config_name = args.config.strip() if args.config else DEFAULT_MODEL_CONFIG
if not config_name:
Expand Down
64 changes: 15 additions & 49 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from dotenv import load_dotenv
import arc_agi_benchmarking.utils as utils
from arc_agi_benchmarking.utils.metrics import timeit, set_metrics_enabled
from arc_agi_benchmarking.utils.logging_utils import setup_logging
from arc_agi_benchmarking.schemas import ARCPair, Attempt
from arc_agi_benchmarking.prompts.prompt_manager import convert_task_pairs_to_prompt
from typing import List, Optional
Expand Down Expand Up @@ -260,60 +261,25 @@ def main_cli(cli_args: Optional[List[str]] = None):
# Set metrics enabled status based on CLI arg first
set_metrics_enabled(args.enable_metrics)

# # Prepare OpenAI SDK file logging in logs/<config>/<task_id>/openai.jsonl
# Prepare log file path for structured JSON logs
log_dir = Path("logs") / args.config / args.task_id
log_dir.mkdir(parents=True, exist_ok=True)
log_path = log_dir / "openai.jsonl"

class _JsonFormatter(logging.Formatter):
def format(self, record):
payload = {
"ts": self.formatTime(record, "%Y-%m-%dT%H:%M:%S%z"),
"level": record.levelname,
"logger": record.name,
"message": record.getMessage(),
}
if record.exc_info:
payload["exc_info"] = self.formatException(record.exc_info)
return json.dumps(payload)

file_handler = logging.FileHandler(log_path, encoding="utf-8")
file_handler.setFormatter(_JsonFormatter())

# Configure logging
# Configure structured logging (writes to console and file)
log_level = "DEBUG" if args.verbose else args.log_level
setup_logging(
level=log_level,
log_file=log_path,
quiet_libraries=True,
)

# Enable OpenAI SDK logging to capture API calls
logging.getLogger('openai').setLevel(logging.INFO)

if args.verbose:
# Verbose mode: Show DEBUG for our code, WARNING+ for libraries
logging.basicConfig(
level=logging.DEBUG,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)

# Set library loggers to WARNING to reduce noise
library_loggers = [
'httpx', 'httpcore', 'urllib3', 'requests',
'anthropic', 'google', 'pydantic', 'transformers'
]
for lib_logger in library_loggers:
logging.getLogger(lib_logger).setLevel(logging.WARNING)

# Keep our application loggers at DEBUG
logging.getLogger('arc_agi_benchmarking').setLevel(logging.DEBUG)
logging.getLogger('__main__').setLevel(logging.DEBUG)

logger.info("Verbose mode enabled - showing debug output for arc_agi_benchmarking only")
else:
# Normal mode: Use the specified log level
logging.basicConfig(
level=getattr(logging, args.log_level.upper()),
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)

# Attach file handler for OpenAI SDK and app logs, and ensure OpenAI logger is enabled
root_logger = logging.getLogger()
root_logger.addHandler(file_handler)
openai_logger = logging.getLogger('openai')
openai_logger.setLevel(logging.INFO)
logger.info(f"OpenAI SDK logs will be written to {log_path}")
logger.info("Verbose mode enabled - showing debug output")
logger.info(f"Structured logs will be written to {log_path}")

arc_solver = ARCTester(
config=args.config,
Expand Down
8 changes: 4 additions & 4 deletions src/arc_agi_benchmarking/tests/test_e2e_providers.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,10 +141,10 @@ def test_gpt_4o_provider_e2e_mocked(mock_init_provider): # REMOVE mock_backscan_
@patch('main.ARCTester.generate_task_solution')
@patch('main.ARCTester.init_provider')
@patch('main.utils.read_models_config') # Patched at the source where it's used by main.py
@patch('main.logging.basicConfig') # To suppress logging output during tests
@patch('main.setup_logging') # To suppress logging output during tests
@patch('main.set_metrics_enabled') # To avoid side effects with metrics
def test_main_cli_execution_and_arctester_mocking(mock_set_metrics_enabled,
mock_logging_basic_config,
mock_setup_logging,
mock_read_models_config,
mock_init_provider,
mock_generate_task_solution):
Expand Down Expand Up @@ -202,6 +202,6 @@ def test_main_cli_execution_and_arctester_mocking(mock_set_metrics_enabled,

mock_set_metrics_enabled.assert_called_once_with(False) # Default behavior
print(f"mock_set_metrics_enabled called with: {mock_set_metrics_enabled.call_args}")
mock_logging_basic_config.assert_called_once()

mock_setup_logging.assert_called_once()
print("✅ test_main_cli_execution_and_arctester_mocking completed successfully.")
Loading