Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 7 additions & 4 deletions src/ai_crawler/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -561,16 +561,18 @@ def run_recipe_command(recipe_path: str, output_path: str) -> int:
if result.checkpoint_path
else ""
)
print(
summary = (
"ai-crawler run: "
f"recipe={result.recipe_name} "
f"items_written={result.items_written} "
f"pages_scheduled={result.pages_scheduled} "
f"pages_attempted={result.pages_attempted} "
f"requests_attempted={result.requests_attempted} "
f"stop_reason={result.stop_reason} "
f"output={result.output_path}"
f"output={output_path}"
f"{checkpoint_summary}"
)
print(summary)
return 0


Expand All @@ -596,17 +598,18 @@ def test_recipe_command(recipe_path: str, output_path: str, report_path: str) ->
_write_tool_report(result=result, report_path=report_path)
crawl_result = _artifact_dict(result, "crawl_result")
test_report = _artifact_dict(result, "test_report")
print(
summary = (
"ai-crawler test-recipe: "
f"recipe={crawl_result.get('recipe_name', recipe.name)} "
f"items_written={crawl_result.get('items_written', 0)} "
f"pages_scheduled={crawl_result.get('pages_scheduled', 0)} "
f"failure_reason={test_report.get('failure_reason', '')} "
f"output={output_path} "
f"report={report_path}"
)
print(summary)
return 0


def repair_recipe_command(recipe_path: str, report_path: str, output_path: str) -> int:
"""Repair one recipe using the single JSON report written by test-recipe."""
normalized_report_path = str(Path(report_path).resolve())
Expand Down
1 change: 1 addition & 0 deletions src/ai_crawler/core/agent/recipe_testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ def _test_report(fetcher: RecordingRecipeFetcher, crawl_result: CrawlResult) ->
"content_type": "",
"body_sample": "",
"stop_reason": crawl_result.stop_reason,
"pages_scheduled": crawl_result.pages_scheduled,
"pages_attempted": crawl_result.pages_attempted,
"requests_attempted": crawl_result.requests_attempted,
"failure_reason": _failure_reason(response=response, crawl_result=crawl_result),
Expand Down
1 change: 1 addition & 0 deletions src/ai_crawler/core/models/crawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ class CrawlResult(DomainModel):
recipe_name: str = Field(min_length=1)
items_written: int = Field(ge=0)
output_path: str = Field(min_length=1)
pages_scheduled: int = Field(default=0, ge=0)
pages_attempted: int = Field(default=0, ge=0)
requests_attempted: int = Field(default=0, ge=0)
stop_reason: RunnerStopReason = "completed"
Expand Down
13 changes: 13 additions & 0 deletions src/ai_crawler/core/runner/recipe_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ class RunnerCheckpoint(DomainModel):
@dataclass(slots=True)
class RunState:
items_written: int
pages_scheduled: int
pages_attempted: int
requests_attempted: int
stop_reason: RunnerStopReason
Expand Down Expand Up @@ -120,6 +121,7 @@ def run(self, recipe: Recipe) -> CrawlResult:
recipe_name=recipe.name,
items_written=state.items_written,
output_path=str(output_path),
pages_scheduled=state.pages_scheduled,
pages_attempted=state.pages_attempted,
requests_attempted=state.requests_attempted,
stop_reason=state.stop_reason,
Expand All @@ -138,6 +140,7 @@ def _run_sequential(
started_at: float,
) -> RunState:
current_request_index = next_request_index
pages_scheduled = 0
pages_attempted = 0
requests_attempted = 0
stop_reason: RunnerStopReason = "completed"
Expand All @@ -154,6 +157,7 @@ def _run_sequential(
stop_reason = "max_seconds_reached"
break
current_request_index = request_index
pages_scheduled += 1
pages_attempted += 1
response, request_attempts, stop_reason = self._fetch_with_retries(
request=request,
Expand Down Expand Up @@ -185,6 +189,7 @@ def _run_sequential(
)
return RunState(
items_written=items_written,
pages_scheduled=pages_scheduled,
pages_attempted=pages_attempted,
requests_attempted=requests_attempted,
stop_reason=stop_reason,
Expand Down Expand Up @@ -260,6 +265,7 @@ async def _run_concurrent_async(
next_schedule_offset += 1
current_request_index = next_request_index
next_flush_index = next_request_index
pages_scheduled = next_schedule_offset
pages_attempted = 0
requests_attempted = 0
stop_reason: RunnerStopReason = "completed"
Expand All @@ -274,6 +280,7 @@ async def _run_concurrent_async(
if remaining_timeout == 0:
terminal_state = RunState(
items_written=items_written,
pages_scheduled=pages_scheduled,
pages_attempted=pages_attempted,
requests_attempted=requests_attempted,
stop_reason="max_seconds_reached",
Expand All @@ -288,6 +295,7 @@ async def _run_concurrent_async(
if not done:
terminal_state = RunState(
items_written=items_written,
pages_scheduled=pages_scheduled,
pages_attempted=pages_attempted,
requests_attempted=requests_attempted,
stop_reason="max_seconds_reached",
Expand All @@ -309,6 +317,7 @@ async def _run_concurrent_async(
if response is None or stop_reason == "non_success_status":
terminal_state = RunState(
items_written=items_written,
pages_scheduled=pages_scheduled,
pages_attempted=pages_attempted,
requests_attempted=requests_attempted,
stop_reason=stop_reason,
Expand All @@ -325,6 +334,7 @@ async def _run_concurrent_async(
if stop_reason == "max_items_reached":
terminal_state = RunState(
items_written=items_written,
pages_scheduled=pages_scheduled,
pages_attempted=pages_attempted,
requests_attempted=requests_attempted,
stop_reason=stop_reason,
Expand All @@ -335,6 +345,7 @@ async def _run_concurrent_async(
stop_reason = "empty_page"
terminal_state = RunState(
items_written=items_written,
pages_scheduled=pages_scheduled,
pages_attempted=pages_attempted,
requests_attempted=requests_attempted,
stop_reason=stop_reason,
Expand Down Expand Up @@ -374,6 +385,7 @@ async def _run_concurrent_async(
)
)
next_schedule_offset += 1
pages_scheduled = next_schedule_offset

if pending_tasks:
for task in pending_tasks:
Expand All @@ -385,6 +397,7 @@ async def _run_concurrent_async(

return RunState(
items_written=items_written,
pages_scheduled=pages_scheduled,
pages_attempted=pages_attempted,
requests_attempted=requests_attempted,
stop_reason=stop_reason,
Expand Down
2 changes: 2 additions & 0 deletions tests/component/core/runner/test_recipe_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -491,6 +491,7 @@ def run_recipe() -> None:
assert not error_holder
result = result_holder["result"]
assert result.items_written == 1
assert result.pages_scheduled == 3
assert result.pages_attempted == 1
assert result.requests_attempted == 1
assert result.stop_reason == "max_seconds_reached"
Expand Down Expand Up @@ -592,6 +593,7 @@ def test_recipe_runner_stops_concurrent_run_at_max_items_without_fetching_later_

assert fetcher.page_two_done.is_set()
assert result.items_written == 2
assert result.pages_scheduled == 2
assert result.pages_attempted == 2
assert result.requests_attempted == 2
assert result.stop_reason == "max_items_reached"
Expand Down
7 changes: 6 additions & 1 deletion tests/unit/cli/test_run_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,11 @@ def test_run_command_loads_recipe_executes_runner_and_prints_summary(
assert exit_code == 0
assert capsys.readouterr().out.strip() == (
"ai-crawler run: "
f"recipe=products-api items_written=1 pages_attempted=1 requests_attempted=1 "
"recipe=products-api "
"items_written=1 "
"pages_scheduled=1 "
"pages_attempted=1 "
"requests_attempted=1 "
f"stop_reason=completed output={output_path}"
)
assert output_path.read_text(encoding="utf-8") == (
Expand Down Expand Up @@ -101,6 +105,7 @@ def test_run_command_prints_checkpoint_summary_when_run_stops_with_resume_state(
"ai-crawler run: "
"recipe=products-api "
"items_written=1 "
"pages_scheduled=1 "
"pages_attempted=1 "
"requests_attempted=1 "
"stop_reason=max_seconds_reached "
Expand Down
7 changes: 6 additions & 1 deletion tests/unit/cli/test_test_recipe_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,10 @@ def test_test_recipe_command_executes_tool_writes_report_and_prints_summary(
assert exit_code == 0
assert capsys.readouterr().out.strip() == (
"ai-crawler test-recipe: "
f"recipe=products-api items_written=1 failure_reason= output={output_path} "
"recipe=products-api "
"items_written=1 "
"pages_scheduled=1 "
f"failure_reason= output={output_path} "
f"report={report_path}"
)
assert output_path.read_text(encoding="utf-8") == (
Expand All @@ -69,6 +72,7 @@ def test_test_recipe_command_executes_tool_writes_report_and_prints_summary(
"recipe_name": "products-api",
"items_written": 1,
"output_path": str(output_path),
"pages_scheduled": 1,
"pages_attempted": 1,
"requests_attempted": 1,
"stop_reason": "completed",
Expand All @@ -79,6 +83,7 @@ def test_test_recipe_command_executes_tool_writes_report_and_prints_summary(
"content_type": "application/json",
"body_sample": '{"items": [{"name": "Keyboard", "price": 120}]}',
"stop_reason": "completed",
"pages_scheduled": 1,
"pages_attempted": 1,
"requests_attempted": 1,
"failure_reason": "",
Expand Down
1 change: 1 addition & 0 deletions tests/unit/core/agent/test_agent_recipe_flow.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ def test_agent_controller_hands_generated_recipe_artifact_to_test_recipe_tool(tm
"recipe_name": "products-api",
"items_written": 0,
"output_path": str(output_path),
"pages_scheduled": 1,
"pages_attempted": 1,
"requests_attempted": 1,
"stop_reason": "empty_page",
Expand Down
4 changes: 4 additions & 0 deletions tests/unit/core/agent/test_recipe_testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ def test_test_recipe_tool_runs_recipe_and_returns_crawl_result_artifact(tmp_path
"recipe_name": "products-api",
"items_written": 1,
"output_path": str(output_path),
"pages_scheduled": 1,
"pages_attempted": 1,
"requests_attempted": 1,
"stop_reason": "completed",
Expand All @@ -84,6 +85,7 @@ def test_test_recipe_tool_runs_recipe_and_returns_crawl_result_artifact(tmp_path
"content_type": "application/json",
"body_sample": '{"items": [{"name": "Keyboard", "price": 120}]}',
"stop_reason": "completed",
"pages_scheduled": 1,
"pages_attempted": 1,
"requests_attempted": 1,
"failure_reason": "",
Expand Down Expand Up @@ -119,6 +121,7 @@ def test_test_recipe_tool_classifies_challenge_boundary(tmp_path) -> None:

test_report = result.artifacts["test_report"]
assert test_report["stop_reason"] == "non_success_status"
assert test_report["pages_scheduled"] == 1
assert test_report["pages_attempted"] == 1
assert test_report["requests_attempted"] == 1
assert test_report["failure_reason"] == "non_success_status"
Expand Down Expand Up @@ -152,6 +155,7 @@ def test_test_recipe_tool_reports_retry_exhaustion_as_retryable_failure(tmp_path
test_report = result.artifacts["test_report"]
assert fetcher.calls == 3
assert test_report["stop_reason"] == "retry_exhausted"
assert test_report["pages_scheduled"] == 1
assert test_report["pages_attempted"] == 1
assert test_report["requests_attempted"] == 3
assert test_report["failure_reason"] == "retry_exhausted"
Expand Down
3 changes: 3 additions & 0 deletions tests/unit/core/models/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ def test_recipe_crawl_result_and_failure_report_are_explicit_models() -> None:
recipe_name=recipe.name,
items_written=2,
output_path="out.jsonl",
pages_scheduled=2,
pages_attempted=2,
requests_attempted=2,
stop_reason="completed",
Expand All @@ -139,6 +140,7 @@ def test_recipe_crawl_result_and_failure_report_are_explicit_models() -> None:
RequestSpec(method="GET", url="https://example.com/api/products?page=1"),
)
assert crawl_result.items_written == 2
assert crawl_result.pages_scheduled == 2
assert crawl_result.pages_attempted == 2
assert crawl_result.requests_attempted == 2
assert crawl_result.stop_reason == "completed"
Expand All @@ -153,6 +155,7 @@ def test_crawl_result_rejects_unknown_stop_reason(stop_reason: str) -> None:
recipe_name="example-products",
items_written=0,
output_path="out.jsonl",
pages_scheduled=0,
pages_attempted=0,
requests_attempted=0,
stop_reason=stop_reason,
Expand Down
Loading