diff --git a/src/ai_crawler/cli/main.py b/src/ai_crawler/cli/main.py index 292af8e..daafcdf 100644 --- a/src/ai_crawler/cli/main.py +++ b/src/ai_crawler/cli/main.py @@ -561,16 +561,18 @@ def run_recipe_command(recipe_path: str, output_path: str) -> int: if result.checkpoint_path else "" ) - print( + summary = ( "ai-crawler run: " f"recipe={result.recipe_name} " f"items_written={result.items_written} " + f"pages_scheduled={result.pages_scheduled} " f"pages_attempted={result.pages_attempted} " f"requests_attempted={result.requests_attempted} " f"stop_reason={result.stop_reason} " - f"output={result.output_path}" + f"output={output_path}" f"{checkpoint_summary}" ) + print(summary) return 0 @@ -596,17 +598,18 @@ def test_recipe_command(recipe_path: str, output_path: str, report_path: str) -> _write_tool_report(result=result, report_path=report_path) crawl_result = _artifact_dict(result, "crawl_result") test_report = _artifact_dict(result, "test_report") - print( + summary = ( "ai-crawler test-recipe: " f"recipe={crawl_result.get('recipe_name', recipe.name)} " f"items_written={crawl_result.get('items_written', 0)} " + f"pages_scheduled={crawl_result.get('pages_scheduled', 0)} " f"failure_reason={test_report.get('failure_reason', '')} " f"output={output_path} " f"report={report_path}" ) + print(summary) return 0 - def repair_recipe_command(recipe_path: str, report_path: str, output_path: str) -> int: """Repair one recipe using the single JSON report written by test-recipe.""" normalized_report_path = str(Path(report_path).resolve()) diff --git a/src/ai_crawler/core/agent/recipe_testing.py b/src/ai_crawler/core/agent/recipe_testing.py index 18a17e3..9e5cbe8 100644 --- a/src/ai_crawler/core/agent/recipe_testing.py +++ b/src/ai_crawler/core/agent/recipe_testing.py @@ -91,6 +91,7 @@ def _test_report(fetcher: RecordingRecipeFetcher, crawl_result: CrawlResult) -> "content_type": "", "body_sample": "", "stop_reason": crawl_result.stop_reason, + "pages_scheduled": crawl_result.pages_scheduled, "pages_attempted": crawl_result.pages_attempted, "requests_attempted": crawl_result.requests_attempted, "failure_reason": _failure_reason(response=response, crawl_result=crawl_result), diff --git a/src/ai_crawler/core/models/crawl.py b/src/ai_crawler/core/models/crawl.py index 0a9e644..82455b5 100644 --- a/src/ai_crawler/core/models/crawl.py +++ b/src/ai_crawler/core/models/crawl.py @@ -12,6 +12,7 @@ class CrawlResult(DomainModel): recipe_name: str = Field(min_length=1) items_written: int = Field(ge=0) output_path: str = Field(min_length=1) + pages_scheduled: int = Field(default=0, ge=0) pages_attempted: int = Field(default=0, ge=0) requests_attempted: int = Field(default=0, ge=0) stop_reason: RunnerStopReason = "completed" diff --git a/src/ai_crawler/core/runner/recipe_runner.py b/src/ai_crawler/core/runner/recipe_runner.py index c5101b0..d6ad136 100644 --- a/src/ai_crawler/core/runner/recipe_runner.py +++ b/src/ai_crawler/core/runner/recipe_runner.py @@ -35,6 +35,7 @@ class RunnerCheckpoint(DomainModel): @dataclass(slots=True) class RunState: items_written: int + pages_scheduled: int pages_attempted: int requests_attempted: int stop_reason: RunnerStopReason @@ -120,6 +121,7 @@ def run(self, recipe: Recipe) -> CrawlResult: recipe_name=recipe.name, items_written=state.items_written, output_path=str(output_path), + pages_scheduled=state.pages_scheduled, pages_attempted=state.pages_attempted, requests_attempted=state.requests_attempted, stop_reason=state.stop_reason, @@ -138,6 +140,7 @@ def _run_sequential( started_at: float, ) -> RunState: current_request_index = next_request_index + pages_scheduled = 0 pages_attempted = 0 requests_attempted = 0 stop_reason: RunnerStopReason = "completed" @@ -154,6 +157,7 @@ def _run_sequential( stop_reason = "max_seconds_reached" break current_request_index = request_index + pages_scheduled += 1 pages_attempted += 1 response, request_attempts, stop_reason = self._fetch_with_retries( request=request, @@ -185,6 +189,7 @@ def _run_sequential( ) return RunState( items_written=items_written, + pages_scheduled=pages_scheduled, pages_attempted=pages_attempted, requests_attempted=requests_attempted, stop_reason=stop_reason, @@ -260,6 +265,7 @@ async def _run_concurrent_async( next_schedule_offset += 1 current_request_index = next_request_index next_flush_index = next_request_index + pages_scheduled = next_schedule_offset pages_attempted = 0 requests_attempted = 0 stop_reason: RunnerStopReason = "completed" @@ -274,6 +280,7 @@ async def _run_concurrent_async( if remaining_timeout == 0: terminal_state = RunState( items_written=items_written, + pages_scheduled=pages_scheduled, pages_attempted=pages_attempted, requests_attempted=requests_attempted, stop_reason="max_seconds_reached", @@ -288,6 +295,7 @@ async def _run_concurrent_async( if not done: terminal_state = RunState( items_written=items_written, + pages_scheduled=pages_scheduled, pages_attempted=pages_attempted, requests_attempted=requests_attempted, stop_reason="max_seconds_reached", @@ -309,6 +317,7 @@ async def _run_concurrent_async( if response is None or stop_reason == "non_success_status": terminal_state = RunState( items_written=items_written, + pages_scheduled=pages_scheduled, pages_attempted=pages_attempted, requests_attempted=requests_attempted, stop_reason=stop_reason, @@ -325,6 +334,7 @@ async def _run_concurrent_async( if stop_reason == "max_items_reached": terminal_state = RunState( items_written=items_written, + pages_scheduled=pages_scheduled, pages_attempted=pages_attempted, requests_attempted=requests_attempted, stop_reason=stop_reason, @@ -335,6 +345,7 @@ async def _run_concurrent_async( stop_reason = "empty_page" terminal_state = RunState( items_written=items_written, + pages_scheduled=pages_scheduled, pages_attempted=pages_attempted, requests_attempted=requests_attempted, stop_reason=stop_reason, @@ -374,6 +385,7 @@ async def _run_concurrent_async( ) ) next_schedule_offset += 1 + pages_scheduled = next_schedule_offset if pending_tasks: for task in pending_tasks: @@ -385,6 +397,7 @@ async def _run_concurrent_async( return RunState( items_written=items_written, + pages_scheduled=pages_scheduled, pages_attempted=pages_attempted, requests_attempted=requests_attempted, stop_reason=stop_reason, diff --git a/tests/component/core/runner/test_recipe_runner.py b/tests/component/core/runner/test_recipe_runner.py index e5ebdce..4beb3de 100644 --- a/tests/component/core/runner/test_recipe_runner.py +++ b/tests/component/core/runner/test_recipe_runner.py @@ -491,6 +491,7 @@ def run_recipe() -> None: assert not error_holder result = result_holder["result"] assert result.items_written == 1 + assert result.pages_scheduled == 3 assert result.pages_attempted == 1 assert result.requests_attempted == 1 assert result.stop_reason == "max_seconds_reached" @@ -592,6 +593,7 @@ def test_recipe_runner_stops_concurrent_run_at_max_items_without_fetching_later_ assert fetcher.page_two_done.is_set() assert result.items_written == 2 + assert result.pages_scheduled == 2 assert result.pages_attempted == 2 assert result.requests_attempted == 2 assert result.stop_reason == "max_items_reached" diff --git a/tests/unit/cli/test_run_command.py b/tests/unit/cli/test_run_command.py index 02c61dd..d23097b 100644 --- a/tests/unit/cli/test_run_command.py +++ b/tests/unit/cli/test_run_command.py @@ -49,7 +49,11 @@ def test_run_command_loads_recipe_executes_runner_and_prints_summary( assert exit_code == 0 assert capsys.readouterr().out.strip() == ( "ai-crawler run: " - f"recipe=products-api items_written=1 pages_attempted=1 requests_attempted=1 " + "recipe=products-api " + "items_written=1 " + "pages_scheduled=1 " + "pages_attempted=1 " + "requests_attempted=1 " f"stop_reason=completed output={output_path}" ) assert output_path.read_text(encoding="utf-8") == ( @@ -101,6 +105,7 @@ def test_run_command_prints_checkpoint_summary_when_run_stops_with_resume_state( "ai-crawler run: " "recipe=products-api " "items_written=1 " + "pages_scheduled=1 " "pages_attempted=1 " "requests_attempted=1 " "stop_reason=max_seconds_reached " diff --git a/tests/unit/cli/test_test_recipe_command.py b/tests/unit/cli/test_test_recipe_command.py index a4f2a3f..04dce85 100644 --- a/tests/unit/cli/test_test_recipe_command.py +++ b/tests/unit/cli/test_test_recipe_command.py @@ -58,7 +58,10 @@ def test_test_recipe_command_executes_tool_writes_report_and_prints_summary( assert exit_code == 0 assert capsys.readouterr().out.strip() == ( "ai-crawler test-recipe: " - f"recipe=products-api items_written=1 failure_reason= output={output_path} " + "recipe=products-api " + "items_written=1 " + "pages_scheduled=1 " + f"failure_reason= output={output_path} " f"report={report_path}" ) assert output_path.read_text(encoding="utf-8") == ( @@ -69,6 +72,7 @@ def test_test_recipe_command_executes_tool_writes_report_and_prints_summary( "recipe_name": "products-api", "items_written": 1, "output_path": str(output_path), + "pages_scheduled": 1, "pages_attempted": 1, "requests_attempted": 1, "stop_reason": "completed", @@ -79,6 +83,7 @@ def test_test_recipe_command_executes_tool_writes_report_and_prints_summary( "content_type": "application/json", "body_sample": '{"items": [{"name": "Keyboard", "price": 120}]}', "stop_reason": "completed", + "pages_scheduled": 1, "pages_attempted": 1, "requests_attempted": 1, "failure_reason": "", diff --git a/tests/unit/core/agent/test_agent_recipe_flow.py b/tests/unit/core/agent/test_agent_recipe_flow.py index 9ae7177..ad16a6c 100644 --- a/tests/unit/core/agent/test_agent_recipe_flow.py +++ b/tests/unit/core/agent/test_agent_recipe_flow.py @@ -143,6 +143,7 @@ def test_agent_controller_hands_generated_recipe_artifact_to_test_recipe_tool(tm "recipe_name": "products-api", "items_written": 0, "output_path": str(output_path), + "pages_scheduled": 1, "pages_attempted": 1, "requests_attempted": 1, "stop_reason": "empty_page", diff --git a/tests/unit/core/agent/test_recipe_testing.py b/tests/unit/core/agent/test_recipe_testing.py index 626c175..a482e44 100644 --- a/tests/unit/core/agent/test_recipe_testing.py +++ b/tests/unit/core/agent/test_recipe_testing.py @@ -74,6 +74,7 @@ def test_test_recipe_tool_runs_recipe_and_returns_crawl_result_artifact(tmp_path "recipe_name": "products-api", "items_written": 1, "output_path": str(output_path), + "pages_scheduled": 1, "pages_attempted": 1, "requests_attempted": 1, "stop_reason": "completed", @@ -84,6 +85,7 @@ def test_test_recipe_tool_runs_recipe_and_returns_crawl_result_artifact(tmp_path "content_type": "application/json", "body_sample": '{"items": [{"name": "Keyboard", "price": 120}]}', "stop_reason": "completed", + "pages_scheduled": 1, "pages_attempted": 1, "requests_attempted": 1, "failure_reason": "", @@ -119,6 +121,7 @@ def test_test_recipe_tool_classifies_challenge_boundary(tmp_path) -> None: test_report = result.artifacts["test_report"] assert test_report["stop_reason"] == "non_success_status" + assert test_report["pages_scheduled"] == 1 assert test_report["pages_attempted"] == 1 assert test_report["requests_attempted"] == 1 assert test_report["failure_reason"] == "non_success_status" @@ -152,6 +155,7 @@ def test_test_recipe_tool_reports_retry_exhaustion_as_retryable_failure(tmp_path test_report = result.artifacts["test_report"] assert fetcher.calls == 3 assert test_report["stop_reason"] == "retry_exhausted" + assert test_report["pages_scheduled"] == 1 assert test_report["pages_attempted"] == 1 assert test_report["requests_attempted"] == 3 assert test_report["failure_reason"] == "retry_exhausted" diff --git a/tests/unit/core/models/test_models.py b/tests/unit/core/models/test_models.py index 74fe91d..3633469 100644 --- a/tests/unit/core/models/test_models.py +++ b/tests/unit/core/models/test_models.py @@ -128,6 +128,7 @@ def test_recipe_crawl_result_and_failure_report_are_explicit_models() -> None: recipe_name=recipe.name, items_written=2, output_path="out.jsonl", + pages_scheduled=2, pages_attempted=2, requests_attempted=2, stop_reason="completed", @@ -139,6 +140,7 @@ def test_recipe_crawl_result_and_failure_report_are_explicit_models() -> None: RequestSpec(method="GET", url="https://example.com/api/products?page=1"), ) assert crawl_result.items_written == 2 + assert crawl_result.pages_scheduled == 2 assert crawl_result.pages_attempted == 2 assert crawl_result.requests_attempted == 2 assert crawl_result.stop_reason == "completed" @@ -153,6 +155,7 @@ def test_crawl_result_rejects_unknown_stop_reason(stop_reason: str) -> None: recipe_name="example-products", items_written=0, output_path="out.jsonl", + pages_scheduled=0, pages_attempted=0, requests_attempted=0, stop_reason=stop_reason,