diff --git a/README.md b/README.md index 404b547..cd2b51a 100644 --- a/README.md +++ b/README.md @@ -155,6 +155,24 @@ def login_example() -> None: - Fluent assertion DSL via `expect(...)` - Retrying verification via `runtime.check(...).eventually(...)` +### Scroll verification (prevent no-op scroll drift) + +A common agent failure mode is “scrolling” without the UI actually advancing (overlays, nested scrollers, focus issues). Use `AgentRuntime.scroll_by(...)` to deterministically verify scroll *had effect* via before/after `scrollTop`. + +```python +runtime.begin_step("Scroll the page and verify it moved") +ok = await runtime.scroll_by( + 600, + verify=True, + min_delta_px=50, + label="scroll_effective", + required=True, + timeout_s=5.0, +) +if not ok: + raise RuntimeError("Scroll had no effect (likely blocked by overlay or nested scroller).") +``` + ### Explained failure - JSONL trace events (`Tracer` + `JsonlTraceSink`) diff --git a/predicate/agent_runtime.py b/predicate/agent_runtime.py index 5b774aa..47194ca 100644 --- a/predicate/agent_runtime.py +++ b/predicate/agent_runtime.py @@ -484,6 +484,151 @@ async def evaluate_js(self, request: EvaluateJsRequest) -> EvaluateJsResult: truncated=truncated, ) + async def _get_scroll_metrics(self) -> dict[str, Any]: + """ + Best-effort, bounded scroll metrics for verification. + + Returns a small JSON-serializable dict with: + - top: current scrollTop (px) + - height: scrollHeight (px) if available + - client: clientHeight (px) if available + """ + # Keep this as a single bounded expression; do not dump DOM. + expr = """ +(() => { + try { + const el = document.scrollingElement || document.documentElement || document.body; + const top = + (el && typeof el.scrollTop === 'number') + ? el.scrollTop + : (typeof window.scrollY === 'number' ? window.scrollY : 0); + const height = (el && typeof el.scrollHeight === 'number') ? el.scrollHeight : null; + const client = (el && typeof el.clientHeight === 'number') ? el.clientHeight : null; + return { top, height, client }; + } catch (e) { + return { top: null, height: null, client: null, error: String(e && e.message ? e.message : e) }; + } +})() +""".strip() + v = await self.backend.eval(expr) + if isinstance(v, dict): + return v + return {"top": v, "height": None, "client": None} + + async def scroll_by( + self, + dy: float, + *, + verify: bool = True, + min_delta_px: float = 50.0, + label: str = "scroll_effective", + required: bool = True, + timeout_s: float = 10.0, + poll_s: float = 0.25, + x: float | None = None, + y: float | None = None, + js_fallback: bool = True, + ) -> bool: + """ + Scroll and (optionally) deterministically verify that the scroll had effect. + + This targets a common failure mode: an agent "scrolls" but the page doesn't + actually advance (delta stays ~0 due to overlays, focus, nested scrollers, etc.). + + Behavior: + - captures a bounded before/after scrollTop metric + - performs a wheel scroll via backend (most compatible) + - if verify=True, polls until |after-before| >= min_delta_px or timeout + - optionally attempts a JS scrollBy fallback once if wheel has no effect + + Returns: + True if scroll was effective (or verify=False), else False. + """ + await self.record_action(f"scroll_by(dy={dy})", url=await self.get_url()) + + if not verify: + await self.backend.wheel(delta_y=float(dy), x=x, y=y) + return True + + before = await self._get_scroll_metrics() + before_top = before.get("top") + try: + before_top_f = float(before_top) if before_top is not None else 0.0 + except Exception: + before_top_f = 0.0 + + used_js_fallback = False + start = time.monotonic() + + # First attempt: wheel scroll (preferred). + await self.backend.wheel(delta_y=float(dy), x=x, y=y) + + while True: + after = await self._get_scroll_metrics() + after_top = after.get("top") + try: + after_top_f = float(after_top) if after_top is not None else before_top_f + except Exception: + after_top_f = before_top_f + + delta = after_top_f - before_top_f + passed = abs(delta) >= float(min_delta_px) + + if passed: + outcome = AssertOutcome( + passed=True, + reason="", + details={ + "dy": float(dy), + "min_delta_px": float(min_delta_px), + "before": before, + "after": after, + "delta_px": float(delta), + "js_fallback_used": used_js_fallback, + }, + ) + self._record_outcome( + outcome=outcome, + label=label, + required=required, + kind="scroll", + record_in_step=True, + ) + return True + + elapsed = time.monotonic() - start + if elapsed >= float(timeout_s): + outcome = AssertOutcome( + passed=False, + reason=f"scroll delta {delta:.1f}px < min_delta_px={float(min_delta_px):.1f}px", + details={ + "dy": float(dy), + "min_delta_px": float(min_delta_px), + "before": before, + "after": after, + "delta_px": float(delta), + "js_fallback_used": used_js_fallback, + "timeout_s": float(timeout_s), + }, + ) + self._record_outcome( + outcome=outcome, + label=label, + required=required, + kind="scroll", + record_in_step=True, + ) + if required: + self._persist_failure_artifacts(reason=f"scroll_failed:{label}") + return False + + # Optional fallback: if wheel had no effect, try a bounded JS scroll request once. + if js_fallback and not used_js_fallback and abs(delta) < 1.0: + used_js_fallback = True + await self.backend.eval(f"window.scrollBy(0, {float(dy)})") + + await asyncio.sleep(float(poll_s)) + async def list_tabs(self) -> TabListResult: backend = self._get_tab_backend() if backend is None: diff --git a/tests/test_agent_runtime.py b/tests/test_agent_runtime.py index 98162fa..91b0c58 100644 --- a/tests/test_agent_runtime.py +++ b/tests/test_agent_runtime.py @@ -165,6 +165,59 @@ def test_init_with_api_key_and_options(self) -> None: assert runtime._snapshot_options.sentience_api_key == "sk_pro_key" assert runtime._snapshot_options.use_api is True + +@pytest.mark.asyncio +async def test_scroll_by_verifies_delta_via_scrolltop() -> None: + backend = MagicMock() + backend.get_url = AsyncMock(return_value="https://example.com") + backend.wheel = AsyncMock(return_value=None) + + # _get_scroll_metrics() uses backend.eval() with a bounded expression; return before/after. + backend.eval = AsyncMock( + side_effect=[ + {"top": 100, "height": 2000, "client": 800}, # before + {"top": 180, "height": 2000, "client": 800}, # after + ] + ) + tracer = MockTracer() + runtime = AgentRuntime(backend=backend, tracer=tracer) + runtime.begin_step("scroll test") + + ok = await runtime.scroll_by(200, verify=True, min_delta_px=50, timeout_s=1.0, poll_s=0.01) + assert ok is True + backend.wheel.assert_awaited() + assert any( + e["type"] == "verification" and e["data"].get("kind") == "scroll" for e in tracer.events + ) + + +@pytest.mark.asyncio +async def test_scroll_by_times_out_and_records_failed_verification() -> None: + backend = MagicMock() + backend.get_url = AsyncMock(return_value="https://example.com") + backend.wheel = AsyncMock(return_value=None) + + # before and after unchanged → should fail (allow unlimited polls) + calls = {"n": 0} + + async def _eval(_expr: str): + calls["n"] += 1 + return {"top": 100, "height": 2000, "client": 800} + + backend.eval = AsyncMock(side_effect=_eval) + tracer = MockTracer() + runtime = AgentRuntime(backend=backend, tracer=tracer) + runtime.begin_step("scroll fail") + + ok = await runtime.scroll_by(200, verify=True, min_delta_px=50, timeout_s=0.05, poll_s=0.01) + assert ok is False + assert any( + e["type"] == "verification" + and e["data"].get("kind") == "scroll" + and e["data"].get("passed") is False + for e in tracer.events + ) + @pytest.mark.asyncio async def test_evaluate_js_success(self) -> None: backend = MockBackend()