Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
# Changelog

## 8.0.4 - 2026-03-26

### Fixed

- Loaders: enforce a hard wall-clock timeout on the Playwright render worker thread (2× browser timeout + 30 s); raises a retryable `LoaderRuntimeError` if Chromium becomes permanently unresponsive (stuck WebSocket, no CDP response).
- Loaders: render thread is now always a daemon thread so an abandoned timeout does not block process exit.

## 8.0.3 - 2026-03-26

### Fixed
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "wordlift-sdk"
version = "8.0.3"
version = "8.0.4"
description = "Python toolkit for orchestrating WordLift imports and structured data workflows."
authors = ["David Riccitelli <david@wordlift.io>"]
readme = "README.md"
Expand Down
9 changes: 9 additions & 0 deletions tests/ingestion/test_loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import asyncio
import threading
import time
import urllib.error
from io import BytesIO
from types import SimpleNamespace
Expand Down Expand Up @@ -207,6 +208,14 @@ async def _run() -> None:
assert seen_thread_ids[0] != main_thread_id


def test_run_in_worker_thread_raises_on_hard_timeout() -> None:
with pytest.raises(LoaderRuntimeError) as exc:
loaders_module._run_in_worker_thread(lambda: time.sleep(10), timeout=0.05)

assert exc.value.code == "INGEST_LOAD_BROWSER_TIMEOUT"
assert exc.value.retryable is True


def test_playwright_loader_wraps_non_runtime_exceptions(
monkeypatch: pytest.MonkeyPatch,
) -> None:
Expand Down
3 changes: 3 additions & 0 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

34 changes: 26 additions & 8 deletions wordlift_sdk/ingestion/loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,12 +261,19 @@ def _is_running_in_event_loop() -> bool:
def _render_with_loop_safety(
render_fn: Callable[[RenderOptions], Any], options: RenderOptions
) -> Any:
if not _is_running_in_event_loop():
return render_fn(options)
return _run_in_worker_thread(lambda: render_fn(options))


def _run_in_worker_thread(fn: Callable[[], Any]) -> Any:
# Always delegate to a dedicated daemon worker thread so that:
# 1. Playwright's greenlet event loop does not run on the calling thread
# (avoids blocking the asyncio executor thread introduced in 8.0.3).
# 2. A hard wall-clock timeout can be enforced even when the browser
# subprocess becomes permanently unresponsive (stuck WebSocket, no
# CDP response to page.content() or browser.close(), etc.).
hard_timeout = options.timeout_ms / 1000 * 2 + 30
return _run_in_worker_thread(lambda: render_fn(options), timeout=hard_timeout)


def _run_in_worker_thread(
fn: Callable[[], Any], *, timeout: float | None = None
) -> Any:
result: dict[str, Any] = {}
error: dict[str, BaseException] = {}

Expand All @@ -276,9 +283,20 @@ def target() -> None:
except BaseException as exc: # pragma: no cover - asserted via caller paths
error["exc"] = exc

thread = threading.Thread(target=target, name="playwright-render-worker")
# daemon=True: if this thread is abandoned on timeout it will not block
# process exit — Chromium subprocesses it owns are cleaned up by the OS.
thread = threading.Thread(
target=target, name="playwright-render-worker", daemon=True
)
thread.start()
thread.join()
thread.join(timeout=timeout)

if thread.is_alive():
raise LoaderRuntimeError(
f"Playwright render hard timeout after {timeout:.0f}s for render operation",
code="INGEST_LOAD_BROWSER_TIMEOUT",
retryable=True,
)

exc = error.get("exc")
if exc is not None:
Expand Down
Loading