Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions src/openchronicle/capture/app_parsers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
"""S1 app parser registry.

Parsers are registered at import time and run in priority order during
:func:`apply_parsers`. Later parsers can match on fields produced by
earlier parsers (e.g. a Linear parser matching ``fields.url`` that
contains ``linear.app``, which was extracted by the browser parser).
"""

from __future__ import annotations

from ...logger import get
from .base import AppParser, ParseContext, S1Fields
from .browser import BrowserParser

logger = get("openchronicle.capture.s1_registry")

_parsers: list[AppParser] = []


def _register_builtins() -> None:
register(BrowserParser())


def register(parser: AppParser) -> None:
_parsers.append(parser)
_parsers.sort(key=lambda p: p.priority)


def _reset_registry() -> None:
"""Clear all registered parsers and re-register builtins.

Intended for test isolation so registry mutations in one test
do not leak into another.
"""
_parsers.clear()
_register_builtins()


def apply_parsers(ctx: ParseContext, fields: S1Fields) -> None:
for parser in _parsers:
try:
if parser.matches(ctx, fields):
patch = parser.parse(ctx, fields)
if patch.focused_element is not None:
fields.focused_element = patch.focused_element
if patch.visible_text is not None:
fields.visible_text = patch.visible_text
if patch.url is not None:
fields.url = patch.url
if patch.app_context:
fields.app_context = {**fields.app_context, **patch.app_context}
Comment thread
heming-gmh marked this conversation as resolved.
except Exception:
logger.exception("S1 parser %r failed", parser.name)


_register_builtins()
94 changes: 94 additions & 0 deletions src/openchronicle/capture/app_parsers/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
"""Base types for the S1 app parser registry.

Every app-specific parser implements the :class:`AppParser` protocol.
The :class:`ParseContext` gives parsers read-only access to the raw
capture data; :class:`S1Fields` holds the current state; and
:class:`S1Patch` lets a parser selectively override fields.
"""

from __future__ import annotations

from dataclasses import asdict, dataclass, field
from typing import Any, Iterable, Protocol


@dataclass
class FocusedElement:
role: str = ""
title: str = ""
value: str = ""
is_editable: bool = False
has_value: bool = False
value_length: int = 0

def to_dict(self) -> dict[str, Any]:
d = asdict(self)
stripped = (self.value or "").strip()
d["has_value"] = bool(stripped)
d["value_length"] = len(stripped)
return d


@dataclass
class ParseContext:
"""Read-only view of the raw capture data for a parser."""

capture: dict[str, Any]
app: dict[str, Any]
window_meta: dict[str, Any]

@property
def bundle_id(self) -> str:
return (self.app.get("bundle_id") or "").strip()

@property
def app_name(self) -> str:
return (self.app.get("name") or "").strip()

def iter_windows(self) -> Iterable[dict[str, Any]]:
return iter(self.app.get("windows", []))

def focused_window(self) -> dict[str, Any] | None:
for w in self.app.get("windows", []):
if w.get("focused"):
return w
return None

def iter_elements(self) -> Iterable[dict[str, Any]]:
"""Iterate top-level elements across all windows."""
for window in self.app.get("windows", []):
yield from window.get("elements", [])


@dataclass
class S1Fields:
focused_element: FocusedElement
visible_text: str
url: str | None = None
app_context: dict[str, Any] = field(default_factory=dict)


@dataclass
class S1Patch:
focused_element: FocusedElement | None = None
visible_text: str | None = None
url: str | None = None
app_context: dict[str, Any] = field(default_factory=dict)


class AppParser(Protocol):
"""Protocol for app-specific S1 field parsers.

.. warning::

``matches()`` and ``parse()`` **must not** call ``register()``.
Doing so mutates the parser list while ``apply_parsers()`` is
iterating and will raise a ``RuntimeError``.
"""

name: str
priority: int

def matches(self, ctx: ParseContext, fields: S1Fields) -> bool: ...

def parse(self, ctx: ParseContext, fields: S1Fields) -> S1Patch: ...
52 changes: 52 additions & 0 deletions src/openchronicle/capture/app_parsers/browser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
"""Browser URL extraction parser.

Migrated from ``s1_parser._extract_url``. Matches known browser
bundle IDs and extracts the URL from the first ``AXTextField`` whose
value looks like a URL or bare domain.
"""

from __future__ import annotations

import re
from typing import Any

from .base import ParseContext, S1Fields, S1Patch

_BROWSER_BUNDLES = {
"com.google.Chrome",
"com.apple.Safari",
"org.mozilla.firefox",
"com.microsoft.edgemac",
"company.thebrowser.Browser",
"com.brave.Browser",
"com.operasoftware.Opera",
}

_URL_RE = re.compile(r"https?://\S+")


class BrowserParser:
name = "browser"
priority = 10

def matches(self, ctx: ParseContext, fields: S1Fields) -> bool:
return ctx.bundle_id in _BROWSER_BUNDLES

def parse(self, ctx: ParseContext, fields: S1Fields) -> S1Patch:
url = _extract_url_from_app(ctx.app)
return S1Patch(url=url)


def _extract_url_from_app(app_data: dict[str, Any]) -> str | None:
for window in app_data.get("windows", []):
for el in window.get("elements", []):
if el.get("role") != "AXTextField":
continue
value = (el.get("value") or "").strip()
if not value:
continue
if _URL_RE.search(value):
return value
if "." in value and " " not in value:
return f"https://{value}"
return None
86 changes: 35 additions & 51 deletions src/openchronicle/capture/s1_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,27 +9,28 @@
``_extract_focused_element`` / ``_render_visible_text`` / ``_extract_url``).
Runs inline inside ``capture_once`` so every capture-buffer JSON carries
these fields.

Architecture
------------

``enrich()`` computes a **generic baseline** (focused element + visible text
+ url=None) and then runs registered app parsers in priority order. Each
parser may selectively override fields via an ``S1Patch``. This lets future
parsers compose — for example a Linear parser can match ``linear.app`` in
the URL that the browser parser already extracted.
"""

from __future__ import annotations

import re
from dataclasses import asdict, dataclass
from typing import Any

# Import triggers builtin parser registration.
from .app_parsers import apply_parsers
from .app_parsers.base import FocusedElement, ParseContext, S1Fields
from .ax_models import ax_app_to_markdown

_BROWSER_BUNDLES = {
"com.google.Chrome",
"com.apple.Safari",
"org.mozilla.firefox",
"com.microsoft.edgemac",
"company.thebrowser.Browser",
"com.brave.Browser",
"com.operasoftware.Opera",
}

_URL_RE = re.compile(r"https?://\S+")
# Re-export for tests and downstream code that imports from here.
__all__ = ["FocusedElement", "enrich"]

_EDITABLE_ROLES = {"AXTextField", "AXTextArea", "AXComboBox"}
_STATIC_ROLES = {"AXStaticText", "AXWebArea"}
Expand All @@ -39,23 +40,6 @@
_FOCUS_VALUE_MAX = 2_000


@dataclass
class FocusedElement:
role: str = ""
title: str = ""
value: str = ""
is_editable: bool = False
has_value: bool = False
value_length: int = 0

def to_dict(self) -> dict[str, Any]:
d = asdict(self)
stripped = (self.value or "").strip()
d["has_value"] = bool(stripped)
d["value_length"] = len(stripped)
return d


def enrich(capture: dict[str, Any]) -> None:
"""Mutate ``capture`` in place: add ``focused_element`` / ``visible_text`` / ``url``.

Expand All @@ -72,9 +56,27 @@ def enrich(capture: dict[str, Any]) -> None:
capture["url"] = None
return

capture["focused_element"] = _extract_focused_element(app_data).to_dict()
capture["visible_text"] = _render_visible_text(app_data)
capture["url"] = _extract_url(app_data)
# ── Generic baseline ──────────────────────────────────────────────
fields = S1Fields(
focused_element=_extract_focused_element(app_data),
visible_text=_render_visible_text(app_data),
url=None,
)

# ── App-parser patches ────────────────────────────────────────────
ctx = ParseContext(
capture=capture,
app=app_data,
window_meta=capture.get("window_meta") or {},
)
apply_parsers(ctx, fields)

# ── Write back ────────────────────────────────────────────────────
capture["focused_element"] = fields.focused_element.to_dict()
capture["visible_text"] = fields.visible_text
capture["url"] = fields.url
if fields.app_context:
capture["app_context"] = fields.app_context


def _frontmost_app(ax_tree: dict[str, Any]) -> dict[str, Any] | None:
Expand Down Expand Up @@ -113,21 +115,3 @@ def _render_visible_text(app_data: dict[str, Any]) -> str:
if len(md) > _VISIBLE_TEXT_MAX:
md = md[:_VISIBLE_TEXT_MAX] + "\n...(truncated)"
return md


def _extract_url(app_data: dict[str, Any]) -> str | None:
bundle = app_data.get("bundle_id", "")
if bundle not in _BROWSER_BUNDLES:
return None
for window in app_data.get("windows", []):
for el in window.get("elements", []):
if el.get("role") != "AXTextField":
continue
value = (el.get("value") or "").strip()
if not value:
continue
if _URL_RE.search(value):
return value
if "." in value and " " not in value:
return f"https://{value}"
return None
9 changes: 9 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,15 @@
import pytest


@pytest.fixture(autouse=True)
def _reset_app_parser_registry() -> None:
"""Restore builtin parsers before each test so registry mutations
in one test do not leak into another."""
from openchronicle.capture.app_parsers import _reset_registry

_reset_registry()


@pytest.fixture
def ac_root(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> Path:
root = tmp_path / "openchronicle"
Expand Down
12 changes: 12 additions & 0 deletions tests/fixtures/s1/chrome_url/expected.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"focused_element": {
"role": "AXTextField",
"title": "Address and search bar",
"value": "https://www.anthropic.com/news",
"is_editable": true,
"has_value": true,
"value_length": 30
},
"visible_text": "## Google Chrome [active]\n_com.google.Chrome_\n### Anthropic — Claude Code\n- [TextField] Address and search bar — https://www.anthropic.com/news",
"url": "https://www.anthropic.com/news"
}
24 changes: 24 additions & 0 deletions tests/fixtures/s1/chrome_url/input.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
{
"ax_tree": {
"apps": [
{
"name": "Google Chrome",
"bundle_id": "com.google.Chrome",
"is_frontmost": true,
"windows": [
{
"title": "Anthropic — Claude Code",
"focused": true,
"elements": [
{
"role": "AXTextField",
"title": "Address and search bar",
"value": "https://www.anthropic.com/news"
}
]
}
]
}
]
}
}
12 changes: 12 additions & 0 deletions tests/fixtures/s1/generic_cursor_textarea/expected.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"focused_element": {
"role": "AXTextArea",
"title": "editor",
"value": "def enrich(capture):\n ...",
"is_editable": true,
"has_value": true,
"value_length": 28
},
"visible_text": "## Cursor [active]\n_com.todesktop.230313mzl4w4u92_\n### s1_parser.py\n- [TextArea] editor — def enrich(capture):\n ...",
"url": null
}
Loading