diff --git a/parse_errors/__init__.py b/parse_errors/__init__.py index 1cda1d0..dccf764 100644 --- a/parse_errors/__init__.py +++ b/parse_errors/__init__.py @@ -1,8 +1,13 @@ """re-raise parse errors with filename and line number.""" +from .context import ParseContext, ParseError + try: from ._version import __version__ except ImportError: # pragma: no cover __version__ = "dev" -__all__ = [] +__all__ = [ + "ParseContext", + "ParseError", +] diff --git a/parse_errors/_jsonpath.py b/parse_errors/_jsonpath.py new file mode 100644 index 0000000..88175ce --- /dev/null +++ b/parse_errors/_jsonpath.py @@ -0,0 +1,70 @@ +"""Convert JSONPath expressions to JSON Pointer (RFC 6901).""" + +from __future__ import annotations + +import re + + +# Matches a single step in a JSONPath: .key or [index] or ['key'] or ["key"] +_STEP = re.compile( + r"\.(?P[^.\[]+)" # .key + r"|\[(?P\d+)\]" # [0] + r"|\[\'(?P[^\']*)\'\]" # ['key'] + r'|\["(?P[^"]*)"\]' # ["key"] +) + +# Pattern to extract JSONPath from msgspec-style error messages: "... - at `$.foo.bar`" +_AT_PATH = re.compile(r" - at `(\$[^`]*)`") + + +def jsonpath_to_pointer(jsonpath: str) -> str: + """Convert a JSONPath string like ``$.foo[0].bar`` to a JSON Pointer like ``/foo/0/bar``. + + Only supports simple dot-notation and bracket-index forms as produced by + msgspec. Does not support filter expressions or wildcards. + + Args: + jsonpath: A JSONPath string starting with ``$``. + + Returns: + A JSON Pointer string (RFC 6901), e.g. ``/foo/0/bar``. + """ + if jsonpath == "$": + return "" + if not jsonpath.startswith("$"): + raise ValueError(f"JSONPath must start with '$', got: {jsonpath!r}") + + tail = jsonpath[1:] # strip leading $ + parts: list[str] = [] + + pos = 0 + while pos < len(tail): + m = _STEP.match(tail, pos) + if m is None: + raise ValueError( + f"Cannot parse JSONPath step at position {pos}: {tail[pos:]!r}" + ) + name = m.group("name") or m.group("sq") or m.group("dq") or m.group("idx") + parts.append(_escape(name)) + pos = m.end() + + return "/" + "/".join(parts) if parts else "" + + +def extract_jsonpath(message: str) -> str | None: + """Extract a JSONPath expression from an error message. + + Looks for the pattern ``- at `$.path``` as used by msgspec. + + Args: + message: The exception message string. + + Returns: + The JSONPath string if found, otherwise ``None``. + """ + m = _AT_PATH.search(message) + return m.group(1) if m else None + + +def _escape(segment: str) -> str: + return segment.replace("~", "~0").replace("/", "~1") diff --git a/parse_errors/context.py b/parse_errors/context.py new file mode 100644 index 0000000..beee9a5 --- /dev/null +++ b/parse_errors/context.py @@ -0,0 +1,83 @@ +"""Context manager for better parse error messages.""" + +from __future__ import annotations + +import os +import contextlib +from pathlib import Path +from typing import Iterator + +from .source_map import detect_format, build_source_map, closest_entry +from ._jsonpath import extract_jsonpath, jsonpath_to_pointer + +__all__ = ["ParseError", "ParseContext"] + + +class ParseError(Exception): + """A parse or validation error augmented with filename and line number.""" + + def __init__( + self, message: str, filename: str | os.PathLike[str], line: int, column: int = 0 + ): + self.filename = str(filename) + self.line = line + self.column = column + super().__init__(message) + + +@contextlib.contextmanager +def ParseContext( + filename: str | os.PathLike[str], + *, + data: str | bytes | None = None, + format: str | None = None, +) -> Iterator[None]: + """Context manager that re-raises parse/validation errors with location info. + + Catches exceptions whose message contains a JSONPath (e.g. as emitted by + msgspec) and re-raises a :class:`ParseError` with the filename and + 1-based line number derived from the file's source map. + + Args: + filename: Path to the file being parsed. + data: The file contents as a string or bytes (UTF-8). If provided, the + file is not read from disk. Regardless of type, reported + locations (line, column) are always in characters, not bytes. + format: One of ``"json"``, ``"yaml"``, or ``"toml"``. If omitted the + format is inferred from the file extension. + """ + try: + yield + except Exception as exc: + message = str(exc) + # This is focused on msgspec-style exceptions, which use JSONPath for + # some reason. If there are other formats we know can be raised, + # adjust this. + jsonpath = extract_jsonpath(message) + if jsonpath is None: + raise + + try: + pointer = jsonpath_to_pointer(jsonpath) + except ValueError: # pragma: no cover + raise exc + + path = Path(filename) + fmt = format or detect_format(path) + assert fmt is not None + + source = data if data is not None else path.read_bytes() + source_map = build_source_map(source, fmt) + + entry = closest_entry(source_map, pointer) + if entry is None: # pragma: no cover + raise exc + + loc = entry.value_start + # Lines are 0-based in source maps; convert to 1-based for humans. + raise ParseError( + f"{path}:{loc.line + 1}:{loc.column + 1}: {message}", + filename=path, + line=loc.line + 1, + column=loc.column + 1, + ) from exc diff --git a/parse_errors/toml_source_map/__init__.py b/parse_errors/toml_source_map/__init__.py index 47e52b8..b3277de 100644 --- a/parse_errors/toml_source_map/__init__.py +++ b/parse_errors/toml_source_map/__init__.py @@ -5,6 +5,7 @@ import tree_sitter_toml import tree_sitter as ts from ..source_map import Entry, Location, TSourceMap +from .._jsonpath import _escape def calculate(source: str | bytes) -> TSourceMap: @@ -124,6 +125,9 @@ def _pair_key_value(node: ts.Node) -> tuple[ts.Node, ts.Node]: key_node = child elif child.type not in ("=", "comment"): value_node = child + + assert key_node is not None + assert value_node is not None return key_node, value_node @@ -138,8 +142,10 @@ def _table_key(node: ts.Node) -> ts.Node: def _key_segments(node: ts.Node) -> list[str]: """Extract key path segments from a key node.""" if node.type == "bare_key": + assert node.text is not None return [node.text.decode()] elif node.type == "quoted_key": + assert node.text is not None return [_unquote(node.text.decode())] elif node.type == "dotted_key": return sum((_key_segments(child) for child in node.children), []) @@ -176,10 +182,6 @@ def _to_pointer(segments: list[str]) -> str: return "/" + "/".join(_escape(s) for s in segments) if segments else "" -def _escape(key: str) -> str: - return key.replace("~", "~0").replace("/", "~1") - - def _loc(point: ts.Point, src: bytes) -> Location: # tree-sitter gives byte-based row/column; convert to char-based for consistency # with json-source-map and yaml-source-map. diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..212b914 --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1 @@ +# Needed so pytest considers this a package -- PEP 420 isn't enough diff --git a/tests/_types.py b/tests/_types.py new file mode 100644 index 0000000..7349747 --- /dev/null +++ b/tests/_types.py @@ -0,0 +1,10 @@ +import msgspec + + +class Config(msgspec.Struct): + host: str + port: int + + +class Nested(msgspec.Struct): + server: Config diff --git a/tests/test_parse_context_toml.py b/tests/test_parse_context_toml.py index 67b8adf..128753b 100644 --- a/tests/test_parse_context_toml.py +++ b/tests/test_parse_context_toml.py @@ -1,67 +1,82 @@ -from parse_errors import toml_source_map +try: + import tomllib +except ImportError: + import tomli as tomllib # type: ignore[no-redef] +import pytest +import msgspec -TRICKY = """\ -# comment -"quoted key" = 42 -dotted.key = true # inline comment after value +from parse_errors import ParseContext, ParseError -[server] # inline comment on table header -host = "localhost" -path = "/etc/ssl/key.pem" # value contains key substring -""" +from ._types import Config, Nested -def test_toml_quoted_key_present(): - sm = toml_source_map.calculate(TRICKY) - assert "/quoted key" in sm, f"missing '/quoted key', got: {sorted(sm)}" +TOML_SOURCE = """\ +host = "localhost" +port = "not-an-int" +""" +TOML_NESTED_SOURCE = """\ +[server] +host = "localhost" +port = "not-an-int" +""" -def test_toml_val_end_excludes_comment(): - sm = toml_source_map.calculate(TRICKY) - entry = sm["/dotted/key"] - val = TRICKY[entry.value_start.position : entry.value_end.position] - assert val == "true", f"got {val!r}" +def test_passthrough_non_jsonspec(): + with pytest.raises(ValueError, match="^foo$"): + with ParseContext("config.toml", data=TOML_SOURCE): + raise ValueError("foo") -# --- _escape: keys containing / and ~ --- -ESCAPE_SOURCE = """\ -"path/to/thing" = 1 -"tilde~zero" = 2 -""" +def test_toml_raises_parse_error(): + with pytest.raises(ParseError) as exc_info: + with ParseContext("config.toml", data=TOML_SOURCE): + data = tomllib.loads(TOML_SOURCE) + msgspec.convert(data, Config) + err = exc_info.value + assert err.filename == "config.toml" + assert err.line == 2 + assert str(err) == "config.toml:2:8: Expected `int`, got `str` - at `$.port`" -def test_toml_escape_slash_in_key(): - sm = toml_source_map.calculate(ESCAPE_SOURCE) - assert "/path~1to~1thing" in sm +def test_toml_bytes_data(): + with pytest.raises(ParseError) as exc_info: + with ParseContext("config.toml", data=TOML_SOURCE.encode()): + data = tomllib.loads(TOML_SOURCE) + msgspec.convert(data, Config) -def test_toml_escape_tilde_in_key(): - sm = toml_source_map.calculate(ESCAPE_SOURCE) - assert "/tilde~0zero" in sm + assert ( + str(exc_info.value) + == "config.toml:2:8: Expected `int`, got `str` - at `$.port`" + ) -# --- non-consecutive array-of-tables with intermediate sub-table --- +def test_toml_nested_raises_parse_error(): + with pytest.raises(ParseError) as exc_info: + with ParseContext("config.toml", data=TOML_NESTED_SOURCE): + data = tomllib.loads(TOML_NESTED_SOURCE) + msgspec.convert(data, Nested) -AOT_SOURCE = """\ -[[fruits]] -name = "apple" + err = exc_info.value + assert str(err) == "config.toml:3:8: Expected `int`, got `str` - at `$.server.port`" -[fruits.details] -color = "red" -[bar] +# --- fallback to nearest parent pointer --- -[[fruits]] -name = "banana" +FALLBACK_SOURCE = """\ +[server] +port = 8080 """ -def test_toml_nonconsecutive_aot(): - sm = toml_source_map.calculate(AOT_SOURCE) - # [fruits.details] appears after the first [[fruits]], so it belongs to fruits[0] - assert "/fruits/0/details" in sm, f"got: {sorted(sm)}" - assert "/fruits/0/details/color" in sm - # The erroneous flat entry must not exist - assert "/fruits/details" not in sm +def test_toml_fallback_to_parent(): + # Inject a fake error at a path deeper than the source map tracks. + # /server/tls/cert doesn't exist; should fall back to /server (line 1). + with pytest.raises(ParseError) as exc_info: + with ParseContext("config.toml", data=FALLBACK_SOURCE): + raise msgspec.ValidationError( + "Expected `str`, got `int` - at `$.server.tls.cert`" + ) + assert exc_info.value.line == 1 diff --git a/tests/test_source_map.py b/tests/test_source_map.py index 7e34fcd..e0beb75 100644 --- a/tests/test_source_map.py +++ b/tests/test_source_map.py @@ -2,6 +2,8 @@ def test_build_toml(): + # This isn't an exhaustive test of the toml source mapper, just as something + # a minimal example that lets us exercise str/bytes sm1 = build_source_map("x=1\nb='foo'\n", fmt="toml") assert sm1 == { "": Entry( @@ -22,7 +24,7 @@ def test_build_toml(): ), } sm2 = build_source_map(b"x=1\nb='foo'\n", fmt="toml") - # Only ascii, so str vs bytes should be the same + # Only ASCII, so str vs bytes should be the same assert sm1 == sm2