diff --git a/CHANGELOG.md b/CHANGELOG.md index 998024b..7a34848 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,15 @@ # Changelog +## Unreleased + +### Changed + +- Extract codepoint/range parsing into `check_unicode.parsing` module for reuse +- Codepoint parser now validates the Unicode range (0..U+10FFFF) and rejects + empty/invalid input with clear error messages +- Range parser now rejects inverted ranges (lo > hi) and tolerates whitespace + around the dash separator + ## 0.5.0 - 2026-03-10 ### Fixed diff --git a/src/check_unicode/main.py b/src/check_unicode/main.py index 8c0ac89..8aa5053 100644 --- a/src/check_unicode/main.py +++ b/src/check_unicode/main.py @@ -15,10 +15,9 @@ from check_unicode.checker import AllowConfig, Finding, check_confusables, check_file from check_unicode.fixer import fix_file from check_unicode.output import print_findings +from check_unicode.parsing import parse_codepoint, parse_range from check_unicode.scripts import KNOWN_SCRIPTS -_EXPECTED_RANGE_PARTS = 2 - # Unicode general categories: abbreviation -> (full name, description). # Covers all 30 categories from the Unicode standard. UNICODE_CATEGORIES: dict[str, tuple[str, str]] = { @@ -69,25 +68,6 @@ class Override: check_confusables: bool | None # None = inherit global -def _parse_codepoint(s: str) -> int: - """Parse 'U+XXXX' or '0xXXXX' into an integer codepoint.""" - s = s.strip() - for prefix in ("U+", "u+", "0x", "0X"): - if s.startswith(prefix): - s = s[len(prefix) :] - break - return int(s, 16) - - -def _parse_range(s: str) -> tuple[int, int]: - """Parse 'U+XXXX-U+YYYY' into a (lo, hi) tuple.""" - parts = s.split("-", 1) - if len(parts) != _EXPECTED_RANGE_PARTS: - msg = f"Invalid range: {s!r} (expected U+XXXX-U+YYYY)" - raise argparse.ArgumentTypeError(msg) - return _parse_codepoint(parts[0]), _parse_codepoint(parts[1]) - - def _discover_config() -> dict[str, Any] | None: """Auto-discover .check-unicode.toml or [tool.check-unicode] in pyproject.toml.""" cwd = Path.cwd() @@ -128,10 +108,10 @@ def _allow_from_config( ) -> tuple[set[int], list[tuple[int, int]], set[str], bool, set[str]]: """Extract allow-lists from a parsed config dictionary.""" codepoints: set[int] = { - _parse_codepoint(cp_str) for cp_str in config.get("allow-codepoints", []) + parse_codepoint(cp_str) for cp_str in config.get("allow-codepoints", []) } ranges: list[tuple[int, int]] = [ - _parse_range(r_str) for r_str in config.get("allow-ranges", []) + parse_range(r_str) for r_str in config.get("allow-ranges", []) ] categories: set[str] = set(config.get("allow-categories", [])) printable: bool = config.get("allow-printable", False) @@ -199,9 +179,9 @@ def _build_allow_config( if args.allow_codepoint: for item in args.allow_codepoint: for cp_str in item.split(","): - codepoints.add(_parse_codepoint(cp_str)) + codepoints.add(parse_codepoint(cp_str)) if args.allow_range: - ranges.extend(_parse_range(r_str) for r_str in args.allow_range) + ranges.extend(parse_range(r_str) for r_str in args.allow_range) if args.allow_category: categories.update(args.allow_category) if args.allow_printable: diff --git a/src/check_unicode/parsing.py b/src/check_unicode/parsing.py new file mode 100644 index 0000000..7117196 --- /dev/null +++ b/src/check_unicode/parsing.py @@ -0,0 +1,70 @@ +"""Codepoint and range parsing for user input (CLI args and config files).""" + +from __future__ import annotations + +_MAX_UNICODE = 0x10FFFF + + +def parse_codepoint(s: str) -> int: + """Parse a Unicode codepoint string into an integer. + + Accepted formats: ``U+XXXX``, ``u+xxxx``, ``0xXXXX``, bare hex digits. + + Raises: + ValueError: If the string is empty, not valid hex, or out of the + Unicode range (0..U+10FFFF). + + """ + s = s.strip() + if not s: + msg = "Codepoint string is empty" + raise ValueError(msg) + + raw = s + if s[:2].upper() == "U+" or s[:2].lower() == "0x": + s = s[2:] + + if not s: + msg = f"Invalid codepoint: {raw!r}" + raise ValueError(msg) + + try: + value = int(s, 16) + except ValueError: + msg = f"Invalid codepoint: {raw!r}" + raise ValueError(msg) from None + + if value < 0 or value > _MAX_UNICODE: + msg = f"Codepoint {raw!r} is outside the valid Unicode range (0..U+10FFFF)" + raise ValueError(msg) + + return value + + +def parse_range(s: str) -> tuple[int, int]: + """Parse a Unicode range string into a (lo, hi) tuple. + + Accepted formats: ``U+XXXX-U+YYYY``, ``0xXXXX-0xYYYY``. + Splits on the last hyphen so that bare-hex ranges like ``A0-FF`` + work correctly (hex digits and ``U+``/``0x`` prefixes never + contain hyphens). + + Raises: + ValueError: If the string cannot be split into two parts, + either part is invalid, or lo > hi. + + """ + s = s.strip() + idx = s.rfind("-") + if idx <= 0: + msg = f"Invalid range: {s!r} (expected U+XXXX-U+YYYY)" + raise ValueError(msg) + + lo = parse_codepoint(s[:idx]) + hi = parse_codepoint(s[idx + 1 :]) + + if lo > hi: + msg = f"Inverted range: lo U+{lo:04X} > hi U+{hi:04X} (start must be <= end)" + raise ValueError(msg) + + return lo, hi diff --git a/tests/test_cli.py b/tests/test_cli.py index c0a9a12..fe72578 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -14,8 +14,6 @@ _build_parser, _file_matches_override, _is_excluded, - _parse_codepoint, - _parse_range, _resolve_allow_for_file, _resolve_file_settings, main, @@ -258,27 +256,6 @@ def test_multiple_clean_files(self) -> None: ) -class TestParseCodepoint: - """Tests for codepoint parsing helpers.""" - - def test_parse_hex_prefix(self) -> None: - """Codepoints with 0x prefix are parsed correctly.""" - assert _parse_codepoint("0x00B0") == 0x00B0 - - def test_parse_bare_hex(self) -> None: - """Bare hex strings without prefix are parsed correctly.""" - assert _parse_codepoint("00B0") == 0x00B0 - - -class TestParseRange: - """Tests for range parsing helpers.""" - - def test_invalid_range_raises(self) -> None: - """Invalid range strings raise ArgumentTypeError.""" - with pytest.raises(Exception, match="Invalid range"): - _parse_range("NOPE") - - class TestConfigDiscovery: """Tests for automatic config file discovery.""" diff --git a/tests/test_parsing.py b/tests/test_parsing.py new file mode 100644 index 0000000..f42b89b --- /dev/null +++ b/tests/test_parsing.py @@ -0,0 +1,126 @@ +"""Tests for check_unicode.parsing -- codepoint and range parsing.""" + +from __future__ import annotations + +import pytest + +from check_unicode.parsing import parse_codepoint, parse_range + + +class TestParseCodepoint: + """Tests for parse_codepoint().""" + + def test_u_plus_prefix(self) -> None: + """U+XXXX format parses correctly.""" + assert parse_codepoint("U+00B0") == 0x00B0 + + def test_u_plus_lowercase(self) -> None: + """u+xxxx format parses correctly.""" + assert parse_codepoint("u+00b0") == 0x00B0 + + def test_hex_prefix(self) -> None: + """0xXXXX format parses correctly.""" + assert parse_codepoint("0x00B0") == 0x00B0 + + def test_bare_hex(self) -> None: + """Bare hex digits parse correctly.""" + assert parse_codepoint("00B0") == 0x00B0 + + def test_short_bare_hex(self) -> None: + """Short bare hex like 'A0' parses correctly.""" + assert parse_codepoint("A0") == 0xA0 + + def test_max_unicode(self) -> None: + """U+10FFFF (max valid codepoint) is accepted.""" + assert parse_codepoint("U+10FFFF") == 0x10FFFF + + def test_zero(self) -> None: + """U+0000 is a valid codepoint.""" + assert parse_codepoint("U+0000") == 0 + + def test_strips_whitespace(self) -> None: + """Leading/trailing whitespace is stripped.""" + assert parse_codepoint(" U+00B0 ") == 0x00B0 + + def test_rejects_empty_string(self) -> None: + """Empty string raises ValueError.""" + with pytest.raises(ValueError, match="empty"): + parse_codepoint("") + + def test_rejects_whitespace_only(self) -> None: + """Whitespace-only string raises ValueError.""" + with pytest.raises(ValueError, match="empty"): + parse_codepoint(" ") + + def test_rejects_above_max_unicode(self) -> None: + """Values above U+10FFFF raise ValueError.""" + with pytest.raises(ValueError, match=r"U\+10FFFF"): + parse_codepoint("U+110000") + + def test_rejects_negative(self) -> None: + """Negative values raise ValueError.""" + with pytest.raises(ValueError, match=r"[Ii]nvalid|outside"): + parse_codepoint("-1") + + def test_rejects_non_hex(self) -> None: + """Non-hex strings raise ValueError.""" + with pytest.raises(ValueError, match=r"[Ii]nvalid"): + parse_codepoint("ZZZZ") + + def test_rejects_bare_u_plus(self) -> None: + """'U+' with no digits raises ValueError.""" + with pytest.raises(ValueError, match=r"[Ii]nvalid"): + parse_codepoint("U+") + + def test_mixed_case_hex_digits(self) -> None: + """Mixed case hex digits work.""" + assert parse_codepoint("U+00aB") == 0x00AB + + +class TestParseRange: + """Tests for parse_range().""" + + def test_u_plus_format(self) -> None: + """U+XXXX-U+YYYY format parses correctly.""" + assert parse_range("U+00A0-U+00FF") == (0x00A0, 0x00FF) + + def test_hex_prefix_format(self) -> None: + """0xXXXX-0xYYYY format parses correctly.""" + assert parse_range("0x00A0-0x00FF") == (0x00A0, 0x00FF) + + def test_bare_hex_format(self) -> None: + """Bare hex A0-FF parses correctly.""" + assert parse_range("00A0-00FF") == (0x00A0, 0x00FF) + + def test_whitespace_around_dash(self) -> None: + """Whitespace around the dash is tolerated.""" + assert parse_range("U+00A0 - U+00FF") == (0x00A0, 0x00FF) + + def test_single_codepoint_range(self) -> None: + """A range where lo == hi is valid.""" + assert parse_range("U+00B0-U+00B0") == (0x00B0, 0x00B0) + + def test_rejects_inverted_range(self) -> None: + """Inverted range (hi < lo) raises ValueError.""" + with pytest.raises(ValueError, match=r"[Ii]nverted|start.*end"): + parse_range("U+00FF-U+00A0") + + def test_rejects_no_dash(self) -> None: + """Single value without dash raises ValueError.""" + with pytest.raises(ValueError, match=r"[Ii]nvalid range"): + parse_range("U+00A0") + + def test_rejects_empty(self) -> None: + """Empty string raises ValueError.""" + with pytest.raises(ValueError, match=r"[Ii]nvalid range|empty"): + parse_range("") + + def test_rejects_out_of_range(self) -> None: + """Out-of-range codepoint in range raises ValueError.""" + with pytest.raises(ValueError, match=r"U\+10FFFF"): + parse_range("U+0000-U+110000") + + def test_rejects_multiple_hyphens(self) -> None: + """Multiple hyphens produce a clear error.""" + with pytest.raises(ValueError, match=r"[Ii]nvalid"): + parse_range("A0-B0-C0")