Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,15 @@
# Changelog

## Unreleased

### Changed

- Extract codepoint/range parsing into `check_unicode.parsing` module for reuse
- Codepoint parser now validates the Unicode range (0..U+10FFFF) and rejects
empty/invalid input with clear error messages
- Range parser now rejects inverted ranges (lo > hi) and tolerates whitespace
around the dash separator

## 0.5.0 - 2026-03-10

### Fixed
Expand Down
30 changes: 5 additions & 25 deletions src/check_unicode/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,9 @@
from check_unicode.checker import AllowConfig, Finding, check_confusables, check_file
from check_unicode.fixer import fix_file
from check_unicode.output import print_findings
from check_unicode.parsing import parse_codepoint, parse_range
from check_unicode.scripts import KNOWN_SCRIPTS

_EXPECTED_RANGE_PARTS = 2

# Unicode general categories: abbreviation -> (full name, description).
# Covers all 30 categories from the Unicode standard.
UNICODE_CATEGORIES: dict[str, tuple[str, str]] = {
Expand Down Expand Up @@ -69,25 +68,6 @@ class Override:
check_confusables: bool | None # None = inherit global


def _parse_codepoint(s: str) -> int:
"""Parse 'U+XXXX' or '0xXXXX' into an integer codepoint."""
s = s.strip()
for prefix in ("U+", "u+", "0x", "0X"):
if s.startswith(prefix):
s = s[len(prefix) :]
break
return int(s, 16)


def _parse_range(s: str) -> tuple[int, int]:
"""Parse 'U+XXXX-U+YYYY' into a (lo, hi) tuple."""
parts = s.split("-", 1)
if len(parts) != _EXPECTED_RANGE_PARTS:
msg = f"Invalid range: {s!r} (expected U+XXXX-U+YYYY)"
raise argparse.ArgumentTypeError(msg)
return _parse_codepoint(parts[0]), _parse_codepoint(parts[1])


def _discover_config() -> dict[str, Any] | None:
"""Auto-discover .check-unicode.toml or [tool.check-unicode] in pyproject.toml."""
cwd = Path.cwd()
Expand Down Expand Up @@ -128,10 +108,10 @@ def _allow_from_config(
) -> tuple[set[int], list[tuple[int, int]], set[str], bool, set[str]]:
"""Extract allow-lists from a parsed config dictionary."""
codepoints: set[int] = {
_parse_codepoint(cp_str) for cp_str in config.get("allow-codepoints", [])
parse_codepoint(cp_str) for cp_str in config.get("allow-codepoints", [])
}
ranges: list[tuple[int, int]] = [
_parse_range(r_str) for r_str in config.get("allow-ranges", [])
parse_range(r_str) for r_str in config.get("allow-ranges", [])
]
categories: set[str] = set(config.get("allow-categories", []))
printable: bool = config.get("allow-printable", False)
Expand Down Expand Up @@ -199,9 +179,9 @@ def _build_allow_config(
if args.allow_codepoint:
for item in args.allow_codepoint:
for cp_str in item.split(","):
codepoints.add(_parse_codepoint(cp_str))
codepoints.add(parse_codepoint(cp_str))
if args.allow_range:
ranges.extend(_parse_range(r_str) for r_str in args.allow_range)
ranges.extend(parse_range(r_str) for r_str in args.allow_range)
if args.allow_category:
categories.update(args.allow_category)
if args.allow_printable:
Expand Down
70 changes: 70 additions & 0 deletions src/check_unicode/parsing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
"""Codepoint and range parsing for user input (CLI args and config files)."""

from __future__ import annotations

_MAX_UNICODE = 0x10FFFF


def parse_codepoint(s: str) -> int:
"""Parse a Unicode codepoint string into an integer.

Accepted formats: ``U+XXXX``, ``u+xxxx``, ``0xXXXX``, bare hex digits.

Raises:
ValueError: If the string is empty, not valid hex, or out of the
Unicode range (0..U+10FFFF).

"""
s = s.strip()
if not s:
msg = "Codepoint string is empty"
raise ValueError(msg)

raw = s
if s[:2].upper() == "U+" or s[:2].lower() == "0x":
s = s[2:]

if not s:
msg = f"Invalid codepoint: {raw!r}"
raise ValueError(msg)

try:
value = int(s, 16)
except ValueError:
msg = f"Invalid codepoint: {raw!r}"
raise ValueError(msg) from None

if value < 0 or value > _MAX_UNICODE:
msg = f"Codepoint {raw!r} is outside the valid Unicode range (0..U+10FFFF)"
raise ValueError(msg)

return value


def parse_range(s: str) -> tuple[int, int]:
"""Parse a Unicode range string into a (lo, hi) tuple.

Accepted formats: ``U+XXXX-U+YYYY``, ``0xXXXX-0xYYYY``.
Splits on the last hyphen so that bare-hex ranges like ``A0-FF``
work correctly (hex digits and ``U+``/``0x`` prefixes never
contain hyphens).

Raises:
ValueError: If the string cannot be split into two parts,
either part is invalid, or lo > hi.

"""
s = s.strip()
idx = s.rfind("-")
if idx <= 0:
msg = f"Invalid range: {s!r} (expected U+XXXX-U+YYYY)"
raise ValueError(msg)

lo = parse_codepoint(s[:idx])
hi = parse_codepoint(s[idx + 1 :])

if lo > hi:
msg = f"Inverted range: lo U+{lo:04X} > hi U+{hi:04X} (start must be <= end)"
raise ValueError(msg)

return lo, hi
23 changes: 0 additions & 23 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@
_build_parser,
_file_matches_override,
_is_excluded,
_parse_codepoint,
_parse_range,
_resolve_allow_for_file,
_resolve_file_settings,
main,
Expand Down Expand Up @@ -258,27 +256,6 @@ def test_multiple_clean_files(self) -> None:
)


class TestParseCodepoint:
"""Tests for codepoint parsing helpers."""

def test_parse_hex_prefix(self) -> None:
"""Codepoints with 0x prefix are parsed correctly."""
assert _parse_codepoint("0x00B0") == 0x00B0

def test_parse_bare_hex(self) -> None:
"""Bare hex strings without prefix are parsed correctly."""
assert _parse_codepoint("00B0") == 0x00B0


class TestParseRange:
"""Tests for range parsing helpers."""

def test_invalid_range_raises(self) -> None:
"""Invalid range strings raise ArgumentTypeError."""
with pytest.raises(Exception, match="Invalid range"):
_parse_range("NOPE")


class TestConfigDiscovery:
"""Tests for automatic config file discovery."""

Expand Down
126 changes: 126 additions & 0 deletions tests/test_parsing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
"""Tests for check_unicode.parsing -- codepoint and range parsing."""

from __future__ import annotations

import pytest

from check_unicode.parsing import parse_codepoint, parse_range


class TestParseCodepoint:
"""Tests for parse_codepoint()."""

def test_u_plus_prefix(self) -> None:
"""U+XXXX format parses correctly."""
assert parse_codepoint("U+00B0") == 0x00B0

def test_u_plus_lowercase(self) -> None:
"""u+xxxx format parses correctly."""
assert parse_codepoint("u+00b0") == 0x00B0

def test_hex_prefix(self) -> None:
"""0xXXXX format parses correctly."""
assert parse_codepoint("0x00B0") == 0x00B0

def test_bare_hex(self) -> None:
"""Bare hex digits parse correctly."""
assert parse_codepoint("00B0") == 0x00B0

def test_short_bare_hex(self) -> None:
"""Short bare hex like 'A0' parses correctly."""
assert parse_codepoint("A0") == 0xA0

def test_max_unicode(self) -> None:
"""U+10FFFF (max valid codepoint) is accepted."""
assert parse_codepoint("U+10FFFF") == 0x10FFFF

def test_zero(self) -> None:
"""U+0000 is a valid codepoint."""
assert parse_codepoint("U+0000") == 0

def test_strips_whitespace(self) -> None:
"""Leading/trailing whitespace is stripped."""
assert parse_codepoint(" U+00B0 ") == 0x00B0

def test_rejects_empty_string(self) -> None:
"""Empty string raises ValueError."""
with pytest.raises(ValueError, match="empty"):
parse_codepoint("")

def test_rejects_whitespace_only(self) -> None:
"""Whitespace-only string raises ValueError."""
with pytest.raises(ValueError, match="empty"):
parse_codepoint(" ")

def test_rejects_above_max_unicode(self) -> None:
"""Values above U+10FFFF raise ValueError."""
with pytest.raises(ValueError, match=r"U\+10FFFF"):
parse_codepoint("U+110000")

def test_rejects_negative(self) -> None:
"""Negative values raise ValueError."""
with pytest.raises(ValueError, match=r"[Ii]nvalid|outside"):
parse_codepoint("-1")

def test_rejects_non_hex(self) -> None:
"""Non-hex strings raise ValueError."""
with pytest.raises(ValueError, match=r"[Ii]nvalid"):
parse_codepoint("ZZZZ")

def test_rejects_bare_u_plus(self) -> None:
"""'U+' with no digits raises ValueError."""
with pytest.raises(ValueError, match=r"[Ii]nvalid"):
parse_codepoint("U+")

def test_mixed_case_hex_digits(self) -> None:
"""Mixed case hex digits work."""
assert parse_codepoint("U+00aB") == 0x00AB


class TestParseRange:
"""Tests for parse_range()."""

def test_u_plus_format(self) -> None:
"""U+XXXX-U+YYYY format parses correctly."""
assert parse_range("U+00A0-U+00FF") == (0x00A0, 0x00FF)

def test_hex_prefix_format(self) -> None:
"""0xXXXX-0xYYYY format parses correctly."""
assert parse_range("0x00A0-0x00FF") == (0x00A0, 0x00FF)

def test_bare_hex_format(self) -> None:
"""Bare hex A0-FF parses correctly."""
assert parse_range("00A0-00FF") == (0x00A0, 0x00FF)

def test_whitespace_around_dash(self) -> None:
"""Whitespace around the dash is tolerated."""
assert parse_range("U+00A0 - U+00FF") == (0x00A0, 0x00FF)

def test_single_codepoint_range(self) -> None:
"""A range where lo == hi is valid."""
assert parse_range("U+00B0-U+00B0") == (0x00B0, 0x00B0)

def test_rejects_inverted_range(self) -> None:
"""Inverted range (hi < lo) raises ValueError."""
with pytest.raises(ValueError, match=r"[Ii]nverted|start.*end"):
parse_range("U+00FF-U+00A0")

def test_rejects_no_dash(self) -> None:
"""Single value without dash raises ValueError."""
with pytest.raises(ValueError, match=r"[Ii]nvalid range"):
parse_range("U+00A0")

def test_rejects_empty(self) -> None:
"""Empty string raises ValueError."""
with pytest.raises(ValueError, match=r"[Ii]nvalid range|empty"):
parse_range("")

def test_rejects_out_of_range(self) -> None:
"""Out-of-range codepoint in range raises ValueError."""
with pytest.raises(ValueError, match=r"U\+10FFFF"):
parse_range("U+0000-U+110000")

def test_rejects_multiple_hyphens(self) -> None:
"""Multiple hyphens produce a clear error."""
with pytest.raises(ValueError, match=r"[Ii]nvalid"):
parse_range("A0-B0-C0")
Loading