mit-d · mit-d · Mar 21, 2026 · Mar 21, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,15 @@
 # Changelog
 
+## Unreleased
+
+### Changed
+
+- Extract codepoint/range parsing into `check_unicode.parsing` module for reuse
+- Codepoint parser now validates the Unicode range (0..U+10FFFF) and rejects
+  empty/invalid input with clear error messages
+- Range parser now rejects inverted ranges (lo > hi) and tolerates whitespace
+  around the dash separator
+
 ## 0.5.0 - 2026-03-10
 
 ### Fixed

diff --git a/src/check_unicode/main.py b/src/check_unicode/main.py
@@ -15,10 +15,9 @@
 from check_unicode.checker import AllowConfig, Finding, check_confusables, check_file
 from check_unicode.fixer import fix_file
 from check_unicode.output import print_findings
+from check_unicode.parsing import parse_codepoint, parse_range
 from check_unicode.scripts import KNOWN_SCRIPTS
 
-_EXPECTED_RANGE_PARTS = 2
-
 # Unicode general categories: abbreviation -> (full name, description).
 # Covers all 30 categories from the Unicode standard.
 UNICODE_CATEGORIES: dict[str, tuple[str, str]] = {
@@ -69,25 +68,6 @@ class Override:
     check_confusables: bool | None  # None = inherit global
 
 
-def _parse_codepoint(s: str) -> int:
-    """Parse 'U+XXXX' or '0xXXXX' into an integer codepoint."""
-    s = s.strip()
-    for prefix in ("U+", "u+", "0x", "0X"):
-        if s.startswith(prefix):
-            s = s[len(prefix) :]
-            break
-    return int(s, 16)
-
-
-def _parse_range(s: str) -> tuple[int, int]:
-    """Parse 'U+XXXX-U+YYYY' into a (lo, hi) tuple."""
-    parts = s.split("-", 1)
-    if len(parts) != _EXPECTED_RANGE_PARTS:
-        msg = f"Invalid range: {s!r} (expected U+XXXX-U+YYYY)"
-        raise argparse.ArgumentTypeError(msg)
-    return _parse_codepoint(parts[0]), _parse_codepoint(parts[1])
-
-
 def _discover_config() -> dict[str, Any] | None:
     """Auto-discover .check-unicode.toml or [tool.check-unicode] in pyproject.toml."""
     cwd = Path.cwd()
@@ -128,10 +108,10 @@ def _allow_from_config(
 ) -> tuple[set[int], list[tuple[int, int]], set[str], bool, set[str]]:
     """Extract allow-lists from a parsed config dictionary."""
     codepoints: set[int] = {
-        _parse_codepoint(cp_str) for cp_str in config.get("allow-codepoints", [])
+        parse_codepoint(cp_str) for cp_str in config.get("allow-codepoints", [])
     }
     ranges: list[tuple[int, int]] = [
-        _parse_range(r_str) for r_str in config.get("allow-ranges", [])
+        parse_range(r_str) for r_str in config.get("allow-ranges", [])
     ]
     categories: set[str] = set(config.get("allow-categories", []))
     printable: bool = config.get("allow-printable", False)
@@ -199,9 +179,9 @@ def _build_allow_config(
     if args.allow_codepoint:
         for item in args.allow_codepoint:
             for cp_str in item.split(","):
-                codepoints.add(_parse_codepoint(cp_str))
+                codepoints.add(parse_codepoint(cp_str))
     if args.allow_range:
-        ranges.extend(_parse_range(r_str) for r_str in args.allow_range)
+        ranges.extend(parse_range(r_str) for r_str in args.allow_range)
     if args.allow_category:
         categories.update(args.allow_category)
     if args.allow_printable:

diff --git a/src/check_unicode/parsing.py b/src/check_unicode/parsing.py
@@ -0,0 +1,70 @@
+"""Codepoint and range parsing for user input (CLI args and config files)."""
+
+from __future__ import annotations
+
+_MAX_UNICODE = 0x10FFFF
+
+
+def parse_codepoint(s: str) -> int:
+    """Parse a Unicode codepoint string into an integer.
+
+    Accepted formats: ``U+XXXX``, ``u+xxxx``, ``0xXXXX``, bare hex digits.
+
+    Raises:
+        ValueError: If the string is empty, not valid hex, or out of the
+            Unicode range (0..U+10FFFF).
+
+    """
+    s = s.strip()
+    if not s:
+        msg = "Codepoint string is empty"
+        raise ValueError(msg)
+
+    raw = s
+    if s[:2].upper() == "U+" or s[:2].lower() == "0x":
+        s = s[2:]
+
+    if not s:
+        msg = f"Invalid codepoint: {raw!r}"
+        raise ValueError(msg)
+
+    try:
+        value = int(s, 16)
+    except ValueError:
+        msg = f"Invalid codepoint: {raw!r}"
+        raise ValueError(msg) from None
+
+    if value < 0 or value > _MAX_UNICODE:
+        msg = f"Codepoint {raw!r} is outside the valid Unicode range (0..U+10FFFF)"
+        raise ValueError(msg)
+
+    return value
+
+
+def parse_range(s: str) -> tuple[int, int]:
+    """Parse a Unicode range string into a (lo, hi) tuple.
+
+    Accepted formats: ``U+XXXX-U+YYYY``, ``0xXXXX-0xYYYY``.
+    Splits on the last hyphen so that bare-hex ranges like ``A0-FF``
+    work correctly (hex digits and ``U+``/``0x`` prefixes never
+    contain hyphens).
+
+    Raises:
+        ValueError: If the string cannot be split into two parts,
+            either part is invalid, or lo > hi.
+
+    """
+    s = s.strip()
+    idx = s.rfind("-")
+    if idx <= 0:
+        msg = f"Invalid range: {s!r} (expected U+XXXX-U+YYYY)"
+        raise ValueError(msg)
+
+    lo = parse_codepoint(s[:idx])
+    hi = parse_codepoint(s[idx + 1 :])
+
+    if lo > hi:
+        msg = f"Inverted range: lo U+{lo:04X} > hi U+{hi:04X} (start must be <= end)"
+        raise ValueError(msg)
+
+    return lo, hi
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -14,8 +14,6 @@
     _build_parser,
     _file_matches_override,
     _is_excluded,
-    _parse_codepoint,
-    _parse_range,
     _resolve_allow_for_file,
     _resolve_file_settings,
     main,
@@ -258,27 +256,6 @@ def test_multiple_clean_files(self) -> None:
         )
 
 
-class TestParseCodepoint:
-    """Tests for codepoint parsing helpers."""
-
-    def test_parse_hex_prefix(self) -> None:
-        """Codepoints with 0x prefix are parsed correctly."""
-        assert _parse_codepoint("0x00B0") == 0x00B0
-
-    def test_parse_bare_hex(self) -> None:
-        """Bare hex strings without prefix are parsed correctly."""
-        assert _parse_codepoint("00B0") == 0x00B0
-
-
-class TestParseRange:
-    """Tests for range parsing helpers."""
-
-    def test_invalid_range_raises(self) -> None:
-        """Invalid range strings raise ArgumentTypeError."""
-        with pytest.raises(Exception, match="Invalid range"):
-            _parse_range("NOPE")
-
-
 class TestConfigDiscovery:
     """Tests for automatic config file discovery."""
 

diff --git a/tests/test_parsing.py b/tests/test_parsing.py
@@ -0,0 +1,126 @@
+"""Tests for check_unicode.parsing -- codepoint and range parsing."""
+
+from __future__ import annotations
+
+import pytest
+
+from check_unicode.parsing import parse_codepoint, parse_range
+
+
+class TestParseCodepoint:
+    """Tests for parse_codepoint()."""
+
+    def test_u_plus_prefix(self) -> None:
+        """U+XXXX format parses correctly."""
+        assert parse_codepoint("U+00B0") == 0x00B0
+
+    def test_u_plus_lowercase(self) -> None:
+        """u+xxxx format parses correctly."""
+        assert parse_codepoint("u+00b0") == 0x00B0
+
+    def test_hex_prefix(self) -> None:
+        """0xXXXX format parses correctly."""
+        assert parse_codepoint("0x00B0") == 0x00B0
+
+    def test_bare_hex(self) -> None:
+        """Bare hex digits parse correctly."""
+        assert parse_codepoint("00B0") == 0x00B0
+
+    def test_short_bare_hex(self) -> None:
+        """Short bare hex like 'A0' parses correctly."""
+        assert parse_codepoint("A0") == 0xA0
+
+    def test_max_unicode(self) -> None:
+        """U+10FFFF (max valid codepoint) is accepted."""
+        assert parse_codepoint("U+10FFFF") == 0x10FFFF
+
+    def test_zero(self) -> None:
+        """U+0000 is a valid codepoint."""
+        assert parse_codepoint("U+0000") == 0
+
+    def test_strips_whitespace(self) -> None:
+        """Leading/trailing whitespace is stripped."""
+        assert parse_codepoint("  U+00B0  ") == 0x00B0
+
+    def test_rejects_empty_string(self) -> None:
+        """Empty string raises ValueError."""
+        with pytest.raises(ValueError, match="empty"):
+            parse_codepoint("")
+
+    def test_rejects_whitespace_only(self) -> None:
+        """Whitespace-only string raises ValueError."""
+        with pytest.raises(ValueError, match="empty"):
+            parse_codepoint("   ")
+
+    def test_rejects_above_max_unicode(self) -> None:
+        """Values above U+10FFFF raise ValueError."""
+        with pytest.raises(ValueError, match=r"U\+10FFFF"):
+            parse_codepoint("U+110000")
+
+    def test_rejects_negative(self) -> None:
+        """Negative values raise ValueError."""
+        with pytest.raises(ValueError, match=r"[Ii]nvalid|outside"):
+            parse_codepoint("-1")
+
+    def test_rejects_non_hex(self) -> None:
+        """Non-hex strings raise ValueError."""
+        with pytest.raises(ValueError, match=r"[Ii]nvalid"):
+            parse_codepoint("ZZZZ")
+
+    def test_rejects_bare_u_plus(self) -> None:
+        """'U+' with no digits raises ValueError."""
+        with pytest.raises(ValueError, match=r"[Ii]nvalid"):
+            parse_codepoint("U+")
+
+    def test_mixed_case_hex_digits(self) -> None:
+        """Mixed case hex digits work."""
+        assert parse_codepoint("U+00aB") == 0x00AB
+
+
+class TestParseRange:
+    """Tests for parse_range()."""
+
+    def test_u_plus_format(self) -> None:
+        """U+XXXX-U+YYYY format parses correctly."""
+        assert parse_range("U+00A0-U+00FF") == (0x00A0, 0x00FF)
+
+    def test_hex_prefix_format(self) -> None:
+        """0xXXXX-0xYYYY format parses correctly."""
+        assert parse_range("0x00A0-0x00FF") == (0x00A0, 0x00FF)
+
+    def test_bare_hex_format(self) -> None:
+        """Bare hex A0-FF parses correctly."""
+        assert parse_range("00A0-00FF") == (0x00A0, 0x00FF)
+
+    def test_whitespace_around_dash(self) -> None:
+        """Whitespace around the dash is tolerated."""
+        assert parse_range("U+00A0 - U+00FF") == (0x00A0, 0x00FF)
+
+    def test_single_codepoint_range(self) -> None:
+        """A range where lo == hi is valid."""
+        assert parse_range("U+00B0-U+00B0") == (0x00B0, 0x00B0)
+
+    def test_rejects_inverted_range(self) -> None:
+        """Inverted range (hi < lo) raises ValueError."""
+        with pytest.raises(ValueError, match=r"[Ii]nverted|start.*end"):
+            parse_range("U+00FF-U+00A0")
+
+    def test_rejects_no_dash(self) -> None:
+        """Single value without dash raises ValueError."""
+        with pytest.raises(ValueError, match=r"[Ii]nvalid range"):
+            parse_range("U+00A0")
+
+    def test_rejects_empty(self) -> None:
+        """Empty string raises ValueError."""
+        with pytest.raises(ValueError, match=r"[Ii]nvalid range|empty"):
+            parse_range("")
+
+    def test_rejects_out_of_range(self) -> None:
+        """Out-of-range codepoint in range raises ValueError."""
+        with pytest.raises(ValueError, match=r"U\+10FFFF"):
+            parse_range("U+0000-U+110000")
+
+    def test_rejects_multiple_hyphens(self) -> None:
+        """Multiple hyphens produce a clear error."""
+        with pytest.raises(ValueError, match=r"[Ii]nvalid"):
+            parse_range("A0-B0-C0")