mit-d · mit-d · Mar 11, 2026 · Mar 11, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,17 +2,37 @@
 
 ## Unreleased
 
+### Fixed
+
+- Validate `severity` values from config files and overrides (invalid values
+  like `"warn"` now exit 2 instead of silently behaving as warning)
+- Catch config file errors (missing file, invalid TOML) and exit 2 with a
+  friendly message instead of a raw traceback
+- Validate `--allow-category` and `--allow-script` values; unknown names now
+  exit 2 with a hint to use `--list-categories` or `--list-scripts`
+- Warn on unrecognised top-level config keys (e.g. typo `alow-codepoints`)
+- Remove dead `U+FFFD` entry from `REPLACEMENT_TABLE` (unreachable because
+  U+FFFD is in `DANGEROUS_INVISIBLE`, which is checked first)
+- Exclude `tests/fixtures/` from mypy (intentionally malformed Trojan Source
+  files)
+
+### Changed
+
+- Refactor `_apply_replacements` to use `str.translate()` for cleaner code and
+  better performance on large files
+- Read each file once when `--check-confusables` is enabled (previously
+  `check_file` and `check_confusables` each read the file independently)
+- Simplify `_parse_codepoint` to use prefix-stripping instead of fragile
+  double-replace chain
+- Add `slots=True` to `Override` dataclass for consistency with `Finding` and
+  `AllowConfig`
+
 ### Docs
 
 - Document per-file `[[tool.check-unicode.overrides]]` in README and man page
 - Update man page version to 0.4.0 and fix stale pre-commit `rev`
 - Add man page to `bump-my-version` files list
 
-### Fixed
-
-- Exclude `tests/fixtures/` from mypy (intentionally malformed Trojan Source
-  files)
-
 ## 0.4.0 - 2026-02-28
 
 ### Added

diff --git a/src/check_unicode/categories.py b/src/check_unicode/categories.py
@@ -57,6 +57,4 @@
     0x3000: " ",  # IDEOGRAPHIC SPACE
     # Ellipsis
     0x2026: "...",  # HORIZONTAL ELLIPSIS
-    # Replacement character
-    0xFFFD: "?",  # REPLACEMENT CHARACTER
 }
diff --git a/src/check_unicode/checker.py b/src/check_unicode/checker.py
@@ -4,6 +4,7 @@
 
 import re
 import unicodedata
+from collections import Counter
 from dataclasses import dataclass
 from pathlib import Path
 
@@ -61,50 +62,58 @@ def _is_allowed(cp: int, cat: str, allow: AllowConfig) -> bool:
         return True
     if cp in DANGEROUS_INVISIBLE:
         return False
-    if allow.printable and chr(cp).isprintable():
-        return True
-    if allow.scripts and script_of(cp) in allow.scripts:
+
+    ch = chr(cp)
+    if allow.printable and ch.isprintable():
         return True
-    if any(lo <= cp <= hi for lo, hi in allow.ranges):
+
+    if allow.scripts:
+        script = script_of(cp)
+        if script in allow.scripts:
+            return True
+
+    if allow.ranges and any(lo <= cp <= hi for lo, hi in allow.ranges):
         return True
-    return any(cat.startswith(prefix) for prefix in allow.categories)
+
+    return bool(allow.categories) and any(
+        cat.startswith(prefix) for prefix in allow.categories
+    )
 
 
 def _char_name(cp: int) -> str:
-    try:
-        return unicodedata.name(chr(cp))
-    except ValueError:
-        return f"U+{cp:04X}"
+    return unicodedata.name(chr(cp), f"U+{cp:04X}")
 
 
 def check_file(
     path: str | Path,
     allow: AllowConfig | None = None,
+    *,
+    text: str | None = None,
 ) -> list[Finding]:
     """Scan a file for non-ASCII characters, returning findings."""
     if allow is None:
         allow = AllowConfig()
     filepath = str(path)
-    try:
-        text = Path(path).read_text(encoding="utf-8")
-    except (UnicodeDecodeError, OSError) as exc:
-        # Graceful handling of binary / unreadable files
-        return [
-            Finding(
-                file=filepath,
-                line=0,
-                col=0,
-                char="",
-                codepoint=0,
-                name=f"Could not read file: {exc}",
-                category="",
-                dangerous=False,
-            )
-        ]
+
+    if text is None:
+        try:
+            text = Path(path).read_text(encoding="utf-8")
+        except (UnicodeDecodeError, OSError) as exc:
+            return [
+                Finding(
+                    file=filepath,
+                    line=0,
+                    col=0,
+                    char="",
+                    codepoint=0,
+                    name=f"Could not read file: {exc}",
+                    category="",
+                    dangerous=False,
+                )
+            ]
 
     findings: list[Finding] = []
-    lines = text.splitlines()
-    for lineno, line in enumerate(lines, start=1):
+    for lineno, line in enumerate(text.splitlines(), start=1):
         for m in _ASCII_SAFE.finditer(line):
             col = m.start() + 1  # 1-indexed
             char = m.group()
@@ -152,9 +161,7 @@ def _check_line_confusables(
         return []
 
     # Count scripts to find dominant.
-    script_counts: dict[str, int] = {}
-    for _, _, script in letters:
-        script_counts[script] = script_counts.get(script, 0) + 1
+    script_counts = Counter(script for _, _, script in letters)
 
     if len(script_counts) < 2:  # noqa: PLR2004
         return []  # single script, no confusable risk
@@ -189,6 +196,8 @@ def _check_line_confusables(
 
 def check_confusables(
     path: str | Path,
+    *,
+    text: str | None = None,
 ) -> list[Finding]:
     """Detect mixed-script homoglyph/confusable characters in a file.
 
@@ -201,10 +210,12 @@ def check_confusables(
     --allow-script does NOT suppress confusable warnings.
     """
     filepath = str(path)
-    try:
-        text = Path(path).read_text(encoding="utf-8")
-    except (UnicodeDecodeError, OSError):
-        return []
+
+    if text is None:
+        try:
+            text = Path(path).read_text(encoding="utf-8")
+        except (UnicodeDecodeError, OSError):
+            return []
 
     findings: list[Finding] = []
     for lineno, line in enumerate(text.splitlines(), start=1):

diff --git a/src/check_unicode/fixer.py b/src/check_unicode/fixer.py
@@ -9,6 +9,11 @@
 
 from check_unicode.categories import DANGEROUS_INVISIBLE, REPLACEMENT_TABLE
 
+# Pre-built translation table: all REPLACEMENT_TABLE entries that are NOT dangerous.
+_TRANSLATE_TABLE: dict[int, str] = {
+    cp: repl for cp, repl in REPLACEMENT_TABLE.items() if cp not in DANGEROUS_INVISIBLE
+}
+
 
 def fix_file(path: str | Path) -> bool:
     """Replace fixable Unicode characters in a file with ASCII equivalents.
@@ -55,13 +60,4 @@ def _apply_replacements(text: str) -> str:
 
     Skips dangerous invisible characters -- those are never auto-fixed.
     """
-    out: list[str] = []
-    for ch in text:
-        cp = ord(ch)
-        if cp in DANGEROUS_INVISIBLE:
-            out.append(ch)
-        elif cp in REPLACEMENT_TABLE:
-            out.append(REPLACEMENT_TABLE[cp])
-        else:
-            out.append(ch)
-    return "".join(out)
+    return text.translate(_TRANSLATE_TABLE)
diff --git a/src/check_unicode/main.py b/src/check_unicode/main.py
@@ -55,7 +55,7 @@
 }
 
 
-@dataclass(frozen=True)
+@dataclass(frozen=True, slots=True)
 class Override:
     """Per-file override from [[tool.check-unicode.overrides]]."""
 
@@ -72,10 +72,10 @@ class Override:
 def _parse_codepoint(s: str) -> int:
     """Parse 'U+XXXX' or '0xXXXX' into an integer codepoint."""
     s = s.strip()
-    if s.upper().startswith("U+"):
-        return int(s.replace("U+", "0x", 1).replace("u+", "0x", 1), 0)
-    if s.lower().startswith("0x"):
-        return int(s, 0)
+    for prefix in ("U+", "u+", "0x", "0X"):
+        if s.startswith(prefix):
+            s = s[len(prefix) :]
+            break
     return int(s, 16)
 
 
@@ -139,6 +139,54 @@ def _allow_from_config(
     return codepoints, ranges, categories, printable, scripts
 
 
+_VALID_SEVERITIES: frozenset[str] = frozenset({"error", "warning"})
+
+_KNOWN_CONFIG_KEYS: frozenset[str] = frozenset(
+    {
+        "allow-codepoints",
+        "allow-ranges",
+        "allow-categories",
+        "allow-printable",
+        "allow-scripts",
+        "check-confusables",
+        "severity",
+        "exclude-patterns",
+        "overrides",
+    }
+)
+
+
+def _warn_unknown_keys(config: dict[str, Any]) -> None:
+    """Print warnings for unrecognised top-level config keys."""
+    unknown = set(config) - _KNOWN_CONFIG_KEYS
+    for key in sorted(unknown):
+        sys.stderr.write(f"warning: unknown config key {key!r}\n")
+
+
+_VALID_CATEGORIES: frozenset[str] = frozenset(UNICODE_CATEGORIES) | frozenset(
+    k[0] for k in UNICODE_CATEGORIES
+)
+
+
+def _validate_allow_values(
+    categories: set[str],
+    scripts: set[str],
+) -> None:
+    """Raise ``argparse.ArgumentTypeError`` for invalid categories or scripts."""
+    for cat in categories:
+        if cat not in _VALID_CATEGORIES:
+            msg = (
+                f"Unknown Unicode category {cat!r}; "
+                "use --list-categories to see valid values"
+            )
+            raise argparse.ArgumentTypeError(msg)
+
+    for script in scripts:
+        if script not in KNOWN_SCRIPTS:
+            msg = f"Unknown script {script!r}; use --list-scripts to see valid names"
+            raise argparse.ArgumentTypeError(msg)
+
+
 def _build_allow_config(
     args: argparse.Namespace,
     config: dict[str, Any],
@@ -161,6 +209,8 @@ def _build_allow_config(
     if args.allow_script:
         scripts.update(s.title() for s in args.allow_script)
 
+    _validate_allow_values(categories, scripts)
+
     return AllowConfig(
         codepoints=frozenset(codepoints),
         ranges=tuple(ranges),
@@ -459,6 +509,11 @@ def _build_overrides(config: dict[str, Any]) -> tuple[Override, ...]:
             True if printable_val else None if "allow-printable" not in entry else False
         )
         severity: str | None = entry.get("severity")
+        if severity is not None and severity not in {"error", "warning"}:
+            msg = (
+                f"Invalid override severity {severity!r}; must be 'error' or 'warning'"
+            )
+            raise ValueError(msg)
         check_confusables: bool | None = entry.get("check-confusables")
         overrides.append(
             Override(
@@ -558,15 +613,54 @@ def _scan_files(
             global_confusables=do_confusables,
             overrides=overrides,
         )
-        file_findings = check_file(filepath, file_allow)
+        try:
+            file_text = Path(filepath).read_text(encoding="utf-8")
+        except (UnicodeDecodeError, OSError):
+            file_text = None
+        file_findings = check_file(filepath, file_allow, text=file_text)
         if file_confusables:
-            file_findings.extend(check_confusables(filepath))
+            file_findings.extend(check_confusables(filepath, text=file_text))
         if file_findings and file_severity == "error":
             has_errors = True
         findings.extend(file_findings)
     return findings, has_errors
 
 
+def _load_and_validate_config(
+    parser: argparse.ArgumentParser,
+    args: argparse.Namespace,
+) -> tuple[dict[str, Any], str, AllowConfig, bool, tuple[Override, ...]]:
+    """Load, validate and merge all configuration.
+
+    Returns (config, severity, allow, do_confusables, overrides).
+    Calls ``parser.error`` on invalid input.
+    """
+    try:
+        config = _load_config(args.config)
+    except (OSError, tomllib.TOMLDecodeError) as exc:
+        parser.error(f"Cannot load config: {exc}")
+
+    _warn_unknown_keys(config)
+
+    severity = args.severity or config.get("severity", "error")
+    if severity not in _VALID_SEVERITIES:
+        parser.error(f"Invalid severity {severity!r}; must be 'error' or 'warning'")
+
+    try:
+        allow = _build_allow_config(args, config)
+    except argparse.ArgumentTypeError as exc:
+        parser.error(str(exc))
+
+    do_confusables = args.check_confusables or config.get("check-confusables", False)
+
+    try:
+        overrides = _build_overrides(config)
+    except ValueError as exc:
+        parser.error(str(exc))
+
+    return config, severity, allow, do_confusables, overrides
+
+
 def main(argv: list[str] | None = None) -> int:
     """Run the check-unicode CLI."""
     parser = _build_parser()
@@ -583,11 +677,9 @@ def main(argv: list[str] | None = None) -> int:
     if not args.files:
         parser.error("No files specified.")
 
-    config = _load_config(args.config)
-    severity = args.severity or config.get("severity", "error")
-    allow = _build_allow_config(args, config)
-    do_confusables = args.check_confusables or config.get("check-confusables", False)
-    overrides = _build_overrides(config)
+    config, severity, allow, do_confusables, overrides = _load_and_validate_config(
+        parser, args
+    )
 
     # Filter out excluded files
     exclude_patterns = _build_exclude_patterns(args, config)