diff --git a/CHANGELOG.md b/CHANGELOG.md index 9ba7db8..9c631bc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,17 +2,37 @@ ## Unreleased +### Fixed + +- Validate `severity` values from config files and overrides (invalid values + like `"warn"` now exit 2 instead of silently behaving as warning) +- Catch config file errors (missing file, invalid TOML) and exit 2 with a + friendly message instead of a raw traceback +- Validate `--allow-category` and `--allow-script` values; unknown names now + exit 2 with a hint to use `--list-categories` or `--list-scripts` +- Warn on unrecognised top-level config keys (e.g. typo `alow-codepoints`) +- Remove dead `U+FFFD` entry from `REPLACEMENT_TABLE` (unreachable because + U+FFFD is in `DANGEROUS_INVISIBLE`, which is checked first) +- Exclude `tests/fixtures/` from mypy (intentionally malformed Trojan Source + files) + +### Changed + +- Refactor `_apply_replacements` to use `str.translate()` for cleaner code and + better performance on large files +- Read each file once when `--check-confusables` is enabled (previously + `check_file` and `check_confusables` each read the file independently) +- Simplify `_parse_codepoint` to use prefix-stripping instead of fragile + double-replace chain +- Add `slots=True` to `Override` dataclass for consistency with `Finding` and + `AllowConfig` + ### Docs - Document per-file `[[tool.check-unicode.overrides]]` in README and man page - Update man page version to 0.4.0 and fix stale pre-commit `rev` - Add man page to `bump-my-version` files list -### Fixed - -- Exclude `tests/fixtures/` from mypy (intentionally malformed Trojan Source - files) - ## 0.4.0 - 2026-02-28 ### Added diff --git a/src/check_unicode/categories.py b/src/check_unicode/categories.py index 6f67cfa..8b7d257 100644 --- a/src/check_unicode/categories.py +++ b/src/check_unicode/categories.py @@ -57,6 +57,4 @@ 0x3000: " ", # IDEOGRAPHIC SPACE # Ellipsis 0x2026: "...", # HORIZONTAL ELLIPSIS - # Replacement character - 0xFFFD: "?", # REPLACEMENT CHARACTER } diff --git a/src/check_unicode/checker.py b/src/check_unicode/checker.py index 854481b..4a392c9 100644 --- a/src/check_unicode/checker.py +++ b/src/check_unicode/checker.py @@ -4,6 +4,7 @@ import re import unicodedata +from collections import Counter from dataclasses import dataclass from pathlib import Path @@ -61,50 +62,58 @@ def _is_allowed(cp: int, cat: str, allow: AllowConfig) -> bool: return True if cp in DANGEROUS_INVISIBLE: return False - if allow.printable and chr(cp).isprintable(): - return True - if allow.scripts and script_of(cp) in allow.scripts: + + ch = chr(cp) + if allow.printable and ch.isprintable(): return True - if any(lo <= cp <= hi for lo, hi in allow.ranges): + + if allow.scripts: + script = script_of(cp) + if script in allow.scripts: + return True + + if allow.ranges and any(lo <= cp <= hi for lo, hi in allow.ranges): return True - return any(cat.startswith(prefix) for prefix in allow.categories) + + return bool(allow.categories) and any( + cat.startswith(prefix) for prefix in allow.categories + ) def _char_name(cp: int) -> str: - try: - return unicodedata.name(chr(cp)) - except ValueError: - return f"U+{cp:04X}" + return unicodedata.name(chr(cp), f"U+{cp:04X}") def check_file( path: str | Path, allow: AllowConfig | None = None, + *, + text: str | None = None, ) -> list[Finding]: """Scan a file for non-ASCII characters, returning findings.""" if allow is None: allow = AllowConfig() filepath = str(path) - try: - text = Path(path).read_text(encoding="utf-8") - except (UnicodeDecodeError, OSError) as exc: - # Graceful handling of binary / unreadable files - return [ - Finding( - file=filepath, - line=0, - col=0, - char="", - codepoint=0, - name=f"Could not read file: {exc}", - category="", - dangerous=False, - ) - ] + + if text is None: + try: + text = Path(path).read_text(encoding="utf-8") + except (UnicodeDecodeError, OSError) as exc: + return [ + Finding( + file=filepath, + line=0, + col=0, + char="", + codepoint=0, + name=f"Could not read file: {exc}", + category="", + dangerous=False, + ) + ] findings: list[Finding] = [] - lines = text.splitlines() - for lineno, line in enumerate(lines, start=1): + for lineno, line in enumerate(text.splitlines(), start=1): for m in _ASCII_SAFE.finditer(line): col = m.start() + 1 # 1-indexed char = m.group() @@ -152,9 +161,7 @@ def _check_line_confusables( return [] # Count scripts to find dominant. - script_counts: dict[str, int] = {} - for _, _, script in letters: - script_counts[script] = script_counts.get(script, 0) + 1 + script_counts = Counter(script for _, _, script in letters) if len(script_counts) < 2: # noqa: PLR2004 return [] # single script, no confusable risk @@ -189,6 +196,8 @@ def _check_line_confusables( def check_confusables( path: str | Path, + *, + text: str | None = None, ) -> list[Finding]: """Detect mixed-script homoglyph/confusable characters in a file. @@ -201,10 +210,12 @@ def check_confusables( --allow-script does NOT suppress confusable warnings. """ filepath = str(path) - try: - text = Path(path).read_text(encoding="utf-8") - except (UnicodeDecodeError, OSError): - return [] + + if text is None: + try: + text = Path(path).read_text(encoding="utf-8") + except (UnicodeDecodeError, OSError): + return [] findings: list[Finding] = [] for lineno, line in enumerate(text.splitlines(), start=1): diff --git a/src/check_unicode/fixer.py b/src/check_unicode/fixer.py index 49a7a8b..43c5dda 100644 --- a/src/check_unicode/fixer.py +++ b/src/check_unicode/fixer.py @@ -9,6 +9,11 @@ from check_unicode.categories import DANGEROUS_INVISIBLE, REPLACEMENT_TABLE +# Pre-built translation table: all REPLACEMENT_TABLE entries that are NOT dangerous. +_TRANSLATE_TABLE: dict[int, str] = { + cp: repl for cp, repl in REPLACEMENT_TABLE.items() if cp not in DANGEROUS_INVISIBLE +} + def fix_file(path: str | Path) -> bool: """Replace fixable Unicode characters in a file with ASCII equivalents. @@ -55,13 +60,4 @@ def _apply_replacements(text: str) -> str: Skips dangerous invisible characters -- those are never auto-fixed. """ - out: list[str] = [] - for ch in text: - cp = ord(ch) - if cp in DANGEROUS_INVISIBLE: - out.append(ch) - elif cp in REPLACEMENT_TABLE: - out.append(REPLACEMENT_TABLE[cp]) - else: - out.append(ch) - return "".join(out) + return text.translate(_TRANSLATE_TABLE) diff --git a/src/check_unicode/main.py b/src/check_unicode/main.py index 474d440..8c0ac89 100644 --- a/src/check_unicode/main.py +++ b/src/check_unicode/main.py @@ -55,7 +55,7 @@ } -@dataclass(frozen=True) +@dataclass(frozen=True, slots=True) class Override: """Per-file override from [[tool.check-unicode.overrides]].""" @@ -72,10 +72,10 @@ class Override: def _parse_codepoint(s: str) -> int: """Parse 'U+XXXX' or '0xXXXX' into an integer codepoint.""" s = s.strip() - if s.upper().startswith("U+"): - return int(s.replace("U+", "0x", 1).replace("u+", "0x", 1), 0) - if s.lower().startswith("0x"): - return int(s, 0) + for prefix in ("U+", "u+", "0x", "0X"): + if s.startswith(prefix): + s = s[len(prefix) :] + break return int(s, 16) @@ -139,6 +139,54 @@ def _allow_from_config( return codepoints, ranges, categories, printable, scripts +_VALID_SEVERITIES: frozenset[str] = frozenset({"error", "warning"}) + +_KNOWN_CONFIG_KEYS: frozenset[str] = frozenset( + { + "allow-codepoints", + "allow-ranges", + "allow-categories", + "allow-printable", + "allow-scripts", + "check-confusables", + "severity", + "exclude-patterns", + "overrides", + } +) + + +def _warn_unknown_keys(config: dict[str, Any]) -> None: + """Print warnings for unrecognised top-level config keys.""" + unknown = set(config) - _KNOWN_CONFIG_KEYS + for key in sorted(unknown): + sys.stderr.write(f"warning: unknown config key {key!r}\n") + + +_VALID_CATEGORIES: frozenset[str] = frozenset(UNICODE_CATEGORIES) | frozenset( + k[0] for k in UNICODE_CATEGORIES +) + + +def _validate_allow_values( + categories: set[str], + scripts: set[str], +) -> None: + """Raise ``argparse.ArgumentTypeError`` for invalid categories or scripts.""" + for cat in categories: + if cat not in _VALID_CATEGORIES: + msg = ( + f"Unknown Unicode category {cat!r}; " + "use --list-categories to see valid values" + ) + raise argparse.ArgumentTypeError(msg) + + for script in scripts: + if script not in KNOWN_SCRIPTS: + msg = f"Unknown script {script!r}; use --list-scripts to see valid names" + raise argparse.ArgumentTypeError(msg) + + def _build_allow_config( args: argparse.Namespace, config: dict[str, Any], @@ -161,6 +209,8 @@ def _build_allow_config( if args.allow_script: scripts.update(s.title() for s in args.allow_script) + _validate_allow_values(categories, scripts) + return AllowConfig( codepoints=frozenset(codepoints), ranges=tuple(ranges), @@ -459,6 +509,11 @@ def _build_overrides(config: dict[str, Any]) -> tuple[Override, ...]: True if printable_val else None if "allow-printable" not in entry else False ) severity: str | None = entry.get("severity") + if severity is not None and severity not in {"error", "warning"}: + msg = ( + f"Invalid override severity {severity!r}; must be 'error' or 'warning'" + ) + raise ValueError(msg) check_confusables: bool | None = entry.get("check-confusables") overrides.append( Override( @@ -558,15 +613,54 @@ def _scan_files( global_confusables=do_confusables, overrides=overrides, ) - file_findings = check_file(filepath, file_allow) + try: + file_text = Path(filepath).read_text(encoding="utf-8") + except (UnicodeDecodeError, OSError): + file_text = None + file_findings = check_file(filepath, file_allow, text=file_text) if file_confusables: - file_findings.extend(check_confusables(filepath)) + file_findings.extend(check_confusables(filepath, text=file_text)) if file_findings and file_severity == "error": has_errors = True findings.extend(file_findings) return findings, has_errors +def _load_and_validate_config( + parser: argparse.ArgumentParser, + args: argparse.Namespace, +) -> tuple[dict[str, Any], str, AllowConfig, bool, tuple[Override, ...]]: + """Load, validate and merge all configuration. + + Returns (config, severity, allow, do_confusables, overrides). + Calls ``parser.error`` on invalid input. + """ + try: + config = _load_config(args.config) + except (OSError, tomllib.TOMLDecodeError) as exc: + parser.error(f"Cannot load config: {exc}") + + _warn_unknown_keys(config) + + severity = args.severity or config.get("severity", "error") + if severity not in _VALID_SEVERITIES: + parser.error(f"Invalid severity {severity!r}; must be 'error' or 'warning'") + + try: + allow = _build_allow_config(args, config) + except argparse.ArgumentTypeError as exc: + parser.error(str(exc)) + + do_confusables = args.check_confusables or config.get("check-confusables", False) + + try: + overrides = _build_overrides(config) + except ValueError as exc: + parser.error(str(exc)) + + return config, severity, allow, do_confusables, overrides + + def main(argv: list[str] | None = None) -> int: """Run the check-unicode CLI.""" parser = _build_parser() @@ -583,11 +677,9 @@ def main(argv: list[str] | None = None) -> int: if not args.files: parser.error("No files specified.") - config = _load_config(args.config) - severity = args.severity or config.get("severity", "error") - allow = _build_allow_config(args, config) - do_confusables = args.check_confusables or config.get("check-confusables", False) - overrides = _build_overrides(config) + config, severity, allow, do_confusables, overrides = _load_and_validate_config( + parser, args + ) # Filter out excluded files exclude_patterns = _build_exclude_patterns(args, config) diff --git a/tests/test_cli.py b/tests/test_cli.py index 0fb3a92..c0a9a12 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -874,3 +874,96 @@ def test_global_severity_warning_still_works(self, tmp_path: Path) -> None: # Global severity is warning -> exit 0 even for unmatched files assert main(["--config", str(config), str(py_file)]) == 0 + + +class TestSeverityValidation: + """Tests for severity value validation.""" + + def test_invalid_severity_in_config(self, tmp_path: Path) -> None: + """Invalid severity in config file causes exit code 2.""" + config = tmp_path / "config.toml" + config.write_text('severity = "warn"\n', encoding="utf-8") + f = tmp_path / "test.txt" + f.write_text("hello\n", encoding="utf-8") + with pytest.raises(SystemExit) as exc_info: + main(["--config", str(config), str(f)]) + assert exc_info.value.code == 2 + + def test_invalid_severity_in_override(self, tmp_path: Path) -> None: + """Invalid severity in override causes exit code 2.""" + config = tmp_path / "config.toml" + config.write_text( + '[[overrides]]\nfiles = ["*"]\nseverity = "warn"\n', + encoding="utf-8", + ) + f = tmp_path / "test.txt" + f.write_text("hello\n", encoding="utf-8") + with pytest.raises(SystemExit) as exc_info: + main(["--config", str(config), str(f)]) + assert exc_info.value.code == 2 + + +class TestConfigErrorHandling: + """Tests for config file error handling.""" + + def test_missing_config_file(self, tmp_path: Path) -> None: + """Missing config file causes exit 2 with friendly message.""" + f = tmp_path / "test.txt" + f.write_text("hello\n", encoding="utf-8") + with pytest.raises(SystemExit) as exc_info: + main(["--config", "/nonexistent/config.toml", str(f)]) + assert exc_info.value.code == 2 + + def test_invalid_toml(self, tmp_path: Path) -> None: + """Invalid TOML causes exit 2 with friendly message.""" + config = tmp_path / "bad.toml" + config.write_text("this is not valid toml [[[", encoding="utf-8") + f = tmp_path / "test.txt" + f.write_text("hello\n", encoding="utf-8") + with pytest.raises(SystemExit) as exc_info: + main(["--config", str(config), str(f)]) + assert exc_info.value.code == 2 + + +class TestUnknownConfigKeys: + """Tests for unknown config key warnings.""" + + def test_unknown_key_warns( + self, tmp_path: Path, capsys: pytest.CaptureFixture[str] + ) -> None: + """Unknown config keys produce a warning on stderr.""" + config = tmp_path / "config.toml" + config.write_text('alow-codepoints = ["U+00B0"]\n', encoding="utf-8") + f = tmp_path / "test.txt" + f.write_text("hello\n", encoding="utf-8") + main(["--config", str(config), str(f)]) + err = capsys.readouterr().err + assert "unknown config key" in err + assert "alow-codepoints" in err + + +class TestAllowValueValidation: + """Tests for --allow-category and --allow-script validation.""" + + def test_invalid_category(self, tmp_path: Path) -> None: + """Invalid category name causes exit 2.""" + f = tmp_path / "test.txt" + f.write_text("hello\n", encoding="utf-8") + with pytest.raises(SystemExit) as exc_info: + main(["--allow-category", "Foo", str(f)]) + assert exc_info.value.code == 2 + + def test_valid_major_category(self, tmp_path: Path) -> None: + """Major category prefix (single letter) is accepted.""" + f = tmp_path / "test.txt" + f.write_text("\u00a9\n", encoding="utf-8") # copyright sign, category So + # "S" should allow all Symbol categories + assert main(["--allow-category", "S", str(f)]) == 0 + + def test_invalid_script(self, tmp_path: Path) -> None: + """Invalid script name causes exit 2.""" + f = tmp_path / "test.txt" + f.write_text("hello\n", encoding="utf-8") + with pytest.raises(SystemExit) as exc_info: + main(["--allow-script", "Klingon", str(f)]) + assert exc_info.value.code == 2