Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 25 additions & 5 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,37 @@

## Unreleased

### Fixed

- Validate `severity` values from config files and overrides (invalid values
like `"warn"` now exit 2 instead of silently behaving as warning)
- Catch config file errors (missing file, invalid TOML) and exit 2 with a
friendly message instead of a raw traceback
- Validate `--allow-category` and `--allow-script` values; unknown names now
exit 2 with a hint to use `--list-categories` or `--list-scripts`
- Warn on unrecognised top-level config keys (e.g. typo `alow-codepoints`)
- Remove dead `U+FFFD` entry from `REPLACEMENT_TABLE` (unreachable because
U+FFFD is in `DANGEROUS_INVISIBLE`, which is checked first)
- Exclude `tests/fixtures/` from mypy (intentionally malformed Trojan Source
files)

### Changed

- Refactor `_apply_replacements` to use `str.translate()` for cleaner code and
better performance on large files
- Read each file once when `--check-confusables` is enabled (previously
`check_file` and `check_confusables` each read the file independently)
- Simplify `_parse_codepoint` to use prefix-stripping instead of fragile
double-replace chain
- Add `slots=True` to `Override` dataclass for consistency with `Finding` and
`AllowConfig`

### Docs

- Document per-file `[[tool.check-unicode.overrides]]` in README and man page
- Update man page version to 0.4.0 and fix stale pre-commit `rev`
- Add man page to `bump-my-version` files list

### Fixed

- Exclude `tests/fixtures/` from mypy (intentionally malformed Trojan Source
files)

## 0.4.0 - 2026-02-28

### Added
Expand Down
2 changes: 0 additions & 2 deletions src/check_unicode/categories.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,4 @@
0x3000: " ", # IDEOGRAPHIC SPACE
# Ellipsis
0x2026: "...", # HORIZONTAL ELLIPSIS
# Replacement character
0xFFFD: "?", # REPLACEMENT CHARACTER
}
79 changes: 45 additions & 34 deletions src/check_unicode/checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import re
import unicodedata
from collections import Counter
from dataclasses import dataclass
from pathlib import Path

Expand Down Expand Up @@ -61,50 +62,58 @@ def _is_allowed(cp: int, cat: str, allow: AllowConfig) -> bool:
return True
if cp in DANGEROUS_INVISIBLE:
return False
if allow.printable and chr(cp).isprintable():
return True
if allow.scripts and script_of(cp) in allow.scripts:

ch = chr(cp)
if allow.printable and ch.isprintable():
return True
if any(lo <= cp <= hi for lo, hi in allow.ranges):

if allow.scripts:
script = script_of(cp)
if script in allow.scripts:
return True

if allow.ranges and any(lo <= cp <= hi for lo, hi in allow.ranges):
return True
return any(cat.startswith(prefix) for prefix in allow.categories)

return bool(allow.categories) and any(
cat.startswith(prefix) for prefix in allow.categories
)


def _char_name(cp: int) -> str:
try:
return unicodedata.name(chr(cp))
except ValueError:
return f"U+{cp:04X}"
return unicodedata.name(chr(cp), f"U+{cp:04X}")


def check_file(
path: str | Path,
allow: AllowConfig | None = None,
*,
text: str | None = None,
) -> list[Finding]:
"""Scan a file for non-ASCII characters, returning findings."""
if allow is None:
allow = AllowConfig()
filepath = str(path)
try:
text = Path(path).read_text(encoding="utf-8")
except (UnicodeDecodeError, OSError) as exc:
# Graceful handling of binary / unreadable files
return [
Finding(
file=filepath,
line=0,
col=0,
char="",
codepoint=0,
name=f"Could not read file: {exc}",
category="",
dangerous=False,
)
]

if text is None:
try:
text = Path(path).read_text(encoding="utf-8")
except (UnicodeDecodeError, OSError) as exc:
return [
Finding(
file=filepath,
line=0,
col=0,
char="",
codepoint=0,
name=f"Could not read file: {exc}",
category="",
dangerous=False,
)
]

findings: list[Finding] = []
lines = text.splitlines()
for lineno, line in enumerate(lines, start=1):
for lineno, line in enumerate(text.splitlines(), start=1):
for m in _ASCII_SAFE.finditer(line):
col = m.start() + 1 # 1-indexed
char = m.group()
Expand Down Expand Up @@ -152,9 +161,7 @@ def _check_line_confusables(
return []

# Count scripts to find dominant.
script_counts: dict[str, int] = {}
for _, _, script in letters:
script_counts[script] = script_counts.get(script, 0) + 1
script_counts = Counter(script for _, _, script in letters)

if len(script_counts) < 2: # noqa: PLR2004
return [] # single script, no confusable risk
Expand Down Expand Up @@ -189,6 +196,8 @@ def _check_line_confusables(

def check_confusables(
path: str | Path,
*,
text: str | None = None,
) -> list[Finding]:
"""Detect mixed-script homoglyph/confusable characters in a file.

Expand All @@ -201,10 +210,12 @@ def check_confusables(
--allow-script does NOT suppress confusable warnings.
"""
filepath = str(path)
try:
text = Path(path).read_text(encoding="utf-8")
except (UnicodeDecodeError, OSError):
return []

if text is None:
try:
text = Path(path).read_text(encoding="utf-8")
except (UnicodeDecodeError, OSError):
return []

findings: list[Finding] = []
for lineno, line in enumerate(text.splitlines(), start=1):
Expand Down
16 changes: 6 additions & 10 deletions src/check_unicode/fixer.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,11 @@

from check_unicode.categories import DANGEROUS_INVISIBLE, REPLACEMENT_TABLE

# Pre-built translation table: all REPLACEMENT_TABLE entries that are NOT dangerous.
_TRANSLATE_TABLE: dict[int, str] = {
cp: repl for cp, repl in REPLACEMENT_TABLE.items() if cp not in DANGEROUS_INVISIBLE
}


def fix_file(path: str | Path) -> bool:
"""Replace fixable Unicode characters in a file with ASCII equivalents.
Expand Down Expand Up @@ -55,13 +60,4 @@ def _apply_replacements(text: str) -> str:

Skips dangerous invisible characters -- those are never auto-fixed.
"""
out: list[str] = []
for ch in text:
cp = ord(ch)
if cp in DANGEROUS_INVISIBLE:
out.append(ch)
elif cp in REPLACEMENT_TABLE:
out.append(REPLACEMENT_TABLE[cp])
else:
out.append(ch)
return "".join(out)
return text.translate(_TRANSLATE_TABLE)
116 changes: 104 additions & 12 deletions src/check_unicode/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@
}


@dataclass(frozen=True)
@dataclass(frozen=True, slots=True)
class Override:
"""Per-file override from [[tool.check-unicode.overrides]]."""

Expand All @@ -72,10 +72,10 @@ class Override:
def _parse_codepoint(s: str) -> int:
"""Parse 'U+XXXX' or '0xXXXX' into an integer codepoint."""
s = s.strip()
if s.upper().startswith("U+"):
return int(s.replace("U+", "0x", 1).replace("u+", "0x", 1), 0)
if s.lower().startswith("0x"):
return int(s, 0)
for prefix in ("U+", "u+", "0x", "0X"):
if s.startswith(prefix):
s = s[len(prefix) :]
break
return int(s, 16)


Expand Down Expand Up @@ -139,6 +139,54 @@ def _allow_from_config(
return codepoints, ranges, categories, printable, scripts


_VALID_SEVERITIES: frozenset[str] = frozenset({"error", "warning"})

_KNOWN_CONFIG_KEYS: frozenset[str] = frozenset(
{
"allow-codepoints",
"allow-ranges",
"allow-categories",
"allow-printable",
"allow-scripts",
"check-confusables",
"severity",
"exclude-patterns",
"overrides",
}
)


def _warn_unknown_keys(config: dict[str, Any]) -> None:
"""Print warnings for unrecognised top-level config keys."""
unknown = set(config) - _KNOWN_CONFIG_KEYS
for key in sorted(unknown):
sys.stderr.write(f"warning: unknown config key {key!r}\n")


_VALID_CATEGORIES: frozenset[str] = frozenset(UNICODE_CATEGORIES) | frozenset(
k[0] for k in UNICODE_CATEGORIES
)


def _validate_allow_values(
categories: set[str],
scripts: set[str],
) -> None:
"""Raise ``argparse.ArgumentTypeError`` for invalid categories or scripts."""
for cat in categories:
if cat not in _VALID_CATEGORIES:
msg = (
f"Unknown Unicode category {cat!r}; "
"use --list-categories to see valid values"
)
raise argparse.ArgumentTypeError(msg)

for script in scripts:
if script not in KNOWN_SCRIPTS:
msg = f"Unknown script {script!r}; use --list-scripts to see valid names"
raise argparse.ArgumentTypeError(msg)


def _build_allow_config(
args: argparse.Namespace,
config: dict[str, Any],
Expand All @@ -161,6 +209,8 @@ def _build_allow_config(
if args.allow_script:
scripts.update(s.title() for s in args.allow_script)

_validate_allow_values(categories, scripts)

return AllowConfig(
codepoints=frozenset(codepoints),
ranges=tuple(ranges),
Expand Down Expand Up @@ -459,6 +509,11 @@ def _build_overrides(config: dict[str, Any]) -> tuple[Override, ...]:
True if printable_val else None if "allow-printable" not in entry else False
)
severity: str | None = entry.get("severity")
if severity is not None and severity not in {"error", "warning"}:
msg = (
f"Invalid override severity {severity!r}; must be 'error' or 'warning'"
)
raise ValueError(msg)
check_confusables: bool | None = entry.get("check-confusables")
overrides.append(
Override(
Expand Down Expand Up @@ -558,15 +613,54 @@ def _scan_files(
global_confusables=do_confusables,
overrides=overrides,
)
file_findings = check_file(filepath, file_allow)
try:
file_text = Path(filepath).read_text(encoding="utf-8")
except (UnicodeDecodeError, OSError):
file_text = None
file_findings = check_file(filepath, file_allow, text=file_text)
if file_confusables:
file_findings.extend(check_confusables(filepath))
file_findings.extend(check_confusables(filepath, text=file_text))
if file_findings and file_severity == "error":
has_errors = True
findings.extend(file_findings)
return findings, has_errors


def _load_and_validate_config(
parser: argparse.ArgumentParser,
args: argparse.Namespace,
) -> tuple[dict[str, Any], str, AllowConfig, bool, tuple[Override, ...]]:
"""Load, validate and merge all configuration.

Returns (config, severity, allow, do_confusables, overrides).
Calls ``parser.error`` on invalid input.
"""
try:
config = _load_config(args.config)
except (OSError, tomllib.TOMLDecodeError) as exc:
parser.error(f"Cannot load config: {exc}")

_warn_unknown_keys(config)

severity = args.severity or config.get("severity", "error")
if severity not in _VALID_SEVERITIES:
parser.error(f"Invalid severity {severity!r}; must be 'error' or 'warning'")

try:
allow = _build_allow_config(args, config)
except argparse.ArgumentTypeError as exc:
parser.error(str(exc))

do_confusables = args.check_confusables or config.get("check-confusables", False)

try:
overrides = _build_overrides(config)
except ValueError as exc:
parser.error(str(exc))

return config, severity, allow, do_confusables, overrides


def main(argv: list[str] | None = None) -> int:
"""Run the check-unicode CLI."""
parser = _build_parser()
Expand All @@ -583,11 +677,9 @@ def main(argv: list[str] | None = None) -> int:
if not args.files:
parser.error("No files specified.")

config = _load_config(args.config)
severity = args.severity or config.get("severity", "error")
allow = _build_allow_config(args, config)
do_confusables = args.check_confusables or config.get("check-confusables", False)
overrides = _build_overrides(config)
config, severity, allow, do_confusables, overrides = _load_and_validate_config(
parser, args
)

# Filter out excluded files
exclude_patterns = _build_exclude_patterns(args, config)
Expand Down
Loading