diff --git a/CHANGELOG.md b/CHANGELOG.md index 0a58455..2600dd7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,8 +2,23 @@ ## Unreleased +### Added + +- Pipe mode: `check-unicode -` reads stdin line-by-line and writes to stdout, + enabling use as a streaming Unix filter for log monitoring, CI pipelines, and + editor buffer filtering +- `--strip [dangerous|all]` flag to remove non-ASCII characters; `dangerous` + strips only invisible/bidi chars, `all` (default) strips any remaining + non-ASCII after allow-list processing +- `--halt [dangerous|all]` flag to stop immediately on first matching character; + `dangerous` (default) halts on invisible/bidi chars, `all` halts on any + non-ASCII +- `--fix`, `--strip`, and `--halt` are fully composable and work identically + across file and pipe modes + ### Changed +- Add `pytest-sugar` for improved test output - Replace mypy with [ty](https://github.com/astral-sh/ty) for type checking - Move dev dependencies from `optional-dependencies` to `dependency-groups` - Switch CI from pip to uv for faster, reproducible installs; check in `uv.lock` diff --git a/docs/check-unicode.1 b/docs/check-unicode.1 index 24d4da3..23aa6aa 100644 --- a/docs/check-unicode.1 +++ b/docs/check-unicode.1 @@ -33,8 +33,12 @@ hook but also works as a standalone CLI tool. .TP .I FILE ... One or more files to check. -At least one file is required; the program exits with code\ 2 if none are -provided. +Use +.B \- +to read from stdin and write to stdout (pipe mode). +At least one file or +.B \- +is required; the program exits with code\ 2 if none are provided. . .SH OPTIONS .SS Mode @@ -45,6 +49,32 @@ with their ASCII equivalents using an atomic write (temp file + rename). Exits\ 1 if any file was modified. Dangerous invisible characters are never auto\-fixed. .TP +.BI \-\-strip " [LEVEL]" +Remove non\-ASCII characters from output. +.I LEVEL +is +.B dangerous +(only invisible/bidi characters) or +.B all +(any remaining non\-ASCII after allow\-list processing). +Default: +.BR all . +Respects allow\-lists. +In file mode, modifies files in\-place; in pipe mode, writes stripped +output to stdout. +.TP +.BI \-\-halt " [LEVEL]" +Stop immediately on the first character matching the level. +.I LEVEL +is +.B dangerous +or +.BR all . +Default: +.BR dangerous . +Reports the triggering finding on stderr and exits\ 1. +The triggering file is never modified. +.TP .BR \-V ", " \-\-version Print the program version and exit. . @@ -243,13 +273,44 @@ severity is or per\-file via overrides). .TP .B 1 -Non\-ASCII findings were detected, or files were modified in -.B \-\-fix -mode. +Non\-ASCII findings were detected, files were modified by +.BR \-\-fix / \-\-strip , +or +.B \-\-halt +was triggered. .TP .B 2 Usage error (invalid arguments, no files specified, etc.). . +.SH PIPE MODE +When +.B \- +is given as the sole file argument, +.B check\-unicode +reads from stdin and writes to stdout, acting as a streaming Unix filter. +.PP +Input is processed line\-by\-line. +For each line, findings are emitted to stderr immediately with full +context display (rendered source line, caret markers, codepoint details). +The processed line is written to stdout and flushed so downstream +consumers see output in real time. +.PP +.BR \-\-fix ", " \-\-strip ", and " \-\-halt +all work in pipe mode. +The processing order per character is: +allow\-list check, then +.BR \-\-fix , +then +.BR \-\-halt , +then +.BR \-\-strip . +.PP +When +.B \-\-halt +triggers, the current line is not written to stdout and the program +exits immediately. +Lines already flushed remain in the output. +. .SH WHAT IT CATCHES .SS Copy\-paste artifacts (fixable with \-\-fix) .TP @@ -360,6 +421,36 @@ List all valid Unicode general categories: .B check\-unicode \-\-list\-categories .RE .PP +Read from stdin, write to stdout (pipe mode): +.PP +.RS +.B check\-unicode \- < file.txt +.RE +.PP +Fix smart quotes and strip dangerous chars from a pipe: +.PP +.RS +.B check\-unicode \-\-fix \-\-strip dangerous \- < file.txt +.RE +.PP +Halt on first dangerous character in stdin: +.PP +.RS +.B check\-unicode \-\-halt \- < input.txt +.RE +.PP +Stream\-filter logs, fixing and stripping bidi attacks: +.PP +.RS +.B tail \-f app.log | check\-unicode \-\-fix \-\-strip dangerous \- +.RE +.PP +Strip all non\-ASCII from files in\-place: +.PP +.RS +.B check\-unicode \-\-strip all src/ +.RE +.PP Use with pre\-commit: .PP .nf diff --git a/pyproject.toml b/pyproject.toml index a89fc06..32dd134 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,6 +33,7 @@ dev = [ "bump-my-version", "pytest", "pytest-cov", + "pytest-sugar>=1.1.1", "ruff", "ty", ] diff --git a/src/check_unicode/checker.py b/src/check_unicode/checker.py index 4a392c9..f4aab17 100644 --- a/src/check_unicode/checker.py +++ b/src/check_unicode/checker.py @@ -163,8 +163,8 @@ def _check_line_confusables( # Count scripts to find dominant. script_counts = Counter(script for _, _, script in letters) - if len(script_counts) < 2: # noqa: PLR2004 - return [] # single script, no confusable risk + if len(script_counts) <= 1: + return [] # Dominant script: highest count, tie-break to Latin. max_count = max(script_counts.values()) diff --git a/src/check_unicode/fixer.py b/src/check_unicode/fixer.py index 43c5dda..fdce1e9 100644 --- a/src/check_unicode/fixer.py +++ b/src/check_unicode/fixer.py @@ -4,10 +4,19 @@ import contextlib import os +import re import tempfile +import unicodedata from pathlib import Path +from typing import TYPE_CHECKING from check_unicode.categories import DANGEROUS_INVISIBLE, REPLACEMENT_TABLE +from check_unicode.scripts import script_of + +if TYPE_CHECKING: + from check_unicode.checker import AllowConfig + +_NON_ASCII = re.compile(r"[^\t\r\n\x20-\x7E]") # Pre-built translation table: all REPLACEMENT_TABLE entries that are NOT dangerous. _TRANSLATE_TABLE: dict[int, str] = { @@ -15,26 +24,29 @@ } -def fix_file(path: str | Path) -> bool: - """Replace fixable Unicode characters in a file with ASCII equivalents. - - Dangerous invisible characters are never auto-fixed. - Uses atomic write (temp file + rename) to avoid data loss. +def _is_strip_allowed(cp: int, allow: AllowConfig) -> bool: + """Return True if codepoint is exempted from stripping by the allow-list. - Returns True if the file was modified. + Evaluation order matches checker._is_allowed: explicit codepoints are + checked first (can exempt even dangerous chars), then dangerous chars + are blocked, then printable/script/range/category checks. """ - filepath = Path(path) - try: - original = filepath.read_text(encoding="utf-8") - orig_mode = filepath.stat().st_mode - except (UnicodeDecodeError, OSError): + if cp in allow.codepoints: + return True + if cp in DANGEROUS_INVISIBLE: return False + ch = chr(cp) + cat = unicodedata.category(ch) + return ( + (allow.printable and ch.isprintable()) + or (bool(allow.scripts) and script_of(cp) in allow.scripts) + or (bool(allow.ranges) and any(lo <= cp <= hi for lo, hi in allow.ranges)) + or (bool(allow.categories) and any(cat.startswith(p) for p in allow.categories)) + ) - fixed = _apply_replacements(original) - if fixed == original: - return False - # Atomic write: write to temp file in same directory, then rename +def _atomic_write(filepath: Path, content: str, orig_mode: int) -> None: + """Write *content* to *filepath* atomically, preserving *orig_mode*.""" fd, tmp_path_str = tempfile.mkstemp( dir=filepath.parent, prefix=f".{filepath.name}.", @@ -43,21 +55,90 @@ def fix_file(path: str | Path) -> bool: tmp_path = Path(tmp_path_str) try: with os.fdopen(fd, "w", encoding="utf-8") as f: - f.write(fixed) - # Preserve original file permissions + f.write(content) tmp_path.chmod(orig_mode) tmp_path.replace(filepath) except BaseException: - # Clean up temp file on any failure with contextlib.suppress(OSError): tmp_path.unlink() raise + + +def strip_text( + text: str, + *, + level: str = "all", + allow: AllowConfig | None = None, +) -> str: + """Remove non-ASCII characters from text based on strip level. + + level="all": remove all non-ASCII characters (except allowed). + level="dangerous": remove only DANGEROUS_INVISIBLE characters (except allowed). + """ + + def _should_strip(ch: str) -> bool: + cp = ord(ch) + if allow is not None and _is_strip_allowed(cp, allow): + return False + if level == "dangerous": + return cp in DANGEROUS_INVISIBLE + return True + + return _NON_ASCII.sub(lambda m: "" if _should_strip(m.group()) else m.group(), text) + + +def fix_file(path: str | Path) -> bool: + """Replace fixable Unicode characters in a file with ASCII equivalents. + + Dangerous invisible characters are never auto-fixed. + Uses atomic write (temp file + rename) to avoid data loss. + + Returns True if the file was modified. + """ + filepath = Path(path) + try: + original = filepath.read_text(encoding="utf-8") + orig_mode = filepath.stat().st_mode + except (UnicodeDecodeError, OSError): + return False + + fixed = fix_text(original) + if fixed == original: + return False + + _atomic_write(filepath, fixed, orig_mode) return True -def _apply_replacements(text: str) -> str: - """Replace characters that have entries in REPLACEMENT_TABLE. +def strip_file( + path: str | Path, + *, + level: str = "all", + allow: AllowConfig | None = None, +) -> bool: + """Remove non-ASCII characters from a file based on strip level. - Skips dangerous invisible characters -- those are never auto-fixed. + Uses atomic write (temp file + rename) to avoid data loss. + Returns True if the file was modified. + """ + filepath = Path(path) + try: + original = filepath.read_text(encoding="utf-8") + orig_mode = filepath.stat().st_mode + except (UnicodeDecodeError, OSError): + return False + + stripped = strip_text(original, level=level, allow=allow) + if stripped == original: + return False + + _atomic_write(filepath, stripped, orig_mode) + return True + + +def fix_text(text: str) -> str: + """Replace fixable Unicode characters with ASCII equivalents. + + Dangerous invisible characters are never auto-fixed. """ return text.translate(_TRANSLATE_TABLE) diff --git a/src/check_unicode/main.py b/src/check_unicode/main.py index 8aa5053..41066b0 100644 --- a/src/check_unicode/main.py +++ b/src/check_unicode/main.py @@ -9,12 +9,15 @@ import tomllib from dataclasses import dataclass from pathlib import Path -from typing import Any +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: + from collections.abc import Sequence from check_unicode import __version__ from check_unicode.checker import AllowConfig, Finding, check_confusables, check_file -from check_unicode.fixer import fix_file -from check_unicode.output import print_findings +from check_unicode.fixer import _atomic_write, fix_text, strip_text +from check_unicode.output import print_findings, print_line_findings from check_unicode.parsing import parse_codepoint, parse_range from check_unicode.scripts import KNOWN_SCRIPTS @@ -238,7 +241,56 @@ def _print_categories() -> None: write(f"\nTotal: {len(UNICODE_CATEGORIES)} categories\n") -def _build_parser() -> argparse.ArgumentParser: +_OPTIONAL_LEVEL_FLAGS: tuple[tuple[str, frozenset[str], str], ...] = ( + ("--strip", frozenset({"dangerous", "all"}), "all"), + ("--halt", frozenset({"dangerous", "all"}), "dangerous"), +) + + +def _preprocess_argv(args: list[str]) -> list[str]: + """Rewrite optional-level flags before argparse sees them. + + ``nargs='?'`` with ``choices=`` causes argparse to greedily consume the + next positional token (e.g. a filename) as the flag value, then reject it + because it is not in *choices*. We work around this by scanning the arg + list ourselves: if a level flag is followed by a valid choice we rewrite it + as ``--flag=VALUE``; if not we rewrite it as ``--flag=CONST`` so argparse + sees no separate token to consume. + """ + result: list[str] = [] + i = 0 + while i < len(args): + matched = False + for flag, valid, const in _OPTIONAL_LEVEL_FLAGS: + if args[i] == flag: + if i + 1 < len(args) and args[i + 1] in valid: + result.append(f"{flag}={args[i + 1]}") + i += 2 + else: + result.append(f"{flag}={const}") + i += 1 + matched = True + break + if not matched: + result.append(args[i]) + i += 1 + return result + + +class _CheckUnicodeParser(argparse.ArgumentParser): + """ArgumentParser that preprocesses optional-level flags.""" + + def parse_args( # type: ignore[override] + self, + args: Sequence[str] | None = None, + namespace: argparse.Namespace | None = None, + ) -> argparse.Namespace: + if args is None: + args = sys.argv[1:] + return super().parse_args(_preprocess_argv(list(args)), namespace) # ty: ignore[invalid-return-type] + + +def _build_parser() -> _CheckUnicodeParser: """Build and return the CLI argument parser.""" epilog = textwrap.dedent("""\ examples: @@ -255,6 +307,14 @@ def _build_parser() -> argparse.ArgumentParser: Warn without failing CI check-unicode --list-scripts Show all valid script names check-unicode --list-categories Show all valid category abbreviations + check-unicode - < file.txt Read stdin, write to stdout + check-unicode --fix - < file.txt Fix and write to stdout + check-unicode --fix --strip dangerous - + Fix fixable, strip bidi attacks + check-unicode --strip all src/ Strip all non-ASCII in-place + check-unicode --halt - < input.txt Halt on first dangerous char + check-unicode --fix --halt dangerous src/ + Fix files, halt on dangerous configuration: Settings can be defined in .check-unicode.toml or pyproject.toml under @@ -283,7 +343,7 @@ def _build_parser() -> argparse.ArgumentParser: copy-paste artifacts. Use --fix to auto-replace known offenders with ASCII equivalents. Dangerous characters are always flagged and never auto-fixed.""") - parser = argparse.ArgumentParser( + parser = _CheckUnicodeParser( prog="check-unicode", description=description, epilog=epilog, @@ -293,7 +353,7 @@ def _build_parser() -> argparse.ArgumentParser: "files", nargs="*", metavar="FILE", - help="files to check (one or more paths required)", + help="files to check; use - to read stdin and write to stdout", ) # Allow-list options @@ -440,6 +500,34 @@ def _build_parser() -> argparse.ArgumentParser: "changed. dangerous characters are never auto-fixed" ), ) + mode_group.add_argument( + "--strip", + nargs="?", + const="all", + default=None, + choices=["dangerous", "all"], + metavar="LEVEL", + help=( + "remove non-ASCII characters from output. " + "'all' (default) strips any remaining non-ASCII; " + "'dangerous' strips only invisible/bidi characters. " + "respects allow-lists" + ), + ) + mode_group.add_argument( + "--halt", + nargs="?", + const="dangerous", + default=None, + choices=["dangerous", "all"], + metavar="LEVEL", + help=( + "stop immediately on first character matching the level. " + "'dangerous' (default) halts on invisible/bidi characters; " + "'all' halts on any non-ASCII. " + "exits 1 and reports the triggering finding" + ), + ) mode_group.add_argument( "-V", "--version", @@ -570,40 +658,142 @@ def _resolve_file_settings( return severity, do_confusables -def _scan_files( - files: list[str], +_STDIN_NAME = "" + + +def _check_line( + line: str, + lineno: int, + allow: AllowConfig, + *, + do_confusables: bool, +) -> list[Finding]: + """Check a single line and return findings with corrected line numbers.""" + findings = check_file(_STDIN_NAME, allow, text=line) + if do_confusables: + findings.extend(check_confusables(_STDIN_NAME, text=line)) + return [ + Finding( + file=f.file, + line=lineno, + col=f.col, + char=f.char, + codepoint=f.codepoint, + name=f.name, + category=f.category, + dangerous=f.dangerous, + confusable=f.confusable, + ) + if f.line != lineno + else f + for f in findings + ] + + +def _transform_line( + line: str, + args: argparse.Namespace, + allow: AllowConfig, +) -> tuple[str, bool]: + """Apply --fix and --strip transformations; return (output, was_modified).""" + output = line + modified = False + if args.fix: + fixed = fix_text(output) + if fixed != output: + modified = True + output = fixed + if args.strip: + stripped = strip_text(output, level=args.strip, allow=allow) + if stripped != output: + modified = True + output = stripped + return output, modified + + +@dataclass(slots=True) +class _PipeCounts: + """Running counters for pipe mode summary (avoids unbounded list).""" + + total: int = 0 + fixable: int = 0 + dangerous: int = 0 + confusable: int = 0 + files: int = 1 # always 1 for stdin + + def add(self, findings: list[Finding]) -> None: + self.total += len(findings) + for f in findings: + self.fixable += f.fixable + self.dangerous += f.dangerous + self.confusable += f.confusable is not None + + +def _run_pipe( + args: argparse.Namespace, allow: AllowConfig, - overrides: tuple[Override, ...], *, do_confusables: bool, severity: str, -) -> tuple[list[Finding], bool]: - """Scan files for non-ASCII and (optionally) confusable characters. +) -> int: + """Handle pipe mode: stream stdin line-by-line to stdout.""" + counts = _PipeCounts() + any_modified = False + halted = False + + for lineno, raw_line in enumerate(sys.stdin, start=1): + line = raw_line.rstrip("\n") + has_newline = raw_line.endswith("\n") + + line_findings = _check_line(line, lineno, allow, do_confusables=do_confusables) + + if line_findings and not args.quiet: + print_line_findings( + _STDIN_NAME, + lineno, + line, + line_findings, + no_color=args.no_color, + ) + counts.add(line_findings) - Returns (findings, has_errors) where has_errors is True if any finding - came from a file whose effective severity is "error". - """ - findings: list[Finding] = [] - has_errors = False - for filepath in files: - file_allow = _resolve_allow_for_file(filepath, allow, overrides) - file_severity, file_confusables = _resolve_file_settings( - filepath, - severity, - global_confusables=do_confusables, - overrides=overrides, - ) - try: - file_text = Path(filepath).read_text(encoding="utf-8") - except (UnicodeDecodeError, OSError): - file_text = None - file_findings = check_file(filepath, file_allow, text=file_text) - if file_confusables: - file_findings.extend(check_confusables(filepath, text=file_text)) - if file_findings and file_severity == "error": - has_errors = True - findings.extend(file_findings) - return findings, has_errors + if args.halt and _findings_match_level(line_findings, args.halt): + halted = True + break + + output_line, modified = _transform_line(line, args, allow) + if modified: + any_modified = True + + sys.stdout.write(output_line + ("\n" if has_newline else "")) + sys.stdout.flush() + + if counts.total: + _print_pipe_summary(counts) + + if halted or any_modified: + return 1 + if counts.total and severity == "error": + return 1 + return 0 + + +def _print_pipe_summary(counts: _PipeCounts) -> None: + """Print summary line for pipe mode from running counters.""" + parts = [ + f"Found {counts.total} non-ASCII character{'s' if counts.total != 1 else ''}" + ] + parts.append(f"in {counts.files} file{'s' if counts.files != 1 else ''}") + extras = [] + if counts.fixable: + extras.append(f"{counts.fixable} fixable") + if counts.dangerous: + extras.append(f"{counts.dangerous} dangerous") + if counts.confusable: + extras.append(f"{counts.confusable} confusable") + if extras: + parts.append(f"({', '.join(extras)})") + sys.stderr.write(" ".join(parts) + "\n") def _load_and_validate_config( @@ -641,17 +831,87 @@ def _load_and_validate_config( return config, severity, allow, do_confusables, overrides +@dataclass(frozen=True, slots=True) +class _ScanConfig: + """Bundled scan configuration passed to _process_files.""" + + allow: AllowConfig + overrides: tuple[Override, ...] + severity: str + do_confusables: bool + + +def _findings_match_level(findings: list[Finding], level: str) -> bool: + """Check if any finding matches the halt/strip level.""" + if level == "dangerous": + return any(f.dangerous for f in findings) + return bool(findings) + + +def _process_files( + files: list[str], + args: argparse.Namespace, + cfg: _ScanConfig, +) -> tuple[list[Finding], bool, bool, bool]: + """Process files one at a time, respecting --halt, --fix, and --strip. + + Returns (all_findings, has_errors, any_modified, halted) where + ``halted`` is True if processing stopped early due to --halt. + """ + all_findings: list[Finding] = [] + has_errors = False + any_modified = False + + for filepath in files: + file_allow = _resolve_allow_for_file(filepath, cfg.allow, cfg.overrides) + file_severity, file_confusables = _resolve_file_settings( + filepath, + cfg.severity, + global_confusables=cfg.do_confusables, + overrides=cfg.overrides, + ) + try: + file_text: str | None = Path(filepath).read_text(encoding="utf-8") + except (UnicodeDecodeError, OSError): + file_text = None + + file_findings = check_file(filepath, file_allow, text=file_text) + if file_confusables: + file_findings.extend(check_confusables(filepath, text=file_text)) + + if args.halt and _findings_match_level(file_findings, args.halt): + all_findings.extend(file_findings) + return all_findings, has_errors, any_modified, True + + if file_text is not None and (args.fix or args.strip): + modified = file_text + if args.fix: + modified = fix_text(modified) + if args.strip: + modified = strip_text(modified, level=args.strip, allow=file_allow) + if modified != file_text: + any_modified = True + filepath_p = Path(filepath) + _atomic_write(filepath_p, modified, filepath_p.stat().st_mode) + + if file_findings and file_severity == "error": + has_errors = True + all_findings.extend(file_findings) + + return all_findings, has_errors, any_modified, False + + def main(argv: list[str] | None = None) -> int: """Run the check-unicode CLI.""" parser = _build_parser() args = parser.parse_args(argv) # Informational flags that exit immediately - if args.list_scripts: - _print_scripts() - return 0 - if args.list_categories: - _print_categories() + if args.list_scripts or args.list_categories: + if args.list_scripts: + _print_scripts() + else: + _print_categories() return 0 if not args.files: @@ -661,6 +921,9 @@ def main(argv: list[str] | None = None) -> int: parser, args ) + if args.files == ["-"]: + return _run_pipe(args, allow, do_confusables=do_confusables, severity=severity) + # Filter out excluded files exclude_patterns = _build_exclude_patterns(args, config) files = [f for f in args.files if not _is_excluded(f, exclude_patterns)] @@ -668,26 +931,22 @@ def main(argv: list[str] | None = None) -> int: if not files: return 0 - # Fix mode - if args.fix: - fixed = [fix_file(filepath) for filepath in files] - any_fixed = any(fixed) - all_findings, has_errors = _scan_files( - files, allow, overrides, do_confusables=do_confusables, severity=severity - ) - if all_findings: - print_findings(all_findings, no_color=args.no_color, quiet=args.quiet) - return 1 if any_fixed or all_findings else 0 - - # Check mode - all_findings, has_errors = _scan_files( - files, allow, overrides, do_confusables=do_confusables, severity=severity + scan_cfg = _ScanConfig( + allow=allow, + overrides=overrides, + severity=severity, + do_confusables=do_confusables, + ) + all_findings, has_errors, any_modified, halted = _process_files( + files, args, scan_cfg ) + if all_findings: print_findings(all_findings, no_color=args.no_color, quiet=args.quiet) - return 1 if has_errors else 0 - return 0 + if halted or any_modified: + return 1 + return 1 if has_errors else 0 if __name__ == "__main__": diff --git a/src/check_unicode/output.py b/src/check_unicode/output.py index ffd52a5..b61e72b 100644 --- a/src/check_unicode/output.py +++ b/src/check_unicode/output.py @@ -156,7 +156,7 @@ def _format_codepoint_entry( return f"{prefix}{cp_part} {finding.name} {cat_part}{count_str}" -def _print_summary(findings: list[Finding]) -> None: +def print_summary(findings: list[Finding]) -> None: """Print a summary line of finding counts to stderr.""" n_files = len({f.file for f in findings}) n_fixable = sum(1 for f in findings if f.fixable) @@ -210,11 +210,43 @@ def _collect_codepoints( ) +def _resolve_file_lines(filepath: str, text: str | None) -> list[str]: + """Return source lines from *text* if provided, else read *filepath* from disk.""" + if text is not None: + return text.splitlines() + try: + return Path(filepath).read_text(encoding="utf-8").splitlines() + except (OSError, UnicodeDecodeError): + return [] + + +def _print_line_context( + line: str, + line_findings: list[Finding], + *, + color: bool, + show_codepoints: bool = False, +) -> None: + """Print rendered context for a single source line.""" + rendered = _render_invisible(line) + caret = _build_caret_line(line, line_findings) + + sys.stderr.write(f" {rendered}\n") + if caret: + sys.stderr.write(f" {caret}\n") + + if show_codepoints: + for finding, count in _collect_codepoints(line_findings): + entry = _format_codepoint_entry(finding, count, color=color) + sys.stderr.write(f" {entry}\n") + + def _print_file_findings( filepath: str, file_findings: list[Finding], *, color: bool, + text: str | None = None, ) -> None: """Print grouped output for a single file.""" # Build compact line ranges for header @@ -225,12 +257,7 @@ def _print_file_findings( header = f"{filepath}:{ranges_str}:" if ranges_str else f"{filepath}:" sys.stderr.write(header + "\n") - # Read file for context display - try: - text = Path(filepath).read_text(encoding="utf-8") - file_lines = text.splitlines() - except (OSError, UnicodeDecodeError): - file_lines = [] + file_lines = _resolve_file_lines(filepath, text) # Group findings by line number by_line: dict[int, list[Finding]] = {} @@ -251,16 +278,14 @@ def _print_file_findings( continue seen_contexts.add(context_key) - sys.stderr.write(f" {rendered}\n") - if caret: - sys.stderr.write(f" {caret}\n") + _print_line_context(line, by_line[lineno], color=color) # Print error findings (line == 0, e.g. couldn't read file) for f in file_findings: if f.line == 0: sys.stderr.write(f" {f.name}\n") - # List unique codepoints with counts + # List unique codepoints with counts across all findings for this file for finding, count in _collect_codepoints(file_findings): entry = _format_codepoint_entry(finding, count, color=color) sys.stderr.write(f" {entry}\n") @@ -286,4 +311,25 @@ def print_findings( for filepath, file_findings in by_file.items(): _print_file_findings(filepath, file_findings, color=color) - _print_summary(findings) + print_summary(findings) + + +def print_line_findings( + filepath: str, + lineno: int, + line: str, + findings: list[Finding], + *, + no_color: bool = False, +) -> None: + """Print findings for a single line immediately. + + Used by streaming pipe mode to emit per-line diagnostics. + """ + color = _use_color(no_color=no_color) + + sys.stderr.write(f"{filepath}:{lineno}:\n") + + _print_line_context(line, findings, color=color, show_codepoints=True) + + sys.stderr.write("\n") diff --git a/tests/test_cli.py b/tests/test_cli.py index fe72578..ac186e9 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -3,10 +3,15 @@ from __future__ import annotations import argparse +import io from pathlib import Path +from typing import TYPE_CHECKING import pytest +if TYPE_CHECKING: + from collections.abc import Callable + from check_unicode.checker import AllowConfig from check_unicode.main import ( Override, @@ -22,6 +27,51 @@ FIXTURES = Path(__file__).parent / "fixtures" +@pytest.fixture +def smart_quotes_file(tmp_path: Path) -> Path: + """File containing smart quotes (fixable characters).""" + f = tmp_path / "smart.txt" + f.write_text("He said \u201chello\u201d\n", encoding="utf-8") + return f + + +@pytest.fixture +def accented_file(tmp_path: Path) -> Path: + """File containing accented characters (non-fixable, non-dangerous).""" + f = tmp_path / "accented.txt" + f.write_text("caf\u00e9\n", encoding="utf-8") + return f + + +@pytest.fixture +def dangerous_file(tmp_path: Path) -> Path: + """File containing dangerous bidi characters.""" + f = tmp_path / "bidi.txt" + f.write_text("x\u202ey\n", encoding="utf-8") + return f + + +@pytest.fixture +def mixed_file(tmp_path: Path) -> Path: + """File with fixable, non-fixable, and dangerous characters.""" + f = tmp_path / "mixed.txt" + f.write_text("caf\u00e9 said \u201chi\u201d x\u202ey\n", encoding="utf-8") + return f + + +@pytest.fixture +def stdin_from(monkeypatch: pytest.MonkeyPatch) -> Callable[[str], None]: + """Fixture that sets sys.stdin to read from a string.""" + + def _set(text: str) -> None: + monkeypatch.setattr( + "sys.stdin", + io.TextIOWrapper(io.BytesIO(text.encode("utf-8"))), + ) + + return _set + + class TestExitCodes: """Tests for CLI exit code behavior.""" @@ -498,6 +548,27 @@ def test_help_mentions_list_flags(self, capsys: pytest.CaptureFixture[str]) -> N assert "--list-scripts" in out assert "--list-categories" in out + def test_help_contains_strip_and_halt( + self, capsys: pytest.CaptureFixture[str] + ) -> None: + """Help output documents --strip and --halt flags.""" + with pytest.raises(SystemExit): + main(["--help"]) + out = capsys.readouterr().out + assert "--strip" in out + assert "--halt" in out + assert "dangerous" in out.lower() + + def test_help_contains_pipe_examples( + self, capsys: pytest.CaptureFixture[str] + ) -> None: + """Help output includes pipe mode examples with new flags.""" + with pytest.raises(SystemExit): + main(["--help"]) + out = capsys.readouterr().out + assert "check-unicode -" in out + assert "--strip" in out + class TestListScripts: """Tests for the --list-scripts flag.""" @@ -944,3 +1015,518 @@ def test_invalid_script(self, tmp_path: Path) -> None: with pytest.raises(SystemExit) as exc_info: main(["--allow-script", "Klingon", str(f)]) assert exc_info.value.code == 2 + + +class TestPipeMode: + """Tests for pipe mode: reading from stdin via `-`.""" + + def test_dash_clean_input_passes_through( + self, + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture[str], + ) -> None: + """Clean ASCII input is passed through to stdout unchanged, exit 0.""" + monkeypatch.setattr("sys.stdin", io.TextIOWrapper(io.BytesIO(b"hello world\n"))) + assert main(["-"]) == 0 + captured = capsys.readouterr() + assert captured.out == "hello world\n" + + def test_dash_dirty_input_passes_through_with_findings( + self, + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture[str], + ) -> None: + """Non-ASCII input passes through to stdout, findings on stderr.""" + text = "He said \u201chello\u201d\n" + monkeypatch.setattr( + "sys.stdin", io.TextIOWrapper(io.BytesIO(text.encode("utf-8"))) + ) + assert main(["-"]) == 1 + captured = capsys.readouterr() + assert captured.out == text + assert "U+201C" in captured.err + + def test_dash_fix_mode_writes_fixed_to_stdout( + self, + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture[str], + ) -> None: + """Fix mode replaces smart quotes and writes fixed text to stdout.""" + text = "He said \u201chello\u201d\n" + monkeypatch.setattr( + "sys.stdin", io.TextIOWrapper(io.BytesIO(text.encode("utf-8"))) + ) + assert main(["--fix", "-"]) == 1 + captured = capsys.readouterr() + assert captured.out == 'He said "hello"\n' + + def test_dash_fix_mode_clean_input( + self, + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture[str], + ) -> None: + """Fix mode with clean input passes through unchanged, exit 0.""" + monkeypatch.setattr("sys.stdin", io.TextIOWrapper(io.BytesIO(b"clean\n"))) + assert main(["--fix", "-"]) == 0 + captured = capsys.readouterr() + assert captured.out == "clean\n" + + def test_dash_fix_mode_dangerous_still_reported( + self, + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture[str], + ) -> None: + """Fix mode preserves dangerous chars in output and stderr.""" + text = "x\u202ey\n" + monkeypatch.setattr( + "sys.stdin", io.TextIOWrapper(io.BytesIO(text.encode("utf-8"))) + ) + result = main(["--fix", "-"]) + assert result == 1 + captured = capsys.readouterr() + # Dangerous char preserved in output + assert "\u202e" in captured.out + assert "DANGEROUS" in captured.err + + def test_dash_with_allow_flags( + self, + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture[str], + ) -> None: + """Allow flags work with pipe mode.""" + text = "72\u00b0F\n" + monkeypatch.setattr( + "sys.stdin", io.TextIOWrapper(io.BytesIO(text.encode("utf-8"))) + ) + assert main(["--allow-codepoint", "U+00B0", "-"]) == 0 + captured = capsys.readouterr() + assert captured.out == text + + def test_dash_filename_in_findings( + self, + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture[str], + ) -> None: + """Findings use '' as the filename.""" + text = "\u201chello\u201d\n" + monkeypatch.setattr( + "sys.stdin", io.TextIOWrapper(io.BytesIO(text.encode("utf-8"))) + ) + main(["-"]) + captured = capsys.readouterr() + assert "" in captured.err + + +class TestStripAndHaltFlags: + """Tests for --strip and --halt flag parsing.""" + + def test_strip_default_level(self) -> None: + """--strip with no argument defaults to 'all'.""" + parser = _build_parser() + args = parser.parse_args(["--strip", "test.txt"]) + assert args.strip == "all" + + def test_strip_explicit_dangerous(self) -> None: + """--strip dangerous sets level to 'dangerous'.""" + parser = _build_parser() + args = parser.parse_args(["--strip", "dangerous", "test.txt"]) + assert args.strip == "dangerous" + + def test_strip_explicit_all(self) -> None: + """--strip all sets level to 'all'.""" + parser = _build_parser() + args = parser.parse_args(["--strip", "all", "test.txt"]) + assert args.strip == "all" + + def test_halt_default_level(self) -> None: + """--halt with no argument defaults to 'dangerous'.""" + parser = _build_parser() + args = parser.parse_args(["--halt", "test.txt"]) + assert args.halt == "dangerous" + + def test_halt_explicit_all(self) -> None: + """--halt all sets level to 'all'.""" + parser = _build_parser() + args = parser.parse_args(["--halt", "all", "test.txt"]) + assert args.halt == "all" + + def test_halt_explicit_dangerous(self) -> None: + """--halt dangerous sets level to 'dangerous'.""" + parser = _build_parser() + args = parser.parse_args(["--halt", "dangerous", "test.txt"]) + assert args.halt == "dangerous" + + def test_strip_and_halt_together(self) -> None: + """--strip and --halt can be combined.""" + parser = _build_parser() + args = parser.parse_args(["--strip", "all", "--halt", "dangerous", "test.txt"]) + assert args.strip == "all" + assert args.halt == "dangerous" + + def test_fix_strip_halt_together(self) -> None: + """All three action flags can be combined.""" + parser = _build_parser() + args = parser.parse_args( + [ + "--fix", + "--strip", + "dangerous", + "--halt", + "dangerous", + "test.txt", + ] + ) + assert args.fix is True + assert args.strip == "dangerous" + assert args.halt == "dangerous" + + def test_no_strip_defaults_none(self) -> None: + """Without --strip, args.strip is None.""" + parser = _build_parser() + args = parser.parse_args(["test.txt"]) + assert args.strip is None + + def test_no_halt_defaults_none(self) -> None: + """Without --halt, args.halt is None.""" + parser = _build_parser() + args = parser.parse_args(["test.txt"]) + assert args.halt is None + + +class TestStripFileMode: + """Tests for --strip on files.""" + + def test_strip_all_removes_non_ascii(self, accented_file: Path) -> None: + """--strip all removes non-fixable non-ASCII from file.""" + assert main(["--strip", "all", str(accented_file)]) == 1 + assert accented_file.read_text(encoding="utf-8") == "caf\n" + + def test_strip_dangerous_only(self, dangerous_file: Path) -> None: + """--strip dangerous removes only dangerous chars.""" + assert main(["--strip", "dangerous", str(dangerous_file)]) == 1 + assert dangerous_file.read_text(encoding="utf-8") == "xy\n" + + def test_strip_dangerous_keeps_accented(self, accented_file: Path) -> None: + """--strip dangerous does not remove accented characters.""" + assert main(["--strip", "dangerous", str(accented_file)]) == 1 + content = accented_file.read_text(encoding="utf-8") + assert "\u00e9" in content + + def test_fix_strip_combined(self, mixed_file: Path) -> None: + """--fix --strip all fixes fixable, strips the rest.""" + assert main(["--fix", "--strip", "all", str(mixed_file)]) == 1 + content = mixed_file.read_text(encoding="utf-8") + assert content == 'caf said "hi" xy\n' + + def test_fix_strip_dangerous(self, mixed_file: Path) -> None: + """--fix --strip dangerous fixes fixable, strips dangerous.""" + assert main(["--fix", "--strip", "dangerous", str(mixed_file)]) == 1 + content = mixed_file.read_text(encoding="utf-8") + assert content == 'caf\u00e9 said "hi" xy\n' + + def test_strip_clean_file_exits_0(self, tmp_path: Path) -> None: + """--strip on a clean file exits 0 with no changes.""" + f = tmp_path / "clean.txt" + f.write_text("hello world\n", encoding="utf-8") + assert main(["--strip", str(f)]) == 0 + + def test_strip_respects_allow_codepoint(self, accented_file: Path) -> None: + """--strip all respects --allow-codepoint.""" + assert ( + main( + [ + "--strip", + "all", + "--allow-codepoint", + "U+00E9", + str(accented_file), + ] + ) + == 0 + ) + content = accented_file.read_text(encoding="utf-8") + assert content == "caf\u00e9\n" + + +class TestHaltFileMode: + """Tests for --halt on files.""" + + def test_halt_dangerous_stops_on_bidi(self, dangerous_file: Path) -> None: + """--halt dangerous exits 1 on dangerous character.""" + assert main(["--halt", str(dangerous_file)]) == 1 + + def test_halt_dangerous_ignores_accented(self, accented_file: Path) -> None: + """--halt dangerous does not trigger halt on accented chars.""" + # Still exits 1 because findings exist, but no halt behavior + assert main(["--halt", str(accented_file)]) == 1 + + def test_halt_all_stops_on_any_non_ascii(self, accented_file: Path) -> None: + """--halt all exits 1 on any non-ASCII.""" + assert main(["--halt", "all", str(accented_file)]) == 1 + + def test_halt_does_not_modify_file(self, dangerous_file: Path) -> None: + """--halt with --fix does not write the halting file.""" + original = dangerous_file.read_text(encoding="utf-8") + main(["--fix", "--halt", str(dangerous_file)]) + assert dangerous_file.read_text(encoding="utf-8") == original + + def test_halt_skips_remaining_files( + self, dangerous_file: Path, tmp_path: Path + ) -> None: + """--halt stops processing remaining files after trigger.""" + second = tmp_path / "second.txt" + second.write_text("\u201chello\u201d\n", encoding="utf-8") + original_second = second.read_text(encoding="utf-8") + main(["--fix", "--halt", str(dangerous_file), str(second)]) + assert second.read_text(encoding="utf-8") == original_second + + def test_halt_respects_allow_codepoint(self, dangerous_file: Path) -> None: + """--halt does not trigger on allowed codepoints.""" + result = main( + [ + "--halt", + "dangerous", + "--allow-codepoint", + "U+202E,U+202C", + str(dangerous_file), + ] + ) + assert result == 0 + + def test_halt_reports_finding( + self, + dangerous_file: Path, + capsys: pytest.CaptureFixture[str], + ) -> None: + """--halt reports the triggering finding on stderr.""" + main(["--halt", str(dangerous_file)]) + err = capsys.readouterr().err + assert "DANGEROUS" in err + + +class TestPipeModeStreaming: + """Tests for streaming pipe mode with --strip and --halt.""" + + def test_pipe_strip_all( + self, + stdin_from: Callable[[str], None], + capsys: pytest.CaptureFixture[str], + ) -> None: + """--strip all removes all non-ASCII from pipe output.""" + stdin_from("caf\u00e9\n") + assert main(["--strip", "all", "-"]) == 1 + assert capsys.readouterr().out == "caf\n" + + def test_pipe_strip_dangerous( + self, + stdin_from: Callable[[str], None], + capsys: pytest.CaptureFixture[str], + ) -> None: + """--strip dangerous removes only dangerous chars.""" + stdin_from("caf\u00e9 x\u202ey\n") + assert main(["--strip", "dangerous", "-"]) == 1 + captured = capsys.readouterr() + assert "\u00e9" in captured.out + assert "\u202e" not in captured.out + + def test_pipe_fix_strip_combined( + self, + stdin_from: Callable[[str], None], + capsys: pytest.CaptureFixture[str], + ) -> None: + """--fix --strip all: fix fixable, strip the rest.""" + stdin_from("caf\u00e9 said \u201chi\u201d\n") + assert main(["--fix", "--strip", "all", "-"]) == 1 + assert capsys.readouterr().out == 'caf said "hi"\n' + + def test_pipe_halt_dangerous( + self, + stdin_from: Callable[[str], None], + capsys: pytest.CaptureFixture[str], + ) -> None: + """--halt dangerous stops on dangerous char, no stdout for that line.""" + stdin_from("line1\nx\u202ey\nline3\n") + assert main(["--halt", "-"]) == 1 + captured = capsys.readouterr() + assert "line1\n" in captured.out + assert "\u202e" not in captured.out + assert "line3" not in captured.out + assert "DANGEROUS" in captured.err + + def test_pipe_halt_all( + self, + stdin_from: Callable[[str], None], + capsys: pytest.CaptureFixture[str], + ) -> None: + """--halt all stops on any non-ASCII.""" + stdin_from("clean\ncaf\u00e9\nmore\n") + assert main(["--halt", "all", "-"]) == 1 + captured = capsys.readouterr() + assert "clean\n" in captured.out + assert "\u00e9" not in captured.out + + def test_pipe_fix_halt_dangerous( + self, + stdin_from: Callable[[str], None], + capsys: pytest.CaptureFixture[str], + ) -> None: + """--fix --halt: fix fixable, halt on dangerous.""" + stdin_from("\u201chi\u201d\nx\u202ey\n") + assert main(["--fix", "--halt", "-"]) == 1 + captured = capsys.readouterr() + assert '"hi"' in captured.out + assert "\u202e" not in captured.out + + def test_pipe_context_display( + self, + stdin_from: Callable[[str], None], + capsys: pytest.CaptureFixture[str], + ) -> None: + """Pipe mode shows context lines with caret markers.""" + stdin_from("x\u202ey\n") + main(["-"]) + err = capsys.readouterr().err + assert "" in err + assert "!" in err + + def test_pipe_summary_after_stream( + self, + stdin_from: Callable[[str], None], + capsys: pytest.CaptureFixture[str], + ) -> None: + """Pipe mode prints summary line after stdin exhausted.""" + stdin_from("\u201chello\u201d\n") + main(["-"]) + err = capsys.readouterr().err + assert "Found" in err + assert "non-ASCII" in err + + def test_pipe_strip_respects_allow( + self, + stdin_from: Callable[[str], None], + capsys: pytest.CaptureFixture[str], + ) -> None: + """--strip all with --allow-codepoint preserves allowed.""" + stdin_from("caf\u00e9\n") + assert main(["--strip", "all", "--allow-codepoint", "U+00E9", "-"]) == 0 + assert capsys.readouterr().out == "caf\u00e9\n" + + def test_pipe_halt_respects_allow( + self, + stdin_from: Callable[[str], None], + ) -> None: + """--halt dangerous with allowed bidi char does not halt.""" + stdin_from("x\u202ey\n") + result = main( + [ + "--halt", + "dangerous", + "--allow-codepoint", + "U+202E,U+202C", + "-", + ] + ) + assert result == 0 + + +class TestFlagInteractionsWithConfig: + """Tests for action flag interactions with config/overrides.""" + + def test_strip_with_config_allow_codepoints(self, tmp_path: Path) -> None: + """Config allow-codepoints are respected by --strip.""" + config = tmp_path / "config.toml" + config.write_text('allow-codepoints = ["U+00E9"]\n', encoding="utf-8") + f = tmp_path / "test.txt" + f.write_text("caf\u00e9 na\u00efve\n", encoding="utf-8") + main(["--config", str(config), "--strip", "all", str(f)]) + content = f.read_text(encoding="utf-8") + assert "\u00e9" in content + assert "\u00ef" not in content + + def test_strip_with_override_allow(self, tmp_path: Path) -> None: + """Per-file override allow-lists are respected by --strip.""" + config = tmp_path / "config.toml" + config.write_text( + '[[overrides]]\nfiles = ["*.md"]\nallow-printable = true\n', + encoding="utf-8", + ) + md = tmp_path / "doc.md" + md.write_text("caf\u00e9\n", encoding="utf-8") + py = tmp_path / "code.py" + py.write_text("caf\u00e9\n", encoding="utf-8") + main(["--config", str(config), "--strip", "all", str(md), str(py)]) + assert "\u00e9" in md.read_text(encoding="utf-8") + assert "\u00e9" not in py.read_text(encoding="utf-8") + + def test_halt_with_severity_warning(self, dangerous_file: Path) -> None: + """--halt still exits 1 regardless of --severity warning.""" + result = main(["--severity", "warning", "--halt", str(dangerous_file)]) + assert result == 1 + + def test_strip_with_severity_warning(self, accented_file: Path) -> None: + """--strip modifies file; exit 1 even with --severity warning.""" + result = main(["--severity", "warning", "--strip", "all", str(accented_file)]) + assert result == 1 + assert "\u00e9" not in accented_file.read_text(encoding="utf-8") + + @pytest.mark.parametrize( + ("flags", "input_text", "expected_out"), + [ + (["--fix"], "\u201chi\u201d\n", '"hi"\n'), + (["--strip", "all"], "caf\u00e9\n", "caf\n"), + (["--fix", "--strip", "all"], "caf\u00e9 \u201chi\u201d\n", 'caf "hi"\n'), + ( + ["--fix", "--strip", "dangerous"], + "\u201chi\u201d x\u202ey\n", + '"hi" xy\n', + ), + ( + ["--strip", "dangerous"], + "caf\u00e9 x\u202ey\n", + "caf\u00e9 xy\n", + ), + ], + ids=[ + "fix-only", + "strip-all", + "fix-strip-all", + "fix-strip-dangerous", + "strip-dangerous-only", + ], + ) + def test_pipe_flag_combinations( + self, + stdin_from: Callable[[str], None], + capsys: pytest.CaptureFixture[str], + flags: list[str], + input_text: str, + expected_out: str, + ) -> None: + """Parametrized test of flag combinations in pipe mode.""" + stdin_from(input_text) + main([*flags, "-"]) + assert capsys.readouterr().out == expected_out + + def test_quiet_flag_with_strip( + self, + stdin_from: Callable[[str], None], + capsys: pytest.CaptureFixture[str], + ) -> None: + """--quiet suppresses per-finding output in pipe mode.""" + stdin_from("caf\u00e9\n") + main(["--quiet", "--strip", "all", "-"]) + err = capsys.readouterr().err + assert "Found" in err + assert "U+00E9" not in err.split("Found")[0] + + def test_no_color_with_halt( + self, + stdin_from: Callable[[str], None], + capsys: pytest.CaptureFixture[str], + ) -> None: + """--no-color works with --halt.""" + stdin_from("x\u202ey\n") + main(["--no-color", "--halt", "-"]) + err = capsys.readouterr().err + assert "\033[" not in err + assert "DANGEROUS" in err diff --git a/tests/test_fixer.py b/tests/test_fixer.py index cc4048a..0e25047 100644 --- a/tests/test_fixer.py +++ b/tests/test_fixer.py @@ -8,12 +8,58 @@ import pytest -from check_unicode.fixer import fix_file +from check_unicode.checker import AllowConfig +from check_unicode.fixer import fix_file, strip_text if TYPE_CHECKING: from pathlib import Path +class TestStripText: + """Tests for strip_text() character removal.""" + + @pytest.mark.parametrize( + ("level", "input_text", "expected"), + [ + ("all", "caf\u00e9\n", "caf\n"), + ("all", "He said \u201chello\u201d\n", "He said hello\n"), + ("dangerous", "caf\u00e9\n", "caf\u00e9\n"), + ("dangerous", "x\u202ey\n", "xy\n"), + ("dangerous", "a\u200bb\n", "ab\n"), + ("all", "hello world\n", "hello world\n"), + ("dangerous", "hello world\n", "hello world\n"), + ("all", "", ""), + ], + ids=[ + "all-accented", + "all-smart-quotes", + "dangerous-keeps-accented", + "dangerous-strips-bidi", + "dangerous-strips-zwsp", + "all-clean-passthrough", + "dangerous-clean-passthrough", + "all-empty", + ], + ) + def test_strip_text(self, level: str, input_text: str, expected: str) -> None: + """strip_text removes characters based on level.""" + assert strip_text(input_text, level=level) == expected + + def test_strip_text_respects_allowed(self) -> None: + """Allowed codepoints are never stripped.""" + text = "caf\u00e9 x\u202ey\n" + allow = AllowConfig(codepoints=frozenset({0x00E9})) + result = strip_text(text, level="all", allow=allow) + assert result == "caf\u00e9 xy\n" + + def test_strip_dangerous_respects_allowed(self) -> None: + """Explicitly allowed dangerous codepoints are not stripped.""" + text = "x\u202ey\n" + allow = AllowConfig(codepoints=frozenset({0x202E})) + result = strip_text(text, level="dangerous", allow=allow) + assert result == "x\u202ey\n" + + class TestSmartQuoteReplacement: """Tests for smart quote to ASCII replacement.""" diff --git a/tests/test_output.py b/tests/test_output.py index f02dee2..e5db5c7 100644 --- a/tests/test_output.py +++ b/tests/test_output.py @@ -14,8 +14,10 @@ _build_caret_line, _compact_ranges, _format_codepoint_entry, + _print_file_findings, _use_color, print_findings, + print_line_findings, ) FIXTURES = Path(__file__).parent / "fixtures" @@ -343,3 +345,104 @@ def test_count_for_repeated_codepoints( print_findings(findings, no_color=True) err = capsys.readouterr().err assert "(x10)" in err + + +class TestPrintFileFindingsWithText: + """Tests for _print_file_findings with pre-supplied text.""" + + def test_stdin_context_display(self, capsys: pytest.CaptureFixture[str]) -> None: + """Findings for show context when text is provided.""" + text = "x\u202ey\n" + findings = [ + Finding( + file="", + line=1, + col=2, + char="\u202e", + codepoint=0x202E, + name="RIGHT-TO-LEFT OVERRIDE", + category="Cf", + dangerous=True, + ), + ] + _print_file_findings("", findings, color=False, text=text) + err = capsys.readouterr().err + assert "" in err + assert "!" in err + + def test_stdin_no_text_no_context(self, capsys: pytest.CaptureFixture[str]) -> None: + """Without text param, findings lack context.""" + findings = [ + Finding( + file="", + line=1, + col=2, + char="\u202e", + codepoint=0x202E, + name="RIGHT-TO-LEFT OVERRIDE", + category="Cf", + dangerous=True, + ), + ] + _print_file_findings("", findings, color=False) + err = capsys.readouterr().err + assert "U+202E" in err + assert "" not in err + + +class TestPrintLineFindings: + """Tests for per-line finding output in pipe mode.""" + + def test_single_line_output(self, capsys: pytest.CaptureFixture[str]) -> None: + """print_line_findings emits context for one line.""" + line = "x\u202ey" + findings = [ + Finding( + file="", + line=5, + col=2, + char="\u202e", + codepoint=0x202E, + name="RIGHT-TO-LEFT OVERRIDE", + category="Cf", + dangerous=True, + ), + ] + print_line_findings("", 5, line, findings, no_color=True) + err = capsys.readouterr().err + assert ":5:" in err + assert "" in err + assert "!" in err + assert "U+202E" in err + + def test_multiple_findings_same_line( + self, capsys: pytest.CaptureFixture[str] + ) -> None: + """Multiple findings on one line all appear.""" + line = "\u201chello\u201d" + findings = [ + Finding( + file="", + line=1, + col=1, + char="\u201c", + codepoint=0x201C, + name="LEFT DOUBLE QUOTATION MARK", + category="Pi", + dangerous=False, + ), + Finding( + file="", + line=1, + col=8, + char="\u201d", + codepoint=0x201D, + name="RIGHT DOUBLE QUOTATION MARK", + category="Pf", + dangerous=False, + ), + ] + print_line_findings("", 1, line, findings, no_color=True) + err = capsys.readouterr().err + assert "U+201C" in err + assert "U+201D" in err diff --git a/uv.lock b/uv.lock index 81dc0c5..8bd84c1 100644 --- a/uv.lock +++ b/uv.lock @@ -71,6 +71,7 @@ dev = [ { name = "bump-my-version" }, { name = "pytest" }, { name = "pytest-cov" }, + { name = "pytest-sugar" }, { name = "ruff" }, { name = "ty" }, ] @@ -82,6 +83,7 @@ dev = [ { name = "bump-my-version" }, { name = "pytest" }, { name = "pytest-cov" }, + { name = "pytest-sugar", specifier = ">=1.1.1" }, { name = "ruff" }, { name = "ty" }, ] @@ -482,6 +484,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ee/49/1377b49de7d0c1ce41292161ea0f721913fa8722c19fb9c1e3aa0367eecb/pytest_cov-7.0.0-py3-none-any.whl", hash = "sha256:3b8e9558b16cc1479da72058bdecf8073661c7f57f7d3c5f22a1c23507f2d861", size = 22424, upload-time = "2025-09-09T10:57:00.695Z" }, ] +[[package]] +name = "pytest-sugar" +version = "1.1.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pytest" }, + { name = "termcolor" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/0b/4e/60fed105549297ba1a700e1ea7b828044842ea27d72c898990510b79b0e2/pytest-sugar-1.1.1.tar.gz", hash = "sha256:73b8b65163ebf10f9f671efab9eed3d56f20d2ca68bda83fa64740a92c08f65d", size = 16533, upload-time = "2025-08-23T12:19:35.737Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/87/d5/81d38a91c1fdafb6711f053f5a9b92ff788013b19821257c2c38c1e132df/pytest_sugar-1.1.1-py3-none-any.whl", hash = "sha256:2f8319b907548d5b9d03a171515c1d43d2e38e32bd8182a1781eb20b43344cc8", size = 11440, upload-time = "2025-08-23T12:19:34.894Z" }, +] + [[package]] name = "python-dotenv" version = "1.2.1" @@ -555,6 +570,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/6d/78/097c0798b1dab9f8affe73da9642bb4500e098cb27fd8dc9724816ac747b/ruff-0.15.2-py3-none-win_arm64.whl", hash = "sha256:cabddc5822acdc8f7b5527b36ceac55cc51eec7b1946e60181de8fe83ca8876e", size = 10941649, upload-time = "2026-02-19T22:32:18.108Z" }, ] +[[package]] +name = "termcolor" +version = "3.3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/46/79/cf31d7a93a8fdc6aa0fbb665be84426a8c5a557d9240b6239e9e11e35fc5/termcolor-3.3.0.tar.gz", hash = "sha256:348871ca648ec6a9a983a13ab626c0acce02f515b9e1983332b17af7979521c5", size = 14434, upload-time = "2025-12-29T12:55:21.882Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/33/d1/8bb87d21e9aeb323cc03034f5eaf2c8f69841e40e4853c2627edf8111ed3/termcolor-3.3.0-py3-none-any.whl", hash = "sha256:cf642efadaf0a8ebbbf4bc7a31cec2f9b5f21a9f726f4ccbb08192c9c26f43a5", size = 7734, upload-time = "2025-12-29T12:55:20.718Z" }, +] + [[package]] name = "tomli" version = "2.4.0"