From c99e2ebbe74805448bb3be2f91e7b59d2a6cdb4d Mon Sep 17 00:00:00 2001 From: mit-d Date: Tue, 10 Mar 2026 19:14:09 -0600 Subject: [PATCH 1/3] feat: group output by file with compact line ranges to reduce noise Instead of printing one block per character (300+ lines for 100 box-drawing chars), findings are now grouped per file with: - Compact line range header (e.g. file.txt:1,4-80,90:) - Context lines shown once with multi-caret markers (^, ! dangerous, ? confusable) - Deduplicated identical context lines - Collapsed codepoint listing with (xN) counts --- src/check_unicode/output.py | 265 ++++++++++++++++++++++++------ tests/test_confusables.py | 8 +- tests/test_output.py | 316 +++++++++++++++++++++++++++++++++--- 3 files changed, 505 insertions(+), 84 deletions(-) diff --git a/src/check_unicode/output.py b/src/check_unicode/output.py index b12ed37..9447e5f 100644 --- a/src/check_unicode/output.py +++ b/src/check_unicode/output.py @@ -42,49 +42,121 @@ def _render_invisible(line: str) -> str: return "".join(out) -def _format_finding(f: Finding, *, color: bool) -> str: - """Format a single finding as file:line:col: U+XXXX NAME [Cat].""" - cp_str = f"U+{f.codepoint:04X}" - if color: - if f.dangerous: - prefix = f"{_BOLD_RED}[DANGEROUS]{_RESET} " +def _compact_ranges(lines: list[int]) -> str: + """Convert sorted line numbers to compact range string like '1,4-80,90'.""" + if not lines: + return "" + + sorted_lines = sorted(set(lines)) + ranges: list[str] = [] + start = sorted_lines[0] + end = sorted_lines[0] + + for line in sorted_lines[1:]: + if line == end + 1: + end = line + else: + ranges.append(str(start) if start == end else f"{start}-{end}") + start = line + end = line + + ranges.append(str(start) if start == end else f"{start}-{end}") + return ",".join(ranges) + + +def _rendered_width(ch: str) -> int: + """Return the display width of a character after invisible-char expansion.""" + cp = ord(ch) + return len(f"") if cp > _MAX_ASCII and not ch.isprintable() else 1 + + +def _build_caret_line(line: str, line_findings: list[Finding]) -> str: + """Build a caret line with ^ for normal, ! for dangerous, ? for confusable.""" + # Map column (1-indexed) to most severe finding at that column + col_map: dict[int, Finding] = {} + for f in line_findings: + existing = col_map.get(f.col) + if ( + existing is None + or (f.dangerous and not existing.dangerous) + or ( + f.confusable is not None + and not existing.dangerous + and existing.confusable is None + ) + ): + col_map[f.col] = f + + # Walk through the line, tracking rendered position + markers: list[tuple[int, str]] = [] + pos = 0 + for i, ch in enumerate(line): + col = i + 1 + if col in col_map: + mf = col_map[col] + match (mf.dangerous, mf.confusable): + case (True, _): + marker = "!" + case (_, str()): + marker = "?" + case _: + marker = "^" + markers.append((pos, marker)) + + pos += _rendered_width(ch) + + if not markers: + return "" + + # Build caret string + result: list[str] = [] + last_pos = 0 + for rpos, marker in markers: + result.append(" " * (rpos - last_pos)) + result.append(marker) + last_pos = rpos + 1 + + return "".join(result) + + +def _format_codepoint_entry( + finding: Finding, + count: int, + *, + color: bool, +) -> str: + """Format a unique codepoint listing entry.""" + cp_str = f"U+{finding.codepoint:04X}" + count_str = f" (x{count})" if count > 1 else "" + + match (finding.dangerous, finding.confusable, color): + case (True, _, True): + prefix = f"{_BOLD_RED}!{_RESET} {_BOLD_RED}[DANGEROUS]{_RESET} " cp_part = f"{_BOLD_RED}{cp_str}{_RESET}" - elif f.confusable is not None: - prefix = f"{_YELLOW}[CONFUSABLE]{_RESET} " + case (True, _, False): + prefix = "! [DANGEROUS] " + cp_part = cp_str + case (_, str() as lookalike, True): + prefix = ( + f"{_YELLOW}?{_RESET} " + f"{_YELLOW}[CONFUSABLE: looks like '{lookalike}']{_RESET} " + ) cp_part = f"{_YELLOW}{cp_str}{_RESET}" - else: + case (_, str() as lookalike, False): + prefix = f"? [CONFUSABLE: looks like '{lookalike}'] " + cp_part = cp_str + case (_, _, True): prefix = "" cp_part = f"{_RED}{cp_str}{_RESET}" - cat_part = f"{_DIM}[{f.category}]{_RESET}" - else: - if f.dangerous: - prefix = "[DANGEROUS] " - elif f.confusable is not None: - prefix = f"[CONFUSABLE: looks like '{f.confusable}'] " - else: + case _: prefix = "" - cp_part = cp_str - cat_part = f"[{f.category}]" - return f"{f.file}:{f.line}:{f.col}: {prefix}{cp_part} {f.name} {cat_part}" + cp_part = cp_str + cat_part = ( + f"{_DIM}[{finding.category}]{_RESET}" if color else f"[{finding.category}]" + ) -def _context_line(finding: Finding, file_lines: list[str]) -> str: - """Show the source line with a caret pointing at the character.""" - if finding.line < 1 or finding.line > len(file_lines): - return "" - line = file_lines[finding.line - 1] - rendered = _render_invisible(line) - # Compute caret position accounting for invisible char expansion - caret_pos = 0 - for i, ch in enumerate(line): - if i == finding.col - 1: - break - cp = ord(ch) - if cp > _MAX_ASCII and not ch.isprintable(): - caret_pos += len(f"") - else: - caret_pos += 1 - return f" {rendered}\n {' ' * caret_pos}^" + return f"{prefix}{cp_part} {finding.name} {cat_part}{count_str}" def _print_summary(findings: list[Finding]) -> None: @@ -109,33 +181,118 @@ def _print_summary(findings: list[Finding]) -> None: sys.stderr.write(" ".join(parts) + "\n") +def _collect_codepoints( + file_findings: list[Finding], +) -> list[tuple[Finding, int]]: + """Collect unique codepoints with counts, preferring the most informative. + + When the same codepoint appears as both a normal finding and a confusable + (or dangerous), the more informative classification wins. + Returns a sorted list of (finding, count) tuples. + """ + cp_counts: dict[int, tuple[Finding, int]] = {} + for f in file_findings: + if f.line == 0: + # Error finding (e.g., couldn't read file) -- printed separately + sys.stderr.write(f" {f.name}\n") + continue + existing = cp_counts.get(f.codepoint) + if existing is None: + cp_counts[f.codepoint] = (f, 1) + else: + existing_f, n = existing + # Prefer dangerous > confusable > normal + best = ( + f + if (f.dangerous and not existing_f.dangerous) + or ( + f.confusable is not None + and not existing_f.dangerous + and existing_f.confusable is None + ) + else existing_f + ) + cp_counts[f.codepoint] = (best, n + 1) + + return sorted( + cp_counts.values(), + key=lambda x: ( + not x[0].dangerous, + x[0].confusable is None, + x[0].codepoint, + ), + ) + + +def _print_file_findings( + filepath: str, + file_findings: list[Finding], + *, + color: bool, +) -> None: + """Print grouped output for a single file.""" + # Build compact line ranges for header + lines_with_findings = sorted({f.line for f in file_findings if f.line > 0}) + ranges_str = _compact_ranges(lines_with_findings) + + # Print header + header = f"{filepath}:{ranges_str}:" if ranges_str else f"{filepath}:" + sys.stderr.write(header + "\n") + + # Read file for context display + try: + text = Path(filepath).read_text(encoding="utf-8") + file_lines = text.splitlines() + except (OSError, UnicodeDecodeError): + file_lines = [] + + # Group findings by line number + by_line: dict[int, list[Finding]] = {} + for f in file_findings: + by_line.setdefault(f.line, []).append(f) + + # Show context lines with carets, deduplicating identical blocks + seen_contexts: set[tuple[str, str]] = set() + for lineno in sorted(by_line): + if lineno < 1 or lineno > len(file_lines): + continue + line = file_lines[lineno - 1] + rendered = _render_invisible(line) + caret = _build_caret_line(line, by_line[lineno]) + + context_key = (rendered, caret) + if context_key in seen_contexts: + continue + seen_contexts.add(context_key) + + sys.stderr.write(f" {rendered}\n") + if caret: + sys.stderr.write(f" {caret}\n") + + # List unique codepoints with counts + for finding, count in _collect_codepoints(file_findings): + entry = _format_codepoint_entry(finding, count, color=color) + sys.stderr.write(f" {entry}\n") + + sys.stderr.write("\n") + + def print_findings( findings: list[Finding], *, no_color: bool = False, quiet: bool = False, ) -> None: - """Print findings to stderr.""" + """Print findings to stderr, grouped by file with compact line ranges.""" color = _use_color(no_color=no_color) - # Group by file for context lines - files_cache: dict[str, list[str]] = {} - if not quiet: + # Group by file, preserving first-seen order + by_file: dict[str, list[Finding]] = {} for f in findings: - line = _format_finding(f, color=color) - sys.stderr.write(line + "\n") - - # Show context if the finding has valid line info - if f.line > 0: - if f.file not in files_cache: - try: - text = Path(f.file).read_text(encoding="utf-8") - files_cache[f.file] = text.splitlines() - except (OSError, UnicodeDecodeError): - files_cache[f.file] = [] - ctx = _context_line(f, files_cache[f.file]) - if ctx: - sys.stderr.write(ctx + "\n") + by_file.setdefault(f.file, []).append(f) + + for filepath, file_findings in by_file.items(): + _print_file_findings(filepath, file_findings, color=color) _print_summary(findings) diff --git a/tests/test_confusables.py b/tests/test_confusables.py index 04fcfff..c593945 100644 --- a/tests/test_confusables.py +++ b/tests/test_confusables.py @@ -9,7 +9,7 @@ from check_unicode.checker import Finding, check_confusables, check_file from check_unicode.confusables import CONFUSABLES from check_unicode.main import main -from check_unicode.output import _format_finding, print_findings +from check_unicode.output import _format_codepoint_entry, print_findings FIXTURES = Path(__file__).parent / "fixtures" @@ -160,7 +160,7 @@ def test_confusable_format_no_color(self) -> None: dangerous=False, confusable="a", ) - result = _format_finding(finding, color=False) + result = _format_codepoint_entry(finding, 1, color=False) assert "[CONFUSABLE: looks like 'a']" in result def test_confusable_format_with_color(self) -> None: @@ -176,8 +176,8 @@ def test_confusable_format_with_color(self) -> None: dangerous=False, confusable="a", ) - result = _format_finding(finding, color=True) - assert "[CONFUSABLE]" in result + result = _format_codepoint_entry(finding, 1, color=True) + assert "[CONFUSABLE: looks like 'a']" in result assert "\033[33m" in result # yellow def test_confusable_summary_count(self) -> None: diff --git a/tests/test_output.py b/tests/test_output.py index 3773024..f02dee2 100644 --- a/tests/test_output.py +++ b/tests/test_output.py @@ -3,12 +3,17 @@ from __future__ import annotations from pathlib import Path +from typing import TYPE_CHECKING from unittest.mock import patch +if TYPE_CHECKING: + import pytest + from check_unicode.checker import Finding, check_file from check_unicode.output import ( - _context_line, - _format_finding, + _build_caret_line, + _compact_ranges, + _format_codepoint_entry, _use_color, print_findings, ) @@ -25,33 +30,154 @@ def test_no_color_env_var(self) -> None: assert _use_color(no_color=False) is False -class TestFormatFinding: - """Tests for finding formatting with and without color.""" +class TestCompactRanges: + """Tests for compact line range formatting.""" - def test_dangerous_with_color(self) -> None: - """Dangerous findings include bold red [DANGEROUS] prefix with color.""" - findings = check_file(FIXTURES / "bidi_attack.txt") - dangerous = [f for f in findings if f.dangerous] - result = _format_finding(dangerous[0], color=True) - assert "[DANGEROUS]" in result - assert "\033[1;31m" in result + def test_empty(self) -> None: + """Empty input returns empty string.""" + assert _compact_ranges([]) == "" + + def test_single_line(self) -> None: + """Single line number returned as-is.""" + assert _compact_ranges([5]) == "5" + + def test_consecutive_lines(self) -> None: + """Consecutive lines collapsed into a range.""" + assert _compact_ranges([1, 2, 3, 4]) == "1-4" + + def test_mixed(self) -> None: + """Mix of singles and ranges formatted correctly.""" + assert _compact_ranges([1, 4, 5, 6, 7, 80, 90]) == "1,4-7,80,90" + + def test_unsorted_input(self) -> None: + """Unsorted input is sorted before formatting.""" + assert _compact_ranges([90, 1, 5, 4, 80, 7, 6]) == "1,4-7,80,90" + + def test_duplicates(self) -> None: + """Duplicate line numbers are deduplicated.""" + assert _compact_ranges([1, 1, 2, 2, 3]) == "1-3" + + def test_two_separate(self) -> None: + """Two non-consecutive lines shown comma-separated.""" + assert _compact_ranges([3, 7]) == "3,7" + + +class TestBuildCaretLine: + """Tests for caret line construction.""" - def test_non_dangerous_with_color(self) -> None: - """Non-dangerous findings use red codepoint with color.""" - findings = check_file(FIXTURES / "smart_quotes.txt") - result = _format_finding(findings[0], color=True) - assert "\033[31m" in result - assert "[DANGEROUS]" not in result + def test_single_finding(self) -> None: + """Single finding produces one caret at correct position.""" + line = "He said \u201chello\u201d" + findings = [ + Finding( + file="t.txt", + line=1, + col=9, + char="\u201c", + codepoint=0x201C, + name="LEFT DOUBLE QUOTATION MARK", + category="Ps", + dangerous=False, + ), + ] + caret = _build_caret_line(line, findings) + assert caret == " ^" + def test_dangerous_uses_exclamation(self) -> None: + """Dangerous findings marked with ! instead of ^.""" + line = "x\u202ey" + findings = [ + Finding( + file="t.txt", + line=1, + col=2, + char="\u202e", + codepoint=0x202E, + name="RIGHT-TO-LEFT OVERRIDE", + category="Cf", + dangerous=True, + ), + ] + caret = _build_caret_line(line, findings) + assert "!" in caret + assert "^" not in caret -class TestContextLine: - """Tests for source context line display.""" + def test_confusable_uses_question(self) -> None: + """Confusable findings marked with ? instead of ^.""" + line = "p\u0430ssword" + findings = [ + Finding( + file="t.txt", + line=1, + col=2, + char="\u0430", + codepoint=0x0430, + name="CYRILLIC SMALL LETTER A", + category="Ll", + dangerous=False, + confusable="a", + ), + ] + caret = _build_caret_line(line, findings) + assert "?" in caret + assert "^" not in caret - def test_out_of_range_line(self) -> None: - """Out-of-range line numbers return empty string.""" + def test_multiple_findings_on_line(self) -> None: + """Multiple findings produce multiple carets.""" + line = "\u201chello\u201d" + findings = [ + Finding( + file="t.txt", + line=1, + col=1, + char="\u201c", + codepoint=0x201C, + name="LEFT DOUBLE QUOTATION MARK", + category="Ps", + dangerous=False, + ), + Finding( + file="t.txt", + line=1, + col=7, + char="\u201d", + codepoint=0x201D, + name="RIGHT DOUBLE QUOTATION MARK", + category="Pe", + dangerous=False, + ), + ] + caret = _build_caret_line(line, findings) + assert caret.count("^") == 2 + + def test_invisible_char_expansion(self) -> None: + """Caret position accounts for expansion of invisible chars.""" + line = "a\u200bb" # ZWS between a and b + findings = [ + Finding( + file="t.txt", + line=1, + col=2, + char="\u200b", + codepoint=0x200B, + name="ZERO WIDTH SPACE", + category="Cf", + dangerous=True, + ), + ] + caret = _build_caret_line(line, findings) + # 'a' is at position 0, ZWS renders as starting at position 1 + assert caret == " !" + + +class TestFormatCodepointEntry: + """Tests for codepoint listing entry formatting.""" + + def test_normal_no_color(self) -> None: + """Normal finding formatted with codepoint, name, and category.""" finding = Finding( - file="test.txt", - line=999, + file="t.txt", + line=1, col=1, char="\u201c", codepoint=0x201C, @@ -59,14 +185,80 @@ def test_out_of_range_line(self) -> None: category="Ps", dangerous=False, ) - assert _context_line(finding, ["only one line"]) == "" + result = _format_codepoint_entry(finding, 1, color=False) + assert "U+201C" in result + assert "LEFT DOUBLE QUOTATION MARK" in result + assert "[Ps]" in result + assert "(x" not in result + + def test_count_shown(self) -> None: + """Count > 1 shows (xN) suffix.""" + finding = Finding( + file="t.txt", + line=1, + col=1, + char="\u2500", + codepoint=0x2500, + name="BOX DRAWINGS LIGHT HORIZONTAL", + category="So", + dangerous=False, + ) + result = _format_codepoint_entry(finding, 98, color=False) + assert "(x98)" in result + + def test_dangerous_prefix(self) -> None: + """Dangerous findings prefixed with ! [DANGEROUS].""" + finding = Finding( + file="t.txt", + line=1, + col=1, + char="\u202e", + codepoint=0x202E, + name="RIGHT-TO-LEFT OVERRIDE", + category="Cf", + dangerous=True, + ) + result = _format_codepoint_entry(finding, 1, color=False) + assert result.startswith("! [DANGEROUS]") + + def test_confusable_prefix(self) -> None: + """Confusable findings prefixed with ? [CONFUSABLE].""" + finding = Finding( + file="t.txt", + line=1, + col=1, + char="\u0430", + codepoint=0x0430, + name="CYRILLIC SMALL LETTER A", + category="Ll", + dangerous=False, + confusable="a", + ) + result = _format_codepoint_entry(finding, 1, color=False) + assert result.startswith("? [CONFUSABLE: looks like 'a']") + + def test_dangerous_with_color(self) -> None: + """Dangerous findings use bold red ANSI codes.""" + finding = Finding( + file="t.txt", + line=1, + col=1, + char="\u202e", + codepoint=0x202E, + name="RIGHT-TO-LEFT OVERRIDE", + category="Cf", + dangerous=True, + ) + result = _format_codepoint_entry(finding, 1, color=True) + assert "[DANGEROUS]" in result + assert "\033[1;31m" in result class TestPrintFindings: - """Tests for full finding output.""" + """Tests for full grouped output.""" def test_context_file_read_failure(self) -> None: - """Findings referencing nonexistent files don't crash context display.""" + """Findings referencing nonexistent files don't crash.""" finding = Finding( file="/nonexistent/file.txt", line=1, @@ -79,3 +271,75 @@ def test_context_file_read_failure(self) -> None: ) # Should not raise print_findings([finding], no_color=True) + + def test_grouped_header_format( + self, tmp_path: Path, capsys: pytest.CaptureFixture[str] + ) -> None: + """Output shows filepath:ranges: header.""" + f = tmp_path / "test.txt" + f.write_text("He said \u201chello\u201d\n", encoding="utf-8") + findings = check_file(str(f)) + print_findings(findings, no_color=True) + err = capsys.readouterr().err + assert f"{f}:1:" in err + + def test_grouped_caret_line( + self, tmp_path: Path, capsys: pytest.CaptureFixture[str] + ) -> None: + """Output shows carets under non-ASCII characters.""" + f = tmp_path / "test.txt" + f.write_text("He said \u201chello\u201d\n", encoding="utf-8") + findings = check_file(str(f)) + print_findings(findings, no_color=True) + err = capsys.readouterr().err + # Should have caret markers + assert "^" in err + + def test_grouped_codepoint_listing( + self, tmp_path: Path, capsys: pytest.CaptureFixture[str] + ) -> None: + """Output lists unique codepoints.""" + f = tmp_path / "test.txt" + f.write_text("He said \u201chello\u201d\n", encoding="utf-8") + findings = check_file(str(f)) + print_findings(findings, no_color=True) + err = capsys.readouterr().err + assert "U+201C" in err + assert "U+201D" in err + + def test_quiet_suppresses_detail( + self, tmp_path: Path, capsys: pytest.CaptureFixture[str] + ) -> None: + """Quiet mode shows only summary.""" + f = tmp_path / "test.txt" + f.write_text("He said \u201chello\u201d\n", encoding="utf-8") + findings = check_file(str(f)) + print_findings(findings, no_color=True, quiet=True) + err = capsys.readouterr().err + assert "Found" in err + assert "U+201C" not in err + + def test_deduplicates_identical_context( + self, tmp_path: Path, capsys: pytest.CaptureFixture[str] + ) -> None: + """Identical context lines are shown only once.""" + f = tmp_path / "test.txt" + # Write 5 identical lines with same non-ASCII char + f.write_text("\u2500\u2500\u2500\n" * 5, encoding="utf-8") + findings = check_file(str(f)) + print_findings(findings, no_color=True) + err = capsys.readouterr().err + # The context line should appear only once despite 5 source lines + rendered_line = "\u2500\u2500\u2500" + assert err.count(f" {rendered_line}") == 1 + + def test_count_for_repeated_codepoints( + self, tmp_path: Path, capsys: pytest.CaptureFixture[str] + ) -> None: + """Repeated codepoints show (xN) count.""" + f = tmp_path / "test.txt" + f.write_text("\u2500" * 10 + "\n", encoding="utf-8") + findings = check_file(str(f)) + print_findings(findings, no_color=True) + err = capsys.readouterr().err + assert "(x10)" in err From 90b50cd227a45700a201da744c651111d6a7de1b Mon Sep 17 00:00:00 2001 From: mit-d Date: Tue, 10 Mar 2026 19:18:52 -0600 Subject: [PATCH 2/3] docs: add grouped output to changelog --- CHANGELOG.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9c631bc..fe9d9d5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,13 @@ - Exclude `tests/fixtures/` from mypy (intentionally malformed Trojan Source files) +### Added + +- Grouped output: findings are now grouped per file with a compact line range + header (e.g. `file.txt:1,4-80,90:`), context lines shown once with multi-caret + markers (`^` normal, `!` dangerous, `?` confusable), deduplicated identical + context lines, and collapsed codepoint listing with `(xN)` counts + ### Changed - Refactor `_apply_replacements` to use `str.translate()` for cleaner code and From 99e0628ea06954489f9066b1d2579dd4fc2cb58f Mon Sep 17 00:00:00 2001 From: mit-d Date: Tue, 10 Mar 2026 19:25:55 -0600 Subject: [PATCH 3/3] refactor: extract severity helper and remove side effects from _collect_codepoints - Extract _is_more_severe() to deduplicate severity-priority logic used in both _build_caret_line and _collect_codepoints - Replace _rendered_width() with inline len(_render_invisible(ch)) - Move stderr side effect out of _collect_codepoints into _print_file_findings for better separation of concerns --- src/check_unicode/output.py | 45 +++++++++++++++---------------------- 1 file changed, 18 insertions(+), 27 deletions(-) diff --git a/src/check_unicode/output.py b/src/check_unicode/output.py index 9447e5f..ffd52a5 100644 --- a/src/check_unicode/output.py +++ b/src/check_unicode/output.py @@ -64,10 +64,15 @@ def _compact_ranges(lines: list[int]) -> str: return ",".join(ranges) -def _rendered_width(ch: str) -> int: - """Return the display width of a character after invisible-char expansion.""" - cp = ord(ch) - return len(f"") if cp > _MAX_ASCII and not ch.isprintable() else 1 +def _is_more_severe(candidate: Finding, existing: Finding) -> bool: + """Return True if *candidate* should replace *existing*.""" + if candidate.dangerous and not existing.dangerous: + return True + return ( + candidate.confusable is not None + and not existing.dangerous + and existing.confusable is None + ) def _build_caret_line(line: str, line_findings: list[Finding]) -> str: @@ -76,15 +81,7 @@ def _build_caret_line(line: str, line_findings: list[Finding]) -> str: col_map: dict[int, Finding] = {} for f in line_findings: existing = col_map.get(f.col) - if ( - existing is None - or (f.dangerous and not existing.dangerous) - or ( - f.confusable is not None - and not existing.dangerous - and existing.confusable is None - ) - ): + if existing is None or _is_more_severe(f, existing): col_map[f.col] = f # Walk through the line, tracking rendered position @@ -103,7 +100,7 @@ def _build_caret_line(line: str, line_findings: list[Finding]) -> str: marker = "^" markers.append((pos, marker)) - pos += _rendered_width(ch) + pos += len(_render_invisible(ch)) if not markers: return "" @@ -188,30 +185,19 @@ def _collect_codepoints( When the same codepoint appears as both a normal finding and a confusable (or dangerous), the more informative classification wins. + Findings with line == 0 (read errors) are skipped. Returns a sorted list of (finding, count) tuples. """ cp_counts: dict[int, tuple[Finding, int]] = {} for f in file_findings: if f.line == 0: - # Error finding (e.g., couldn't read file) -- printed separately - sys.stderr.write(f" {f.name}\n") continue existing = cp_counts.get(f.codepoint) if existing is None: cp_counts[f.codepoint] = (f, 1) else: existing_f, n = existing - # Prefer dangerous > confusable > normal - best = ( - f - if (f.dangerous and not existing_f.dangerous) - or ( - f.confusable is not None - and not existing_f.dangerous - and existing_f.confusable is None - ) - else existing_f - ) + best = f if _is_more_severe(f, existing_f) else existing_f cp_counts[f.codepoint] = (best, n + 1) return sorted( @@ -269,6 +255,11 @@ def _print_file_findings( if caret: sys.stderr.write(f" {caret}\n") + # Print error findings (line == 0, e.g. couldn't read file) + for f in file_findings: + if f.line == 0: + sys.stderr.write(f" {f.name}\n") + # List unique codepoints with counts for finding, count in _collect_codepoints(file_findings): entry = _format_codepoint_entry(finding, count, color=color)