Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,13 @@
- Exclude `tests/fixtures/` from mypy (intentionally malformed Trojan Source
files)

### Added

- Grouped output: findings are now grouped per file with a compact line range
header (e.g. `file.txt:1,4-80,90:`), context lines shown once with multi-caret
markers (`^` normal, `!` dangerous, `?` confusable), deduplicated identical
context lines, and collapsed codepoint listing with `(xN)` counts

### Changed

- Refactor `_apply_replacements` to use `str.translate()` for cleaner code and
Expand Down
256 changes: 202 additions & 54 deletions src/check_unicode/output.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,49 +42,118 @@ def _render_invisible(line: str) -> str:
return "".join(out)


def _format_finding(f: Finding, *, color: bool) -> str:
"""Format a single finding as file:line:col: U+XXXX NAME [Cat]."""
cp_str = f"U+{f.codepoint:04X}"
if color:
if f.dangerous:
prefix = f"{_BOLD_RED}[DANGEROUS]{_RESET} "
def _compact_ranges(lines: list[int]) -> str:
"""Convert sorted line numbers to compact range string like '1,4-80,90'."""
if not lines:
return ""

sorted_lines = sorted(set(lines))
ranges: list[str] = []
start = sorted_lines[0]
end = sorted_lines[0]

for line in sorted_lines[1:]:
if line == end + 1:
end = line
else:
ranges.append(str(start) if start == end else f"{start}-{end}")
start = line
end = line

ranges.append(str(start) if start == end else f"{start}-{end}")
return ",".join(ranges)


def _is_more_severe(candidate: Finding, existing: Finding) -> bool:
"""Return True if *candidate* should replace *existing*."""
if candidate.dangerous and not existing.dangerous:
return True
return (
candidate.confusable is not None
and not existing.dangerous
and existing.confusable is None
)


def _build_caret_line(line: str, line_findings: list[Finding]) -> str:
"""Build a caret line with ^ for normal, ! for dangerous, ? for confusable."""
# Map column (1-indexed) to most severe finding at that column
col_map: dict[int, Finding] = {}
for f in line_findings:
existing = col_map.get(f.col)
if existing is None or _is_more_severe(f, existing):
col_map[f.col] = f

# Walk through the line, tracking rendered position
markers: list[tuple[int, str]] = []
pos = 0
for i, ch in enumerate(line):
col = i + 1
if col in col_map:
mf = col_map[col]
match (mf.dangerous, mf.confusable):
case (True, _):
marker = "!"
case (_, str()):
marker = "?"
case _:
marker = "^"
markers.append((pos, marker))

pos += len(_render_invisible(ch))

if not markers:
return ""

# Build caret string
result: list[str] = []
last_pos = 0
for rpos, marker in markers:
result.append(" " * (rpos - last_pos))
result.append(marker)
last_pos = rpos + 1

return "".join(result)


def _format_codepoint_entry(
finding: Finding,
count: int,
*,
color: bool,
) -> str:
"""Format a unique codepoint listing entry."""
cp_str = f"U+{finding.codepoint:04X}"
count_str = f" (x{count})" if count > 1 else ""

match (finding.dangerous, finding.confusable, color):
case (True, _, True):
prefix = f"{_BOLD_RED}!{_RESET} {_BOLD_RED}[DANGEROUS]{_RESET} "
cp_part = f"{_BOLD_RED}{cp_str}{_RESET}"
elif f.confusable is not None:
prefix = f"{_YELLOW}[CONFUSABLE]{_RESET} "
case (True, _, False):
prefix = "! [DANGEROUS] "
cp_part = cp_str
case (_, str() as lookalike, True):
prefix = (
f"{_YELLOW}?{_RESET} "
f"{_YELLOW}[CONFUSABLE: looks like '{lookalike}']{_RESET} "
)
cp_part = f"{_YELLOW}{cp_str}{_RESET}"
else:
case (_, str() as lookalike, False):
prefix = f"? [CONFUSABLE: looks like '{lookalike}'] "
cp_part = cp_str
case (_, _, True):
prefix = ""
cp_part = f"{_RED}{cp_str}{_RESET}"
cat_part = f"{_DIM}[{f.category}]{_RESET}"
else:
if f.dangerous:
prefix = "[DANGEROUS] "
elif f.confusable is not None:
prefix = f"[CONFUSABLE: looks like '{f.confusable}'] "
else:
case _:
prefix = ""
cp_part = cp_str
cat_part = f"[{f.category}]"
return f"{f.file}:{f.line}:{f.col}: {prefix}{cp_part} {f.name} {cat_part}"
cp_part = cp_str

cat_part = (
f"{_DIM}[{finding.category}]{_RESET}" if color else f"[{finding.category}]"
)

def _context_line(finding: Finding, file_lines: list[str]) -> str:
"""Show the source line with a caret pointing at the character."""
if finding.line < 1 or finding.line > len(file_lines):
return ""
line = file_lines[finding.line - 1]
rendered = _render_invisible(line)
# Compute caret position accounting for invisible char expansion
caret_pos = 0
for i, ch in enumerate(line):
if i == finding.col - 1:
break
cp = ord(ch)
if cp > _MAX_ASCII and not ch.isprintable():
caret_pos += len(f"<U+{cp:04X}>")
else:
caret_pos += 1
return f" {rendered}\n {' ' * caret_pos}^"
return f"{prefix}{cp_part} {finding.name} {cat_part}{count_str}"


def _print_summary(findings: list[Finding]) -> None:
Expand All @@ -109,33 +178,112 @@ def _print_summary(findings: list[Finding]) -> None:
sys.stderr.write(" ".join(parts) + "\n")


def _collect_codepoints(
file_findings: list[Finding],
) -> list[tuple[Finding, int]]:
"""Collect unique codepoints with counts, preferring the most informative.

When the same codepoint appears as both a normal finding and a confusable
(or dangerous), the more informative classification wins.
Findings with line == 0 (read errors) are skipped.
Returns a sorted list of (finding, count) tuples.
"""
cp_counts: dict[int, tuple[Finding, int]] = {}
for f in file_findings:
if f.line == 0:
continue
existing = cp_counts.get(f.codepoint)
if existing is None:
cp_counts[f.codepoint] = (f, 1)
else:
existing_f, n = existing
best = f if _is_more_severe(f, existing_f) else existing_f
cp_counts[f.codepoint] = (best, n + 1)

return sorted(
cp_counts.values(),
key=lambda x: (
not x[0].dangerous,
x[0].confusable is None,
x[0].codepoint,
),
)


def _print_file_findings(
filepath: str,
file_findings: list[Finding],
*,
color: bool,
) -> None:
"""Print grouped output for a single file."""
# Build compact line ranges for header
lines_with_findings = sorted({f.line for f in file_findings if f.line > 0})
ranges_str = _compact_ranges(lines_with_findings)

# Print header
header = f"{filepath}:{ranges_str}:" if ranges_str else f"{filepath}:"
sys.stderr.write(header + "\n")

# Read file for context display
try:
text = Path(filepath).read_text(encoding="utf-8")
file_lines = text.splitlines()
except (OSError, UnicodeDecodeError):
file_lines = []

# Group findings by line number
by_line: dict[int, list[Finding]] = {}
for f in file_findings:
by_line.setdefault(f.line, []).append(f)

# Show context lines with carets, deduplicating identical blocks
seen_contexts: set[tuple[str, str]] = set()
for lineno in sorted(by_line):
if lineno < 1 or lineno > len(file_lines):
continue
line = file_lines[lineno - 1]
rendered = _render_invisible(line)
caret = _build_caret_line(line, by_line[lineno])

context_key = (rendered, caret)
if context_key in seen_contexts:
continue
seen_contexts.add(context_key)

sys.stderr.write(f" {rendered}\n")
if caret:
sys.stderr.write(f" {caret}\n")

# Print error findings (line == 0, e.g. couldn't read file)
for f in file_findings:
if f.line == 0:
sys.stderr.write(f" {f.name}\n")

# List unique codepoints with counts
for finding, count in _collect_codepoints(file_findings):
entry = _format_codepoint_entry(finding, count, color=color)
sys.stderr.write(f" {entry}\n")

sys.stderr.write("\n")


def print_findings(
findings: list[Finding],
*,
no_color: bool = False,
quiet: bool = False,
) -> None:
"""Print findings to stderr."""
"""Print findings to stderr, grouped by file with compact line ranges."""
color = _use_color(no_color=no_color)

# Group by file for context lines
files_cache: dict[str, list[str]] = {}

if not quiet:
# Group by file, preserving first-seen order
by_file: dict[str, list[Finding]] = {}
for f in findings:
line = _format_finding(f, color=color)
sys.stderr.write(line + "\n")

# Show context if the finding has valid line info
if f.line > 0:
if f.file not in files_cache:
try:
text = Path(f.file).read_text(encoding="utf-8")
files_cache[f.file] = text.splitlines()
except (OSError, UnicodeDecodeError):
files_cache[f.file] = []
ctx = _context_line(f, files_cache[f.file])
if ctx:
sys.stderr.write(ctx + "\n")
by_file.setdefault(f.file, []).append(f)

for filepath, file_findings in by_file.items():
_print_file_findings(filepath, file_findings, color=color)

_print_summary(findings)
8 changes: 4 additions & 4 deletions tests/test_confusables.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from check_unicode.checker import Finding, check_confusables, check_file
from check_unicode.confusables import CONFUSABLES
from check_unicode.main import main
from check_unicode.output import _format_finding, print_findings
from check_unicode.output import _format_codepoint_entry, print_findings

FIXTURES = Path(__file__).parent / "fixtures"

Expand Down Expand Up @@ -160,7 +160,7 @@ def test_confusable_format_no_color(self) -> None:
dangerous=False,
confusable="a",
)
result = _format_finding(finding, color=False)
result = _format_codepoint_entry(finding, 1, color=False)
assert "[CONFUSABLE: looks like 'a']" in result

def test_confusable_format_with_color(self) -> None:
Expand All @@ -176,8 +176,8 @@ def test_confusable_format_with_color(self) -> None:
dangerous=False,
confusable="a",
)
result = _format_finding(finding, color=True)
assert "[CONFUSABLE]" in result
result = _format_codepoint_entry(finding, 1, color=True)
assert "[CONFUSABLE: looks like 'a']" in result
assert "\033[33m" in result # yellow

def test_confusable_summary_count(self) -> None:
Expand Down
Loading