From e3459920c90e69b2c41b158dc3bfed7e6ce6cbc2 Mon Sep 17 00:00:00 2001 From: mit-d Date: Sun, 29 Mar 2026 13:19:51 -0600 Subject: [PATCH 1/2] refactor: improve test coverage with fixtures, parametrize, and edge cases Grow test suite from 333 to 436 tests while improving structure: - checker: add text= param tests, allow priority order, confusable edge cases, BOM boundaries, Finding.fixable property tests - fixer: consolidate 4 replacement classes into parametrized table, add fix_text/strip_file direct tests, strip allow-list edge cases - cli: parametrize exit codes, convert pipe tests to stdin_from fixture, add edge cases (empty stdin, no newline, halt on line 1), add _preprocess_argv unit tests - output: add _make_finding helper, parametrize caret marker types, add empty/singular summary edge cases --- pyproject.toml | 1 + tests/test_checker.py | 321 ++++++++++++++++++++++++++++++++++++++++-- tests/test_cli.py | 201 ++++++++++++++++++++------ tests/test_fixer.py | 245 ++++++++++++++++++++++++-------- tests/test_output.py | 276 +++++++++++++++++++----------------- 5 files changed, 799 insertions(+), 245 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 32dd134..da70cce 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,6 +55,7 @@ lint.ignore = [ "ISC001", # single-line implicit string concat -- conflicts with formatter ] lint.per-file-ignores."tests/**" = [ + "PLR0913", # test factory helpers mirror dataclass fields "PLR2004", # magic values in test assertions are readable "S101", # assert is fine in tests ] diff --git a/tests/test_checker.py b/tests/test_checker.py index d3fc582..f8f07e9 100644 --- a/tests/test_checker.py +++ b/tests/test_checker.py @@ -4,7 +4,10 @@ from pathlib import Path -from check_unicode.checker import AllowConfig, check_file +import pytest + +from check_unicode.categories import REPLACEMENT_TABLE +from check_unicode.checker import AllowConfig, Finding, check_confusables, check_file FIXTURES = Path(__file__).parent / "fixtures" @@ -17,6 +20,28 @@ def test_clean_ascii_returns_empty(self) -> None: findings = check_file(FIXTURES / "clean_ascii.txt") assert findings == [] + def test_empty_string(self) -> None: + """Empty text produces no findings.""" + findings = check_file("virtual.txt", text="") + assert findings == [] + + def test_only_newlines(self) -> None: + """Text with only newlines produces no findings.""" + findings = check_file("virtual.txt", text="\n\n\n") + assert findings == [] + + def test_tabs_and_spaces(self) -> None: + """Text with tabs and spaces produces no findings.""" + findings = check_file("virtual.txt", text="\t \t hello\tworld \n") + assert findings == [] + + def test_empty_file(self, tmp_path: Path) -> None: + """An empty file on disk produces no findings.""" + f = tmp_path / "empty.txt" + f.write_text("", encoding="utf-8") + findings = check_file(f) + assert findings == [] + class TestSmartQuotes: """Tests for smart/curly quote detection.""" @@ -48,6 +73,58 @@ def test_smart_quotes_not_dangerous(self) -> None: assert not any(f.dangerous for f in findings) +class TestTextParameter: + """Tests for check_file with text= parameter (no disk I/O).""" + + def test_empty_text(self) -> None: + """Empty text produces no findings.""" + findings = check_file("virtual.txt", text="") + assert findings == [] + + def test_clean_text(self) -> None: + """Clean ASCII text produces no findings.""" + findings = check_file("virtual.txt", text="Hello, world!\n") + assert findings == [] + + def test_multiple_lines(self) -> None: + """Findings span multiple lines with correct line numbers.""" + text = "line one \u201c\nline two \u201d\n" + findings = check_file("virtual.txt", text=text) + assert len(findings) == 2 + assert findings[0].line == 1 + assert findings[1].line == 2 + + def test_multiple_findings_same_line(self) -> None: + """Multiple non-ASCII chars on the same line are all reported.""" + text = "\u201chello\u201d \u2013 world\n" + findings = check_file("virtual.txt", text=text) + assert len(findings) == 3 + assert all(f.line == 1 for f in findings) + cols = [f.col for f in findings] + assert cols == sorted(cols) + + def test_respects_allow_config(self) -> None: + """text= mode respects allow config.""" + text = "\u201chello\u201d\n" + allow = AllowConfig(codepoints=frozenset([0x201C, 0x201D])) + findings = check_file("virtual.txt", allow, text=text) + assert findings == [] + + def test_file_field_matches_path_argument(self) -> None: + """Finding.file reflects the path argument, not a real file.""" + text = "caf\u00e9\n" + findings = check_file("my/virtual/path.txt", text=text) + assert len(findings) == 1 + assert findings[0].file == "my/virtual/path.txt" + + def test_col_is_one_indexed(self) -> None: + """Column numbers are 1-indexed.""" + text = "abc\u00e9\n" + findings = check_file("virtual.txt", text=text) + assert len(findings) == 1 + assert findings[0].col == 4 + + class TestDangerousChars: """Tests for dangerous invisible character detection.""" @@ -57,23 +134,24 @@ def test_bidi_always_flagged(self) -> None: dangerous = [f for f in findings if f.dangerous] assert len(dangerous) > 0 - def test_bidi_not_suppressed_by_broad_range(self) -> None: - """Bidi characters are not suppressed by broad allow ranges.""" - allow = AllowConfig(ranges=((0x0000, 0xFFFF),)) - findings = check_file(FIXTURES / "bidi_attack.txt", allow) - dangerous = [f for f in findings if f.dangerous] - assert len(dangerous) > 0 - - def test_bidi_not_suppressed_by_category(self) -> None: - """Bidi characters are not suppressed by category allow-lists.""" - allow = AllowConfig(categories=frozenset(["Cf"])) + @pytest.mark.parametrize( + "allow", + [ + AllowConfig(ranges=((0x0000, 0xFFFF),)), + AllowConfig(categories=frozenset(["Cf"])), + AllowConfig(printable=True), + AllowConfig(scripts=frozenset(["Latin", "Common"])), + ], + ids=["range", "category", "printable", "script"], + ) + def test_dangerous_not_suppressed(self, allow: AllowConfig) -> None: + """Dangerous characters are not suppressed by non-codepoint allows.""" findings = check_file(FIXTURES / "bidi_attack.txt", allow) dangerous = [f for f in findings if f.dangerous] assert len(dangerous) > 0 def test_bidi_suppressed_by_explicit_codepoint(self) -> None: """Bidi characters are suppressed only by explicit codepoint allow.""" - # Get the dangerous codepoints first findings = check_file(FIXTURES / "bidi_attack.txt") dangerous_cps = frozenset(f.codepoint for f in findings if f.dangerous) allow = AllowConfig(codepoints=dangerous_cps) @@ -93,6 +171,21 @@ def test_zero_width_not_fixable(self) -> None: dangerous = [f for f in findings if f.dangerous] assert not any(f.fixable for f in dangerous) + @pytest.mark.parametrize( + "allow", + [ + AllowConfig(ranges=((0x0000, 0xFFFF),)), + AllowConfig(categories=frozenset(["Cf"])), + AllowConfig(printable=True), + ], + ids=["range", "category", "printable"], + ) + def test_zero_width_not_suppressed(self, allow: AllowConfig) -> None: + """Zero-width chars are not suppressed by non-codepoint allows.""" + findings = check_file(FIXTURES / "zero_width.txt", allow) + dangerous = [f for f in findings if f.dangerous] + assert len(dangerous) > 0 + class TestAllowList: """Tests for allow-list filtering of findings.""" @@ -111,7 +204,6 @@ def test_allow_range(self) -> None: def test_allow_category(self) -> None: """Codepoints in an allowed Unicode category are excluded.""" - # Sc = Symbol, currency (covers euro sign U+20AC) allow = AllowConfig(categories=frozenset(["Sc"])) findings = check_file(FIXTURES / "mixed_allowed.txt", allow) assert not any(f.codepoint == 0x20AC for f in findings) @@ -152,7 +244,6 @@ def test_allow_latin_suppresses_accented(self) -> None: """Allowing Latin script suppresses accented Latin characters.""" allow = AllowConfig(scripts=frozenset(["Latin"])) findings = check_file(FIXTURES / "printable_i18n.txt", allow) - # Accented chars suppressed, but CJK/Arabic still flagged assert not any(f.name.startswith("LATIN") for f in findings) assert len(findings) > 0 @@ -164,6 +255,115 @@ def test_allow_script_still_flags_dangerous(self) -> None: assert len(dangerous) > 0 +class TestAllowPriority: + """Tests for _is_allowed evaluation order and combined allow types.""" + + def test_explicit_codepoint_overrides_dangerous(self) -> None: + """Explicit codepoint allow overrides DANGEROUS_INVISIBLE block.""" + text = "hello\u200bworld\n" + allow = AllowConfig(codepoints=frozenset([0x200B])) + findings = check_file("virtual.txt", allow, text=text) + assert not any(f.codepoint == 0x200B for f in findings) + + def test_printable_checked_before_script(self) -> None: + """Printable allows a char even without script match.""" + text = "caf\u00e9\n" + allow_printable = AllowConfig(printable=True) + findings = check_file("virtual.txt", allow_printable, text=text) + assert findings == [] + + def test_script_checked_before_range(self) -> None: + """Script allows a char even without range match.""" + text = "caf\u00e9\n" + allow_script = AllowConfig(scripts=frozenset(["Latin"])) + findings = check_file("virtual.txt", allow_script, text=text) + assert findings == [] + + def test_range_checked_before_category(self) -> None: + """Range allows a char even without category match.""" + text = "\u00a9 copyright\n" + allow_range = AllowConfig(ranges=((0x00A0, 0x00FF),)) + findings = check_file("virtual.txt", allow_range, text=text) + assert not any(f.codepoint == 0x00A9 for f in findings) + + def test_category_is_last_resort(self) -> None: + """Category alone can allow a char.""" + text = "\u20ac100\n" + allow_cat = AllowConfig(categories=frozenset(["Sc"])) + findings = check_file("virtual.txt", allow_cat, text=text) + assert findings == [] + + def test_printable_plus_category_covers_all(self) -> None: + """Combining printable + category covers all non-dangerous chars.""" + text = "caf\u00e9 \u20ac100 \u00a9 \u201chello\u201d\n" + allow = AllowConfig(printable=True, categories=frozenset(["Sc"])) + findings = check_file("virtual.txt", allow, text=text) + assert findings == [] + + def test_dangerous_blocked_even_with_all_other_allows(self) -> None: + """Dangerous chars blocked even with printable + script + range + category.""" + text = "hello\u202eworld\n" + allow = AllowConfig( + printable=True, + scripts=frozenset(["Latin", "Common"]), + ranges=((0x0000, 0xFFFF),), + categories=frozenset(["Cf"]), + ) + findings = check_file("virtual.txt", allow, text=text) + dangerous = [f for f in findings if f.dangerous] + assert len(dangerous) > 0 + + +class TestConfusableEdgeCases: + """Tests for check_confusables edge cases.""" + + def test_empty_text(self) -> None: + """Empty text produces no confusable findings.""" + findings = check_confusables("virtual.txt", text="") + assert findings == [] + + def test_single_script_no_findings(self) -> None: + """A line with only one script produces no confusable findings.""" + findings = check_confusables("virtual.txt", text="hello world\n") + assert findings == [] + + def test_latin_wins_tie(self) -> None: + """When Latin and another script tie, Latin is dominant.""" + # 3 Latin + 3 Cyrillic confusables (U+0430, U+0441, U+043E) + text = "abc\u0430\u0441\u043e\n" + findings = check_confusables("virtual.txt", text=text) + assert len(findings) == 3 + assert all(f.confusable is not None for f in findings) + confusable_cps = {f.codepoint for f in findings} + assert confusable_cps == {0x0430, 0x0441, 0x043E} + + def test_minority_not_in_table_no_finding(self) -> None: + """Minority-script char not in CONFUSABLES table is not flagged.""" + # Mix Latin with a Cyrillic char NOT in CONFUSABLES (U+0436) + text = "abcdef\u0436\n" + findings = check_confusables("virtual.txt", text=text) + assert findings == [] + + def test_confusable_finding_has_replacement(self) -> None: + """Confusable findings include the Latin lookalike.""" + text = "abc\u0430\n" + findings = check_confusables("virtual.txt", text=text) + assert len(findings) == 1 + assert findings[0].confusable == "a" + + def test_pure_cyrillic_no_findings(self) -> None: + """Pure Cyrillic text (single script) produces no findings.""" + findings = check_confusables(FIXTURES / "pure_cyrillic.txt") + assert findings == [] + + def test_confusable_line_numbers(self) -> None: + """Confusable findings report correct line numbers.""" + text = "hello world\nabc\u0430def\n" + findings = check_confusables("virtual.txt", text=text) + assert len(findings) == 1 + assert findings[0].line == 2 + + class TestBOM: """Tests for byte-order mark handling.""" @@ -181,6 +381,27 @@ def test_bom_midfile_flagged(self, tmp_path: Path) -> None: findings = check_file(f) assert any(f_.codepoint == 0xFEFF for f_ in findings) + def test_bom_line2_col1_flagged(self) -> None: + """BOM at line 2 col 1 is flagged (not at file start).""" + text = "hello\n\ufeffworld\n" + findings = check_file("virtual.txt", text=text) + bom_findings = [f for f in findings if f.codepoint == 0xFEFF] + assert len(bom_findings) == 1 + assert bom_findings[0].line == 2 + assert bom_findings[0].col == 1 + + def test_bom_only_file(self) -> None: + """A file containing only a BOM produces no findings (BOM at start).""" + text = "\ufeff" + findings = check_file("virtual.txt", text=text) + assert not any(f.codepoint == 0xFEFF for f in findings) + + def test_bom_at_start_via_text(self) -> None: + """BOM at start of text= input is also ignored.""" + text = "\ufeffhello world\n" + findings = check_file("virtual.txt", text=text) + assert not any(f.codepoint == 0xFEFF for f in findings) + class TestInvalidUTF8: """Tests for invalid UTF-8 and binary file handling.""" @@ -192,3 +413,75 @@ def test_binary_file_handled_gracefully(self, tmp_path: Path) -> None: findings = check_file(f) assert len(findings) == 1 assert "Could not read file" in findings[0].name + + +class TestFindingProperties: + """Tests for Finding.fixable and other computed properties.""" + + @pytest.mark.parametrize( + ("codepoint", "char"), + [ + (0x201C, "\u201c"), + (0x201D, "\u201d"), + (0x2018, "\u2018"), + (0x2013, "\u2013"), + (0x00A0, "\u00a0"), + ], + ids=["left-dquote", "right-dquote", "left-squote", "en-dash", "nbsp"], + ) + def test_replacement_table_chars_are_fixable( + self, codepoint: int, char: str + ) -> None: + """Characters in REPLACEMENT_TABLE are marked fixable.""" + text = f"abc{char}def\n" + findings = check_file("virtual.txt", text=text) + assert len(findings) == 1 + assert findings[0].fixable + assert findings[0].codepoint == codepoint + + def test_accented_char_not_fixable(self) -> None: + """Accented characters not in REPLACEMENT_TABLE are not fixable.""" + text = "caf\u00e9\n" + findings = check_file("virtual.txt", text=text) + assert len(findings) == 1 + assert not findings[0].fixable + + def test_dangerous_never_fixable_even_if_in_replacement_table(self) -> None: + """Dangerous findings are never fixable, even for REPLACEMENT_TABLE chars.""" + for cp in (0x201C, 0x201D, 0x00A0): + assert cp in REPLACEMENT_TABLE + f = Finding( + file="virtual.txt", + line=1, + col=1, + char=chr(cp), + codepoint=cp, + name="TEST", + category="Cf", + dangerous=True, + ) + assert not f.fixable + + def test_dangerous_zero_width_not_fixable(self) -> None: + """Zero-width dangerous characters are not fixable.""" + text = "hello\u200bworld\n" + findings = check_file("virtual.txt", text=text) + assert len(findings) == 1 + assert findings[0].dangerous + assert not findings[0].fixable + + def test_finding_fields_populated(self) -> None: + """All Finding fields are correctly populated.""" + text = "caf\u00e9\n" + findings = check_file("virtual.txt", text=text) + assert len(findings) == 1 + f = findings[0] + assert f.file == "virtual.txt" + assert f.line == 1 + assert f.col == 4 + assert f.char == "\u00e9" + assert f.codepoint == 0x00E9 + assert "LATIN" in f.name + assert f.category == "Ll" + assert not f.dangerous + assert f.confusable is None diff --git a/tests/test_cli.py b/tests/test_cli.py index ac186e9..ff48206 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -19,6 +19,7 @@ _build_parser, _file_matches_override, _is_excluded, + _preprocess_argv, _resolve_allow_for_file, _resolve_file_settings, main, @@ -75,21 +76,19 @@ def _set(text: str) -> None: class TestExitCodes: """Tests for CLI exit code behavior.""" - def test_clean_file_exits_0(self) -> None: - """Clean files produce exit code 0.""" - assert main([str(FIXTURES / "clean_ascii.txt")]) == 0 - - def test_dirty_file_exits_1(self) -> None: - """Files with non-ASCII characters produce exit code 1.""" - assert main([str(FIXTURES / "smart_quotes.txt")]) == 1 - - def test_warning_severity_exits_0(self) -> None: - """Warning severity mode exits 0 even with findings.""" - assert main(["--severity", "warning", str(FIXTURES / "smart_quotes.txt")]) == 0 - - def test_dangerous_file_exits_1(self) -> None: - """Files with dangerous characters produce exit code 1.""" - assert main([str(FIXTURES / "bidi_attack.txt")]) == 1 + @pytest.mark.parametrize( + ("args", "expected_code"), + [ + ([str(FIXTURES / "clean_ascii.txt")], 0), + ([str(FIXTURES / "smart_quotes.txt")], 1), + (["--severity", "warning", str(FIXTURES / "smart_quotes.txt")], 0), + ([str(FIXTURES / "bidi_attack.txt")], 1), + ], + ids=["clean-exits-0", "dirty-exits-1", "warning-exits-0", "dangerous-exits-1"], + ) + def test_exit_code(self, args: list[str], expected_code: int) -> None: + """Exit codes match expected values for different inputs.""" + assert main(args) == expected_code def test_no_files_exits_error(self) -> None: """Providing no files causes argparse to exit with code 2.""" @@ -1022,25 +1021,23 @@ class TestPipeMode: def test_dash_clean_input_passes_through( self, - monkeypatch: pytest.MonkeyPatch, + stdin_from: Callable[[str], None], capsys: pytest.CaptureFixture[str], ) -> None: """Clean ASCII input is passed through to stdout unchanged, exit 0.""" - monkeypatch.setattr("sys.stdin", io.TextIOWrapper(io.BytesIO(b"hello world\n"))) + stdin_from("hello world\n") assert main(["-"]) == 0 captured = capsys.readouterr() assert captured.out == "hello world\n" def test_dash_dirty_input_passes_through_with_findings( self, - monkeypatch: pytest.MonkeyPatch, + stdin_from: Callable[[str], None], capsys: pytest.CaptureFixture[str], ) -> None: """Non-ASCII input passes through to stdout, findings on stderr.""" text = "He said \u201chello\u201d\n" - monkeypatch.setattr( - "sys.stdin", io.TextIOWrapper(io.BytesIO(text.encode("utf-8"))) - ) + stdin_from(text) assert main(["-"]) == 1 captured = capsys.readouterr() assert captured.out == text @@ -1048,39 +1045,34 @@ def test_dash_dirty_input_passes_through_with_findings( def test_dash_fix_mode_writes_fixed_to_stdout( self, - monkeypatch: pytest.MonkeyPatch, + stdin_from: Callable[[str], None], capsys: pytest.CaptureFixture[str], ) -> None: """Fix mode replaces smart quotes and writes fixed text to stdout.""" text = "He said \u201chello\u201d\n" - monkeypatch.setattr( - "sys.stdin", io.TextIOWrapper(io.BytesIO(text.encode("utf-8"))) - ) + stdin_from(text) assert main(["--fix", "-"]) == 1 captured = capsys.readouterr() assert captured.out == 'He said "hello"\n' def test_dash_fix_mode_clean_input( self, - monkeypatch: pytest.MonkeyPatch, + stdin_from: Callable[[str], None], capsys: pytest.CaptureFixture[str], ) -> None: """Fix mode with clean input passes through unchanged, exit 0.""" - monkeypatch.setattr("sys.stdin", io.TextIOWrapper(io.BytesIO(b"clean\n"))) + stdin_from("clean\n") assert main(["--fix", "-"]) == 0 captured = capsys.readouterr() assert captured.out == "clean\n" def test_dash_fix_mode_dangerous_still_reported( self, - monkeypatch: pytest.MonkeyPatch, + stdin_from: Callable[[str], None], capsys: pytest.CaptureFixture[str], ) -> None: """Fix mode preserves dangerous chars in output and stderr.""" - text = "x\u202ey\n" - monkeypatch.setattr( - "sys.stdin", io.TextIOWrapper(io.BytesIO(text.encode("utf-8"))) - ) + stdin_from("x\u202ey\n") result = main(["--fix", "-"]) assert result == 1 captured = capsys.readouterr() @@ -1090,28 +1082,23 @@ def test_dash_fix_mode_dangerous_still_reported( def test_dash_with_allow_flags( self, - monkeypatch: pytest.MonkeyPatch, + stdin_from: Callable[[str], None], capsys: pytest.CaptureFixture[str], ) -> None: """Allow flags work with pipe mode.""" text = "72\u00b0F\n" - monkeypatch.setattr( - "sys.stdin", io.TextIOWrapper(io.BytesIO(text.encode("utf-8"))) - ) + stdin_from(text) assert main(["--allow-codepoint", "U+00B0", "-"]) == 0 captured = capsys.readouterr() assert captured.out == text def test_dash_filename_in_findings( self, - monkeypatch: pytest.MonkeyPatch, + stdin_from: Callable[[str], None], capsys: pytest.CaptureFixture[str], ) -> None: """Findings use '' as the filename.""" - text = "\u201chello\u201d\n" - monkeypatch.setattr( - "sys.stdin", io.TextIOWrapper(io.BytesIO(text.encode("utf-8"))) - ) + stdin_from("\u201chello\u201d\n") main(["-"]) captured = capsys.readouterr() assert "" in captured.err @@ -1429,6 +1416,138 @@ def test_pipe_halt_respects_allow( assert result == 0 +class TestPipeModeEdgeCases: + """Edge case tests for pipe mode.""" + + def test_pipe_empty_stdin( + self, + stdin_from: Callable[[str], None], + capsys: pytest.CaptureFixture[str], + ) -> None: + """Empty input should exit 0 and produce no stdout/stderr.""" + stdin_from("") + assert main(["-"]) == 0 + captured = capsys.readouterr() + assert captured.out == "" + assert captured.err == "" + + def test_pipe_no_trailing_newline( + self, + stdin_from: Callable[[str], None], + capsys: pytest.CaptureFixture[str], + ) -> None: + """Input without trailing newline passes through without adding one.""" + stdin_from("hello") + assert main(["-"]) == 0 + captured = capsys.readouterr() + assert captured.out == "hello" + + def test_pipe_halt_on_first_line( + self, + stdin_from: Callable[[str], None], + capsys: pytest.CaptureFixture[str], + ) -> None: + """--halt with dangerous char on line 1 should NOT write line 1 to stdout.""" + stdin_from("x\u202ey\nline2\n") + assert main(["--halt", "-"]) == 1 + captured = capsys.readouterr() + assert "\u202e" not in captured.out + assert "line2" not in captured.out + + def test_pipe_multiline_summary_counts( + self, + stdin_from: Callable[[str], None], + capsys: pytest.CaptureFixture[str], + ) -> None: + """Multi-line input with findings on multiple lines; verify summary count.""" + stdin_from("\u201ca\u201d\n\u201cb\u201d\n") + main(["-"]) + err = capsys.readouterr().err + assert "Found" in err + # 4 smart quote chars total (2 per line) + assert "4" in err + + def test_pipe_preserves_blank_lines( + self, + stdin_from: Callable[[str], None], + capsys: pytest.CaptureFixture[str], + ) -> None: + """Blank lines in input are preserved in output.""" + stdin_from("a\n\nb\n") + assert main(["-"]) == 0 + assert capsys.readouterr().out == "a\n\nb\n" + + def test_pipe_strip_equals_syntax( + self, + stdin_from: Callable[[str], None], + capsys: pytest.CaptureFixture[str], + ) -> None: + """--strip=all works (equals syntax doesn't break preprocessing).""" + stdin_from("caf\u00e9\n") + assert main(["--strip=all", "-"]) == 1 + assert capsys.readouterr().out == "caf\n" + + def test_pipe_halt_clean_input_exits_0( + self, + stdin_from: Callable[[str], None], + ) -> None: + """--halt with clean input exits 0.""" + stdin_from("hello world\n") + assert main(["--halt", "-"]) == 0 + + +class TestPreprocessArgv: + """Tests for _preprocess_argv optional-level flag rewriting.""" + + @pytest.mark.parametrize( + ("argv", "expected"), + [ + (["--strip", "test.txt"], ["--strip=all", "test.txt"]), + (["--strip", "dangerous", "test.txt"], ["--strip=dangerous", "test.txt"]), + (["--strip", "all", "test.txt"], ["--strip=all", "test.txt"]), + (["--halt", "test.txt"], ["--halt=dangerous", "test.txt"]), + (["--halt", "all", "test.txt"], ["--halt=all", "test.txt"]), + (["--halt", "dangerous", "test.txt"], ["--halt=dangerous", "test.txt"]), + ( + ["--strip", "--halt", "test.txt"], + ["--strip=all", "--halt=dangerous", "test.txt"], + ), + ( + ["--fix", "--strip", "test.txt"], + ["--fix", "--strip=all", "test.txt"], + ), + ( + ["--strip", "all", "--halt", "dangerous", "test.txt"], + ["--strip=all", "--halt=dangerous", "test.txt"], + ), + (["test.txt"], ["test.txt"]), + ([], []), + (["--fix", "test.txt"], ["--fix", "test.txt"]), + (["-"], ["-"]), + (["--strip", "-"], ["--strip=all", "-"]), + ], + ids=[ + "strip-no-level-defaults-all", + "strip-dangerous", + "strip-all-explicit", + "halt-no-level-defaults-dangerous", + "halt-all", + "halt-dangerous-explicit", + "strip-then-halt-no-levels", + "fix-then-strip", + "strip-all-halt-dangerous", + "no-flags", + "empty-args", + "unrelated-flag", + "dash-alone", + "strip-with-dash", + ], + ) + def test_preprocess_argv(self, argv: list[str], expected: list[str]) -> None: + """_preprocess_argv correctly rewrites optional-level flags.""" + assert _preprocess_argv(argv) == expected + + class TestFlagInteractionsWithConfig: """Tests for action flag interactions with config/overrides.""" diff --git a/tests/test_fixer.py b/tests/test_fixer.py index 0e25047..f03d9a8 100644 --- a/tests/test_fixer.py +++ b/tests/test_fixer.py @@ -9,7 +9,7 @@ import pytest from check_unicode.checker import AllowConfig -from check_unicode.fixer import fix_file, strip_text +from check_unicode.fixer import fix_file, fix_text, strip_file, strip_text if TYPE_CHECKING: from pathlib import Path @@ -59,77 +59,156 @@ def test_strip_dangerous_respects_allowed(self) -> None: result = strip_text(text, level="dangerous", allow=allow) assert result == "x\u202ey\n" + def test_multiline_strips_across_lines(self) -> None: + """Non-ASCII chars on different lines are all stripped.""" + text = "caf\u00e9\nhello\u2026\nworld\u2014end\n" + result = strip_text(text, level="all") + assert result == "caf\nhello\nworldend\n" + + def test_multiple_dangerous_chars_stripped(self) -> None: + """Multiple different dangerous chars are all stripped in dangerous mode.""" + # ZWSP + bidi override + zero-width non-joiner + text = "a\u200bb\u202ec\u200cd\n" + result = strip_text(text, level="dangerous") + assert result == "abcd\n" + + def test_all_level_strips_dangerous_chars(self) -> None: + """Level 'all' strips dangerous chars as well as non-dangerous non-ASCII.""" + text = "x\u200by\u202ez\n" + result = strip_text(text, level="all") + assert result == "xyz\n" + + def test_allow_printable_preserves_printable(self) -> None: + """Allow printable=True keeps printable non-ASCII but still strips dangerous.""" + text = "caf\u00e9 x\u200by\n" + allow = AllowConfig(printable=True) + result = strip_text(text, level="all", allow=allow) + # e-acute is printable -> kept; ZWSP is dangerous -> stripped + assert result == "caf\u00e9 xy\n" -class TestSmartQuoteReplacement: - """Tests for smart quote to ASCII replacement.""" - - def test_replaces_smart_double_quotes(self, tmp_path: Path) -> None: - """Smart double quotes are replaced with straight double quotes.""" - f = tmp_path / "quotes.txt" - f.write_text("He said \u201chello\u201d\n", encoding="utf-8") - assert fix_file(f) is True - assert f.read_text(encoding="utf-8") == 'He said "hello"\n' - - def test_replaces_smart_single_quotes(self, tmp_path: Path) -> None: - """Smart single quotes are replaced with straight apostrophes.""" - f = tmp_path / "quotes.txt" - f.write_text("It\u2019s fine\n", encoding="utf-8") - assert fix_file(f) is True - assert f.read_text(encoding="utf-8") == "It's fine\n" - + def test_allow_range_preserves_chars_in_range(self) -> None: + """Chars within an allowed range are not stripped.""" + # Allow Latin-1 Supplement range (U+00C0 to U+00FF) + text = "caf\u00e9 na\u00efve\n" + allow = AllowConfig(ranges=((0x00C0, 0x00FF),)) + result = strip_text(text, level="all", allow=allow) + assert result == "caf\u00e9 na\u00efve\n" -class TestDashReplacement: - """Tests for dash and minus sign replacement.""" + def test_allow_script_preserves_chars_in_script(self) -> None: + """Chars belonging to an allowed script are not stripped.""" + # Greek capital letter sigma + text = "sum=\u03a3\n" + allow = AllowConfig(scripts=frozenset({"Greek"})) + result = strip_text(text, level="all", allow=allow) + assert result == "sum=\u03a3\n" - def test_replaces_em_dash(self, tmp_path: Path) -> None: - """Em dashes are replaced with double hyphens.""" - f = tmp_path / "dashes.txt" - f.write_text("word\u2014word\n", encoding="utf-8") - assert fix_file(f) is True - assert f.read_text(encoding="utf-8") == "word--word\n" - def test_replaces_en_dash(self, tmp_path: Path) -> None: - """En dashes are replaced with double hyphens.""" - f = tmp_path / "dashes.txt" - f.write_text("1\u20132\n", encoding="utf-8") - assert fix_file(f) is True - assert f.read_text(encoding="utf-8") == "1--2\n" +class TestFixText: + """Tests for fix_text() pure string replacement.""" - def test_replaces_minus_sign(self, tmp_path: Path) -> None: - """Unicode minus signs are replaced with ASCII hyphens.""" - f = tmp_path / "minus.txt" - f.write_text("x \u2212 y\n", encoding="utf-8") - assert fix_file(f) is True - assert f.read_text(encoding="utf-8") == "x - y\n" + @pytest.mark.parametrize( + ("input_text", "expected"), + [ + ("\u201chello\u201d", '"hello"'), + ("It\u2019s", "It's"), + ("\u2018word\u2019", "'word'"), + ("\u201aquote\u201b", "'quote'"), + ("\u201equote\u201f", '"quote"'), + ("\u00abguillemet\u00bb", '"guillemet"'), + ("\u2039angle\u203a", "'angle'"), + ("word\u2014word", "word--word"), + ("1\u20132", "1--2"), + ("x \u2212 y", "x - y"), + ("hello\u00a0world", "hello world"), + ("a\u2003b", "a b"), + ("a\u2009b", "a b"), + ("a\u200ab", "a b"), + ("a\u3000b", "a b"), + ("wait\u2026", "wait..."), + ], + ids=[ + "smart-double-quotes", + "right-single-quote", + "left-right-single-quotes", + "low9-highrev9-single-quotes", + "low9-highrev9-double-quotes", + "guillemets", + "angle-quotes", + "em-dash", + "en-dash", + "minus-sign", + "nbsp", + "em-space", + "thin-space", + "hair-space", + "ideographic-space", + "ellipsis", + ], + ) + def test_fix_replaces_character(self, input_text: str, expected: str) -> None: + """fix_text replaces known non-ASCII chars with ASCII equivalents.""" + assert fix_text(input_text) == expected + def test_clean_text_unchanged(self) -> None: + """Plain ASCII text passes through unchanged.""" + text = "hello world 123 !@#$%\n" + assert fix_text(text) == text -class TestSpaceReplacement: - """Tests for non-breaking and special space replacement.""" + def test_dangerous_chars_unchanged(self) -> None: + """Dangerous invisible chars are never replaced by fix_text.""" + text = "a\u200bb\u202ec\n" + assert fix_text(text) == text - def test_replaces_nbsp(self, tmp_path: Path) -> None: - """Non-breaking spaces are replaced with regular spaces.""" - f = tmp_path / "spaces.txt" - f.write_text("hello\u00a0world\n", encoding="utf-8") - assert fix_file(f) is True - assert f.read_text(encoding="utf-8") == "hello world\n" + def test_mixed_fixable_nonfixable_dangerous(self) -> None: + """Only fixable chars are replaced; non-fixable and dangerous are kept.""" + # e-acute (non-fixable), smart quote (fixable), ZWSP (dangerous) + text = "caf\u00e9 \u201chi\u201d a\u200bb\n" + result = fix_text(text) + assert result == 'caf\u00e9 "hi" a\u200bb\n' - def test_replaces_em_space(self, tmp_path: Path) -> None: - """Em spaces are replaced with regular spaces.""" - f = tmp_path / "spaces.txt" - f.write_text("a\u2003b\n", encoding="utf-8") - assert fix_file(f) is True - assert f.read_text(encoding="utf-8") == "a b\n" + def test_multiline_text(self) -> None: + """fix_text handles multi-line strings correctly.""" + text = "line1 \u201chi\u201d\nline2 word\u2014word\nline3 wait\u2026\n" + expected = 'line1 "hi"\nline2 word--word\nline3 wait...\n' + assert fix_text(text) == expected -class TestEllipsis: - """Tests for ellipsis character replacement.""" +class TestFixFileReplacements: + """Tests for fix_file() character replacements via atomic write.""" - def test_replaces_ellipsis(self, tmp_path: Path) -> None: - """Unicode ellipsis is replaced with three dots.""" - f = tmp_path / "ellipsis.txt" - f.write_text("wait\u2026\n", encoding="utf-8") + @pytest.mark.parametrize( + ("input_text", "expected"), + [ + ("\u201chello\u201d", '"hello"'), + ("It\u2019s", "It's"), + ("\u2018word\u2019", "'word'"), + ("word\u2014word", "word--word"), + ("1\u20132", "1--2"), + ("x \u2212 y", "x - y"), + ("hello\u00a0world", "hello world"), + ("a\u2003b", "a b"), + ("wait\u2026", "wait..."), + ], + ids=[ + "smart-double-quotes", + "right-single-quote", + "left-right-single-quotes", + "em-dash", + "en-dash", + "minus-sign", + "nbsp", + "em-space", + "ellipsis", + ], + ) + def test_fix_replaces_character( + self, tmp_path: Path, input_text: str, expected: str + ) -> None: + """fix_file replaces known non-ASCII chars and returns True.""" + f = tmp_path / "test.txt" + f.write_text(input_text + "\n", encoding="utf-8") assert fix_file(f) is True - assert f.read_text(encoding="utf-8") == "wait...\n" + assert f.read_text(encoding="utf-8") == expected + "\n" class TestDangerousCharsNotFixed: @@ -161,12 +240,58 @@ def test_clean_file_unchanged(self, tmp_path: Path) -> None: def test_no_replacement_chars_unchanged(self, tmp_path: Path) -> None: """Characters without replacement mappings are left untouched.""" - # Characters with no entry in REPLACEMENT_TABLE f = tmp_path / "unknown.txt" f.write_text("caf\u00e9\n", encoding="utf-8") # e-acute assert fix_file(f) is False +class TestStripFile: + """Tests for strip_file() with atomic writes.""" + + def test_strip_file_removes_non_ascii(self, tmp_path: Path) -> None: + """strip_file removes non-ASCII characters and returns True.""" + f = tmp_path / "strip.txt" + f.write_text("caf\u00e9\n", encoding="utf-8") + assert strip_file(f) is True + assert f.read_text(encoding="utf-8") == "caf\n" + + def test_strip_file_clean_returns_false(self, tmp_path: Path) -> None: + """strip_file on a clean ASCII file returns False.""" + f = tmp_path / "clean.txt" + f.write_text("hello world\n", encoding="utf-8") + assert strip_file(f) is False + + def test_strip_file_preserves_permissions(self, tmp_path: Path) -> None: + """File permissions are preserved after stripping.""" + f = tmp_path / "perms.txt" + f.write_text("caf\u00e9\n", encoding="utf-8") + f.chmod(0o755) + strip_file(f) + mode = stat.S_IMODE(f.stat().st_mode) + assert mode == 0o755 + + def test_strip_file_with_allow_config(self, tmp_path: Path) -> None: + """strip_file respects AllowConfig, keeping allowed codepoints.""" + f = tmp_path / "allow.txt" + f.write_text("caf\u00e9 \u201chi\u201d\n", encoding="utf-8") + allow = AllowConfig(codepoints=frozenset({0x00E9})) + assert strip_file(f, allow=allow) is True + assert f.read_text(encoding="utf-8") == "caf\u00e9 hi\n" + + def test_strip_file_dangerous_level(self, tmp_path: Path) -> None: + """strip_file with level='dangerous' only removes dangerous chars.""" + f = tmp_path / "danger.txt" + f.write_text("caf\u00e9 a\u200bb\n", encoding="utf-8") + assert strip_file(f, level="dangerous") is True + assert f.read_text(encoding="utf-8") == "caf\u00e9 ab\n" + + def test_strip_file_binary_returns_false(self, tmp_path: Path) -> None: + """Binary files that fail UTF-8 decode return False.""" + f = tmp_path / "binary.bin" + f.write_bytes(b"\x80\x81\xff") + assert strip_file(f) is False + + class TestAtomicWrite: """Tests for atomic file writing behavior.""" diff --git a/tests/test_output.py b/tests/test_output.py index e5db5c7..7f36709 100644 --- a/tests/test_output.py +++ b/tests/test_output.py @@ -3,11 +3,9 @@ from __future__ import annotations from pathlib import Path -from typing import TYPE_CHECKING from unittest.mock import patch -if TYPE_CHECKING: - import pytest +import pytest from check_unicode.checker import Finding, check_file from check_unicode.output import ( @@ -23,6 +21,31 @@ FIXTURES = Path(__file__).parent / "fixtures" +def _make_finding( + *, + col: int = 1, + char: str = "\u201c", + codepoint: int = 0x201C, + name: str = "LEFT DOUBLE QUOTATION MARK", + category: str = "Ps", + dangerous: bool = False, + confusable: str | None = None, + file: str = "t.txt", + line: int = 1, +) -> Finding: + return Finding( + file=file, + line=line, + col=col, + char=char, + codepoint=codepoint, + name=name, + category=category, + dangerous=dangerous, + confusable=confusable, + ) + + class TestUseColor: """Tests for color detection logic.""" @@ -63,90 +86,84 @@ def test_two_separate(self) -> None: """Two non-consecutive lines shown comma-separated.""" assert _compact_ranges([3, 7]) == "3,7" + def test_two_consecutive(self) -> None: + """Two consecutive lines collapsed into a range.""" + assert _compact_ranges([5, 6]) == "5-6" + + def test_large_gap(self) -> None: + """Large gap between lines shown comma-separated.""" + assert _compact_ranges([1, 1000]) == "1,1000" + + def test_single_element_list(self) -> None: + """Single element list returns that element as string.""" + assert _compact_ranges([42]) == "42" + class TestBuildCaretLine: """Tests for caret line construction.""" - def test_single_finding(self) -> None: - """Single finding produces one caret at correct position.""" - line = "He said \u201chello\u201d" - findings = [ - Finding( - file="t.txt", - line=1, - col=9, - char="\u201c", - codepoint=0x201C, - name="LEFT DOUBLE QUOTATION MARK", - category="Ps", - dangerous=False, + @pytest.mark.parametrize( + ("line_text", "finding", "expected_marker", "absent_marker"), + [ + ( + "He said \u201chello\u201d", + _make_finding(col=9), + "^", + None, ), - ] - caret = _build_caret_line(line, findings) - assert caret == " ^" - - def test_dangerous_uses_exclamation(self) -> None: - """Dangerous findings marked with ! instead of ^.""" - line = "x\u202ey" - findings = [ - Finding( - file="t.txt", - line=1, - col=2, - char="\u202e", - codepoint=0x202E, - name="RIGHT-TO-LEFT OVERRIDE", - category="Cf", - dangerous=True, + ( + "x\u202ey", + _make_finding( + col=2, + char="\u202e", + codepoint=0x202E, + name="RIGHT-TO-LEFT OVERRIDE", + category="Cf", + dangerous=True, + ), + "!", + "^", ), - ] - caret = _build_caret_line(line, findings) - assert "!" in caret - assert "^" not in caret - - def test_confusable_uses_question(self) -> None: - """Confusable findings marked with ? instead of ^.""" - line = "p\u0430ssword" - findings = [ - Finding( - file="t.txt", - line=1, - col=2, - char="\u0430", - codepoint=0x0430, - name="CYRILLIC SMALL LETTER A", - category="Ll", - dangerous=False, - confusable="a", + ( + "p\u0430ssword", + _make_finding( + col=2, + char="\u0430", + codepoint=0x0430, + name="CYRILLIC SMALL LETTER A", + category="Ll", + confusable="a", + ), + "?", + "^", ), - ] - caret = _build_caret_line(line, findings) - assert "?" in caret - assert "^" not in caret + ], + ids=["normal-caret", "dangerous-exclamation", "confusable-question"], + ) + def test_marker_type( + self, + line_text: str, + finding: Finding, + expected_marker: str, + absent_marker: str | None, + ) -> None: + """Correct marker character used for each finding severity.""" + caret = _build_caret_line(line_text, [finding]) + assert expected_marker in caret + if absent_marker is not None: + assert absent_marker not in caret def test_multiple_findings_on_line(self) -> None: """Multiple findings produce multiple carets.""" line = "\u201chello\u201d" findings = [ - Finding( - file="t.txt", - line=1, - col=1, - char="\u201c", - codepoint=0x201C, - name="LEFT DOUBLE QUOTATION MARK", - category="Ps", - dangerous=False, - ), - Finding( - file="t.txt", - line=1, + _make_finding(col=1), + _make_finding( col=7, char="\u201d", codepoint=0x201D, name="RIGHT DOUBLE QUOTATION MARK", category="Pe", - dangerous=False, ), ] caret = _build_caret_line(line, findings) @@ -156,9 +173,7 @@ def test_invisible_char_expansion(self) -> None: """Caret position accounts for expansion of invisible chars.""" line = "a\u200bb" # ZWS between a and b findings = [ - Finding( - file="t.txt", - line=1, + _make_finding( col=2, char="\u200b", codepoint=0x200B, @@ -171,23 +186,21 @@ def test_invisible_char_expansion(self) -> None: # 'a' is at position 0, ZWS renders as starting at position 1 assert caret == " !" + def test_finding_at_column_one(self) -> None: + """Finding at column 1 produces marker at start of caret line.""" + line = "\u201chello" + findings = [_make_finding(col=1)] + caret = _build_caret_line(line, findings) + assert caret.startswith("^") + assert caret == "^" + class TestFormatCodepointEntry: """Tests for codepoint listing entry formatting.""" def test_normal_no_color(self) -> None: """Normal finding formatted with codepoint, name, and category.""" - finding = Finding( - file="t.txt", - line=1, - col=1, - char="\u201c", - codepoint=0x201C, - name="LEFT DOUBLE QUOTATION MARK", - category="Ps", - dangerous=False, - ) - result = _format_codepoint_entry(finding, 1, color=False) + result = _format_codepoint_entry(_make_finding(), 1, color=False) assert "U+201C" in result assert "LEFT DOUBLE QUOTATION MARK" in result assert "[Ps]" in result @@ -195,25 +208,18 @@ def test_normal_no_color(self) -> None: def test_count_shown(self) -> None: """Count > 1 shows (xN) suffix.""" - finding = Finding( - file="t.txt", - line=1, - col=1, + finding = _make_finding( char="\u2500", codepoint=0x2500, name="BOX DRAWINGS LIGHT HORIZONTAL", category="So", - dangerous=False, ) result = _format_codepoint_entry(finding, 98, color=False) assert "(x98)" in result def test_dangerous_prefix(self) -> None: """Dangerous findings prefixed with ! [DANGEROUS].""" - finding = Finding( - file="t.txt", - line=1, - col=1, + finding = _make_finding( char="\u202e", codepoint=0x202E, name="RIGHT-TO-LEFT OVERRIDE", @@ -225,15 +231,11 @@ def test_dangerous_prefix(self) -> None: def test_confusable_prefix(self) -> None: """Confusable findings prefixed with ? [CONFUSABLE].""" - finding = Finding( - file="t.txt", - line=1, - col=1, + finding = _make_finding( char="\u0430", codepoint=0x0430, name="CYRILLIC SMALL LETTER A", category="Ll", - dangerous=False, confusable="a", ) result = _format_codepoint_entry(finding, 1, color=False) @@ -241,10 +243,7 @@ def test_confusable_prefix(self) -> None: def test_dangerous_with_color(self) -> None: """Dangerous findings use bold red ANSI codes.""" - finding = Finding( - file="t.txt", - line=1, - col=1, + finding = _make_finding( char="\u202e", codepoint=0x202E, name="RIGHT-TO-LEFT OVERRIDE", @@ -261,16 +260,7 @@ class TestPrintFindings: def test_context_file_read_failure(self) -> None: """Findings referencing nonexistent files don't crash.""" - finding = Finding( - file="/nonexistent/file.txt", - line=1, - col=1, - char="\u201c", - codepoint=0x201C, - name="LEFT DOUBLE QUOTATION MARK", - category="Ps", - dangerous=False, - ) + finding = _make_finding(file="/nonexistent/file.txt") # Should not raise print_findings([finding], no_color=True) @@ -294,7 +284,6 @@ def test_grouped_caret_line( findings = check_file(str(f)) print_findings(findings, no_color=True) err = capsys.readouterr().err - # Should have caret markers assert "^" in err def test_grouped_codepoint_listing( @@ -326,12 +315,10 @@ def test_deduplicates_identical_context( ) -> None: """Identical context lines are shown only once.""" f = tmp_path / "test.txt" - # Write 5 identical lines with same non-ASCII char f.write_text("\u2500\u2500\u2500\n" * 5, encoding="utf-8") findings = check_file(str(f)) print_findings(findings, no_color=True) err = capsys.readouterr().err - # The context line should appear only once despite 5 source lines rendered_line = "\u2500\u2500\u2500" assert err.count(f" {rendered_line}") == 1 @@ -347,6 +334,45 @@ def test_count_for_repeated_codepoints( assert "(x10)" in err +class TestPrintFindingsEdgeCases: + """Edge case tests for print_findings.""" + + def test_empty_findings_only_summary( + self, capsys: pytest.CaptureFixture[str] + ) -> None: + """Empty findings list produces only a zero-count summary.""" + print_findings([], no_color=True) + err = capsys.readouterr().err + assert "Found 0 non-ASCII characters in 0 files" in err + # No file headers or codepoint listings + assert "U+" not in err + + def test_summary_line_counts( + self, tmp_path: Path, capsys: pytest.CaptureFixture[str] + ) -> None: + """Summary line shows correct character, file, and fixable counts.""" + f = tmp_path / "test.txt" + f.write_text("He said \u201chello\u201d\n", encoding="utf-8") + findings = check_file(str(f)) + print_findings(findings, no_color=True) + err = capsys.readouterr().err + assert "Found 2 non-ASCII characters" in err + assert "in 1 file" in err + assert "2 fixable" in err + + def test_summary_singular_forms( + self, tmp_path: Path, capsys: pytest.CaptureFixture[str] + ) -> None: + """Single finding uses singular 'character' and 'file'.""" + f = tmp_path / "test.txt" + f.write_text("He said \u201chello\n", encoding="utf-8") + findings = check_file(str(f)) + print_findings(findings, no_color=True) + err = capsys.readouterr().err + assert "Found 1 non-ASCII character " in err + assert "in 1 file " in err + + class TestPrintFileFindingsWithText: """Tests for _print_file_findings with pre-supplied text.""" @@ -354,9 +380,8 @@ def test_stdin_context_display(self, capsys: pytest.CaptureFixture[str]) -> None """Findings for show context when text is provided.""" text = "x\u202ey\n" findings = [ - Finding( + _make_finding( file="", - line=1, col=2, char="\u202e", codepoint=0x202E, @@ -373,9 +398,8 @@ def test_stdin_context_display(self, capsys: pytest.CaptureFixture[str]) -> None def test_stdin_no_text_no_context(self, capsys: pytest.CaptureFixture[str]) -> None: """Without text param, findings lack context.""" findings = [ - Finding( + _make_finding( file="", - line=1, col=2, char="\u202e", codepoint=0x202E, @@ -397,7 +421,7 @@ def test_single_line_output(self, capsys: pytest.CaptureFixture[str]) -> None: """print_line_findings emits context for one line.""" line = "x\u202ey" findings = [ - Finding( + _make_finding( file="", line=5, col=2, @@ -421,25 +445,17 @@ def test_multiple_findings_same_line( """Multiple findings on one line all appear.""" line = "\u201chello\u201d" findings = [ - Finding( + _make_finding( file="", - line=1, - col=1, - char="\u201c", - codepoint=0x201C, - name="LEFT DOUBLE QUOTATION MARK", category="Pi", - dangerous=False, ), - Finding( + _make_finding( file="", - line=1, col=8, char="\u201d", codepoint=0x201D, name="RIGHT DOUBLE QUOTATION MARK", category="Pf", - dangerous=False, ), ] print_line_findings("", 1, line, findings, no_color=True) From bd5a904ce66b2b8890f0246694fdcb9b5d1d25d3 Mon Sep 17 00:00:00 2001 From: mit-d Date: Sun, 29 Mar 2026 13:31:13 -0600 Subject: [PATCH 2/2] feat: expand replacement table and change dash mapping to single hyphen - Change en/em dash replacement from '--' to '-' - Add hyphen variants: U+2010 HYPHEN, U+2011 NON-BREAKING HYPHEN, U+2012 FIGURE DASH, U+2015 HORIZONTAL BAR, U+FE58 SMALL EM DASH - Add soft hyphen (U+00AD) removal (invisible layout hint) - Add bullets: U+2022 BULLET, U+2023 TRIANGULAR BULLET -> *, U+2043 HYPHEN BULLET -> - - Add dot leaders: U+2024 -> ., U+2025 -> .. - Add arrows: U+2190 -> <-, U+2192 -> ->, U+2191 -> ^, U+2193 -> v - Add math operators: U+00D7 -> x, U+00F7 -> /, U+2044 -> / --- CHANGELOG.md | 4 +++ docs/check-unicode.1 | 38 ++++++++++++++++++++++++--- src/check_unicode/categories.py | 30 ++++++++++++++++++--- tests/test_cli.py | 2 +- tests/test_fixer.py | 46 +++++++++++++++++++++++++++++---- 5 files changed, 106 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2600dd7..a08f759 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,10 @@ ### Changed +- En dash and em dash now replace to `-` instead of `--` +- Expanded `--fix` replacement table with: hyphen variants (U+2010-2012, U+2015, + U+FE58), soft hyphen (removed), bullets, dot leaders, arrows (`->`, `<-`, `^`, + `v`), and math operators (`x`, `/`) - Add `pytest-sugar` for improved test output - Replace mypy with [ty](https://github.com/astral-sh/ty) for type checking - Move dev dependencies from `optional-dependencies` to `dependency-groups` diff --git a/docs/check-unicode.1 b/docs/check-unicode.1 index 23aa6aa..ad483a6 100644 --- a/docs/check-unicode.1 +++ b/docs/check-unicode.1 @@ -317,12 +317,14 @@ Lines already flushed remain in the output. .B Smart quotes \(lq\(rq \(oq\(cq and variants \(-> replaced with ASCII quotes .TP -.B Dashes -Em dash (U+2014), en dash (U+2013), minus sign (U+2212) \(-> replaced with -.B \-\- -or +.B Dashes and hyphens +Em dash, en dash, figure dash, horizontal bar, minus sign, and other +dash\-like characters \(-> replaced with .BR \- . .TP +.B Soft hyphen +U+00AD \(-> removed (invisible layout hint, not content). +.TP .B Fancy spaces Non\-breaking space, em space, thin space, and 14 other Unicode space characters \(-> replaced with a regular space. @@ -330,6 +332,34 @@ Non\-breaking space, em space, thin space, and 14 other Unicode space characters .B Ellipsis Horizontal ellipsis (U+2026) \(-> replaced with .BR ... . +.TP +.B Bullets +Bullet (U+2022), triangular bullet, hyphen bullet \(-> replaced with +.B * +or +.BR \- . +.TP +.B Dot leaders +One dot leader, two dot leader \(-> replaced with +.B . +or +.BR .. . +.TP +.B Arrows +\(-> and \(<- \(-> replaced with +.B \-> +and +.BR <\- ; +\(ua and \(da \(-> replaced with +.B ^ +and +.BR v . +.TP +.B Math operators +Multiplication sign (\(mu) \(-> replaced with +.BR x ; +division sign (\(di) and fraction slash \(-> replaced with +.BR / . . .SS Dangerous invisible characters (never auto\-fixed) .TP diff --git a/src/check_unicode/categories.py b/src/check_unicode/categories.py index 8b7d257..24408d3 100644 --- a/src/check_unicode/categories.py +++ b/src/check_unicode/categories.py @@ -36,11 +36,17 @@ 0x201F: '"', # DOUBLE HIGH-REVERSED-9 QUOTATION MARK 0x00AB: '"', # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK 0x00BB: '"', # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK - # Dashes - 0x2013: "--", # EN DASH - 0x2014: "--", # EM DASH - # Minus + # Dashes and hyphens + 0x2010: "-", # HYPHEN + 0x2011: "-", # NON-BREAKING HYPHEN + 0x2012: "-", # FIGURE DASH + 0x2013: "-", # EN DASH + 0x2014: "-", # EM DASH + 0x2015: "-", # HORIZONTAL BAR 0x2212: "-", # MINUS SIGN + 0xFE58: "-", # SMALL EM DASH + # Soft hyphen (invisible layout hint, not content) + 0x00AD: "", # SOFT HYPHEN # Fancy spaces -> regular space 0x00A0: " ", # NO-BREAK SPACE 0x2000: " ", # EN QUAD @@ -57,4 +63,20 @@ 0x3000: " ", # IDEOGRAPHIC SPACE # Ellipsis 0x2026: "...", # HORIZONTAL ELLIPSIS + # Bullets + 0x2022: "*", # BULLET + 0x2023: "*", # TRIANGULAR BULLET + 0x2043: "-", # HYPHEN BULLET + # Dot leaders + 0x2024: ".", # ONE DOT LEADER + 0x2025: "..", # TWO DOT LEADER + # Arrows + 0x2190: "<-", # LEFTWARDS ARROW + 0x2192: "->", # RIGHTWARDS ARROW + 0x2191: "^", # UPWARDS ARROW + 0x2193: "v", # DOWNWARDS ARROW + # Math operators + 0x00D7: "x", # MULTIPLICATION SIGN + 0x00F7: "/", # DIVISION SIGN + 0x2044: "/", # FRACTION SLASH } diff --git a/tests/test_cli.py b/tests/test_cli.py index ff48206..1bcf667 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -121,7 +121,7 @@ def test_fix_multiple_files_all_fixed(self, tmp_path: Path) -> None: f2.write_text("word\u2014word\n", encoding="utf-8") assert main(["--fix", str(f1), str(f2)]) == 1 assert f1.read_text(encoding="utf-8") == 'He said "hello"\n' - assert f2.read_text(encoding="utf-8") == "word--word\n" + assert f2.read_text(encoding="utf-8") == "word-word\n" def test_fix_dangerous_still_reported(self, tmp_path: Path) -> None: """Fix mode does not remove dangerous characters.""" diff --git a/tests/test_fixer.py b/tests/test_fixer.py index f03d9a8..0946f10 100644 --- a/tests/test_fixer.py +++ b/tests/test_fixer.py @@ -116,8 +116,8 @@ class TestFixText: ("\u201equote\u201f", '"quote"'), ("\u00abguillemet\u00bb", '"guillemet"'), ("\u2039angle\u203a", "'angle'"), - ("word\u2014word", "word--word"), - ("1\u20132", "1--2"), + ("word\u2014word", "word-word"), + ("1\u20132", "1-2"), ("x \u2212 y", "x - y"), ("hello\u00a0world", "hello world"), ("a\u2003b", "a b"), @@ -125,6 +125,24 @@ class TestFixText: ("a\u200ab", "a b"), ("a\u3000b", "a b"), ("wait\u2026", "wait..."), + ("a\u2010b", "a-b"), + ("a\u2011b", "a-b"), + ("a\u2012b", "a-b"), + ("a\u2015b", "a-b"), + ("a\ufe58b", "a-b"), + ("soft\u00adhyphen", "softhyphen"), + ("\u2022 item", "* item"), + ("\u2023 item", "* item"), + ("\u2043 item", "- item"), + ("ch\u20241", "ch.1"), + ("ch\u20251", "ch..1"), + ("a \u2192 b", "a -> b"), + ("b \u2190 a", "b <- a"), + ("\u2191up", "^up"), + ("\u2193down", "vdown"), + ("2 \u00d7 3", "2 x 3"), + ("6 \u00f7 2", "6 / 2"), + ("1\u20442", "1/2"), ], ids=[ "smart-double-quotes", @@ -143,6 +161,24 @@ class TestFixText: "hair-space", "ideographic-space", "ellipsis", + "hyphen", + "non-breaking-hyphen", + "figure-dash", + "horizontal-bar", + "small-em-dash", + "soft-hyphen", + "bullet", + "triangular-bullet", + "hyphen-bullet", + "one-dot-leader", + "two-dot-leader", + "right-arrow", + "left-arrow", + "up-arrow", + "down-arrow", + "multiplication-sign", + "division-sign", + "fraction-slash", ], ) def test_fix_replaces_character(self, input_text: str, expected: str) -> None: @@ -169,7 +205,7 @@ def test_mixed_fixable_nonfixable_dangerous(self) -> None: def test_multiline_text(self) -> None: """fix_text handles multi-line strings correctly.""" text = "line1 \u201chi\u201d\nline2 word\u2014word\nline3 wait\u2026\n" - expected = 'line1 "hi"\nline2 word--word\nline3 wait...\n' + expected = 'line1 "hi"\nline2 word-word\nline3 wait...\n' assert fix_text(text) == expected @@ -182,8 +218,8 @@ class TestFixFileReplacements: ("\u201chello\u201d", '"hello"'), ("It\u2019s", "It's"), ("\u2018word\u2019", "'word'"), - ("word\u2014word", "word--word"), - ("1\u20132", "1--2"), + ("word\u2014word", "word-word"), + ("1\u20132", "1-2"), ("x \u2212 y", "x - y"), ("hello\u00a0world", "hello world"), ("a\u2003b", "a b"),