From 944476b82c58679ece3a615deaf7ab396120d7ab Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 15 Apr 2026 11:25:41 +0000 Subject: [PATCH] unicode_show: escape all non-ASCII in describe_char output MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit describe_char used repr(c) to render suspicious characters in the description line. In Python 3, repr() only escapes characters that are not printable, so printable non-ASCII characters — letters (including Cyrillic/Greek/etc. homoglyphs), CJK, emoji, symbols, and combining marks — are passed through literally. This lets a suspicious character slip into unicode-show's own terminal output, defeating the tool's core purpose: a combining acute accent merges with the adjacent quote, a Cyrillic 'а' still reads as Latin 'a', etc. Use ascii(), which always returns an ASCII-only escaped representation, and add a regression test covering letters, homoglyphs, combining marks, CJK, emoji, and currency symbols. --- .../unicode_show/tests/unicode_show.py | 50 +++++++++++++++++++ .../unicode_show/unicode_show.py | 9 +++- 2 files changed, 58 insertions(+), 1 deletion(-) diff --git a/usr/lib/python3/dist-packages/unicode_show/tests/unicode_show.py b/usr/lib/python3/dist-packages/unicode_show/tests/unicode_show.py index 893858df..8237697d 100644 --- a/usr/lib/python3/dist-packages/unicode_show/tests/unicode_show.py +++ b/usr/lib/python3/dist-packages/unicode_show/tests/unicode_show.py @@ -308,6 +308,56 @@ def test_unicode_format_controls(self) -> None: file_contents=input_str, ) + def test_printable_non_ascii_chars_are_escaped(self) -> None: + """ + Tests that suspicious printable non-ASCII characters (letters, + homoglyphs, combining marks, CJK, emoji, symbols) are escaped in + the description line rather than passed through literally. Python's + repr() does NOT escape printable non-ASCII characters, so using it + would allow such characters to slip into the terminal output of a + tool whose whole purpose is to safely show them. + """ + + test_cases: list[tuple[str, str, str, str, str]] = [ + ("é", "'\\xe9'", "U+00E9", "LATIN SMALL LETTER E WITH ACUTE", "Ll"), + ("\u0430", "'\\u0430'", "U+0430", "CYRILLIC SMALL LETTER A", "Ll"), + ("\u0301", "'\\u0301'", "U+0301", "COMBINING ACUTE ACCENT", "Mn"), + ("漢", "'\\u6f22'", "U+6F22", "CJK UNIFIED IDEOGRAPH-6F22", "Lo"), + ("\U0001f600", "'\\U0001f600'", "U+1F600", "GRINNING FACE", "So"), + ("\u20ac", "'\\u20ac'", "U+20AC", "EURO SIGN", "Sc"), + ] + for test_case in test_cases: + input_str: str = f"pre{test_case[0]}post\n" + expect_str: str = f"""\ +FILENAME_PLACEHOLDER:1: pre[{test_case[2]}]post + -> {test_case[1]} ({test_case[2]}, {test_case[3]}, {test_case[4]}) +""" + ## Verify the escaped form contains only ASCII so suspicious + ## characters never make it into the terminal output. + self.assertTrue( + expect_str.isascii(), + f"expected output is not ASCII-only: {expect_str!r}", + ) + self._test_stdin( + main_func=unicode_show_main, + argv0=self.argv0, + stdout_string=expect_str.replace( + "FILENAME_PLACEHOLDER", "" + ), + stderr_string="", + args=[], + exit_code=1, + stdin_string=input_str, + ) + self._test_file( + main_func=unicode_show_main, + argv0=self.argv0, + stdout_string=expect_str, + stderr_string="", + exit_code=1, + file_contents=input_str, + ) + def test_trailing_whitespace(self) -> None: """ Tests the detection of trailing whitespace characters (tabs, newlines, diff --git a/usr/lib/python3/dist-packages/unicode_show/unicode_show.py b/usr/lib/python3/dist-packages/unicode_show/unicode_show.py index 91f7e144..50e323a9 100644 --- a/usr/lib/python3/dist-packages/unicode_show/unicode_show.py +++ b/usr/lib/python3/dist-packages/unicode_show/unicode_show.py @@ -75,11 +75,18 @@ def describe_char(c: str) -> str: semantically_allowed: bool = c in SAFE_ASCII_SEMANTIC ## Purposeful redundancy for extra safety in character display. + ## Use ascii() rather than repr() because repr() in Python 3 only + ## escapes non-printable characters: printable non-ASCII characters + ## (letters, homoglyphs, combining marks, CJK, emoji, symbols, ...) + ## are passed through literally by repr(), which would let a + ## suspicious character slip into the terminal output of a tool whose + ## whole purpose is to safely display such characters. ascii() always + ## returns an ASCII-only escaped representation. display: str if codepoint_allowed and semantically_allowed and not c.isspace(): display = c else: - display = repr(c) + display = ascii(c) desc: str = f"{display} (U+{code:04X}, {name}, {cat})" return colorize(desc, CYAN)