From 944476b82c58679ece3a615deaf7ab396120d7ab Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Wed, 15 Apr 2026 11:25:41 +0000
Subject: [PATCH] unicode_show: escape all non-ASCII in describe_char output
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

describe_char used repr(c) to render suspicious characters in the
description line. In Python 3, repr() only escapes characters that are
not printable, so printable non-ASCII characters — letters (including
Cyrillic/Greek/etc. homoglyphs), CJK, emoji, symbols, and combining
marks — are passed through literally. This lets a suspicious character
slip into unicode-show's own terminal output, defeating the tool's core
purpose: a combining acute accent merges with the adjacent quote, a
Cyrillic 'а' still reads as Latin 'a', etc.

Use ascii(), which always returns an ASCII-only escaped representation,
and add a regression test covering letters, homoglyphs, combining marks,
CJK, emoji, and currency symbols.
---
 .../unicode_show/tests/unicode_show.py        | 50 +++++++++++++++++++
 .../unicode_show/unicode_show.py              |  9 +++-
 2 files changed, 58 insertions(+), 1 deletion(-)
diff --git a/usr/lib/python3/dist-packages/unicode_show/tests/unicode_show.py b/usr/lib/python3/dist-packages/unicode_show/tests/unicode_show.py
index 893858df..8237697d 100644
--- a/usr/lib/python3/dist-packages/unicode_show/tests/unicode_show.py
+++ b/usr/lib/python3/dist-packages/unicode_show/tests/unicode_show.py
@@ -308,6 +308,56 @@ def test_unicode_format_controls(self) -> None:
                 file_contents=input_str,
             )
 
+    def test_printable_non_ascii_chars_are_escaped(self) -> None:
+        """
+        Tests that suspicious printable non-ASCII characters (letters,
+        homoglyphs, combining marks, CJK, emoji, symbols) are escaped in
+        the description line rather than passed through literally. Python's
+        repr() does NOT escape printable non-ASCII characters, so using it
+        would allow such characters to slip into the terminal output of a
+        tool whose whole purpose is to safely show them.
+        """
+
+        test_cases: list[tuple[str, str, str, str, str]] = [
+            ("é", "'\\xe9'", "U+00E9", "LATIN SMALL LETTER E WITH ACUTE", "Ll"),
+            ("\u0430", "'\\u0430'", "U+0430", "CYRILLIC SMALL LETTER A", "Ll"),
+            ("\u0301", "'\\u0301'", "U+0301", "COMBINING ACUTE ACCENT", "Mn"),
+            ("漢", "'\\u6f22'", "U+6F22", "CJK UNIFIED IDEOGRAPH-6F22", "Lo"),
+            ("\U0001f600", "'\\U0001f600'", "U+1F600", "GRINNING FACE", "So"),
+            ("\u20ac", "'\\u20ac'", "U+20AC", "EURO SIGN", "Sc"),
+        ]
+        for test_case in test_cases:
+            input_str: str = f"pre{test_case[0]}post\n"
+            expect_str: str = f"""\
+FILENAME_PLACEHOLDER:1: pre[{test_case[2]}]post
+   -> {test_case[1]} ({test_case[2]}, {test_case[3]}, {test_case[4]})
+"""
+            ## Verify the escaped form contains only ASCII so suspicious
+            ## characters never make it into the terminal output.
+            self.assertTrue(
+                expect_str.isascii(),
+                f"expected output is not ASCII-only: {expect_str!r}",
+            )
+            self._test_stdin(
+                main_func=unicode_show_main,
+                argv0=self.argv0,
+                stdout_string=expect_str.replace(
+                    "FILENAME_PLACEHOLDER", "<stdin>"
+                ),
+                stderr_string="",
+                args=[],
+                exit_code=1,
+                stdin_string=input_str,
+            )
+            self._test_file(
+                main_func=unicode_show_main,
+                argv0=self.argv0,
+                stdout_string=expect_str,
+                stderr_string="",
+                exit_code=1,
+                file_contents=input_str,
+            )
+
     def test_trailing_whitespace(self) -> None:
         """
         Tests the detection of trailing whitespace characters (tabs, newlines,
diff --git a/usr/lib/python3/dist-packages/unicode_show/unicode_show.py b/usr/lib/python3/dist-packages/unicode_show/unicode_show.py
index 91f7e144..50e323a9 100644
--- a/usr/lib/python3/dist-packages/unicode_show/unicode_show.py
+++ b/usr/lib/python3/dist-packages/unicode_show/unicode_show.py
@@ -75,11 +75,18 @@ def describe_char(c: str) -> str:
     semantically_allowed: bool = c in SAFE_ASCII_SEMANTIC
 
     ## Purposeful redundancy for extra safety in character display.
+    ## Use ascii() rather than repr() because repr() in Python 3 only
+    ## escapes non-printable characters: printable non-ASCII characters
+    ## (letters, homoglyphs, combining marks, CJK, emoji, symbols, ...)
+    ## are passed through literally by repr(), which would let a
+    ## suspicious character slip into the terminal output of a tool whose
+    ## whole purpose is to safely display such characters. ascii() always
+    ## returns an ASCII-only escaped representation.
     display: str
     if codepoint_allowed and semantically_allowed and not c.isspace():
         display = c
     else:
-        display = repr(c)
+        display = ascii(c)
 
     desc: str = f"{display} (U+{code:04X}, {name}, {cat})"
     return colorize(desc, CYAN)