mit-d · mit-d · Mar 29, 2026 · Mar 29, 2026 · Mar 29, 2026 · Mar 29, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,8 +2,23 @@
 
 ## Unreleased
 
+### Added
+
+- Pipe mode: `check-unicode -` reads stdin line-by-line and writes to stdout,
+  enabling use as a streaming Unix filter for log monitoring, CI pipelines, and
+  editor buffer filtering
+- `--strip [dangerous|all]` flag to remove non-ASCII characters; `dangerous`
+  strips only invisible/bidi chars, `all` (default) strips any remaining
+  non-ASCII after allow-list processing
+- `--halt [dangerous|all]` flag to stop immediately on first matching character;
+  `dangerous` (default) halts on invisible/bidi chars, `all` halts on any
+  non-ASCII
+- `--fix`, `--strip`, and `--halt` are fully composable and work identically
+  across file and pipe modes
+
 ### Changed
 
+- Add `pytest-sugar` for improved test output
 - Replace mypy with [ty](https://github.com/astral-sh/ty) for type checking
 - Move dev dependencies from `optional-dependencies` to `dependency-groups`
 - Switch CI from pip to uv for faster, reproducible installs; check in `uv.lock`

diff --git a/docs/check-unicode.1 b/docs/check-unicode.1
@@ -33,8 +33,12 @@ hook but also works as a standalone CLI tool.
 .TP
 .I FILE ...
 One or more files to check.
-At least one file is required; the program exits with code\ 2 if none are
-provided.
+Use
+.B \-
+to read from stdin and write to stdout (pipe mode).
+At least one file or
+.B \-
+is required; the program exits with code\ 2 if none are provided.
 .
 .SH OPTIONS
 .SS Mode
@@ -45,6 +49,32 @@ with their ASCII equivalents using an atomic write (temp file + rename).
 Exits\ 1 if any file was modified.
 Dangerous invisible characters are never auto\-fixed.
 .TP
+.BI \-\-strip " [LEVEL]"
+Remove non\-ASCII characters from output.
+.I LEVEL
+is
+.B dangerous
+(only invisible/bidi characters) or
+.B all
+(any remaining non\-ASCII after allow\-list processing).
+Default:
+.BR all .
+Respects allow\-lists.
+In file mode, modifies files in\-place; in pipe mode, writes stripped
+output to stdout.
+.TP
+.BI \-\-halt " [LEVEL]"
+Stop immediately on the first character matching the level.
+.I LEVEL
+is
+.B dangerous
+or
+.BR all .
+Default:
+.BR dangerous .
+Reports the triggering finding on stderr and exits\ 1.
+The triggering file is never modified.
+.TP
 .BR \-V ", " \-\-version
 Print the program version and exit.
 .
@@ -243,13 +273,44 @@ severity is
 or per\-file via overrides).
 .TP
 .B 1
-Non\-ASCII findings were detected, or files were modified in
-.B \-\-fix
-mode.
+Non\-ASCII findings were detected, files were modified by
+.BR \-\-fix / \-\-strip ,
+or
+.B \-\-halt
+was triggered.
 .TP
 .B 2
 Usage error (invalid arguments, no files specified, etc.).
 .
+.SH PIPE MODE
+When
+.B \-
+is given as the sole file argument,
+.B check\-unicode
+reads from stdin and writes to stdout, acting as a streaming Unix filter.
+.PP
+Input is processed line\-by\-line.
+For each line, findings are emitted to stderr immediately with full
+context display (rendered source line, caret markers, codepoint details).
+The processed line is written to stdout and flushed so downstream
+consumers see output in real time.
+.PP
+.BR \-\-fix ", " \-\-strip ", and " \-\-halt
+all work in pipe mode.
+The processing order per character is:
+allow\-list check, then
+.BR \-\-fix ,
+then
+.BR \-\-halt ,
+then
+.BR \-\-strip .
+.PP
+When
+.B \-\-halt
+triggers, the current line is not written to stdout and the program
+exits immediately.
+Lines already flushed remain in the output.
+.
 .SH WHAT IT CATCHES
 .SS Copy\-paste artifacts (fixable with \-\-fix)
 .TP
@@ -360,6 +421,36 @@ List all valid Unicode general categories:
 .B check\-unicode \-\-list\-categories
 .RE
 .PP
+Read from stdin, write to stdout (pipe mode):
+.PP
+.RS
+.B check\-unicode \- < file.txt
+.RE
+.PP
+Fix smart quotes and strip dangerous chars from a pipe:
+.PP
+.RS
+.B check\-unicode \-\-fix \-\-strip dangerous \- < file.txt
+.RE
+.PP
+Halt on first dangerous character in stdin:
+.PP
+.RS
+.B check\-unicode \-\-halt \- < input.txt
+.RE
+.PP
+Stream\-filter logs, fixing and stripping bidi attacks:
+.PP
+.RS
+.B tail \-f app.log | check\-unicode \-\-fix \-\-strip dangerous \-
+.RE
+.PP
+Strip all non\-ASCII from files in\-place:
+.PP
+.RS
+.B check\-unicode \-\-strip all src/
+.RE
+.PP
 Use with pre\-commit:
 .PP
 .nf

diff --git a/pyproject.toml b/pyproject.toml
@@ -33,6 +33,7 @@ dev = [
   "bump-my-version",
   "pytest",
   "pytest-cov",
+  "pytest-sugar>=1.1.1",
   "ruff",
   "ty",
 ]

diff --git a/src/check_unicode/checker.py b/src/check_unicode/checker.py
@@ -163,8 +163,8 @@ def _check_line_confusables(
     # Count scripts to find dominant.
     script_counts = Counter(script for _, _, script in letters)
 
-    if len(script_counts) < 2:  # noqa: PLR2004
-        return []  # single script, no confusable risk
+    if len(script_counts) <= 1:
+        return []
 
     # Dominant script: highest count, tie-break to Latin.
     max_count = max(script_counts.values())

diff --git a/src/check_unicode/fixer.py b/src/check_unicode/fixer.py
@@ -4,37 +4,49 @@
 
 import contextlib
 import os
+import re
 import tempfile
+import unicodedata
 from pathlib import Path
+from typing import TYPE_CHECKING
 
 from check_unicode.categories import DANGEROUS_INVISIBLE, REPLACEMENT_TABLE
+from check_unicode.scripts import script_of
+
+if TYPE_CHECKING:
+    from check_unicode.checker import AllowConfig
+
+_NON_ASCII = re.compile(r"[^\t\r\n\x20-\x7E]")
 
 # Pre-built translation table: all REPLACEMENT_TABLE entries that are NOT dangerous.
 _TRANSLATE_TABLE: dict[int, str] = {
     cp: repl for cp, repl in REPLACEMENT_TABLE.items() if cp not in DANGEROUS_INVISIBLE
 }
 
 
-def fix_file(path: str | Path) -> bool:
-    """Replace fixable Unicode characters in a file with ASCII equivalents.
-
-    Dangerous invisible characters are never auto-fixed.
-    Uses atomic write (temp file + rename) to avoid data loss.
+def _is_strip_allowed(cp: int, allow: AllowConfig) -> bool:
+    """Return True if codepoint is exempted from stripping by the allow-list.
 
-    Returns True if the file was modified.
+    Evaluation order matches checker._is_allowed: explicit codepoints are
+    checked first (can exempt even dangerous chars), then dangerous chars
+    are blocked, then printable/script/range/category checks.
     """
-    filepath = Path(path)
-    try:
-        original = filepath.read_text(encoding="utf-8")
-        orig_mode = filepath.stat().st_mode
-    except (UnicodeDecodeError, OSError):
+    if cp in allow.codepoints:
+        return True
+    if cp in DANGEROUS_INVISIBLE:
         return False
+    ch = chr(cp)
+    cat = unicodedata.category(ch)
+    return (
+        (allow.printable and ch.isprintable())
+        or (bool(allow.scripts) and script_of(cp) in allow.scripts)
+        or (bool(allow.ranges) and any(lo <= cp <= hi for lo, hi in allow.ranges))
+        or (bool(allow.categories) and any(cat.startswith(p) for p in allow.categories))
+    )
 
-    fixed = _apply_replacements(original)
-    if fixed == original:
-        return False
 
-    # Atomic write: write to temp file in same directory, then rename
+def _atomic_write(filepath: Path, content: str, orig_mode: int) -> None:
+    """Write *content* to *filepath* atomically, preserving *orig_mode*."""
     fd, tmp_path_str = tempfile.mkstemp(
         dir=filepath.parent,
         prefix=f".{filepath.name}.",
@@ -43,21 +55,90 @@ def fix_file(path: str | Path) -> bool:
     tmp_path = Path(tmp_path_str)
     try:
         with os.fdopen(fd, "w", encoding="utf-8") as f:
-            f.write(fixed)
-        # Preserve original file permissions
+            f.write(content)
         tmp_path.chmod(orig_mode)
         tmp_path.replace(filepath)
     except BaseException:
-        # Clean up temp file on any failure
         with contextlib.suppress(OSError):
             tmp_path.unlink()
         raise
+
+
+def strip_text(
+    text: str,
+    *,
+    level: str = "all",
+    allow: AllowConfig | None = None,
+) -> str:
+    """Remove non-ASCII characters from text based on strip level.
+
+    level="all": remove all non-ASCII characters (except allowed).
+    level="dangerous": remove only DANGEROUS_INVISIBLE characters (except allowed).
+    """
+
+    def _should_strip(ch: str) -> bool:
+        cp = ord(ch)
+        if allow is not None and _is_strip_allowed(cp, allow):
+            return False
+        if level == "dangerous":
+            return cp in DANGEROUS_INVISIBLE
+        return True
+
+    return _NON_ASCII.sub(lambda m: "" if _should_strip(m.group()) else m.group(), text)
+
+
+def fix_file(path: str | Path) -> bool:
+    """Replace fixable Unicode characters in a file with ASCII equivalents.
+
+    Dangerous invisible characters are never auto-fixed.
+    Uses atomic write (temp file + rename) to avoid data loss.
+
+    Returns True if the file was modified.
+    """
+    filepath = Path(path)
+    try:
+        original = filepath.read_text(encoding="utf-8")
+        orig_mode = filepath.stat().st_mode
+    except (UnicodeDecodeError, OSError):
+        return False
+
+    fixed = fix_text(original)
+    if fixed == original:
+        return False
+
+    _atomic_write(filepath, fixed, orig_mode)
     return True
 
 
-def _apply_replacements(text: str) -> str:
-    """Replace characters that have entries in REPLACEMENT_TABLE.
+def strip_file(
+    path: str | Path,
+    *,
+    level: str = "all",
+    allow: AllowConfig | None = None,
+) -> bool:
+    """Remove non-ASCII characters from a file based on strip level.
 
-    Skips dangerous invisible characters -- those are never auto-fixed.
+    Uses atomic write (temp file + rename) to avoid data loss.
+    Returns True if the file was modified.
+    """
+    filepath = Path(path)
+    try:
+        original = filepath.read_text(encoding="utf-8")
+        orig_mode = filepath.stat().st_mode
+    except (UnicodeDecodeError, OSError):
+        return False
+
+    stripped = strip_text(original, level=level, allow=allow)
+    if stripped == original:
+        return False
+
+    _atomic_write(filepath, stripped, orig_mode)
+    return True
+
+
+def fix_text(text: str) -> str:
+    """Replace fixable Unicode characters with ASCII equivalents.
+
+    Dangerous invisible characters are never auto-fixed.
     """
     return text.translate(_TRANSLATE_TABLE)