colorgpt/visualize.py at main · jschoemaker/colorgpt · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
"""
ColorGPT — visualize text as a strip of colors.

Usage:
    python visualize.py "the quick brown fox"
    python visualize.py --file some.txt
    python visualize.py            # reads stdin

Writes out.html and tries to open it. Each token shows up as a colored cell;
hover to see the token text + id. Useful for sanity-checking the LUT before
wiring up LEDs or a generator loop.
"""

import argparse
import html
import json
import sys
import webbrowser
from pathlib import Path

import numpy as np
from transformers import AutoTokenizer

ROOT = Path(__file__).parent
LUT_PATH = ROOT / "lut.bin"
META_PATH = ROOT / "lut_meta.json"
OUT_HTML = ROOT / "out.html"


def load_lut():
    if not LUT_PATH.exists() or not META_PATH.exists():
        raise SystemExit("LUT not built — run `python build_lut.py` first.")
    meta = json.loads(META_PATH.read_text())
    vocab = meta["vocab_size"]
    raw = np.fromfile(LUT_PATH, dtype=np.uint8)
    if raw.size != vocab * 3:
        raise SystemExit(f"LUT size mismatch: {raw.size} vs expected {vocab * 3}")
    return raw.reshape(vocab, 3), meta


def render(tokens, ids, lut, model_id):
    cells = []
    for tok, tid in zip(tokens, ids):
        r, g, b = lut[tid]
        # Pick text color by luminance of the cell — black on bright, white on dark.
        lum = 0.2126 * r + 0.7152 * g + 0.0722 * b
        fg = "#000" if lum > 140 else "#fff"
        # Show the token text with leading spaces visible. Qwen uses Ġ for spaces;
        # decode it for display.
        display = tok.replace("Ġ", "·").replace("Ċ", "⏎")
        cells.append(
            f'<span class="t" style="background:rgb({r},{g},{b});color:{fg}" '
            f'title="id {tid}: {html.escape(repr(tok))}">{html.escape(display)}</span>'
        )
    return f"""<!doctype html>
<html><head><meta charset="utf-8"><title>ColorGPT</title>
<style>
  body {{ background:#0a0a0a; color:#ddd; font:14px/1.4 ui-sans-serif,system-ui,sans-serif;
         margin:0; padding:32px; }}
  h1 {{ font-weight:400; font-size:14px; color:#888; margin:0 0 16px; letter-spacing:0.05em; }}
  .strip {{ display:flex; flex-wrap:wrap; gap:0; border-radius:4px; overflow:hidden;
            box-shadow:0 0 0 1px #222; }}
  .t {{ padding:14px 10px; font:13px ui-monospace,Menlo,Consolas,monospace; white-space:pre;
        min-width:24px; text-align:center; cursor:default; }}
  .meta {{ margin-top:18px; font:11px ui-monospace,monospace; color:#555; }}
</style></head><body>
<h1>colorgpt — {html.escape(model_id)} · {len(ids)} tokens</h1>
<div class="strip">{''.join(cells)}</div>
<div class="meta">hover any cell for the underlying token. · = leading space, ⏎ = newline.</div>
</body></html>"""


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("text", nargs="*", help="text to colorize (or use --file / stdin)")
    ap.add_argument("--file", type=Path, help="read text from a file")
    ap.add_argument("--no-open", action="store_true", help="don't auto-open the browser")
    args = ap.parse_args()

    if args.file:
        text = args.file.read_text(encoding="utf-8")
    elif args.text:
        text = " ".join(args.text)
    elif not sys.stdin.isatty():
        text = sys.stdin.read()
    else:
        text = "the quick brown fox jumps over the lazy dog"

    lut, meta = load_lut()
    tokenizer = AutoTokenizer.from_pretrained(meta["model_id"])
    ids = tokenizer.encode(text, add_special_tokens=False)
    tokens = tokenizer.convert_ids_to_tokens(ids)

    OUT_HTML.write_text(render(tokens, ids, lut, meta["model_id"]), encoding="utf-8")
    print(f"wrote {OUT_HTML}  ({len(ids)} tokens)")
    if not args.no_open:
        webbrowser.open(OUT_HTML.resolve().as_uri())


if __name__ == "__main__":
    main()