-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathvisualize.py
More file actions
101 lines (85 loc) · 3.67 KB
/
Copy pathvisualize.py
File metadata and controls
101 lines (85 loc) · 3.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
"""
ColorGPT — visualize text as a strip of colors.
Usage:
python visualize.py "the quick brown fox"
python visualize.py --file some.txt
python visualize.py # reads stdin
Writes out.html and tries to open it. Each token shows up as a colored cell;
hover to see the token text + id. Useful for sanity-checking the LUT before
wiring up LEDs or a generator loop.
"""
import argparse
import html
import json
import sys
import webbrowser
from pathlib import Path
import numpy as np
from transformers import AutoTokenizer
ROOT = Path(__file__).parent
LUT_PATH = ROOT / "lut.bin"
META_PATH = ROOT / "lut_meta.json"
OUT_HTML = ROOT / "out.html"
def load_lut():
if not LUT_PATH.exists() or not META_PATH.exists():
raise SystemExit("LUT not built — run `python build_lut.py` first.")
meta = json.loads(META_PATH.read_text())
vocab = meta["vocab_size"]
raw = np.fromfile(LUT_PATH, dtype=np.uint8)
if raw.size != vocab * 3:
raise SystemExit(f"LUT size mismatch: {raw.size} vs expected {vocab * 3}")
return raw.reshape(vocab, 3), meta
def render(tokens, ids, lut, model_id):
cells = []
for tok, tid in zip(tokens, ids):
r, g, b = lut[tid]
# Pick text color by luminance of the cell — black on bright, white on dark.
lum = 0.2126 * r + 0.7152 * g + 0.0722 * b
fg = "#000" if lum > 140 else "#fff"
# Show the token text with leading spaces visible. Qwen uses Ġ for spaces;
# decode it for display.
display = tok.replace("Ġ", "·").replace("Ċ", "⏎")
cells.append(
f'<span class="t" style="background:rgb({r},{g},{b});color:{fg}" '
f'title="id {tid}: {html.escape(repr(tok))}">{html.escape(display)}</span>'
)
return f"""<!doctype html>
<html><head><meta charset="utf-8"><title>ColorGPT</title>
<style>
body {{ background:#0a0a0a; color:#ddd; font:14px/1.4 ui-sans-serif,system-ui,sans-serif;
margin:0; padding:32px; }}
h1 {{ font-weight:400; font-size:14px; color:#888; margin:0 0 16px; letter-spacing:0.05em; }}
.strip {{ display:flex; flex-wrap:wrap; gap:0; border-radius:4px; overflow:hidden;
box-shadow:0 0 0 1px #222; }}
.t {{ padding:14px 10px; font:13px ui-monospace,Menlo,Consolas,monospace; white-space:pre;
min-width:24px; text-align:center; cursor:default; }}
.meta {{ margin-top:18px; font:11px ui-monospace,monospace; color:#555; }}
</style></head><body>
<h1>colorgpt — {html.escape(model_id)} · {len(ids)} tokens</h1>
<div class="strip">{''.join(cells)}</div>
<div class="meta">hover any cell for the underlying token. · = leading space, ⏎ = newline.</div>
</body></html>"""
def main():
ap = argparse.ArgumentParser()
ap.add_argument("text", nargs="*", help="text to colorize (or use --file / stdin)")
ap.add_argument("--file", type=Path, help="read text from a file")
ap.add_argument("--no-open", action="store_true", help="don't auto-open the browser")
args = ap.parse_args()
if args.file:
text = args.file.read_text(encoding="utf-8")
elif args.text:
text = " ".join(args.text)
elif not sys.stdin.isatty():
text = sys.stdin.read()
else:
text = "the quick brown fox jumps over the lazy dog"
lut, meta = load_lut()
tokenizer = AutoTokenizer.from_pretrained(meta["model_id"])
ids = tokenizer.encode(text, add_special_tokens=False)
tokens = tokenizer.convert_ids_to_tokens(ids)
OUT_HTML.write_text(render(tokens, ids, lut, meta["model_id"]), encoding="utf-8")
print(f"wrote {OUT_HTML} ({len(ids)} tokens)")
if not args.no_open:
webbrowser.open(OUT_HTML.resolve().as_uri())
if __name__ == "__main__":
main()