-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathengine.py
More file actions
104 lines (85 loc) · 3.58 KB
/
Copy pathengine.py
File metadata and controls
104 lines (85 loc) · 3.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
"""Shared engine: lazy-loaded model, tokenizer, and LUT."""
import json
import threading
from pathlib import Path
from typing import Tuple
import numpy as np
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
ROOT = Path(__file__).parent
LUT_PATH = ROOT / "lut.bin"
META_PATH = ROOT / "lut_meta.json"
UMAP_CACHE = ROOT / "umap_3d.npy"
_state = {"model": None, "tokenizer": None, "lut": None, "meta": None, "audio_lut": None}
_lock = threading.Lock()
def load_lut() -> Tuple[np.ndarray, dict]:
if _state["lut"] is None:
if not LUT_PATH.exists() or not META_PATH.exists():
raise RuntimeError("LUT not built — run `python build_lut.py` first.")
meta = json.loads(META_PATH.read_text())
raw = np.fromfile(LUT_PATH, dtype=np.uint8)
_state["lut"] = raw.reshape(meta["vocab_size"], 3)
_state["meta"] = meta
return _state["lut"], _state["meta"]
def load_tokenizer():
if _state["tokenizer"] is None:
_, meta = load_lut()
_state["tokenizer"] = AutoTokenizer.from_pretrained(meta["model_id"])
return _state["tokenizer"]
def load_model():
"""Load Qwen for generation. Lazy + singleton — first call ~5s, subsequent instant."""
if _state["model"] is None:
with _lock:
if _state["model"] is None:
_, meta = load_lut()
model_id = meta["model_id"]
print(f"[engine] loading {model_id} for generation…")
tok = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, dtype=torch.float32)
model.eval()
_state["tokenizer"] = tok
_state["model"] = model
print("[engine] model ready.")
return _state["model"], _state["tokenizer"]
def color_for(token_id: int) -> Tuple[int, int, int]:
lut, _ = load_lut()
r, g, b = lut[token_id]
return int(r), int(g), int(b)
def load_audio_lut() -> np.ndarray:
"""Normalized 3D UMAP coords as uint8 [0,255], for audio synthesis client-side.
Uses the same axes as the color LUT — so a token's color and sound are coupled.
"""
if _state["audio_lut"] is None:
if not UMAP_CACHE.exists():
raise RuntimeError("umap_3d.npy missing — rerun build_lut.py")
umap = np.load(UMAP_CACHE)
mins, maxs = umap.min(axis=0), umap.max(axis=0)
norm = (umap - mins) / (maxs - mins + 1e-9)
_state["audio_lut"] = (norm * 255 + 0.5).astype(np.uint8)
return _state["audio_lut"]
def audio_for(token_id: int) -> Tuple[int, int, int]:
lut = load_audio_lut()
a0, a1, a2 = lut[token_id]
return int(a0), int(a1), int(a2)
def get_non_latin_token_ids() -> list:
"""Token IDs whose decoded text contains any non-ASCII char.
Threshold is > 0x7F so we catch:
- byte-fallback tokens (Qwen encodes CJK as 3-byte sequences; each byte
decodes to U+0080-U+00FF, which would slip past a wider Latin filter)
- all non-Latin scripts
Cost: also blocks accented Latin (é, ü, ñ) — fine for English-only output."""
if _state.get("bad_words") is None:
tokenizer = load_tokenizer()
vocab = getattr(tokenizer, "vocab_size", None) or len(tokenizer)
bad = []
for tid in range(vocab):
try:
text = tokenizer.decode([tid])
except Exception:
continue
for ch in text:
if ord(ch) > 0x7F:
bad.append([tid])
break
_state["bad_words"] = bad
return _state["bad_words"]