colorgpt/engine.py at main · jschoemaker/colorgpt · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
"""Shared engine: lazy-loaded model, tokenizer, and LUT."""

import json
import threading
from pathlib import Path
from typing import Tuple

import numpy as np
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

ROOT = Path(__file__).parent
LUT_PATH = ROOT / "lut.bin"
META_PATH = ROOT / "lut_meta.json"
UMAP_CACHE = ROOT / "umap_3d.npy"

_state = {"model": None, "tokenizer": None, "lut": None, "meta": None, "audio_lut": None}
_lock = threading.Lock()


def load_lut() -> Tuple[np.ndarray, dict]:
    if _state["lut"] is None:
        if not LUT_PATH.exists() or not META_PATH.exists():
            raise RuntimeError("LUT not built — run `python build_lut.py` first.")
        meta = json.loads(META_PATH.read_text())
        raw = np.fromfile(LUT_PATH, dtype=np.uint8)
        _state["lut"] = raw.reshape(meta["vocab_size"], 3)
        _state["meta"] = meta
    return _state["lut"], _state["meta"]


def load_tokenizer():
    if _state["tokenizer"] is None:
        _, meta = load_lut()
        _state["tokenizer"] = AutoTokenizer.from_pretrained(meta["model_id"])
    return _state["tokenizer"]


def load_model():
    """Load Qwen for generation. Lazy + singleton — first call ~5s, subsequent instant."""
    if _state["model"] is None:
        with _lock:
            if _state["model"] is None:
                _, meta = load_lut()
                model_id = meta["model_id"]
                print(f"[engine] loading {model_id} for generation…")
                tok = AutoTokenizer.from_pretrained(model_id)
                model = AutoModelForCausalLM.from_pretrained(model_id, dtype=torch.float32)
                model.eval()
                _state["tokenizer"] = tok
                _state["model"] = model
                print("[engine] model ready.")
    return _state["model"], _state["tokenizer"]


def color_for(token_id: int) -> Tuple[int, int, int]:
    lut, _ = load_lut()
    r, g, b = lut[token_id]
    return int(r), int(g), int(b)


def load_audio_lut() -> np.ndarray:
    """Normalized 3D UMAP coords as uint8 [0,255], for audio synthesis client-side.

    Uses the same axes as the color LUT — so a token's color and sound are coupled.
    """
    if _state["audio_lut"] is None:
        if not UMAP_CACHE.exists():
            raise RuntimeError("umap_3d.npy missing — rerun build_lut.py")
        umap = np.load(UMAP_CACHE)
        mins, maxs = umap.min(axis=0), umap.max(axis=0)
        norm = (umap - mins) / (maxs - mins + 1e-9)
        _state["audio_lut"] = (norm * 255 + 0.5).astype(np.uint8)
    return _state["audio_lut"]


def audio_for(token_id: int) -> Tuple[int, int, int]:
    lut = load_audio_lut()
    a0, a1, a2 = lut[token_id]
    return int(a0), int(a1), int(a2)


def get_non_latin_token_ids() -> list:
    """Token IDs whose decoded text contains any non-ASCII char.
    Threshold is > 0x7F so we catch:
      - byte-fallback tokens (Qwen encodes CJK as 3-byte sequences; each byte
        decodes to U+0080-U+00FF, which would slip past a wider Latin filter)
      - all non-Latin scripts
    Cost: also blocks accented Latin (é, ü, ñ) — fine for English-only output."""
    if _state.get("bad_words") is None:
        tokenizer = load_tokenizer()
        vocab = getattr(tokenizer, "vocab_size", None) or len(tokenizer)
        bad = []
        for tid in range(vocab):
            try:
                text = tokenizer.decode([tid])
            except Exception:
                continue
            for ch in text:
                if ord(ch) > 0x7F:
                    bad.append([tid])
                    break
        _state["bad_words"] = bad
    return _state["bad_words"]