-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdebug_engine.py
More file actions
117 lines (99 loc) · 3.21 KB
/
Copy pathdebug_engine.py
File metadata and controls
117 lines (99 loc) · 3.21 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import sys
import os
import time
import psutil
# NOTE: We are NOT importing ctypes or setting LLAMA_CPP_LIB.
# We are letting the library work exactly as designed.
from llama_cpp import Llama
# ==========================================
# 0. SYSTEM PREP
# ==========================================
try:
p = psutil.Process(os.getpid())
p.nice(psutil.HIGH_PRIORITY_CLASS)
except: pass
root_path = os.path.dirname(os.path.abspath(__file__))
# ==========================================
# 1. MODEL DETECTION
# ==========================================
MODELS_DIR = os.path.join(root_path, "models")
if not os.path.exists(MODELS_DIR):
sys.exit(f"\n[ERROR] Models folder not found at {MODELS_DIR}")
model_files = [f for f in os.listdir(MODELS_DIR) if f.endswith(".gguf")]
if not model_files:
sys.exit("\n[ERROR] No .gguf files found in ./models")
print("\nAVAILABLE MODELS:")
for i, f in enumerate(model_files):
print(f" [{i+1}] {f}")
while True:
try:
idx = int(input("\nSelect model for benchmark (1-N): "))
if 1 <= idx <= len(model_files):
model_path = os.path.join(MODELS_DIR, model_files[idx-1])
break
except ValueError: pass
PROMPT = "Explain how a computer processor works in 100 words."
# ==========================================
# 2. THE CLEAN SHOOTOUT
# ==========================================
# We are testing if the "Default" behavior (which Ollama likely mimics) is better
configs = [
{
"name": "Standard CPU (Default Load)",
"gpu_layers": 0,
"threads": 8,
"batch": 512
},
{
"name": "Heavy CPU (High Batch)",
"gpu_layers": 0,
"threads": 8,
"batch": 1024 # Ollama uses larger batches
},
{
"name": "Hybrid (12 Layers)",
"gpu_layers": 12, # Try offloading a little bit
"threads": 8,
"batch": 512
}
]
print("\n" + "="*50)
print(" VOX-AI: SANITY CHECK (NO DLL OVERRIDES)")
print("="*50)
for cfg in configs:
print(f"\n--- TESTING: {cfg['name']} ---")
try:
# PURE STANDARD LOAD
llm = Llama(
model_path=model_path,
n_ctx=2048,
n_gpu_layers=cfg['gpu_layers'],
n_threads=cfg['threads'],
n_batch=cfg['batch'],
verbose=False
)
# Warmup
llm.create_chat_completion(messages=[{"role":"user","content":"."}], max_tokens=1)
# Sprint
start = time.time()
token_count = 0
stream = llm.create_chat_completion(
messages=[{"role": "user", "content": PROMPT}],
max_tokens=200,
stream=True
)
print(" >> Generating...", end="", flush=True)
for chunk in stream:
if "content" in chunk["choices"][0]["delta"]:
token_count += 1
if token_count % 20 == 0: print(".", end="", flush=True)
duration = time.time() - start
tps = token_count / duration
print(f"\n >> SPEED: {tps:.2f} t/s")
del llm
except Exception as e:
print(f"\n >> FAILED: {e}")
print("\n" + "="*50)
print(" TEST COMPLETE")
print("="*50)
input("Press Enter to exit...")