-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathvox_api.py
More file actions
187 lines (152 loc) · 6.37 KB
/
Copy pathvox_api.py
File metadata and controls
187 lines (152 loc) · 6.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
import os
import time
from typing import Generator, Dict, List, Optional, Union
from llama_cpp import Llama
import machine_engine_handshake
class VoxAPI:
"""
A clean API wrapper for the VOX-AI Engine.
Designed for easy integration into chat software.
"""
def __init__(self, model_path: str = None, verbose: bool = False):
"""
Initialize the VOX Engine with automatic hardware optimization.
Args:
model_path: Path to the .gguf model file. If None, auto-detects from ./models
verbose: Enable detailed logging
"""
self.verbose = verbose
self.history: List[Dict[str, str]] = []
# 1. Hardware Handshake
self.mode, self.phys_cores, self.config = machine_engine_handshake.get_hardware_config()
if self.verbose:
print(f"[VOX API] Mode: {self.mode}")
print(f"[VOX API] Config: {self.config}")
# 2. Apply Environment Optimizations
self._apply_env_optimizations()
# 3. Model Loading
if model_path is None:
model_path = self._auto_find_model()
if not os.path.exists(model_path):
raise FileNotFoundError(f"Model not found at: {model_path}")
self.model_name = os.path.basename(model_path)
# 4. Initialize Llama
self.llm = Llama(
model_path=model_path,
n_ctx=2048, # Standard context window
# Hardware Config
n_gpu_layers=self.config['n_gpu_layers'],
n_threads=self.config['n_threads'],
n_threads_batch=self.config['n_threads_batch'],
n_batch=self.config['n_batch'],
flash_attn=self.config['flash_attn'],
use_mlock=self.config['use_mlock'],
cache_type_k=self.config['cache_type_k'],
cache_type_v=self.config['cache_type_v'],
use_mmap=True,
verbose=self.verbose
)
# 5. Warmup
self.warmup()
def _apply_env_optimizations(self):
"""Apply environment variables for APU performance"""
root_path = os.path.abspath(".")
# API Specific
if hasattr(os, 'add_dll_directory'):
try: os.add_dll_directory(root_path)
except: pass
# Performance Variables
if "busy_wait" in self.config:
os.environ["GGML_VK_FORCE_BUSY_WAIT"] = self.config["busy_wait"]
os.environ["GGML_NUMA"] = "0"
os.environ["GGML_BACKEND_SEARCH_PATH"] = root_path
os.environ["LLAMA_CPP_LIB"] = os.path.join(root_path, "llama.dll")
def _auto_find_model(self) -> str:
"""Find the first .gguf file in ./models"""
models_dir = os.path.abspath("./models")
if not os.path.exists(models_dir):
raise FileNotFoundError("Models directory './models' not found")
files = [f for f in os.listdir(models_dir) if f.endswith(".gguf")]
if not files:
raise FileNotFoundError("No .gguf models found in ./models")
return os.path.join(models_dir, files[0])
def warmup(self):
"""Run a silent inference to load weights into RAM/VRAM"""
if self.verbose: print("[VOX API] Warming up...")
self.llm.create_chat_completion(
messages=[{"role": "user", "content": "."}],
max_tokens=1
)
def chat(self, user_message: str, stream: bool = True, system_prompt: str = None) -> Union[str, Generator[str, None, None]]:
"""
Send a message to the AI and get a response.
Args:
user_message: The text input from the user
stream: If True, returns a generator yielding tokens. If False, returns full string.
system_prompt: Optional override for system prompt (default is "You are a helpful assistant.")
"""
# Initialize history if empty
if not self.history:
sys_msg = system_prompt or "You are a helpful assistant."
self.history.append({"role": "system", "content": sys_msg})
# Add user message
self.history.append({"role": "user", "content": user_message})
if stream:
return self._stream_response()
else:
return self._full_response()
def _stream_response(self) -> Generator[str, None, None]:
"""Internal generator for streaming responses"""
full_response = ""
stream = self.llm.create_chat_completion(
messages=self.history,
max_tokens=2048,
temperature=0.7,
top_k=40,
repeat_penalty=1.1,
stream=True
)
for chunk in stream:
if "content" in chunk["choices"][0]["delta"]:
token = chunk["choices"][0]["delta"]["content"]
full_response += token
yield token
# Update history with the complete response
self.history.append({"role": "assistant", "content": full_response})
def _full_response(self) -> str:
"""Internal method for non-streaming response"""
response = self.llm.create_chat_completion(
messages=self.history,
max_tokens=2048,
temperature=0.7,
top_k=40,
repeat_penalty=1.1,
stream=False
)
text = response["choices"][0]["message"]["content"]
self.history.append({"role": "assistant", "content": text})
return text
def clear_history(self):
"""Reset conversation context"""
self.history = []
def get_stats(self):
"""Get info about the loaded model and hardware"""
return {
"model": self.model_name,
"mode": self.mode,
"cores": self.phys_cores,
"gpu_layers": self.config['n_gpu_layers']
}
# Usage Example
if __name__ == "__main__":
print("Testing VOX API...")
try:
engine = VoxAPI(verbose=True)
print(f"Loaded: {engine.model_name}")
print("\nUser: Hello!")
print("Bot: ", end="")
for token in engine.chat("Hello!", stream=True):
print(token, end="", flush=True)
print("\n\nTest Complete.")
except Exception as e:
print(f"Error: {e}")