VoxAI_Chat_API/vox_api.py at master · AlexC1991/VoxAI_Chat_API · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
import os
import time
from typing import Generator, Dict, List, Optional, Union
from llama_cpp import Llama
import machine_engine_handshake

class VoxAPI:
    """
    A clean API wrapper for the VOX-AI Engine.
    Designed for easy integration into chat software.
    """

    def __init__(self, model_path: str = None, verbose: bool = False):
        """
        Initialize the VOX Engine with automatic hardware optimization.

        Args:
            model_path: Path to the .gguf model file. If None, auto-detects from ./models
            verbose: Enable detailed logging
        """
        self.verbose = verbose
        self.history: List[Dict[str, str]] = []

        # 1. Hardware Handshake
        self.mode, self.phys_cores, self.config = machine_engine_handshake.get_hardware_config()
        if self.verbose:
            print(f"[VOX API] Mode: {self.mode}")
            print(f"[VOX API] Config: {self.config}")

        # 2. Apply Environment Optimizations
        self._apply_env_optimizations()

        # 3. Model Loading
        if model_path is None:
            model_path = self._auto_find_model()

        if not os.path.exists(model_path):
            raise FileNotFoundError(f"Model not found at: {model_path}")

        self.model_name = os.path.basename(model_path)

        # 4. Initialize Llama
        self.llm = Llama(
            model_path=model_path,
            n_ctx=2048, # Standard context window

            # Hardware Config
            n_gpu_layers=self.config['n_gpu_layers'],
            n_threads=self.config['n_threads'],
            n_threads_batch=self.config['n_threads_batch'],
            n_batch=self.config['n_batch'],
            flash_attn=self.config['flash_attn'],
            use_mlock=self.config['use_mlock'],
            cache_type_k=self.config['cache_type_k'],
            cache_type_v=self.config['cache_type_v'],

            use_mmap=True,
            verbose=self.verbose
        )

        # 5. Warmup
        self.warmup()

    def _apply_env_optimizations(self):
        """Apply environment variables for APU performance"""
        root_path = os.path.abspath(".")

        # API Specific
        if hasattr(os, 'add_dll_directory'):
            try: os.add_dll_directory(root_path)
            except: pass

        # Performance Variables
        if "busy_wait" in self.config:
            os.environ["GGML_VK_FORCE_BUSY_WAIT"] = self.config["busy_wait"]

        os.environ["GGML_NUMA"] = "0"
        os.environ["GGML_BACKEND_SEARCH_PATH"] = root_path
        os.environ["LLAMA_CPP_LIB"] = os.path.join(root_path, "llama.dll")

    def _auto_find_model(self) -> str:
        """Find the first .gguf file in ./models"""
        models_dir = os.path.abspath("./models")
        if not os.path.exists(models_dir):
            raise FileNotFoundError("Models directory './models' not found")

        files = [f for f in os.listdir(models_dir) if f.endswith(".gguf")]
        if not files:
            raise FileNotFoundError("No .gguf models found in ./models")

        return os.path.join(models_dir, files[0])

    def warmup(self):
        """Run a silent inference to load weights into RAM/VRAM"""
        if self.verbose: print("[VOX API] Warming up...")
        self.llm.create_chat_completion(
            messages=[{"role": "user", "content": "."}],
            max_tokens=1
        )

    def chat(self, user_message: str, stream: bool = True, system_prompt: str = None) -> Union[str, Generator[str, None, None]]:
        """
        Send a message to the AI and get a response.

        Args:
            user_message: The text input from the user
            stream: If True, returns a generator yielding tokens. If False, returns full string.
            system_prompt: Optional override for system prompt (default is "You are a helpful assistant.")
        """

        # Initialize history if empty
        if not self.history:
            sys_msg = system_prompt or "You are a helpful assistant."
            self.history.append({"role": "system", "content": sys_msg})

        # Add user message
        self.history.append({"role": "user", "content": user_message})

        if stream:
            return self._stream_response()
        else:
            return self._full_response()

    def _stream_response(self) -> Generator[str, None, None]:
        """Internal generator for streaming responses"""
        full_response = ""

        stream = self.llm.create_chat_completion(
            messages=self.history,
            max_tokens=2048,
            temperature=0.7,
            top_k=40,
            repeat_penalty=1.1,
            stream=True
        )

        for chunk in stream:
            if "content" in chunk["choices"][0]["delta"]:
                token = chunk["choices"][0]["delta"]["content"]
                full_response += token
                yield token

        # Update history with the complete response
        self.history.append({"role": "assistant", "content": full_response})

    def _full_response(self) -> str:
        """Internal method for non-streaming response"""
        response = self.llm.create_chat_completion(
            messages=self.history,
            max_tokens=2048,
            temperature=0.7,
            top_k=40,
            repeat_penalty=1.1,
            stream=False
        )

        text = response["choices"][0]["message"]["content"]
        self.history.append({"role": "assistant", "content": text})
        return text

    def clear_history(self):
        """Reset conversation context"""
        self.history = []

    def get_stats(self):
        """Get info about the loaded model and hardware"""
        return {
            "model": self.model_name,
            "mode": self.mode,
            "cores": self.phys_cores,
            "gpu_layers": self.config['n_gpu_layers']
        }

# Usage Example
if __name__ == "__main__":
    print("Testing VOX API...")
    try:
        engine = VoxAPI(verbose=True)
        print(f"Loaded: {engine.model_name}")

        print("\nUser: Hello!")
        print("Bot: ", end="")
        for token in engine.chat("Hello!", stream=True):
            print(token, end="", flush=True)
        print("\n\nTest Complete.")
    except Exception as e:
        print(f"Error: {e}")