From aaa97612d31fe0bb329a7f5c34041fa01f76f16f Mon Sep 17 00:00:00 2001
From: Mikhail Komarov <144356904+hardWorker254@users.noreply.github.com>
Date: Fri, 15 Aug 2025 18:39:55 +0400
Subject: [PATCH 1/4] Raw /v1/audio/transcription

---
 llama_runner/config_loader.py            |  88 ++++++++++++-
 llama_runner/headless_service_manager.py |   7 +-
 llama_runner/llama_cpp_runner.py         |   1 -
 llama_runner/llama_runner_manager.py     | 106 +++++++++++++++-
 llama_runner/lmstudio_proxy_thread.py    |  36 +++++-
 llama_runner/main_window.py              |  23 +++-
 llama_runner/ollama_proxy_thread.py      |  44 ++++++-
 llama_runner/whisper_cpp_runner.py       | 155 +++++++++++++++++++++++
 8 files changed, 447 insertions(+), 13 deletions(-)
 create mode 100644 llama_runner/whisper_cpp_runner.py

diff --git a/llama_runner/config_loader.py b/llama_runner/config_loader.py
index c3a5921..98ad3e8 100644
--- a/llama_runner/config_loader.py
+++ b/llama_runner/config_loader.py
@@ -2,18 +2,22 @@
 import json
 import logging
 
+
 CONFIG_DIR = os.path.expanduser("~/.llama-runner")
 CONFIG_FILE = os.path.join(CONFIG_DIR, "config.json")
 LOG_FILE = os.path.join(CONFIG_DIR, "error.log")
 
+
 # Ensure the log directory exists
 if not os.path.exists(CONFIG_DIR):
     os.makedirs(CONFIG_DIR, exist_ok=True)
 
+
 # Set up logging
 logging.basicConfig(filename=LOG_FILE, level=logging.ERROR,
                     format='%(asctime)s - %(levelname)s - %(message)s')
 
+
 def ensure_config_exists():
     """
     Ensures that the configuration directory and file exist.
@@ -27,6 +31,7 @@ def ensure_config_exists():
             logging.error(f"Error creating config directory: {e}")
             return False
 
+
     if not os.path.exists(CONFIG_FILE):
         try:
             default_config = {
@@ -48,6 +53,7 @@ def ensure_config_exists():
             return False
     return True
 
+
 def load_config():
     """
     Loads the configuration from the JSON file.
@@ -56,21 +62,23 @@ def load_config():
     if not ensure_config_exists():
         return {}
 
+
     try:
         with open(CONFIG_FILE, "r") as f:
             config = json.load(f)
-
             # Ensure default_runtime and concurrentRunners exist
             if "default_runtime" not in config:
                 config["default_runtime"] = "llama-server"
             if "concurrentRunners" not in config:
                 config["concurrentRunners"] = 1
 
+
             # Ensure proxies section and its sub-keys exist with defaults
             proxies_config = config.get("proxies", {})
             if not isinstance(proxies_config, dict): # Handle case where 'proxies' might exist but not as a dict
                 proxies_config = {}
 
+
             ollama_proxy_config = proxies_config.get("ollama", {})
             if not isinstance(ollama_proxy_config, dict):
                 ollama_proxy_config = {}
@@ -78,6 +86,7 @@ def load_config():
                 ollama_proxy_config["enabled"] = True
             proxies_config["ollama"] = ollama_proxy_config
 
+
             lmstudio_proxy_config = proxies_config.get("lmstudio", {})
             if not isinstance(lmstudio_proxy_config, dict):
                 lmstudio_proxy_config = {}
@@ -89,6 +98,7 @@ def load_config():
             
             config["proxies"] = proxies_config
 
+
             # Ensure logging section and its sub-keys exist with defaults
             logging_config = config.get("logging", {})
             if not isinstance(logging_config, dict): # Handle case where 'logging' might exist but not as a dict
@@ -134,6 +144,80 @@ def load_config():
                 print("Warning: Config: 'llama-runtimes' key exists but is not a dictionary. Ignoring.")
             # If 'llama-runtimes' is not in config or is None, it's handled gracefully (no changes made to it)
 
+
+
+            raw_audio = config.get("audio")
+            if isinstance(raw_audio, dict):
+                # Process runtimes
+                raw_runtimes = raw_audio.get("runtimes")
+                processed_runtimes = {}
+                if isinstance(raw_runtimes, dict):
+                    for name, details in raw_runtimes.items():
+                        if isinstance(details, dict):
+                            runtime_path = details.get("runtime")
+                            if isinstance(runtime_path, str) and runtime_path.strip():
+                                processed_runtimes[name] = {
+                                    "runtime": runtime_path.strip()
+                                }
+                            else:
+                                logging.warning(f"Config: Audio runtime '{name}' has invalid or empty 'runtime' path. Skipping.")
+                                print(f"Warning: Config: Audio runtime '{name}' has invalid or empty 'runtime' path. Skipping.")
+                        else:
+                            logging.warning(f"Config: Audio runtime '{name}' details should be a dictionary. Skipping.")
+                            print(f"Warning: Config: Audio runtime '{name}' details should be a dictionary. Skipping.")
+                elif raw_runtimes is not None:
+                    logging.warning("Config: 'audio.runtimes' exists but is not a dictionary. Ignoring.")
+                    print("Warning: Config: 'audio.runtimes' exists but is not a dictionary. Ignoring.")
+
+
+                # Process models
+                raw_models = raw_audio.get("models")
+                processed_models = {}
+                if isinstance(raw_models, dict):
+                    for model_name, model_info in raw_models.items():
+                        if isinstance(model_info, dict):
+                            model_path = model_info.get("model_path")
+                            runtime = model_info.get("runtime")
+                            parameters = model_info.get("parameters", {})
+                            if isinstance(model_path, str) and model_path.strip():
+                                if isinstance(parameters, dict):
+                                    processed_models[model_name] = {
+                                        "model_path": model_path.strip(),
+                                        "runtime": runtime,
+                                        "parameters": parameters
+                                    }
+                                else:
+                                    logging.warning(f"Config: Parameters for model '{model_name}' should be a dictionary. Using empty dict instead.")
+                                    print(f"Warning: Config: Parameters for model '{model_name}' should be a dictionary. Using empty dict instead.")
+                                    processed_models[model_name] = {
+                                        "model_path": model_path.strip(),
+                                        "runtime": runtime,
+                                        "parameters": {}
+                                    }
+                            else:
+                                logging.warning(f"Config: Model '{model_name}' has invalid or empty 'model_path'. Skipping.")
+                                print(f"Warning: Config: Model '{model_name}' has invalid or empty 'model_path'. Skipping.")
+                        else:
+                            logging.warning(f"Config: Model entry '{model_name}' is not a dictionary. Skipping.")
+                            print(f"Warning: Config: Model entry '{model_name}' is not a dictionary. Skipping.")
+                elif raw_models is not None:
+                    logging.warning("Config: 'audio.models' exists but is not a dictionary. Ignoring.")
+                    print("Warning: Config: 'audio.models' exists but is not a dictionary. Ignoring.")
+
+
+                # Update the audio section in config
+                config["audio"] = {
+                    "runtimes": processed_runtimes,
+                    "models": processed_models
+                }
+
+
+            elif raw_audio is not None:
+                logging.warning("Config: 'audio' key exists but is not a dictionary. Ignoring.")
+                print("Warning: Config: 'audio' key exists but is not a dictionary. Ignoring.")
+
+
+             
             print(f"Loaded config (processed): {config}")  # Print loaded config
             return config
     except (OSError, json.JSONDecodeError) as e:
@@ -141,6 +225,7 @@ def load_config():
         logging.error(f"Error loading config file: {e}")
         return {}
 
+
 def calculate_system_fingerprint(config: dict) -> str:
     """Calculates a 16-character hash of the configuration parameters."""
     import hashlib
@@ -149,6 +234,7 @@ def calculate_system_fingerprint(config: dict) -> str:
     hash_object = hashlib.md5(config_str.encode())
     return hash_object.hexdigest()[:16]
 
+
 if __name__ == '__main__':
     # Example usage:
     config = load_config()
diff --git a/llama_runner/headless_service_manager.py b/llama_runner/headless_service_manager.py
index 54bfe75..25e7d1a 100644
--- a/llama_runner/headless_service_manager.py
+++ b/llama_runner/headless_service_manager.py
@@ -35,7 +35,8 @@ def _initialize_services(self):
         # Initialize LlamaRunnerManager
         self.llama_runner_manager = LlamaRunnerManager(
             models=self.models_specific_config, # This is app_config['models']
-            llama_runtimes=self.app_config.get('llama-runtimes', {}), # Ensure correct key
+            llama_runtimes=self.app_config.get('llama-runtimes', {}), # Ensure correct key,
+            audio_config=self.app_config.get('audio', {}),
             default_runtime=self.app_config.get('default_runtime', 'llama-server'), # Ensure correct key and default
             model_status_widgets={} # No UI widgets in headless mode
         )
@@ -48,6 +49,7 @@ def _initialize_services(self):
 
         # Get proxy and logging settings from the unified config
         proxies_config = self.app_config.get('proxies', {})
+        audio_config = self.app_config.get('audio', {})
         ollama_proxy_settings = proxies_config.get('ollama', {})
         lmstudio_proxy_settings = proxies_config.get('lmstudio', {})
         logging_settings = self.app_config.get('logging', {})
@@ -63,7 +65,9 @@ def _initialize_services(self):
             self.ollama_proxy = OllamaProxyThread(
                 all_models_config=self.models_specific_config,
                 runtimes_config=self.app_config.get('llama-runtimes', {}),
+                audio_config=audio_config,
                 is_model_running_callback=self.llama_runner_manager.is_llama_runner_running,
+                is_model_whisper_running=self.llama_runner_manager.is_whisper_runner_running,
                 get_runner_port_callback=self.llama_runner_manager.get_runner_port,
                 request_runner_start_callback=self.llama_runner_manager.request_runner_start,
                 prompt_logging_enabled=prompt_logging_enabled,
@@ -80,6 +84,7 @@ def _initialize_services(self):
             self.lmstudio_proxy = LMStudioProxyThread( # Using the aliased FastAPIProxyThread
                 all_models_config=self.models_specific_config,
                 runtimes_config=self.app_config.get('llama-runtimes', {}),
+                audio_config=audio_config,
                 is_model_running_callback=self.llama_runner_manager.is_llama_runner_running,
                 get_runner_port_callback=self.llama_runner_manager.get_runner_port,
                 request_runner_start_callback=self.llama_runner_manager.request_runner_start,
diff --git a/llama_runner/llama_cpp_runner.py b/llama_runner/llama_cpp_runner.py
index 0ba32eb..15c13e2 100644
--- a/llama_runner/llama_cpp_runner.py
+++ b/llama_runner/llama_cpp_runner.py
@@ -84,7 +84,6 @@ async def start(self):
                 command.append(f"--{arg_name}")
                 command.append(str(value))
 
-        print(f"Starting llama.cpp server with command: {' '.join(command)}")
         logging.info(f"Starting llama.cpp server with command: {' '.join(command)}")
 
         # Clear the output buffer before starting a new process
diff --git a/llama_runner/llama_runner_manager.py b/llama_runner/llama_runner_manager.py
index 5b578b6..5a36ed2 100644
--- a/llama_runner/llama_runner_manager.py
+++ b/llama_runner/llama_runner_manager.py
@@ -9,6 +9,8 @@
 from llama_runner.llama_runner_thread import LlamaRunnerThread, RunnerStoppedEvent, RunnerErrorEvent
 from llama_runner.error_output_dialog import ErrorOutputDialog
 
+from llama_runner.whisper_cpp_runner import WhisperServer
+
 class LlamaRunnerManager(QObject):
     # Define a custom event type for events posted to the parent (e.g., MainWindow)
     # This replaces the QEvent.User + 4 magic number.
@@ -23,6 +25,7 @@ def __init__(
         models: dict,
         llama_runtimes: dict,
         default_runtime: str,
+        audio_config,
         model_status_widgets: dict,
         # runner_port_ready_for_proxy and runner_stopped_for_proxy are now class attributes
         parent=None,
@@ -41,6 +44,91 @@ def __init__(
         self._runner_startup_futures: Dict[str, asyncio.Future] = {}
         self._current_running_model: Optional[str] = None
         self.concurrent_runners_limit = 1  # Will be set by MainWindow after instantiation
+        self.audio_config = audio_config
+        self.whisper_servers: Dict[str, WhisperServer] = {}
+    
+    
+    
+    def start_whisper_server(self, model_name: str) -> None:
+        """
+        Создать и запустить whisper сервер для модели.
+        """
+        if model_name in self.whisper_servers:
+            logging.info(f"WhisperServer already started for model {model_name}")
+            return
+        try:
+            print(f"Starting Runner for {model_name}...")
+            status_widget = self.model_status_widgets.get(model_name)
+            if status_widget:
+                status_widget.update_status("Starting...")
+                status_widget.update_port("N/A")
+                status_widget.set_buttons_enabled(False, False)
+
+            whisper_server = WhisperServer(self.audio_config, model_name)
+            whisper_server.start_server()
+            self.whisper_servers[model_name] = whisper_server
+            port = whisper_server.get_port()
+
+            logging.info(f"Whisper Runner started for model {model_name}")
+            
+            print(f"Whisper Runner for {model_name} ready on port {port}.")
+            status_widget = self.model_status_widgets.get(model_name)
+            if status_widget:
+                status_widget.update_port(port)
+                status_widget.update_status("Running")
+                status_widget.set_buttons_enabled(False, True)
+            
+        except Exception as e:
+            logging.error(f"Failed to start WhisperServer for {model_name}: {e}")
+
+
+    def stop_whisper_server(self, model_name: str) -> None:
+        """
+        Остановить whisper сервер для модели.
+        """
+        whisper_server = self.whisper_servers.get(model_name)
+        if whisper_server:
+            try:
+                status_widget = self.model_status_widgets.get(model_name)
+                if status_widget:
+                    status_widget.update_status("Stopping...")
+                    status_widget.set_buttons_enabled(False, False)
+                print(f"Stopping Runner for {model_name}...")
+                whisper_server.stop_server()
+                del self.whisper_servers[model_name]
+                if status_widget:
+                    status_widget.update_status("Not Running")
+                    status_widget.update_port("N/A")
+                    status_widget.set_buttons_enabled(True, False)
+                logging.info(f"WhisperServer stopped for model {model_name}")
+            except Exception as e:
+                logging.error(f"Error stopping WhisperServer for {model_name}: {e}")
+        else:
+            logging.warning(f"WhisperServer for {model_name} not running")
+
+
+    def stop_all_whisper_servers(self) -> None:
+        """
+        Остановить все запущенные whisper сервера.
+        """
+        for model_name in list(self.whisper_servers.keys()):
+            self.stop_whisper_server(model_name)
+    
+    
+    def is_whisper_runner_running(self, model_name: str) -> bool:
+        whisper_class = self.whisper_servers.get(model_name)
+        if whisper_class:
+            return True
+        return False
+    
+    
+    def get_whisper_port(self, model_name: str) -> Optional[int]:
+        whisper_class = self.whisper_servers.get(model_name)
+        if whisper_class:
+            return whisper_class.get_port()
+        return None
+
+
 
     def set_concurrent_runners_limit(self, limit: int):
         self.concurrent_runners_limit = limit
@@ -57,8 +145,21 @@ def get_runner_port(self, model_name: str) -> Optional[int]:
             return thread.runner.get_port()
         return None
 
-    def request_runner_start(self, model_name: str) -> asyncio.Future:
+    def request_runner_start(self, model_name: str, iswhisper: bool = False) -> asyncio.Future:
         logging.info(f"Received request to start runner for model: {model_name}")
+        if iswhisper:
+            if self.is_whisper_runner_running(model_name):
+                logging.info(f"Runner for {model_name} is already starting. Returning existing Future.")
+                return self._runner_startup_futures[model_name]
+            
+            self.stop_all_whisper_servers()
+            self.start_whisper_server(model_name)
+            future = asyncio.Future()
+            future.set_result(self.get_whisper_port(model_name))
+            self._runner_startup_futures[model_name] = future
+            QTimer.singleShot(1000, lambda: self._cleanup_completed_future(model_name))
+            return future
+
 
         if model_name in self._runner_startup_futures and not self._runner_startup_futures[model_name].done():
             logging.info(f"Runner for {model_name} is already starting. Returning existing Future.")
@@ -311,4 +412,5 @@ def check_runner_statuses(self):
                     if app_instance and parent_object:
                         app_instance.postEvent(parent_object, parent_event)
                     else:
-                        logging.warning(f"Could not post parent event for {model_name} (check_runner_statuses): App/Parent None.")
\ No newline at end of file
+                        logging.warning(f"Could not post parent event for {model_name} (check_runner_statuses): App/Parent None.")
+        
\ No newline at end of file
diff --git a/llama_runner/lmstudio_proxy_thread.py b/llama_runner/lmstudio_proxy_thread.py
index 5ffd7d4..44878b8 100644
--- a/llama_runner/lmstudio_proxy_thread.py
+++ b/llama_runner/lmstudio_proxy_thread.py
@@ -20,6 +20,10 @@
 from llama_runner import gguf_metadata # Import the new metadata module
 from llama_runner.config_loader import calculate_system_fingerprint
 
+from llama_runner.whisper_cpp_runner import WhisperServer
+from io import BytesIO
+from fastapi import UploadFile as FastAPIUploadFile
+
 # Configure logging (already done in main.py for configurable levels)
 # logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 
@@ -974,6 +978,29 @@ async def _v1_embeddings_handler(request: Request):
         logging.error(f"Error in /v1/embeddings handler: {e}\n{traceback.format_exc()}")
         return JSONResponse(content={"error": {"message": f"Internal server error: {e}", "type": "internal_error"}}, status_code=status.HTTP_500_INTERNAL_SERVER_ERROR)
 
+@app.post("/v1/audio/transcriptions")
+async def openai_speech_to_text(request: Request):
+    """Function to convert speech to text using whisper.cpp"""
+    try:
+        request_runner_start_callback = request.app.state.request_runner_start_callback
+        form = await request.form() 
+        file = form.get("file") 
+        contents = await file.read()
+        fastapi_file = FastAPIUploadFile(filename=file.filename, file=BytesIO(contents))
+        model = str(form.get("model"))
+        audio_config = request.app.state.audio_config
+        request_runner_start_callback(model, True)
+        whisper_server = WhisperServer(audio_config, model)
+        audio_file_path = whisper_server.convert_to_wav(fastapi_file)
+        result = whisper_server.transcribe_audio(audio_file_path)
+        return JSONResponse(content=result)
+    except json.JSONDecodeError:
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid JSON request body")
+    except Exception as e:
+        logging.error(f"Error handling /audio/transcriptions: {e}\n{traceback.format_exc()}")
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail="Internal Server Error processing transcription request")
+
+
 logging.info("Updated dynamic routing handlers for /v1/chat/completions, /v1/completions, /v1/embeddings to support conditional streaming.")
 
 
@@ -999,7 +1026,8 @@ class FastAPIProxyThread(QThread): # Renamed class
 
     def __init__(self,
                  all_models_config: Dict[str, Dict[str, Any]], # Renamed models_config
-                 runtimes_config: Dict[str, Dict[str, Any]], # Added runtimes_config
+                 runtimes_config: Dict[str, Dict[str, Any]], # Added runtimes_config,
+                 audio_config,
                  is_model_running_callback: Callable[[str], bool],
                  get_runner_port_callback: Callable[[str], Optional[int]],
                  request_runner_start_callback: Callable[[str], asyncio.Future], # Callback now returns Future
@@ -1009,6 +1037,7 @@ def __init__(self,
         super().__init__()
         self.all_models_config = all_models_config # Store all_models_config
         self.runtimes_config = runtimes_config # Store runtimes_config
+        self.audio_config = audio_config
         self.is_model_running_callback = is_model_running_callback
         self.get_runner_port_callback = get_runner_port_callback
         self.request_runner_start_callback = request_runner_start_callback # Store the callback
@@ -1128,6 +1157,11 @@ async def run_async(self):
             app.state.request_runner_start_callback = self.request_runner_start_callback # Pass the new callback
             app.state.prompt_logging_enabled = self.prompt_logging_enabled # Set prompt logging flag on state
             app.state.prompts_logger = self.prompts_logger # Set prompts logger on state
+            
+            # Audio global variables
+            app.state.audio_config = self.audio_config
+            app.state.running_whisper = None
+            
             # Extract metadata for all models and store it in app.state.models_metadata
             # Note: get_all_models_lmstudio_format expects the main models config (all_models_config)
             app.state.models_metadata = gguf_metadata.get_all_models_lmstudio_format(
diff --git a/llama_runner/main_window.py b/llama_runner/main_window.py
index afb18e2..93f7296 100644
--- a/llama_runner/main_window.py
+++ b/llama_runner/main_window.py
@@ -37,7 +37,9 @@ def __init__(self):
         # Load settings from config with defaults
         self.prompt_logging_enabled = self.config.get('logging', {}).get('prompt_logging_enabled', False)
         self.llama_runtimes = self.config.get("llama-runtimes", {})
+        self.audio_config = self.config.get("audio")
         self.default_runtime = self.config.get("default_runtime", "llama-server")
+        self.audio_models = self.audio_config.get("models", {})
         self.models = self.config.get("models", {})
         self.concurrent_runners_limit = self.config.get("concurrentRunners", 1)
         if not isinstance(self.concurrent_runners_limit, int) or self.concurrent_runners_limit < 1:
@@ -63,7 +65,11 @@ def __init__(self):
                     self.model_metadata_cache[model_name] = metadata
             else:
                 logging.warning(f"Model '{model_name}' has no 'model_path' in config. Skipping metadata caching.")
-
+        for model_name, model_config in self.audio_models.items():
+            model_path = model_config.get("model_path")
+            if model_path:
+                logging.warning(f"Model '{model_name}' has no 'model_path' in config. Skipping metadata caching.")
+        
         self.fastapi_proxy_thread: Optional[FastAPIProxyThread] = None
         self.ollama_proxy_thread: Optional[OllamaProxyThread] = None
 
@@ -114,6 +120,14 @@ def __init__(self):
             self.model_status_widgets[model_name] = status_widget
             status_widget.start_button.clicked.connect(lambda checked, name=model_name: self.llama_runner_manager.request_runner_start(name))
             status_widget.stop_button.clicked.connect(lambda checked, name=model_name: self.llama_runner_manager.stop_llama_runner(name))
+        for model_name in self.audio_models.keys():
+            self.model_list_widget.addItem(model_name)
+            #model_metadata = self.model_metadata_cache.get(model_name)
+            status_widget = ModelStatusWidget(model_name)
+            self.model_status_stack.addWidget(status_widget)
+            self.model_status_widgets[model_name] = status_widget
+            status_widget.start_button.clicked.connect(lambda checked, name=model_name: self.llama_runner_manager.request_runner_start(name, True))
+            status_widget.stop_button.clicked.connect(lambda checked, name=model_name: self.llama_runner_manager.stop_whisper_server(name))
 
         self.model_list_widget.currentItemChanged.connect(self.on_model_selection_changed)
 
@@ -136,13 +150,14 @@ def __init__(self):
         self.llama_runner_manager = LlamaRunnerManager(
             models=self.models,
             llama_runtimes=self.llama_runtimes,
+            audio_config=self.audio_config,
             default_runtime=self.default_runtime,
             model_status_widgets=self.model_status_widgets,
             # runner_port_ready_for_proxy and runner_stopped_for_proxy are now owned by LlamaRunnerManager
             parent=self,
         )
         self.llama_runner_manager.set_concurrent_runners_limit(self.concurrent_runners_limit)
-
+        
         # --- Start the FastAPI Proxy (for LM Studio) automatically if enabled ---
         if self.lmstudio_proxy_enabled:
             self.start_fastapi_proxy()
@@ -248,6 +263,7 @@ def on_model_selection_changed(self, current_item, previous_item):
                 self.model_status_stack.setCurrentWidget(self.no_model_selected_widget)
         else:
             self.model_status_stack.setCurrentWidget(self.no_model_selected_widget)
+    
 
     # Runner management methods moved to LlamaRunnerManager
 
@@ -264,6 +280,7 @@ def start_fastapi_proxy(self):
         self.fastapi_proxy_thread = FastAPIProxyThread(
             all_models_config=self.models,
             runtimes_config=self.llama_runtimes,
+            audio_config=self.audio_config,
             is_model_running_callback=self.llama_runner_manager.is_llama_runner_running,
             get_runner_port_callback=self.llama_runner_manager.get_runner_port,
             request_runner_start_callback=self.llama_runner_manager.request_runner_start,
@@ -293,7 +310,9 @@ def start_ollama_proxy(self):
         self.ollama_proxy_thread = OllamaProxyThread(
             all_models_config=self.models,
             runtimes_config=self.llama_runtimes,
+            audio_config=self.audio_config,
             is_model_running_callback=self.llama_runner_manager.is_llama_runner_running,
+            is_model_whisper_running=self.llama_runner_manager.is_whisper_runner_running,
             get_runner_port_callback=self.llama_runner_manager.get_runner_port,
             request_runner_start_callback=self.llama_runner_manager.request_runner_start,
             prompt_logging_enabled=self.prompt_logging_enabled,
diff --git a/llama_runner/ollama_proxy_thread.py b/llama_runner/ollama_proxy_thread.py
index 0a75e55..54778f5 100644
--- a/llama_runner/ollama_proxy_thread.py
+++ b/llama_runner/ollama_proxy_thread.py
@@ -20,6 +20,10 @@
     chatRequestFromOllama, chatResponseToOllama
 )
 
+from llama_runner.whisper_cpp_runner import WhisperServer
+from io import BytesIO
+from fastapi import UploadFile as FastAPIUploadFile
+
 # --- Create our own FastAPI app instance ---
 app = FastAPI()
 # --- End create app instance ---
@@ -604,9 +608,6 @@ async def list_models(request: Request):
         raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail="Internal Server Error processing list models request")
 
 
-
-# --- End handlers for Ollama API endpoints ---
-
 @app.post("/api/show")
 async def show_model_info(request: Request):
     """Handles Ollama /api/show requests."""
@@ -713,7 +714,6 @@ async def list_openai_models(request: Request):
         logging.error(f"Error handling /v1/models: {e}\n{traceback.format_exc()}")
         raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail="Internal Server Error processing list models request")
 
-# --- End handlers for OpenAI compatible API endpoints (v1) ---
 
 @app.post("/v1/completions")
 async def openai_completions(request: Request):
@@ -790,6 +790,32 @@ async def openai_embeddings(request: Request):
         raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail="Internal Server Error processing embeddings request")
 
 
+@app.post("/v1/audio/transcriptions")
+async def openai_speech_to_text(request: Request):
+    """Function to convert speech to text using whisper.cpp"""
+    try:
+        request_runner_start_callback = request.app.state.request_runner_start_callback
+        form = await request.form() 
+        file = form.get("file") 
+        contents = await file.read()
+        fastapi_file = FastAPIUploadFile(filename=file.filename, file=BytesIO(contents))
+        model = str(form.get("model"))
+        audio_config = request.app.state.audio_config
+        request_runner_start_callback(model, True)
+        whisper_server = WhisperServer(audio_config, model)
+        audio_file_path = whisper_server.convert_to_wav(fastapi_file)
+        result = whisper_server.transcribe_audio(audio_file_path)
+        return JSONResponse(content=result)
+    except json.JSONDecodeError:
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid JSON request body")
+    except Exception as e:
+        logging.error(f"Error handling /audio/transcriptions: {e}\n{traceback.format_exc()}")
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail="Internal Server Error processing transcription request")
+
+
+# --- End handlers for OpenAI compatible API endpoints (v1) ---
+
+
 class OllamaProxyThread(QThread):
     """
     QThread to run the FastAPI proxy emulating the Ollama API in a separate thread.
@@ -802,7 +828,9 @@ class OllamaProxyThread(QThread):
     def __init__(self,
                  all_models_config: Dict[str, Dict[str, Any]], # Renamed models_config to all_models_config
                  runtimes_config: Dict[str, Dict[str, Any]], # Renamed models_config to runtimes_config
+                 audio_config,
                  is_model_running_callback: Callable[[str], bool],
+                 is_model_whisper_running,
                  get_runner_port_callback: Callable[[str], Optional[int]],
                  request_runner_start_callback: Callable[[str], asyncio.Future],
                  prompt_logging_enabled: bool, # Add prompt logging flag
@@ -810,8 +838,10 @@ def __init__(self,
         super().__init__()
         self.all_models_config = all_models_config # Store all_models_config
         self.runtimes_config = runtimes_config # Store runtimes_config
+        self.audio_config = audio_config
         self.is_model_running_callback = is_model_running_callback
         self.get_runner_port_callback = get_runner_port_callback
+        self.is_model_whisper_running = is_model_whisper_running
         self.request_runner_start_callback = request_runner_start_callback
         self.prompt_logging_enabled = prompt_logging_enabled # Store the flag
         self.prompts_logger = prompts_logger # Store the logger instance
@@ -903,11 +933,15 @@ async def run_async(self):
             app.state.all_models_config = self.all_models_config # Pass all_models_config
             app.state.runtimes_config = self.runtimes_config # Pass runtimes_config
             app.state.is_model_running_callback = self.is_model_running_callback
+            app.state.is_model_whisper_running = self.is_model_whisper_running
             app.state.get_runner_port_callback = self.get_runner_port_callback
             app.state.request_runner_start_callback = self.request_runner_start_callback
             app.state.prompt_logging_enabled = self.prompt_logging_enabled # Set prompt logging flag on state
             app.state.prompts_logger = self.prompts_logger # Set prompts logger on state
-
+            
+            # Audio global variables
+            app.state.audio_config = self.audio_config
+            
             # Use port 11434 as required for Ollama emulation
             uvicorn_config = uvicorn.Config(app, host="127.0.0.1", port=11434, reload=False)
             self._uvicorn_server = uvicorn.Server(uvicorn_config)
diff --git a/llama_runner/whisper_cpp_runner.py b/llama_runner/whisper_cpp_runner.py
new file mode 100644
index 0000000..19b9ca7
--- /dev/null
+++ b/llama_runner/whisper_cpp_runner.py
@@ -0,0 +1,155 @@
+import subprocess
+import requests
+import time
+import logging
+import os
+from fastapi import UploadFile
+from typing import Optional, Dict, Any, Union
+
+
+class WhisperServer:
+    """
+    Manage the startup of the whisper server and interaction with it.
+    """
+
+    def __init__(self, audio_config: Dict[str, Any], model_name: str):
+        """
+        Initialize the server with audio configuration and model name.
+
+        :param audio_config: audio config (normalized)
+        :param model_name: model name from audio_config['models']
+        """
+        self.audio_config = audio_config
+        self.model_name = model_name
+
+        # Get model and runtime config
+        models = audio_config.get('models', {})
+        model_conf = models.get(model_name, {})
+
+        self.runtime_name = model_conf.get('runtime', 'default')
+        runtimes = audio_config.get('runtimes', {})
+        runtime_conf = runtimes.get(self.runtime_name, {})
+        self.runtime_path = runtime_conf.get('runtime')
+
+        if not self.runtime_path:
+            raise ValueError(f"Runtime path for '{self.runtime_name}' not defined in audio config.")
+
+        self.model_path = model_conf.get("model_path")
+        if not self.model_path:
+            raise ValueError(f"Model path for '{self.model_name}' not defined in audio config.")
+
+        # Compose launch command
+        self.cmd = [
+            self.runtime_path,
+            '--model', self.model_path,
+        ]
+
+        parameters = model_conf.get("parameters", {})
+        if isinstance(parameters, dict):
+            for option, value in parameters.items():
+                self.cmd.extend([f'--{option}', str(value)])
+
+        # Check if host and port exist, if not add with default values
+        default_host = 'localhost'
+        default_port = '9000'  # string, since command list elements are strings
+
+        if '--host' not in self.cmd:
+            self.cmd.extend(['--host', default_host])
+        if '--port' not in self.cmd:
+            self.cmd.extend(['--port', default_port])
+
+        # Extract host and port from the command list
+        def get_cmd_param(cmd_list, param_name, default):
+            try:
+                idx = cmd_list.index(param_name)
+                return cmd_list[idx + 1]
+            except (ValueError, IndexError):
+                return default
+
+        self.host = get_cmd_param(self.cmd, '--host', default_host)
+        self.port = get_cmd_param(self.cmd, '--port', default_port)
+        self.base_url = f'http://{self.host}:{self.port}'
+
+        self.process: Optional[subprocess.Popen] = None
+
+    def start_server(self, wait_seconds: float = 5.0) -> None:
+        """Start the whisper server with current parameters."""
+        logging.info(f"Starting whisper-server with command: {' '.join(self.cmd)}")
+        self.process = subprocess.Popen(self.cmd)
+        print(f"Whisper-server started on {self.host}:{self.port} with model {self.model_name}")
+        time.sleep(wait_seconds)
+
+    def stop_server(self) -> None:
+        """Stop the server if it is running."""
+        if self.process and self.process.poll() is None:
+            self.process.terminate()
+            try:
+                self.process.wait(timeout=10)
+            except subprocess.TimeoutExpired:
+                self.process.kill()
+                self.process.wait()
+            print("Whisper-server stopped")
+        else:
+            print("Whisper-server is not running or already stopped.")
+
+    def transcribe_audio(self, audio_path: str) -> Union[Dict[str, Any], None]:
+        """Send an audio file to the server for transcription and return the result."""
+        url = f"{self.base_url}/inference"
+        data = {"response_format": "json"}
+
+        try:
+            with open(audio_path, 'rb') as audio_file:
+                files = {'file': audio_file}
+                response = requests.post(url, files=files, data=data)
+                response.raise_for_status()
+                return response.json()
+        except requests.RequestException as e:
+            logging.error(f"Error transcribing audio: {e}")
+            return None
+
+    def convert_to_wav(self, input_file: UploadFile, output_path: Optional[str] = None) -> str:
+        """
+        Convert incoming audio file to WAV (16kHz, mono, PCM s16le).
+
+        :param input_file: Uploaded audio file
+        :param output_path: Path to save WAV file. Defaults to ~/.llama-runner/temp.wav
+        :return: Path to saved WAV file
+        """
+        if output_path is None:
+            output_path = os.path.expanduser("~/.llama-runner/temp.wav")
+
+        input_tmp_dir = os.path.dirname(output_path)
+        input_tmp_path = os.path.join(input_tmp_dir, "temp_input")
+
+        os.makedirs(input_tmp_dir, exist_ok=True)
+
+        try:
+            with open(input_tmp_path, "wb") as f:
+                f.write(input_file.file.read())
+        finally:
+            input_file.file.close()
+
+        cmd = [
+            "ffmpeg",
+            "-y",
+            "-i", input_tmp_path,
+            "-ar", "16000",
+            "-ac", "1",
+            "-c:a", "pcm_s16le",
+            output_path
+        ]
+
+        try:
+            subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        except subprocess.CalledProcessError as e:
+            error_msg = e.stderr.decode(errors='ignore')
+            raise RuntimeError(f"Error during audio conversion: {error_msg}")
+        finally:
+            if os.path.exists(input_tmp_path):
+                os.remove(input_tmp_path)
+
+        return output_path
+    
+    
+    def get_port(self):
+        return int(self.port)

From 7f0396ab319bb1c9ea5c1a43db9780bfc1c9dbcf Mon Sep 17 00:00:00 2001
From: Mikhail Komarov <144356904+hardWorker254@users.noreply.github.com>
Date: Fri, 15 Aug 2025 19:37:00 +0400
Subject: [PATCH 2/4] Fixed upload models counter

---
 llama_runner/llama_runner_manager.py | 42 +++++++++++++---------------
 1 file changed, 20 insertions(+), 22 deletions(-)

diff --git a/llama_runner/llama_runner_manager.py b/llama_runner/llama_runner_manager.py
index 5a36ed2..5453e61 100644
--- a/llama_runner/llama_runner_manager.py
+++ b/llama_runner/llama_runner_manager.py
@@ -147,12 +147,30 @@ def get_runner_port(self, model_name: str) -> Optional[int]:
 
     def request_runner_start(self, model_name: str, iswhisper: bool = False) -> asyncio.Future:
         logging.info(f"Received request to start runner for model: {model_name}")
+            
+        # Подсчёт запущенных раннеров (llama + whisper)
+        running_llama = sum(1 for thread in self.llama_runner_threads.values() if thread.isRunning())
+        running_whisper = len(self.whisper_servers)
+        total_running = running_llama + running_whisper
+
+        if total_running >= self.concurrent_runners_limit:
+            if self.concurrent_runners_limit == 1:
+                # При лимите 1 останавливаем все, чтобы запустить новый
+                self.stop_all_llama_runners()
+                self.stop_all_whisper_servers()
+            else:
+                future = asyncio.Future()
+                future.set_exception(RuntimeError(
+                    f"Concurrent runner limit ({self.concurrent_runners_limit}) reached. Cannot start '{model_name}'."))
+                self._runner_startup_futures[model_name] = future
+                QTimer.singleShot(1000, lambda: self._cleanup_completed_future(model_name))
+                return future
+
         if iswhisper:
             if self.is_whisper_runner_running(model_name):
                 logging.info(f"Runner for {model_name} is already starting. Returning existing Future.")
                 return self._runner_startup_futures[model_name]
-            
-            self.stop_all_whisper_servers()
+
             self.start_whisper_server(model_name)
             future = asyncio.Future()
             future.set_result(self.get_whisper_port(model_name))
@@ -182,26 +200,6 @@ def request_runner_start(self, model_name: str, iswhisper: bool = False) -> asyn
                 QTimer.singleShot(1000, lambda: self._cleanup_completed_future(model_name))
                 return future
 
-        running_runners = {name: thread for name, thread in self.llama_runner_threads.items() if thread.isRunning()}
-        num_running = len(running_runners)
-
-        if num_running >= self.concurrent_runners_limit:
-            if self.concurrent_runners_limit == 1:
-                models_to_stop = list(running_runners.keys())
-                if models_to_stop:
-                    logging.info(f"Concurrent runner limit ({self.concurrent_runners_limit}) reached. Stopping existing runner(s): {models_to_stop} before starting {model_name}.")
-                    for name_to_stop in models_to_stop:
-                        self.stop_llama_runner(name_to_stop)
-                else:
-                    logging.warning("Concurrent runner limit reached but no running runners found?")
-            else:
-                logging.warning(f"Concurrent runner limit ({self.concurrent_runners_limit}) reached. Cannot start {model_name}.")
-                future = asyncio.Future()
-                future.set_exception(RuntimeError(f"Concurrent runner limit ({self.concurrent_runners_limit}) reached. Cannot start '{model_name}'."))
-                self._runner_startup_futures[model_name] = future
-                QTimer.singleShot(1000, lambda: self._cleanup_completed_future(model_name))
-                return future
-
         future = asyncio.Future()
         self._runner_startup_futures[model_name] = future
 

From 66eb9d829231a51eb016a68c2f96ff4a72034990 Mon Sep 17 00:00:00 2001
From: Mikhail Komarov <144356904+hardWorker254@users.noreply.github.com>
Date: Sat, 16 Aug 2025 00:20:44 +0400
Subject: [PATCH 3/4] Fix little bugs + add /v1/audio/translations

---
 llama_runner/llama_runner_manager.py  | 51 ++++++++++++++++---
 llama_runner/lmstudio_proxy_thread.py | 72 ++++++++++++++++++++++++++-
 llama_runner/ollama_proxy_thread.py   | 72 ++++++++++++++++++++++++++-
 llama_runner/whisper_cpp_runner.py    |  8 ++-
 4 files changed, 194 insertions(+), 9 deletions(-)

diff --git a/llama_runner/llama_runner_manager.py b/llama_runner/llama_runner_manager.py
index 5453e61..86d0825 100644
--- a/llama_runner/llama_runner_manager.py
+++ b/llama_runner/llama_runner_manager.py
@@ -3,23 +3,29 @@
 import logging
 from typing import Optional, Dict, Any
 
+
 from PySide6.QtCore import QObject, QTimer, QCoreApplication, QEvent, Signal, Slot
 
+
 # Import the custom event classes
 from llama_runner.llama_runner_thread import LlamaRunnerThread, RunnerStoppedEvent, RunnerErrorEvent
 from llama_runner.error_output_dialog import ErrorOutputDialog
 
+
 from llama_runner.whisper_cpp_runner import WhisperServer
 
+
 class LlamaRunnerManager(QObject):
     # Define a custom event type for events posted to the parent (e.g., MainWindow)
     # This replaces the QEvent.User + 4 magic number.
     MANAGER_PARENT_NOTIFICATION_EVENT_TYPE = QEvent.Type(QEvent.registerEventType())
 
+
     # Define signals directly as class attributes
     runner_port_ready_for_proxy = Signal(str, int)
     runner_stopped_for_proxy = Signal(str)
 
+
     def __init__(
         self,
         models: dict,
@@ -40,6 +46,7 @@ def __init__(
         # self.runner_port_ready_for_proxy = runner_port_ready_for_proxy # Removed, now class attribute
         # self.runner_stopped_for_proxy = runner_stopped_for_proxy # Removed, now class attribute
 
+
         self.llama_runner_threads: Dict[str, LlamaRunnerThread] = {}
         self._runner_startup_futures: Dict[str, asyncio.Future] = {}
         self._current_running_model: Optional[str] = None
@@ -51,7 +58,7 @@ def __init__(
     
     def start_whisper_server(self, model_name: str) -> None:
         """
-        Создать и запустить whisper сервер для модели.
+        Create and start a whisper server for the model.
         """
         if model_name in self.whisper_servers:
             logging.info(f"WhisperServer already started for model {model_name}")
@@ -64,11 +71,13 @@ def start_whisper_server(self, model_name: str) -> None:
                 status_widget.update_port("N/A")
                 status_widget.set_buttons_enabled(False, False)
 
+
             whisper_server = WhisperServer(self.audio_config, model_name)
             whisper_server.start_server()
             self.whisper_servers[model_name] = whisper_server
             port = whisper_server.get_port()
 
+
             logging.info(f"Whisper Runner started for model {model_name}")
             
             print(f"Whisper Runner for {model_name} ready on port {port}.")
@@ -82,9 +91,10 @@ def start_whisper_server(self, model_name: str) -> None:
             logging.error(f"Failed to start WhisperServer for {model_name}: {e}")
 
 
+
     def stop_whisper_server(self, model_name: str) -> None:
         """
-        Остановить whisper сервер для модели.
+        Stop the whisper server for the model.
         """
         whisper_server = self.whisper_servers.get(model_name)
         if whisper_server:
@@ -107,9 +117,10 @@ def stop_whisper_server(self, model_name: str) -> None:
             logging.warning(f"WhisperServer for {model_name} not running")
 
 
+
     def stop_all_whisper_servers(self) -> None:
         """
-        Остановить все запущенные whisper сервера.
+        Stop all running whisper servers.
         """
         for model_name in list(self.whisper_servers.keys()):
             self.stop_whisper_server(model_name)
@@ -130,32 +141,37 @@ def get_whisper_port(self, model_name: str) -> Optional[int]:
 
 
 
+
     def set_concurrent_runners_limit(self, limit: int):
         self.concurrent_runners_limit = limit
 
+
     def is_llama_runner_running(self, model_name: str) -> bool:
         thread = self.llama_runner_threads.get(model_name)
         if thread and thread.isRunning() and thread.runner and thread.runner.is_running():
             return True
         return False
 
+
     def get_runner_port(self, model_name: str) -> Optional[int]:
         thread = self.llama_runner_threads.get(model_name)
         if thread and thread.isRunning() and thread.runner and thread.runner.is_running():
             return thread.runner.get_port()
         return None
 
+
     def request_runner_start(self, model_name: str, iswhisper: bool = False) -> asyncio.Future:
         logging.info(f"Received request to start runner for model: {model_name}")
             
-        # Подсчёт запущенных раннеров (llama + whisper)
+        # Count running runners (llama + whisper)
         running_llama = sum(1 for thread in self.llama_runner_threads.values() if thread.isRunning())
         running_whisper = len(self.whisper_servers)
         total_running = running_llama + running_whisper
 
+
         if total_running >= self.concurrent_runners_limit:
             if self.concurrent_runners_limit == 1:
-                # При лимите 1 останавливаем все, чтобы запустить новый
+                # With limit 1, stop all to start new one
                 self.stop_all_llama_runners()
                 self.stop_all_whisper_servers()
             else:
@@ -166,11 +182,13 @@ def request_runner_start(self, model_name: str, iswhisper: bool = False) -> asyn
                 QTimer.singleShot(1000, lambda: self._cleanup_completed_future(model_name))
                 return future
 
+
         if iswhisper:
             if self.is_whisper_runner_running(model_name):
                 logging.info(f"Runner for {model_name} is already starting. Returning existing Future.")
                 return self._runner_startup_futures[model_name]
 
+
             self.start_whisper_server(model_name)
             future = asyncio.Future()
             future.set_result(self.get_whisper_port(model_name))
@@ -179,10 +197,12 @@ def request_runner_start(self, model_name: str, iswhisper: bool = False) -> asyn
             return future
 
 
+
         if model_name in self._runner_startup_futures and not self._runner_startup_futures[model_name].done():
             logging.info(f"Runner for {model_name} is already starting. Returning existing Future.")
             return self._runner_startup_futures[model_name]
 
+
         if self.is_llama_runner_running(model_name):
             port = self.get_runner_port(model_name)
             if port is not None:
@@ -200,14 +220,17 @@ def request_runner_start(self, model_name: str, iswhisper: bool = False) -> asyn
                 QTimer.singleShot(1000, lambda: self._cleanup_completed_future(model_name))
                 return future
 
+
         future = asyncio.Future()
         self._runner_startup_futures[model_name] = future
 
+
         model_config = self.models[model_name]
         model_path = model_config.get("model_path")
         llama_cpp_runtime_key = model_config.get("llama_cpp_runtime", "default")
         _raw_llama_cpp_runtime_config = self.llama_runtimes.get(llama_cpp_runtime_key, self.default_runtime)
 
+
         if isinstance(_raw_llama_cpp_runtime_config, dict):
             llama_cpp_runtime_command = _raw_llama_cpp_runtime_config.get("runtime")
             if not llama_cpp_runtime_command:
@@ -221,21 +244,25 @@ def request_runner_start(self, model_name: str, iswhisper: bool = False) -> asyn
             future.set_exception(RuntimeError(f"Invalid runtime configuration type for '{llama_cpp_runtime_key}'."))
             return future
 
+
         if not model_path:
             logging.error(f"Configuration Error: Model '{model_name}' has no 'model_path' specified in config.json.")
             future.set_exception(RuntimeError(f"Configuration Error: Model '{model_name}' has no 'model_path'."))
             return future
 
+
         if not os.path.exists(model_path):
             logging.error(f"File Not Found: Model file not found: {model_path}")
             future.set_exception(FileNotFoundError(f"Model file not found: {model_path}"))
             return future
 
+
         if llama_cpp_runtime_key != "default" and not os.path.exists(llama_cpp_runtime_command):
             logging.error(f"Runtime Not Found: Llama.cpp runtime not found: {llama_cpp_runtime_command}")
             future.set_exception(FileNotFoundError(f"Llama.cpp runtime not found: {llama_cpp_runtime_command}"))
             return future
 
+
         print(f"Starting Llama Runner for {model_name}...")
         status_widget = self.model_status_widgets.get(model_name)
         if status_widget:
@@ -243,6 +270,7 @@ def request_runner_start(self, model_name: str, iswhisper: bool = False) -> asyn
             status_widget.update_port("N/A")
             status_widget.set_buttons_enabled(False, False)
 
+
         thread = LlamaRunnerThread(
             model_name=model_name,
             model_path=model_path,
@@ -255,16 +283,20 @@ def request_runner_start(self, model_name: str, iswhisper: bool = False) -> asyn
         thread.error.connect(lambda message, output_buffer, name=model_name: self.on_llama_runner_error(name, message, output_buffer))
         # thread.stopped.connect(lambda name=model_name: self.on_llama_runner_stopped(name))  # Removed, now handled by customEvent
 
+
         self.llama_runner_threads[model_name] = thread
         thread.start()
 
+
         return future
 
+
     def _cleanup_completed_future(self, model_name: str):
         if model_name in self._runner_startup_futures and not self._runner_startup_futures[model_name].done():
             logging.debug(f"Cleaning up completed future for {model_name}")
             del self._runner_startup_futures[model_name]
 
+
     def stop_llama_runner(self, model_name: str):
         if model_name in self.llama_runner_threads and self.llama_runner_threads[model_name].isRunning():
             print(f"Stopping Llama Runner for {model_name}...")
@@ -287,6 +319,7 @@ def stop_llama_runner(self, model_name: str):
                 # Or, if an event is preferred for the manager itself:
                 # QCoreApplication.instance().postEvent(self, RunnerStoppedEvent(model_name))
 
+
                 # Original logic: post a generic event to parent.
                 # This might be for a different purpose than the thread's stopped event.
                 parent_event = QEvent(LlamaRunnerManager.MANAGER_PARENT_NOTIFICATION_EVENT_TYPE)
@@ -301,6 +334,7 @@ def stop_llama_runner(self, model_name: str):
                 self.on_llama_runner_stopped(model_name)
 
 
+
     def stop_all_llama_runners(self):
         print("Stopping all Llama Runners...")
         # Collect running threads first to avoid modifying the dict during iteration
@@ -315,6 +349,7 @@ def stop_all_llama_runners(self):
         for model_name, thread in running_threads:
             thread.wait()
 
+
     @Slot(str)
     def on_llama_runner_started(self, model_name: str):
         status_widget = self.model_status_widgets.get(model_name)
@@ -322,6 +357,7 @@ def on_llama_runner_started(self, model_name: str):
             status_widget.update_status("Starting...")
             status_widget.set_buttons_enabled(False, False)
 
+
     @Slot(str)
     def on_llama_runner_stopped(self, model_name: str):
         print(f"Llama Runner for {model_name} stopped.")
@@ -343,6 +379,7 @@ def on_llama_runner_stopped(self, model_name: str):
         else:
             logging.warning(f"Stopped signal received for unknown or already cleaned up model: {model_name}")
 
+
     @Slot(str, str, list)
     def on_llama_runner_error(self, model_name: str, message: str, output_buffer: list):
         print(f"Llama Runner for {model_name} error: {message}")
@@ -361,6 +398,7 @@ def on_llama_runner_error(self, model_name: str, message: str, output_buffer: li
             logging.debug(f"Runner {model_name} errored while startup Future was pending.")
             self._runner_startup_futures[model_name].set_exception(RuntimeError(f"Runner for {model_name} errored during startup: {message}"))
 
+
     def customEvent(self, event: QEvent):
         # Handle custom stopped event from LlamaRunnerThread
         if event.type() == RunnerStoppedEvent.EVENT_TYPE:
@@ -376,6 +414,7 @@ def customEvent(self, event: QEvent):
         else:
             super().customEvent(event)
 
+
     @Slot(str, int)
     def on_llama_runner_port_ready_and_emit(self, model_name: str, port: int):
         print(f"Llama Runner for {model_name} ready on port {port}.")
@@ -395,6 +434,7 @@ def on_llama_runner_port_ready_and_emit(self, model_name: str, port: int):
         logging.info(f"Set current running model: {model_name}")
         self.runner_port_ready_for_proxy.emit(model_name, port)
 
+
     @Slot()
     def check_runner_statuses(self):
         for model_name, thread in list(self.llama_runner_threads.items()):
@@ -411,4 +451,3 @@ def check_runner_statuses(self):
                         app_instance.postEvent(parent_object, parent_event)
                     else:
                         logging.warning(f"Could not post parent event for {model_name} (check_runner_statuses): App/Parent None.")
-        
\ No newline at end of file
diff --git a/llama_runner/lmstudio_proxy_thread.py b/llama_runner/lmstudio_proxy_thread.py
index 44878b8..c550a1c 100644
--- a/llama_runner/lmstudio_proxy_thread.py
+++ b/llama_runner/lmstudio_proxy_thread.py
@@ -982,22 +982,92 @@ async def _v1_embeddings_handler(request: Request):
 async def openai_speech_to_text(request: Request):
     """Function to convert speech to text using whisper.cpp"""
     try:
+        # Retrieve callback function to notify request start
         request_runner_start_callback = request.app.state.request_runner_start_callback
+        # Parse form data from the incoming request
         form = await request.form() 
+        # Extract uploaded audio file from form
         file = form.get("file") 
+        # Read the content of the uploaded file into bytes
         contents = await file.read()
+        # Create a FastAPI UploadFile object from bytes content, preserving filename
         fastapi_file = FastAPIUploadFile(filename=file.filename, file=BytesIO(contents))
+        # Extract model name from the form data
         model = str(form.get("model"))
+        # Retrieve audio configuration from app state
         audio_config = request.app.state.audio_config
+
+        # Notify that processing of this model's request has started
         request_runner_start_callback(model, True)
+        
+        # Initialize WhisperServer with audio configuration and model
         whisper_server = WhisperServer(audio_config, model)
+        # Convert the uploaded audio file to WAV format
         audio_file_path = whisper_server.convert_to_wav(fastapi_file)
+        # Perform transcription on the WAV audio file
         result = whisper_server.transcribe_audio(audio_file_path)
+        # Return transcription result as JSON response
         return JSONResponse(content=result)
+    
     except json.JSONDecodeError:
+        # Handle cases where request body is not valid JSON
         raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid JSON request body")
+    
+    except Exception as e:
+        # Log unexpected errors and return HTTP 500 error
+        logging.error(f"Error handling /v1/audio/transcriptions: {e}\n{traceback.format_exc()}")
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail="Internal Server Error processing transcription request")
+
+
+@app.post("/v1/audio/translations")
+async def openai_speech_to_text_translate(request: Request):
+    """Function to convert speech to text using whisper.cpp with translation"""
+    try:
+        # Retrieve callback function to notify request start
+        request_runner_start_callback = request.app.state.request_runner_start_callback
+        # Parse form data from the incoming request
+        form = await request.form() 
+        # Extract uploaded audio file from form
+        file = form.get("file") 
+        # Read the content of the uploaded file into bytes
+        contents = await file.read()
+        # Create a FastAPI UploadFile object from bytes content, preserving filename
+        fastapi_file = FastAPIUploadFile(filename=file.filename, file=BytesIO(contents))
+        # Extract model name from the form data
+        model = str(form.get("model"))
+        
+        # Save the current audio config to restore later
+        prev_audio_config = request.app.state.audio_config
+        # Remove "language" parameter if it exists for this model's config
+        if request.app.state.audio_config["models"][model]["parameters"].get("language", False):
+            del request.app.state.audio_config["models"][model]["parameters"]["language"]
+        # Enable translation mode by setting "translate" parameter
+        request.app.state.audio_config["models"][model]["parameters"]["translate"] = ""
+        # Retrieve updated audio configuration
+        audio_config = request.app.state.audio_config
+        
+        # Notify that processing of this model's request has started
+        request_runner_start_callback(model, True)
+        # Initialize WhisperServer with updated audio configuration and model
+        whisper_server = WhisperServer(audio_config, model)
+        # Convert the uploaded audio file to WAV format
+        audio_file_path = whisper_server.convert_to_wav(fastapi_file)
+        # Perform transcription with translation on the WAV audio file
+        result = whisper_server.transcribe_audio(audio_file_path)
+        
+        # Restore previous audio config after request processed
+        request.app.state.audio_config = prev_audio_config
+        
+        # Return transcription and translation result as JSON response
+        return JSONResponse(content=result)
+    
+    except json.JSONDecodeError:
+        # Handle cases where request body is not valid JSON
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid JSON request body")
+    
     except Exception as e:
-        logging.error(f"Error handling /audio/transcriptions: {e}\n{traceback.format_exc()}")
+        # Log unexpected errors and return HTTP 500 error
+        logging.error(f"Error handling /v1/audio/transcriptions: {e}\n{traceback.format_exc()}")
         raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail="Internal Server Error processing transcription request")
 
 
diff --git a/llama_runner/ollama_proxy_thread.py b/llama_runner/ollama_proxy_thread.py
index 54778f5..17bb1f5 100644
--- a/llama_runner/ollama_proxy_thread.py
+++ b/llama_runner/ollama_proxy_thread.py
@@ -794,25 +794,95 @@ async def openai_embeddings(request: Request):
 async def openai_speech_to_text(request: Request):
     """Function to convert speech to text using whisper.cpp"""
     try:
+        # Retrieve callback function to notify request start
         request_runner_start_callback = request.app.state.request_runner_start_callback
+        # Parse form data from the incoming request
         form = await request.form() 
+        # Extract uploaded audio file from form
         file = form.get("file") 
+        # Read the content of the uploaded file into bytes
         contents = await file.read()
+        # Create a FastAPI UploadFile object from bytes content, preserving filename
         fastapi_file = FastAPIUploadFile(filename=file.filename, file=BytesIO(contents))
+        # Extract model name from the form data
         model = str(form.get("model"))
+        # Retrieve audio configuration from app state
         audio_config = request.app.state.audio_config
+
+        # Notify that processing of this model's request has started
         request_runner_start_callback(model, True)
+        
+        # Initialize WhisperServer with audio configuration and model
         whisper_server = WhisperServer(audio_config, model)
+        # Convert the uploaded audio file to WAV format
         audio_file_path = whisper_server.convert_to_wav(fastapi_file)
+        # Perform transcription on the WAV audio file
         result = whisper_server.transcribe_audio(audio_file_path)
+        # Return transcription result as JSON response
         return JSONResponse(content=result)
+    
     except json.JSONDecodeError:
+        # Handle cases where request body is not valid JSON
         raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid JSON request body")
+    
     except Exception as e:
-        logging.error(f"Error handling /audio/transcriptions: {e}\n{traceback.format_exc()}")
+        # Log unexpected errors and return HTTP 500 error
+        logging.error(f"Error handling /v1/audio/transcriptions: {e}\n{traceback.format_exc()}")
         raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail="Internal Server Error processing transcription request")
 
 
+
+@app.post("/v1/audio/translations")
+async def openai_speech_to_text_translate(request: Request):
+    """Function to convert speech to text using whisper.cpp with translation"""
+    try:
+        # Retrieve callback function to notify request start
+        request_runner_start_callback = request.app.state.request_runner_start_callback
+        # Parse form data from the incoming request
+        form = await request.form() 
+        # Extract uploaded audio file from form
+        file = form.get("file") 
+        # Read the content of the uploaded file into bytes
+        contents = await file.read()
+        # Create a FastAPI UploadFile object from bytes content, preserving filename
+        fastapi_file = FastAPIUploadFile(filename=file.filename, file=BytesIO(contents))
+        # Extract model name from the form data
+        model = str(form.get("model"))
+        
+        # Save the current audio config to restore later
+        prev_audio_config = request.app.state.audio_config
+        # Remove "language" parameter if it exists for this model's config
+        if request.app.state.audio_config["models"][model]["parameters"].get("language", False):
+            del request.app.state.audio_config["models"][model]["parameters"]["language"]
+        # Enable translation mode by setting "translate" parameter
+        request.app.state.audio_config["models"][model]["parameters"]["translate"] = ""
+        # Retrieve updated audio configuration
+        audio_config = request.app.state.audio_config
+        
+        # Notify that processing of this model's request has started
+        request_runner_start_callback(model, True)
+        # Initialize WhisperServer with updated audio configuration and model
+        whisper_server = WhisperServer(audio_config, model)
+        # Convert the uploaded audio file to WAV format
+        audio_file_path = whisper_server.convert_to_wav(fastapi_file)
+        # Perform transcription with translation on the WAV audio file
+        result = whisper_server.transcribe_audio(audio_file_path)
+        
+        # Restore previous audio config after request processed
+        request.app.state.audio_config = prev_audio_config
+        
+        # Return transcription and translation result as JSON response
+        return JSONResponse(content=result)
+    
+    except json.JSONDecodeError:
+        # Handle cases where request body is not valid JSON
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid JSON request body")
+    
+    except Exception as e:
+        # Log unexpected errors and return HTTP 500 error
+        logging.error(f"Error handling /v1/audio/transcriptions: {e}\n{traceback.format_exc()}")
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail="Internal Server Error processing transcription request")
+
 # --- End handlers for OpenAI compatible API endpoints (v1) ---
 
 
diff --git a/llama_runner/whisper_cpp_runner.py b/llama_runner/whisper_cpp_runner.py
index 19b9ca7..a77cb97 100644
--- a/llama_runner/whisper_cpp_runner.py
+++ b/llama_runner/whisper_cpp_runner.py
@@ -57,7 +57,13 @@ def __init__(self, audio_config: Dict[str, Any], model_name: str):
             self.cmd.extend(['--host', default_host])
         if '--port' not in self.cmd:
             self.cmd.extend(['--port', default_port])
-
+            
+        # Remove '' in self.cmd if exists
+        for i in range(len(self.cmd)):
+            if self.cmd[i] == '':
+                self.cmd.pop(i)
+        
+        
         # Extract host and port from the command list
         def get_cmd_param(cmd_list, param_name, default):
             try:

From 979a0cacb7685c2a59f9b5fc27b923cfdf77433b Mon Sep 17 00:00:00 2001
From: Mikhail Komarov <144356904+hardWorker254@users.noreply.github.com>
Date: Sun, 7 Sep 2025 14:13:10 +0400
Subject: [PATCH 4/4] Method not allowed fix

Fixed error: OPTIONS /v1/chat/completions 405 Method not allowed
---
 llama_runner/lmstudio_proxy_thread.py |  8 ++++++++
 llama_runner/ollama_proxy_thread.py   | 10 +++++++++-
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/llama_runner/lmstudio_proxy_thread.py b/llama_runner/lmstudio_proxy_thread.py
index c550a1c..3ae0c7a 100644
--- a/llama_runner/lmstudio_proxy_thread.py
+++ b/llama_runner/lmstudio_proxy_thread.py
@@ -23,12 +23,20 @@
 from llama_runner.whisper_cpp_runner import WhisperServer
 from io import BytesIO
 from fastapi import UploadFile as FastAPIUploadFile
+from fastapi.middleware.cors import CORSMiddleware
 
 # Configure logging (already done in main.py for configurable levels)
 # logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 
 # --- Create our own FastAPI app instance ---
 app = FastAPI()
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
 # --- End create app instance ---
 
 
diff --git a/llama_runner/ollama_proxy_thread.py b/llama_runner/ollama_proxy_thread.py
index 17bb1f5..e78a936 100644
--- a/llama_runner/ollama_proxy_thread.py
+++ b/llama_runner/ollama_proxy_thread.py
@@ -23,9 +23,17 @@
 from llama_runner.whisper_cpp_runner import WhisperServer
 from io import BytesIO
 from fastapi import UploadFile as FastAPIUploadFile
+from fastapi.middleware.cors import CORSMiddleware
 
 # --- Create our own FastAPI app instance ---
 app = FastAPI()
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
 # --- End create app instance ---
 
 # Define standalone handlers that access state via app.state
@@ -743,8 +751,8 @@ async def openai_chat_completions(request: Request):
     """Handles OpenAI /v1/chat/completions requests."""
     try:
         request_body = await request.json()
+        print(request_body)
         # No conversion needed here, assuming the incoming request is already OpenAI format
-
         async def chat_completion_response_stream():
             # The dynamic router handles the forwarding and streaming
             async for chunk in _dynamic_route_runner_request_generator(request, target_path="/v1/chat/completions", request_body=request_body):