From aaa97612d31fe0bb329a7f5c34041fa01f76f16f Mon Sep 17 00:00:00 2001 From: Mikhail Komarov <144356904+hardWorker254@users.noreply.github.com> Date: Fri, 15 Aug 2025 18:39:55 +0400 Subject: [PATCH 1/4] Raw /v1/audio/transcription --- llama_runner/config_loader.py | 88 ++++++++++++- llama_runner/headless_service_manager.py | 7 +- llama_runner/llama_cpp_runner.py | 1 - llama_runner/llama_runner_manager.py | 106 +++++++++++++++- llama_runner/lmstudio_proxy_thread.py | 36 +++++- llama_runner/main_window.py | 23 +++- llama_runner/ollama_proxy_thread.py | 44 ++++++- llama_runner/whisper_cpp_runner.py | 155 +++++++++++++++++++++++ 8 files changed, 447 insertions(+), 13 deletions(-) create mode 100644 llama_runner/whisper_cpp_runner.py diff --git a/llama_runner/config_loader.py b/llama_runner/config_loader.py index c3a5921..98ad3e8 100644 --- a/llama_runner/config_loader.py +++ b/llama_runner/config_loader.py @@ -2,18 +2,22 @@ import json import logging + CONFIG_DIR = os.path.expanduser("~/.llama-runner") CONFIG_FILE = os.path.join(CONFIG_DIR, "config.json") LOG_FILE = os.path.join(CONFIG_DIR, "error.log") + # Ensure the log directory exists if not os.path.exists(CONFIG_DIR): os.makedirs(CONFIG_DIR, exist_ok=True) + # Set up logging logging.basicConfig(filename=LOG_FILE, level=logging.ERROR, format='%(asctime)s - %(levelname)s - %(message)s') + def ensure_config_exists(): """ Ensures that the configuration directory and file exist. @@ -27,6 +31,7 @@ def ensure_config_exists(): logging.error(f"Error creating config directory: {e}") return False + if not os.path.exists(CONFIG_FILE): try: default_config = { @@ -48,6 +53,7 @@ def ensure_config_exists(): return False return True + def load_config(): """ Loads the configuration from the JSON file. @@ -56,21 +62,23 @@ def load_config(): if not ensure_config_exists(): return {} + try: with open(CONFIG_FILE, "r") as f: config = json.load(f) - # Ensure default_runtime and concurrentRunners exist if "default_runtime" not in config: config["default_runtime"] = "llama-server" if "concurrentRunners" not in config: config["concurrentRunners"] = 1 + # Ensure proxies section and its sub-keys exist with defaults proxies_config = config.get("proxies", {}) if not isinstance(proxies_config, dict): # Handle case where 'proxies' might exist but not as a dict proxies_config = {} + ollama_proxy_config = proxies_config.get("ollama", {}) if not isinstance(ollama_proxy_config, dict): ollama_proxy_config = {} @@ -78,6 +86,7 @@ def load_config(): ollama_proxy_config["enabled"] = True proxies_config["ollama"] = ollama_proxy_config + lmstudio_proxy_config = proxies_config.get("lmstudio", {}) if not isinstance(lmstudio_proxy_config, dict): lmstudio_proxy_config = {} @@ -89,6 +98,7 @@ def load_config(): config["proxies"] = proxies_config + # Ensure logging section and its sub-keys exist with defaults logging_config = config.get("logging", {}) if not isinstance(logging_config, dict): # Handle case where 'logging' might exist but not as a dict @@ -134,6 +144,80 @@ def load_config(): print("Warning: Config: 'llama-runtimes' key exists but is not a dictionary. Ignoring.") # If 'llama-runtimes' is not in config or is None, it's handled gracefully (no changes made to it) + + + raw_audio = config.get("audio") + if isinstance(raw_audio, dict): + # Process runtimes + raw_runtimes = raw_audio.get("runtimes") + processed_runtimes = {} + if isinstance(raw_runtimes, dict): + for name, details in raw_runtimes.items(): + if isinstance(details, dict): + runtime_path = details.get("runtime") + if isinstance(runtime_path, str) and runtime_path.strip(): + processed_runtimes[name] = { + "runtime": runtime_path.strip() + } + else: + logging.warning(f"Config: Audio runtime '{name}' has invalid or empty 'runtime' path. Skipping.") + print(f"Warning: Config: Audio runtime '{name}' has invalid or empty 'runtime' path. Skipping.") + else: + logging.warning(f"Config: Audio runtime '{name}' details should be a dictionary. Skipping.") + print(f"Warning: Config: Audio runtime '{name}' details should be a dictionary. Skipping.") + elif raw_runtimes is not None: + logging.warning("Config: 'audio.runtimes' exists but is not a dictionary. Ignoring.") + print("Warning: Config: 'audio.runtimes' exists but is not a dictionary. Ignoring.") + + + # Process models + raw_models = raw_audio.get("models") + processed_models = {} + if isinstance(raw_models, dict): + for model_name, model_info in raw_models.items(): + if isinstance(model_info, dict): + model_path = model_info.get("model_path") + runtime = model_info.get("runtime") + parameters = model_info.get("parameters", {}) + if isinstance(model_path, str) and model_path.strip(): + if isinstance(parameters, dict): + processed_models[model_name] = { + "model_path": model_path.strip(), + "runtime": runtime, + "parameters": parameters + } + else: + logging.warning(f"Config: Parameters for model '{model_name}' should be a dictionary. Using empty dict instead.") + print(f"Warning: Config: Parameters for model '{model_name}' should be a dictionary. Using empty dict instead.") + processed_models[model_name] = { + "model_path": model_path.strip(), + "runtime": runtime, + "parameters": {} + } + else: + logging.warning(f"Config: Model '{model_name}' has invalid or empty 'model_path'. Skipping.") + print(f"Warning: Config: Model '{model_name}' has invalid or empty 'model_path'. Skipping.") + else: + logging.warning(f"Config: Model entry '{model_name}' is not a dictionary. Skipping.") + print(f"Warning: Config: Model entry '{model_name}' is not a dictionary. Skipping.") + elif raw_models is not None: + logging.warning("Config: 'audio.models' exists but is not a dictionary. Ignoring.") + print("Warning: Config: 'audio.models' exists but is not a dictionary. Ignoring.") + + + # Update the audio section in config + config["audio"] = { + "runtimes": processed_runtimes, + "models": processed_models + } + + + elif raw_audio is not None: + logging.warning("Config: 'audio' key exists but is not a dictionary. Ignoring.") + print("Warning: Config: 'audio' key exists but is not a dictionary. Ignoring.") + + + print(f"Loaded config (processed): {config}") # Print loaded config return config except (OSError, json.JSONDecodeError) as e: @@ -141,6 +225,7 @@ def load_config(): logging.error(f"Error loading config file: {e}") return {} + def calculate_system_fingerprint(config: dict) -> str: """Calculates a 16-character hash of the configuration parameters.""" import hashlib @@ -149,6 +234,7 @@ def calculate_system_fingerprint(config: dict) -> str: hash_object = hashlib.md5(config_str.encode()) return hash_object.hexdigest()[:16] + if __name__ == '__main__': # Example usage: config = load_config() diff --git a/llama_runner/headless_service_manager.py b/llama_runner/headless_service_manager.py index 54bfe75..25e7d1a 100644 --- a/llama_runner/headless_service_manager.py +++ b/llama_runner/headless_service_manager.py @@ -35,7 +35,8 @@ def _initialize_services(self): # Initialize LlamaRunnerManager self.llama_runner_manager = LlamaRunnerManager( models=self.models_specific_config, # This is app_config['models'] - llama_runtimes=self.app_config.get('llama-runtimes', {}), # Ensure correct key + llama_runtimes=self.app_config.get('llama-runtimes', {}), # Ensure correct key, + audio_config=self.app_config.get('audio', {}), default_runtime=self.app_config.get('default_runtime', 'llama-server'), # Ensure correct key and default model_status_widgets={} # No UI widgets in headless mode ) @@ -48,6 +49,7 @@ def _initialize_services(self): # Get proxy and logging settings from the unified config proxies_config = self.app_config.get('proxies', {}) + audio_config = self.app_config.get('audio', {}) ollama_proxy_settings = proxies_config.get('ollama', {}) lmstudio_proxy_settings = proxies_config.get('lmstudio', {}) logging_settings = self.app_config.get('logging', {}) @@ -63,7 +65,9 @@ def _initialize_services(self): self.ollama_proxy = OllamaProxyThread( all_models_config=self.models_specific_config, runtimes_config=self.app_config.get('llama-runtimes', {}), + audio_config=audio_config, is_model_running_callback=self.llama_runner_manager.is_llama_runner_running, + is_model_whisper_running=self.llama_runner_manager.is_whisper_runner_running, get_runner_port_callback=self.llama_runner_manager.get_runner_port, request_runner_start_callback=self.llama_runner_manager.request_runner_start, prompt_logging_enabled=prompt_logging_enabled, @@ -80,6 +84,7 @@ def _initialize_services(self): self.lmstudio_proxy = LMStudioProxyThread( # Using the aliased FastAPIProxyThread all_models_config=self.models_specific_config, runtimes_config=self.app_config.get('llama-runtimes', {}), + audio_config=audio_config, is_model_running_callback=self.llama_runner_manager.is_llama_runner_running, get_runner_port_callback=self.llama_runner_manager.get_runner_port, request_runner_start_callback=self.llama_runner_manager.request_runner_start, diff --git a/llama_runner/llama_cpp_runner.py b/llama_runner/llama_cpp_runner.py index 0ba32eb..15c13e2 100644 --- a/llama_runner/llama_cpp_runner.py +++ b/llama_runner/llama_cpp_runner.py @@ -84,7 +84,6 @@ async def start(self): command.append(f"--{arg_name}") command.append(str(value)) - print(f"Starting llama.cpp server with command: {' '.join(command)}") logging.info(f"Starting llama.cpp server with command: {' '.join(command)}") # Clear the output buffer before starting a new process diff --git a/llama_runner/llama_runner_manager.py b/llama_runner/llama_runner_manager.py index 5b578b6..5a36ed2 100644 --- a/llama_runner/llama_runner_manager.py +++ b/llama_runner/llama_runner_manager.py @@ -9,6 +9,8 @@ from llama_runner.llama_runner_thread import LlamaRunnerThread, RunnerStoppedEvent, RunnerErrorEvent from llama_runner.error_output_dialog import ErrorOutputDialog +from llama_runner.whisper_cpp_runner import WhisperServer + class LlamaRunnerManager(QObject): # Define a custom event type for events posted to the parent (e.g., MainWindow) # This replaces the QEvent.User + 4 magic number. @@ -23,6 +25,7 @@ def __init__( models: dict, llama_runtimes: dict, default_runtime: str, + audio_config, model_status_widgets: dict, # runner_port_ready_for_proxy and runner_stopped_for_proxy are now class attributes parent=None, @@ -41,6 +44,91 @@ def __init__( self._runner_startup_futures: Dict[str, asyncio.Future] = {} self._current_running_model: Optional[str] = None self.concurrent_runners_limit = 1 # Will be set by MainWindow after instantiation + self.audio_config = audio_config + self.whisper_servers: Dict[str, WhisperServer] = {} + + + + def start_whisper_server(self, model_name: str) -> None: + """ + Создать и запустить whisper сервер для модели. + """ + if model_name in self.whisper_servers: + logging.info(f"WhisperServer already started for model {model_name}") + return + try: + print(f"Starting Runner for {model_name}...") + status_widget = self.model_status_widgets.get(model_name) + if status_widget: + status_widget.update_status("Starting...") + status_widget.update_port("N/A") + status_widget.set_buttons_enabled(False, False) + + whisper_server = WhisperServer(self.audio_config, model_name) + whisper_server.start_server() + self.whisper_servers[model_name] = whisper_server + port = whisper_server.get_port() + + logging.info(f"Whisper Runner started for model {model_name}") + + print(f"Whisper Runner for {model_name} ready on port {port}.") + status_widget = self.model_status_widgets.get(model_name) + if status_widget: + status_widget.update_port(port) + status_widget.update_status("Running") + status_widget.set_buttons_enabled(False, True) + + except Exception as e: + logging.error(f"Failed to start WhisperServer for {model_name}: {e}") + + + def stop_whisper_server(self, model_name: str) -> None: + """ + Остановить whisper сервер для модели. + """ + whisper_server = self.whisper_servers.get(model_name) + if whisper_server: + try: + status_widget = self.model_status_widgets.get(model_name) + if status_widget: + status_widget.update_status("Stopping...") + status_widget.set_buttons_enabled(False, False) + print(f"Stopping Runner for {model_name}...") + whisper_server.stop_server() + del self.whisper_servers[model_name] + if status_widget: + status_widget.update_status("Not Running") + status_widget.update_port("N/A") + status_widget.set_buttons_enabled(True, False) + logging.info(f"WhisperServer stopped for model {model_name}") + except Exception as e: + logging.error(f"Error stopping WhisperServer for {model_name}: {e}") + else: + logging.warning(f"WhisperServer for {model_name} not running") + + + def stop_all_whisper_servers(self) -> None: + """ + Остановить все запущенные whisper сервера. + """ + for model_name in list(self.whisper_servers.keys()): + self.stop_whisper_server(model_name) + + + def is_whisper_runner_running(self, model_name: str) -> bool: + whisper_class = self.whisper_servers.get(model_name) + if whisper_class: + return True + return False + + + def get_whisper_port(self, model_name: str) -> Optional[int]: + whisper_class = self.whisper_servers.get(model_name) + if whisper_class: + return whisper_class.get_port() + return None + + def set_concurrent_runners_limit(self, limit: int): self.concurrent_runners_limit = limit @@ -57,8 +145,21 @@ def get_runner_port(self, model_name: str) -> Optional[int]: return thread.runner.get_port() return None - def request_runner_start(self, model_name: str) -> asyncio.Future: + def request_runner_start(self, model_name: str, iswhisper: bool = False) -> asyncio.Future: logging.info(f"Received request to start runner for model: {model_name}") + if iswhisper: + if self.is_whisper_runner_running(model_name): + logging.info(f"Runner for {model_name} is already starting. Returning existing Future.") + return self._runner_startup_futures[model_name] + + self.stop_all_whisper_servers() + self.start_whisper_server(model_name) + future = asyncio.Future() + future.set_result(self.get_whisper_port(model_name)) + self._runner_startup_futures[model_name] = future + QTimer.singleShot(1000, lambda: self._cleanup_completed_future(model_name)) + return future + if model_name in self._runner_startup_futures and not self._runner_startup_futures[model_name].done(): logging.info(f"Runner for {model_name} is already starting. Returning existing Future.") @@ -311,4 +412,5 @@ def check_runner_statuses(self): if app_instance and parent_object: app_instance.postEvent(parent_object, parent_event) else: - logging.warning(f"Could not post parent event for {model_name} (check_runner_statuses): App/Parent None.") \ No newline at end of file + logging.warning(f"Could not post parent event for {model_name} (check_runner_statuses): App/Parent None.") + \ No newline at end of file diff --git a/llama_runner/lmstudio_proxy_thread.py b/llama_runner/lmstudio_proxy_thread.py index 5ffd7d4..44878b8 100644 --- a/llama_runner/lmstudio_proxy_thread.py +++ b/llama_runner/lmstudio_proxy_thread.py @@ -20,6 +20,10 @@ from llama_runner import gguf_metadata # Import the new metadata module from llama_runner.config_loader import calculate_system_fingerprint +from llama_runner.whisper_cpp_runner import WhisperServer +from io import BytesIO +from fastapi import UploadFile as FastAPIUploadFile + # Configure logging (already done in main.py for configurable levels) # logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') @@ -974,6 +978,29 @@ async def _v1_embeddings_handler(request: Request): logging.error(f"Error in /v1/embeddings handler: {e}\n{traceback.format_exc()}") return JSONResponse(content={"error": {"message": f"Internal server error: {e}", "type": "internal_error"}}, status_code=status.HTTP_500_INTERNAL_SERVER_ERROR) +@app.post("/v1/audio/transcriptions") +async def openai_speech_to_text(request: Request): + """Function to convert speech to text using whisper.cpp""" + try: + request_runner_start_callback = request.app.state.request_runner_start_callback + form = await request.form() + file = form.get("file") + contents = await file.read() + fastapi_file = FastAPIUploadFile(filename=file.filename, file=BytesIO(contents)) + model = str(form.get("model")) + audio_config = request.app.state.audio_config + request_runner_start_callback(model, True) + whisper_server = WhisperServer(audio_config, model) + audio_file_path = whisper_server.convert_to_wav(fastapi_file) + result = whisper_server.transcribe_audio(audio_file_path) + return JSONResponse(content=result) + except json.JSONDecodeError: + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid JSON request body") + except Exception as e: + logging.error(f"Error handling /audio/transcriptions: {e}\n{traceback.format_exc()}") + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail="Internal Server Error processing transcription request") + + logging.info("Updated dynamic routing handlers for /v1/chat/completions, /v1/completions, /v1/embeddings to support conditional streaming.") @@ -999,7 +1026,8 @@ class FastAPIProxyThread(QThread): # Renamed class def __init__(self, all_models_config: Dict[str, Dict[str, Any]], # Renamed models_config - runtimes_config: Dict[str, Dict[str, Any]], # Added runtimes_config + runtimes_config: Dict[str, Dict[str, Any]], # Added runtimes_config, + audio_config, is_model_running_callback: Callable[[str], bool], get_runner_port_callback: Callable[[str], Optional[int]], request_runner_start_callback: Callable[[str], asyncio.Future], # Callback now returns Future @@ -1009,6 +1037,7 @@ def __init__(self, super().__init__() self.all_models_config = all_models_config # Store all_models_config self.runtimes_config = runtimes_config # Store runtimes_config + self.audio_config = audio_config self.is_model_running_callback = is_model_running_callback self.get_runner_port_callback = get_runner_port_callback self.request_runner_start_callback = request_runner_start_callback # Store the callback @@ -1128,6 +1157,11 @@ async def run_async(self): app.state.request_runner_start_callback = self.request_runner_start_callback # Pass the new callback app.state.prompt_logging_enabled = self.prompt_logging_enabled # Set prompt logging flag on state app.state.prompts_logger = self.prompts_logger # Set prompts logger on state + + # Audio global variables + app.state.audio_config = self.audio_config + app.state.running_whisper = None + # Extract metadata for all models and store it in app.state.models_metadata # Note: get_all_models_lmstudio_format expects the main models config (all_models_config) app.state.models_metadata = gguf_metadata.get_all_models_lmstudio_format( diff --git a/llama_runner/main_window.py b/llama_runner/main_window.py index afb18e2..93f7296 100644 --- a/llama_runner/main_window.py +++ b/llama_runner/main_window.py @@ -37,7 +37,9 @@ def __init__(self): # Load settings from config with defaults self.prompt_logging_enabled = self.config.get('logging', {}).get('prompt_logging_enabled', False) self.llama_runtimes = self.config.get("llama-runtimes", {}) + self.audio_config = self.config.get("audio") self.default_runtime = self.config.get("default_runtime", "llama-server") + self.audio_models = self.audio_config.get("models", {}) self.models = self.config.get("models", {}) self.concurrent_runners_limit = self.config.get("concurrentRunners", 1) if not isinstance(self.concurrent_runners_limit, int) or self.concurrent_runners_limit < 1: @@ -63,7 +65,11 @@ def __init__(self): self.model_metadata_cache[model_name] = metadata else: logging.warning(f"Model '{model_name}' has no 'model_path' in config. Skipping metadata caching.") - + for model_name, model_config in self.audio_models.items(): + model_path = model_config.get("model_path") + if model_path: + logging.warning(f"Model '{model_name}' has no 'model_path' in config. Skipping metadata caching.") + self.fastapi_proxy_thread: Optional[FastAPIProxyThread] = None self.ollama_proxy_thread: Optional[OllamaProxyThread] = None @@ -114,6 +120,14 @@ def __init__(self): self.model_status_widgets[model_name] = status_widget status_widget.start_button.clicked.connect(lambda checked, name=model_name: self.llama_runner_manager.request_runner_start(name)) status_widget.stop_button.clicked.connect(lambda checked, name=model_name: self.llama_runner_manager.stop_llama_runner(name)) + for model_name in self.audio_models.keys(): + self.model_list_widget.addItem(model_name) + #model_metadata = self.model_metadata_cache.get(model_name) + status_widget = ModelStatusWidget(model_name) + self.model_status_stack.addWidget(status_widget) + self.model_status_widgets[model_name] = status_widget + status_widget.start_button.clicked.connect(lambda checked, name=model_name: self.llama_runner_manager.request_runner_start(name, True)) + status_widget.stop_button.clicked.connect(lambda checked, name=model_name: self.llama_runner_manager.stop_whisper_server(name)) self.model_list_widget.currentItemChanged.connect(self.on_model_selection_changed) @@ -136,13 +150,14 @@ def __init__(self): self.llama_runner_manager = LlamaRunnerManager( models=self.models, llama_runtimes=self.llama_runtimes, + audio_config=self.audio_config, default_runtime=self.default_runtime, model_status_widgets=self.model_status_widgets, # runner_port_ready_for_proxy and runner_stopped_for_proxy are now owned by LlamaRunnerManager parent=self, ) self.llama_runner_manager.set_concurrent_runners_limit(self.concurrent_runners_limit) - + # --- Start the FastAPI Proxy (for LM Studio) automatically if enabled --- if self.lmstudio_proxy_enabled: self.start_fastapi_proxy() @@ -248,6 +263,7 @@ def on_model_selection_changed(self, current_item, previous_item): self.model_status_stack.setCurrentWidget(self.no_model_selected_widget) else: self.model_status_stack.setCurrentWidget(self.no_model_selected_widget) + # Runner management methods moved to LlamaRunnerManager @@ -264,6 +280,7 @@ def start_fastapi_proxy(self): self.fastapi_proxy_thread = FastAPIProxyThread( all_models_config=self.models, runtimes_config=self.llama_runtimes, + audio_config=self.audio_config, is_model_running_callback=self.llama_runner_manager.is_llama_runner_running, get_runner_port_callback=self.llama_runner_manager.get_runner_port, request_runner_start_callback=self.llama_runner_manager.request_runner_start, @@ -293,7 +310,9 @@ def start_ollama_proxy(self): self.ollama_proxy_thread = OllamaProxyThread( all_models_config=self.models, runtimes_config=self.llama_runtimes, + audio_config=self.audio_config, is_model_running_callback=self.llama_runner_manager.is_llama_runner_running, + is_model_whisper_running=self.llama_runner_manager.is_whisper_runner_running, get_runner_port_callback=self.llama_runner_manager.get_runner_port, request_runner_start_callback=self.llama_runner_manager.request_runner_start, prompt_logging_enabled=self.prompt_logging_enabled, diff --git a/llama_runner/ollama_proxy_thread.py b/llama_runner/ollama_proxy_thread.py index 0a75e55..54778f5 100644 --- a/llama_runner/ollama_proxy_thread.py +++ b/llama_runner/ollama_proxy_thread.py @@ -20,6 +20,10 @@ chatRequestFromOllama, chatResponseToOllama ) +from llama_runner.whisper_cpp_runner import WhisperServer +from io import BytesIO +from fastapi import UploadFile as FastAPIUploadFile + # --- Create our own FastAPI app instance --- app = FastAPI() # --- End create app instance --- @@ -604,9 +608,6 @@ async def list_models(request: Request): raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail="Internal Server Error processing list models request") - -# --- End handlers for Ollama API endpoints --- - @app.post("/api/show") async def show_model_info(request: Request): """Handles Ollama /api/show requests.""" @@ -713,7 +714,6 @@ async def list_openai_models(request: Request): logging.error(f"Error handling /v1/models: {e}\n{traceback.format_exc()}") raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail="Internal Server Error processing list models request") -# --- End handlers for OpenAI compatible API endpoints (v1) --- @app.post("/v1/completions") async def openai_completions(request: Request): @@ -790,6 +790,32 @@ async def openai_embeddings(request: Request): raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail="Internal Server Error processing embeddings request") +@app.post("/v1/audio/transcriptions") +async def openai_speech_to_text(request: Request): + """Function to convert speech to text using whisper.cpp""" + try: + request_runner_start_callback = request.app.state.request_runner_start_callback + form = await request.form() + file = form.get("file") + contents = await file.read() + fastapi_file = FastAPIUploadFile(filename=file.filename, file=BytesIO(contents)) + model = str(form.get("model")) + audio_config = request.app.state.audio_config + request_runner_start_callback(model, True) + whisper_server = WhisperServer(audio_config, model) + audio_file_path = whisper_server.convert_to_wav(fastapi_file) + result = whisper_server.transcribe_audio(audio_file_path) + return JSONResponse(content=result) + except json.JSONDecodeError: + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid JSON request body") + except Exception as e: + logging.error(f"Error handling /audio/transcriptions: {e}\n{traceback.format_exc()}") + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail="Internal Server Error processing transcription request") + + +# --- End handlers for OpenAI compatible API endpoints (v1) --- + + class OllamaProxyThread(QThread): """ QThread to run the FastAPI proxy emulating the Ollama API in a separate thread. @@ -802,7 +828,9 @@ class OllamaProxyThread(QThread): def __init__(self, all_models_config: Dict[str, Dict[str, Any]], # Renamed models_config to all_models_config runtimes_config: Dict[str, Dict[str, Any]], # Renamed models_config to runtimes_config + audio_config, is_model_running_callback: Callable[[str], bool], + is_model_whisper_running, get_runner_port_callback: Callable[[str], Optional[int]], request_runner_start_callback: Callable[[str], asyncio.Future], prompt_logging_enabled: bool, # Add prompt logging flag @@ -810,8 +838,10 @@ def __init__(self, super().__init__() self.all_models_config = all_models_config # Store all_models_config self.runtimes_config = runtimes_config # Store runtimes_config + self.audio_config = audio_config self.is_model_running_callback = is_model_running_callback self.get_runner_port_callback = get_runner_port_callback + self.is_model_whisper_running = is_model_whisper_running self.request_runner_start_callback = request_runner_start_callback self.prompt_logging_enabled = prompt_logging_enabled # Store the flag self.prompts_logger = prompts_logger # Store the logger instance @@ -903,11 +933,15 @@ async def run_async(self): app.state.all_models_config = self.all_models_config # Pass all_models_config app.state.runtimes_config = self.runtimes_config # Pass runtimes_config app.state.is_model_running_callback = self.is_model_running_callback + app.state.is_model_whisper_running = self.is_model_whisper_running app.state.get_runner_port_callback = self.get_runner_port_callback app.state.request_runner_start_callback = self.request_runner_start_callback app.state.prompt_logging_enabled = self.prompt_logging_enabled # Set prompt logging flag on state app.state.prompts_logger = self.prompts_logger # Set prompts logger on state - + + # Audio global variables + app.state.audio_config = self.audio_config + # Use port 11434 as required for Ollama emulation uvicorn_config = uvicorn.Config(app, host="127.0.0.1", port=11434, reload=False) self._uvicorn_server = uvicorn.Server(uvicorn_config) diff --git a/llama_runner/whisper_cpp_runner.py b/llama_runner/whisper_cpp_runner.py new file mode 100644 index 0000000..19b9ca7 --- /dev/null +++ b/llama_runner/whisper_cpp_runner.py @@ -0,0 +1,155 @@ +import subprocess +import requests +import time +import logging +import os +from fastapi import UploadFile +from typing import Optional, Dict, Any, Union + + +class WhisperServer: + """ + Manage the startup of the whisper server and interaction with it. + """ + + def __init__(self, audio_config: Dict[str, Any], model_name: str): + """ + Initialize the server with audio configuration and model name. + + :param audio_config: audio config (normalized) + :param model_name: model name from audio_config['models'] + """ + self.audio_config = audio_config + self.model_name = model_name + + # Get model and runtime config + models = audio_config.get('models', {}) + model_conf = models.get(model_name, {}) + + self.runtime_name = model_conf.get('runtime', 'default') + runtimes = audio_config.get('runtimes', {}) + runtime_conf = runtimes.get(self.runtime_name, {}) + self.runtime_path = runtime_conf.get('runtime') + + if not self.runtime_path: + raise ValueError(f"Runtime path for '{self.runtime_name}' not defined in audio config.") + + self.model_path = model_conf.get("model_path") + if not self.model_path: + raise ValueError(f"Model path for '{self.model_name}' not defined in audio config.") + + # Compose launch command + self.cmd = [ + self.runtime_path, + '--model', self.model_path, + ] + + parameters = model_conf.get("parameters", {}) + if isinstance(parameters, dict): + for option, value in parameters.items(): + self.cmd.extend([f'--{option}', str(value)]) + + # Check if host and port exist, if not add with default values + default_host = 'localhost' + default_port = '9000' # string, since command list elements are strings + + if '--host' not in self.cmd: + self.cmd.extend(['--host', default_host]) + if '--port' not in self.cmd: + self.cmd.extend(['--port', default_port]) + + # Extract host and port from the command list + def get_cmd_param(cmd_list, param_name, default): + try: + idx = cmd_list.index(param_name) + return cmd_list[idx + 1] + except (ValueError, IndexError): + return default + + self.host = get_cmd_param(self.cmd, '--host', default_host) + self.port = get_cmd_param(self.cmd, '--port', default_port) + self.base_url = f'http://{self.host}:{self.port}' + + self.process: Optional[subprocess.Popen] = None + + def start_server(self, wait_seconds: float = 5.0) -> None: + """Start the whisper server with current parameters.""" + logging.info(f"Starting whisper-server with command: {' '.join(self.cmd)}") + self.process = subprocess.Popen(self.cmd) + print(f"Whisper-server started on {self.host}:{self.port} with model {self.model_name}") + time.sleep(wait_seconds) + + def stop_server(self) -> None: + """Stop the server if it is running.""" + if self.process and self.process.poll() is None: + self.process.terminate() + try: + self.process.wait(timeout=10) + except subprocess.TimeoutExpired: + self.process.kill() + self.process.wait() + print("Whisper-server stopped") + else: + print("Whisper-server is not running or already stopped.") + + def transcribe_audio(self, audio_path: str) -> Union[Dict[str, Any], None]: + """Send an audio file to the server for transcription and return the result.""" + url = f"{self.base_url}/inference" + data = {"response_format": "json"} + + try: + with open(audio_path, 'rb') as audio_file: + files = {'file': audio_file} + response = requests.post(url, files=files, data=data) + response.raise_for_status() + return response.json() + except requests.RequestException as e: + logging.error(f"Error transcribing audio: {e}") + return None + + def convert_to_wav(self, input_file: UploadFile, output_path: Optional[str] = None) -> str: + """ + Convert incoming audio file to WAV (16kHz, mono, PCM s16le). + + :param input_file: Uploaded audio file + :param output_path: Path to save WAV file. Defaults to ~/.llama-runner/temp.wav + :return: Path to saved WAV file + """ + if output_path is None: + output_path = os.path.expanduser("~/.llama-runner/temp.wav") + + input_tmp_dir = os.path.dirname(output_path) + input_tmp_path = os.path.join(input_tmp_dir, "temp_input") + + os.makedirs(input_tmp_dir, exist_ok=True) + + try: + with open(input_tmp_path, "wb") as f: + f.write(input_file.file.read()) + finally: + input_file.file.close() + + cmd = [ + "ffmpeg", + "-y", + "-i", input_tmp_path, + "-ar", "16000", + "-ac", "1", + "-c:a", "pcm_s16le", + output_path + ] + + try: + subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + except subprocess.CalledProcessError as e: + error_msg = e.stderr.decode(errors='ignore') + raise RuntimeError(f"Error during audio conversion: {error_msg}") + finally: + if os.path.exists(input_tmp_path): + os.remove(input_tmp_path) + + return output_path + + + def get_port(self): + return int(self.port) From 7f0396ab319bb1c9ea5c1a43db9780bfc1c9dbcf Mon Sep 17 00:00:00 2001 From: Mikhail Komarov <144356904+hardWorker254@users.noreply.github.com> Date: Fri, 15 Aug 2025 19:37:00 +0400 Subject: [PATCH 2/4] Fixed upload models counter --- llama_runner/llama_runner_manager.py | 42 +++++++++++++--------------- 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/llama_runner/llama_runner_manager.py b/llama_runner/llama_runner_manager.py index 5a36ed2..5453e61 100644 --- a/llama_runner/llama_runner_manager.py +++ b/llama_runner/llama_runner_manager.py @@ -147,12 +147,30 @@ def get_runner_port(self, model_name: str) -> Optional[int]: def request_runner_start(self, model_name: str, iswhisper: bool = False) -> asyncio.Future: logging.info(f"Received request to start runner for model: {model_name}") + + # Подсчёт запущенных раннеров (llama + whisper) + running_llama = sum(1 for thread in self.llama_runner_threads.values() if thread.isRunning()) + running_whisper = len(self.whisper_servers) + total_running = running_llama + running_whisper + + if total_running >= self.concurrent_runners_limit: + if self.concurrent_runners_limit == 1: + # При лимите 1 останавливаем все, чтобы запустить новый + self.stop_all_llama_runners() + self.stop_all_whisper_servers() + else: + future = asyncio.Future() + future.set_exception(RuntimeError( + f"Concurrent runner limit ({self.concurrent_runners_limit}) reached. Cannot start '{model_name}'.")) + self._runner_startup_futures[model_name] = future + QTimer.singleShot(1000, lambda: self._cleanup_completed_future(model_name)) + return future + if iswhisper: if self.is_whisper_runner_running(model_name): logging.info(f"Runner for {model_name} is already starting. Returning existing Future.") return self._runner_startup_futures[model_name] - - self.stop_all_whisper_servers() + self.start_whisper_server(model_name) future = asyncio.Future() future.set_result(self.get_whisper_port(model_name)) @@ -182,26 +200,6 @@ def request_runner_start(self, model_name: str, iswhisper: bool = False) -> asyn QTimer.singleShot(1000, lambda: self._cleanup_completed_future(model_name)) return future - running_runners = {name: thread for name, thread in self.llama_runner_threads.items() if thread.isRunning()} - num_running = len(running_runners) - - if num_running >= self.concurrent_runners_limit: - if self.concurrent_runners_limit == 1: - models_to_stop = list(running_runners.keys()) - if models_to_stop: - logging.info(f"Concurrent runner limit ({self.concurrent_runners_limit}) reached. Stopping existing runner(s): {models_to_stop} before starting {model_name}.") - for name_to_stop in models_to_stop: - self.stop_llama_runner(name_to_stop) - else: - logging.warning("Concurrent runner limit reached but no running runners found?") - else: - logging.warning(f"Concurrent runner limit ({self.concurrent_runners_limit}) reached. Cannot start {model_name}.") - future = asyncio.Future() - future.set_exception(RuntimeError(f"Concurrent runner limit ({self.concurrent_runners_limit}) reached. Cannot start '{model_name}'.")) - self._runner_startup_futures[model_name] = future - QTimer.singleShot(1000, lambda: self._cleanup_completed_future(model_name)) - return future - future = asyncio.Future() self._runner_startup_futures[model_name] = future From 66eb9d829231a51eb016a68c2f96ff4a72034990 Mon Sep 17 00:00:00 2001 From: Mikhail Komarov <144356904+hardWorker254@users.noreply.github.com> Date: Sat, 16 Aug 2025 00:20:44 +0400 Subject: [PATCH 3/4] Fix little bugs + add /v1/audio/translations --- llama_runner/llama_runner_manager.py | 51 ++++++++++++++++--- llama_runner/lmstudio_proxy_thread.py | 72 ++++++++++++++++++++++++++- llama_runner/ollama_proxy_thread.py | 72 ++++++++++++++++++++++++++- llama_runner/whisper_cpp_runner.py | 8 ++- 4 files changed, 194 insertions(+), 9 deletions(-) diff --git a/llama_runner/llama_runner_manager.py b/llama_runner/llama_runner_manager.py index 5453e61..86d0825 100644 --- a/llama_runner/llama_runner_manager.py +++ b/llama_runner/llama_runner_manager.py @@ -3,23 +3,29 @@ import logging from typing import Optional, Dict, Any + from PySide6.QtCore import QObject, QTimer, QCoreApplication, QEvent, Signal, Slot + # Import the custom event classes from llama_runner.llama_runner_thread import LlamaRunnerThread, RunnerStoppedEvent, RunnerErrorEvent from llama_runner.error_output_dialog import ErrorOutputDialog + from llama_runner.whisper_cpp_runner import WhisperServer + class LlamaRunnerManager(QObject): # Define a custom event type for events posted to the parent (e.g., MainWindow) # This replaces the QEvent.User + 4 magic number. MANAGER_PARENT_NOTIFICATION_EVENT_TYPE = QEvent.Type(QEvent.registerEventType()) + # Define signals directly as class attributes runner_port_ready_for_proxy = Signal(str, int) runner_stopped_for_proxy = Signal(str) + def __init__( self, models: dict, @@ -40,6 +46,7 @@ def __init__( # self.runner_port_ready_for_proxy = runner_port_ready_for_proxy # Removed, now class attribute # self.runner_stopped_for_proxy = runner_stopped_for_proxy # Removed, now class attribute + self.llama_runner_threads: Dict[str, LlamaRunnerThread] = {} self._runner_startup_futures: Dict[str, asyncio.Future] = {} self._current_running_model: Optional[str] = None @@ -51,7 +58,7 @@ def __init__( def start_whisper_server(self, model_name: str) -> None: """ - Создать и запустить whisper сервер для модели. + Create and start a whisper server for the model. """ if model_name in self.whisper_servers: logging.info(f"WhisperServer already started for model {model_name}") @@ -64,11 +71,13 @@ def start_whisper_server(self, model_name: str) -> None: status_widget.update_port("N/A") status_widget.set_buttons_enabled(False, False) + whisper_server = WhisperServer(self.audio_config, model_name) whisper_server.start_server() self.whisper_servers[model_name] = whisper_server port = whisper_server.get_port() + logging.info(f"Whisper Runner started for model {model_name}") print(f"Whisper Runner for {model_name} ready on port {port}.") @@ -82,9 +91,10 @@ def start_whisper_server(self, model_name: str) -> None: logging.error(f"Failed to start WhisperServer for {model_name}: {e}") + def stop_whisper_server(self, model_name: str) -> None: """ - Остановить whisper сервер для модели. + Stop the whisper server for the model. """ whisper_server = self.whisper_servers.get(model_name) if whisper_server: @@ -107,9 +117,10 @@ def stop_whisper_server(self, model_name: str) -> None: logging.warning(f"WhisperServer for {model_name} not running") + def stop_all_whisper_servers(self) -> None: """ - Остановить все запущенные whisper сервера. + Stop all running whisper servers. """ for model_name in list(self.whisper_servers.keys()): self.stop_whisper_server(model_name) @@ -130,32 +141,37 @@ def get_whisper_port(self, model_name: str) -> Optional[int]: + def set_concurrent_runners_limit(self, limit: int): self.concurrent_runners_limit = limit + def is_llama_runner_running(self, model_name: str) -> bool: thread = self.llama_runner_threads.get(model_name) if thread and thread.isRunning() and thread.runner and thread.runner.is_running(): return True return False + def get_runner_port(self, model_name: str) -> Optional[int]: thread = self.llama_runner_threads.get(model_name) if thread and thread.isRunning() and thread.runner and thread.runner.is_running(): return thread.runner.get_port() return None + def request_runner_start(self, model_name: str, iswhisper: bool = False) -> asyncio.Future: logging.info(f"Received request to start runner for model: {model_name}") - # Подсчёт запущенных раннеров (llama + whisper) + # Count running runners (llama + whisper) running_llama = sum(1 for thread in self.llama_runner_threads.values() if thread.isRunning()) running_whisper = len(self.whisper_servers) total_running = running_llama + running_whisper + if total_running >= self.concurrent_runners_limit: if self.concurrent_runners_limit == 1: - # При лимите 1 останавливаем все, чтобы запустить новый + # With limit 1, stop all to start new one self.stop_all_llama_runners() self.stop_all_whisper_servers() else: @@ -166,11 +182,13 @@ def request_runner_start(self, model_name: str, iswhisper: bool = False) -> asyn QTimer.singleShot(1000, lambda: self._cleanup_completed_future(model_name)) return future + if iswhisper: if self.is_whisper_runner_running(model_name): logging.info(f"Runner for {model_name} is already starting. Returning existing Future.") return self._runner_startup_futures[model_name] + self.start_whisper_server(model_name) future = asyncio.Future() future.set_result(self.get_whisper_port(model_name)) @@ -179,10 +197,12 @@ def request_runner_start(self, model_name: str, iswhisper: bool = False) -> asyn return future + if model_name in self._runner_startup_futures and not self._runner_startup_futures[model_name].done(): logging.info(f"Runner for {model_name} is already starting. Returning existing Future.") return self._runner_startup_futures[model_name] + if self.is_llama_runner_running(model_name): port = self.get_runner_port(model_name) if port is not None: @@ -200,14 +220,17 @@ def request_runner_start(self, model_name: str, iswhisper: bool = False) -> asyn QTimer.singleShot(1000, lambda: self._cleanup_completed_future(model_name)) return future + future = asyncio.Future() self._runner_startup_futures[model_name] = future + model_config = self.models[model_name] model_path = model_config.get("model_path") llama_cpp_runtime_key = model_config.get("llama_cpp_runtime", "default") _raw_llama_cpp_runtime_config = self.llama_runtimes.get(llama_cpp_runtime_key, self.default_runtime) + if isinstance(_raw_llama_cpp_runtime_config, dict): llama_cpp_runtime_command = _raw_llama_cpp_runtime_config.get("runtime") if not llama_cpp_runtime_command: @@ -221,21 +244,25 @@ def request_runner_start(self, model_name: str, iswhisper: bool = False) -> asyn future.set_exception(RuntimeError(f"Invalid runtime configuration type for '{llama_cpp_runtime_key}'.")) return future + if not model_path: logging.error(f"Configuration Error: Model '{model_name}' has no 'model_path' specified in config.json.") future.set_exception(RuntimeError(f"Configuration Error: Model '{model_name}' has no 'model_path'.")) return future + if not os.path.exists(model_path): logging.error(f"File Not Found: Model file not found: {model_path}") future.set_exception(FileNotFoundError(f"Model file not found: {model_path}")) return future + if llama_cpp_runtime_key != "default" and not os.path.exists(llama_cpp_runtime_command): logging.error(f"Runtime Not Found: Llama.cpp runtime not found: {llama_cpp_runtime_command}") future.set_exception(FileNotFoundError(f"Llama.cpp runtime not found: {llama_cpp_runtime_command}")) return future + print(f"Starting Llama Runner for {model_name}...") status_widget = self.model_status_widgets.get(model_name) if status_widget: @@ -243,6 +270,7 @@ def request_runner_start(self, model_name: str, iswhisper: bool = False) -> asyn status_widget.update_port("N/A") status_widget.set_buttons_enabled(False, False) + thread = LlamaRunnerThread( model_name=model_name, model_path=model_path, @@ -255,16 +283,20 @@ def request_runner_start(self, model_name: str, iswhisper: bool = False) -> asyn thread.error.connect(lambda message, output_buffer, name=model_name: self.on_llama_runner_error(name, message, output_buffer)) # thread.stopped.connect(lambda name=model_name: self.on_llama_runner_stopped(name)) # Removed, now handled by customEvent + self.llama_runner_threads[model_name] = thread thread.start() + return future + def _cleanup_completed_future(self, model_name: str): if model_name in self._runner_startup_futures and not self._runner_startup_futures[model_name].done(): logging.debug(f"Cleaning up completed future for {model_name}") del self._runner_startup_futures[model_name] + def stop_llama_runner(self, model_name: str): if model_name in self.llama_runner_threads and self.llama_runner_threads[model_name].isRunning(): print(f"Stopping Llama Runner for {model_name}...") @@ -287,6 +319,7 @@ def stop_llama_runner(self, model_name: str): # Or, if an event is preferred for the manager itself: # QCoreApplication.instance().postEvent(self, RunnerStoppedEvent(model_name)) + # Original logic: post a generic event to parent. # This might be for a different purpose than the thread's stopped event. parent_event = QEvent(LlamaRunnerManager.MANAGER_PARENT_NOTIFICATION_EVENT_TYPE) @@ -301,6 +334,7 @@ def stop_llama_runner(self, model_name: str): self.on_llama_runner_stopped(model_name) + def stop_all_llama_runners(self): print("Stopping all Llama Runners...") # Collect running threads first to avoid modifying the dict during iteration @@ -315,6 +349,7 @@ def stop_all_llama_runners(self): for model_name, thread in running_threads: thread.wait() + @Slot(str) def on_llama_runner_started(self, model_name: str): status_widget = self.model_status_widgets.get(model_name) @@ -322,6 +357,7 @@ def on_llama_runner_started(self, model_name: str): status_widget.update_status("Starting...") status_widget.set_buttons_enabled(False, False) + @Slot(str) def on_llama_runner_stopped(self, model_name: str): print(f"Llama Runner for {model_name} stopped.") @@ -343,6 +379,7 @@ def on_llama_runner_stopped(self, model_name: str): else: logging.warning(f"Stopped signal received for unknown or already cleaned up model: {model_name}") + @Slot(str, str, list) def on_llama_runner_error(self, model_name: str, message: str, output_buffer: list): print(f"Llama Runner for {model_name} error: {message}") @@ -361,6 +398,7 @@ def on_llama_runner_error(self, model_name: str, message: str, output_buffer: li logging.debug(f"Runner {model_name} errored while startup Future was pending.") self._runner_startup_futures[model_name].set_exception(RuntimeError(f"Runner for {model_name} errored during startup: {message}")) + def customEvent(self, event: QEvent): # Handle custom stopped event from LlamaRunnerThread if event.type() == RunnerStoppedEvent.EVENT_TYPE: @@ -376,6 +414,7 @@ def customEvent(self, event: QEvent): else: super().customEvent(event) + @Slot(str, int) def on_llama_runner_port_ready_and_emit(self, model_name: str, port: int): print(f"Llama Runner for {model_name} ready on port {port}.") @@ -395,6 +434,7 @@ def on_llama_runner_port_ready_and_emit(self, model_name: str, port: int): logging.info(f"Set current running model: {model_name}") self.runner_port_ready_for_proxy.emit(model_name, port) + @Slot() def check_runner_statuses(self): for model_name, thread in list(self.llama_runner_threads.items()): @@ -411,4 +451,3 @@ def check_runner_statuses(self): app_instance.postEvent(parent_object, parent_event) else: logging.warning(f"Could not post parent event for {model_name} (check_runner_statuses): App/Parent None.") - \ No newline at end of file diff --git a/llama_runner/lmstudio_proxy_thread.py b/llama_runner/lmstudio_proxy_thread.py index 44878b8..c550a1c 100644 --- a/llama_runner/lmstudio_proxy_thread.py +++ b/llama_runner/lmstudio_proxy_thread.py @@ -982,22 +982,92 @@ async def _v1_embeddings_handler(request: Request): async def openai_speech_to_text(request: Request): """Function to convert speech to text using whisper.cpp""" try: + # Retrieve callback function to notify request start request_runner_start_callback = request.app.state.request_runner_start_callback + # Parse form data from the incoming request form = await request.form() + # Extract uploaded audio file from form file = form.get("file") + # Read the content of the uploaded file into bytes contents = await file.read() + # Create a FastAPI UploadFile object from bytes content, preserving filename fastapi_file = FastAPIUploadFile(filename=file.filename, file=BytesIO(contents)) + # Extract model name from the form data model = str(form.get("model")) + # Retrieve audio configuration from app state audio_config = request.app.state.audio_config + + # Notify that processing of this model's request has started request_runner_start_callback(model, True) + + # Initialize WhisperServer with audio configuration and model whisper_server = WhisperServer(audio_config, model) + # Convert the uploaded audio file to WAV format audio_file_path = whisper_server.convert_to_wav(fastapi_file) + # Perform transcription on the WAV audio file result = whisper_server.transcribe_audio(audio_file_path) + # Return transcription result as JSON response return JSONResponse(content=result) + except json.JSONDecodeError: + # Handle cases where request body is not valid JSON raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid JSON request body") + + except Exception as e: + # Log unexpected errors and return HTTP 500 error + logging.error(f"Error handling /v1/audio/transcriptions: {e}\n{traceback.format_exc()}") + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail="Internal Server Error processing transcription request") + + +@app.post("/v1/audio/translations") +async def openai_speech_to_text_translate(request: Request): + """Function to convert speech to text using whisper.cpp with translation""" + try: + # Retrieve callback function to notify request start + request_runner_start_callback = request.app.state.request_runner_start_callback + # Parse form data from the incoming request + form = await request.form() + # Extract uploaded audio file from form + file = form.get("file") + # Read the content of the uploaded file into bytes + contents = await file.read() + # Create a FastAPI UploadFile object from bytes content, preserving filename + fastapi_file = FastAPIUploadFile(filename=file.filename, file=BytesIO(contents)) + # Extract model name from the form data + model = str(form.get("model")) + + # Save the current audio config to restore later + prev_audio_config = request.app.state.audio_config + # Remove "language" parameter if it exists for this model's config + if request.app.state.audio_config["models"][model]["parameters"].get("language", False): + del request.app.state.audio_config["models"][model]["parameters"]["language"] + # Enable translation mode by setting "translate" parameter + request.app.state.audio_config["models"][model]["parameters"]["translate"] = "" + # Retrieve updated audio configuration + audio_config = request.app.state.audio_config + + # Notify that processing of this model's request has started + request_runner_start_callback(model, True) + # Initialize WhisperServer with updated audio configuration and model + whisper_server = WhisperServer(audio_config, model) + # Convert the uploaded audio file to WAV format + audio_file_path = whisper_server.convert_to_wav(fastapi_file) + # Perform transcription with translation on the WAV audio file + result = whisper_server.transcribe_audio(audio_file_path) + + # Restore previous audio config after request processed + request.app.state.audio_config = prev_audio_config + + # Return transcription and translation result as JSON response + return JSONResponse(content=result) + + except json.JSONDecodeError: + # Handle cases where request body is not valid JSON + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid JSON request body") + except Exception as e: - logging.error(f"Error handling /audio/transcriptions: {e}\n{traceback.format_exc()}") + # Log unexpected errors and return HTTP 500 error + logging.error(f"Error handling /v1/audio/transcriptions: {e}\n{traceback.format_exc()}") raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail="Internal Server Error processing transcription request") diff --git a/llama_runner/ollama_proxy_thread.py b/llama_runner/ollama_proxy_thread.py index 54778f5..17bb1f5 100644 --- a/llama_runner/ollama_proxy_thread.py +++ b/llama_runner/ollama_proxy_thread.py @@ -794,25 +794,95 @@ async def openai_embeddings(request: Request): async def openai_speech_to_text(request: Request): """Function to convert speech to text using whisper.cpp""" try: + # Retrieve callback function to notify request start request_runner_start_callback = request.app.state.request_runner_start_callback + # Parse form data from the incoming request form = await request.form() + # Extract uploaded audio file from form file = form.get("file") + # Read the content of the uploaded file into bytes contents = await file.read() + # Create a FastAPI UploadFile object from bytes content, preserving filename fastapi_file = FastAPIUploadFile(filename=file.filename, file=BytesIO(contents)) + # Extract model name from the form data model = str(form.get("model")) + # Retrieve audio configuration from app state audio_config = request.app.state.audio_config + + # Notify that processing of this model's request has started request_runner_start_callback(model, True) + + # Initialize WhisperServer with audio configuration and model whisper_server = WhisperServer(audio_config, model) + # Convert the uploaded audio file to WAV format audio_file_path = whisper_server.convert_to_wav(fastapi_file) + # Perform transcription on the WAV audio file result = whisper_server.transcribe_audio(audio_file_path) + # Return transcription result as JSON response return JSONResponse(content=result) + except json.JSONDecodeError: + # Handle cases where request body is not valid JSON raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid JSON request body") + except Exception as e: - logging.error(f"Error handling /audio/transcriptions: {e}\n{traceback.format_exc()}") + # Log unexpected errors and return HTTP 500 error + logging.error(f"Error handling /v1/audio/transcriptions: {e}\n{traceback.format_exc()}") raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail="Internal Server Error processing transcription request") + +@app.post("/v1/audio/translations") +async def openai_speech_to_text_translate(request: Request): + """Function to convert speech to text using whisper.cpp with translation""" + try: + # Retrieve callback function to notify request start + request_runner_start_callback = request.app.state.request_runner_start_callback + # Parse form data from the incoming request + form = await request.form() + # Extract uploaded audio file from form + file = form.get("file") + # Read the content of the uploaded file into bytes + contents = await file.read() + # Create a FastAPI UploadFile object from bytes content, preserving filename + fastapi_file = FastAPIUploadFile(filename=file.filename, file=BytesIO(contents)) + # Extract model name from the form data + model = str(form.get("model")) + + # Save the current audio config to restore later + prev_audio_config = request.app.state.audio_config + # Remove "language" parameter if it exists for this model's config + if request.app.state.audio_config["models"][model]["parameters"].get("language", False): + del request.app.state.audio_config["models"][model]["parameters"]["language"] + # Enable translation mode by setting "translate" parameter + request.app.state.audio_config["models"][model]["parameters"]["translate"] = "" + # Retrieve updated audio configuration + audio_config = request.app.state.audio_config + + # Notify that processing of this model's request has started + request_runner_start_callback(model, True) + # Initialize WhisperServer with updated audio configuration and model + whisper_server = WhisperServer(audio_config, model) + # Convert the uploaded audio file to WAV format + audio_file_path = whisper_server.convert_to_wav(fastapi_file) + # Perform transcription with translation on the WAV audio file + result = whisper_server.transcribe_audio(audio_file_path) + + # Restore previous audio config after request processed + request.app.state.audio_config = prev_audio_config + + # Return transcription and translation result as JSON response + return JSONResponse(content=result) + + except json.JSONDecodeError: + # Handle cases where request body is not valid JSON + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid JSON request body") + + except Exception as e: + # Log unexpected errors and return HTTP 500 error + logging.error(f"Error handling /v1/audio/transcriptions: {e}\n{traceback.format_exc()}") + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail="Internal Server Error processing transcription request") + # --- End handlers for OpenAI compatible API endpoints (v1) --- diff --git a/llama_runner/whisper_cpp_runner.py b/llama_runner/whisper_cpp_runner.py index 19b9ca7..a77cb97 100644 --- a/llama_runner/whisper_cpp_runner.py +++ b/llama_runner/whisper_cpp_runner.py @@ -57,7 +57,13 @@ def __init__(self, audio_config: Dict[str, Any], model_name: str): self.cmd.extend(['--host', default_host]) if '--port' not in self.cmd: self.cmd.extend(['--port', default_port]) - + + # Remove '' in self.cmd if exists + for i in range(len(self.cmd)): + if self.cmd[i] == '': + self.cmd.pop(i) + + # Extract host and port from the command list def get_cmd_param(cmd_list, param_name, default): try: From 979a0cacb7685c2a59f9b5fc27b923cfdf77433b Mon Sep 17 00:00:00 2001 From: Mikhail Komarov <144356904+hardWorker254@users.noreply.github.com> Date: Sun, 7 Sep 2025 14:13:10 +0400 Subject: [PATCH 4/4] Method not allowed fix Fixed error: OPTIONS /v1/chat/completions 405 Method not allowed --- llama_runner/lmstudio_proxy_thread.py | 8 ++++++++ llama_runner/ollama_proxy_thread.py | 10 +++++++++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/llama_runner/lmstudio_proxy_thread.py b/llama_runner/lmstudio_proxy_thread.py index c550a1c..3ae0c7a 100644 --- a/llama_runner/lmstudio_proxy_thread.py +++ b/llama_runner/lmstudio_proxy_thread.py @@ -23,12 +23,20 @@ from llama_runner.whisper_cpp_runner import WhisperServer from io import BytesIO from fastapi import UploadFile as FastAPIUploadFile +from fastapi.middleware.cors import CORSMiddleware # Configure logging (already done in main.py for configurable levels) # logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') # --- Create our own FastAPI app instance --- app = FastAPI() +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) # --- End create app instance --- diff --git a/llama_runner/ollama_proxy_thread.py b/llama_runner/ollama_proxy_thread.py index 17bb1f5..e78a936 100644 --- a/llama_runner/ollama_proxy_thread.py +++ b/llama_runner/ollama_proxy_thread.py @@ -23,9 +23,17 @@ from llama_runner.whisper_cpp_runner import WhisperServer from io import BytesIO from fastapi import UploadFile as FastAPIUploadFile +from fastapi.middleware.cors import CORSMiddleware # --- Create our own FastAPI app instance --- app = FastAPI() +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) # --- End create app instance --- # Define standalone handlers that access state via app.state @@ -743,8 +751,8 @@ async def openai_chat_completions(request: Request): """Handles OpenAI /v1/chat/completions requests.""" try: request_body = await request.json() + print(request_body) # No conversion needed here, assuming the incoming request is already OpenAI format - async def chat_completion_response_stream(): # The dynamic router handles the forwarding and streaming async for chunk in _dynamic_route_runner_request_generator(request, target_path="/v1/chat/completions", request_body=request_body):