diff --git a/llama_runner/config_loader.py b/llama_runner/config_loader.py index c3a5921..98ad3e8 100644 --- a/llama_runner/config_loader.py +++ b/llama_runner/config_loader.py @@ -2,18 +2,22 @@ import json import logging + CONFIG_DIR = os.path.expanduser("~/.llama-runner") CONFIG_FILE = os.path.join(CONFIG_DIR, "config.json") LOG_FILE = os.path.join(CONFIG_DIR, "error.log") + # Ensure the log directory exists if not os.path.exists(CONFIG_DIR): os.makedirs(CONFIG_DIR, exist_ok=True) + # Set up logging logging.basicConfig(filename=LOG_FILE, level=logging.ERROR, format='%(asctime)s - %(levelname)s - %(message)s') + def ensure_config_exists(): """ Ensures that the configuration directory and file exist. @@ -27,6 +31,7 @@ def ensure_config_exists(): logging.error(f"Error creating config directory: {e}") return False + if not os.path.exists(CONFIG_FILE): try: default_config = { @@ -48,6 +53,7 @@ def ensure_config_exists(): return False return True + def load_config(): """ Loads the configuration from the JSON file. @@ -56,21 +62,23 @@ def load_config(): if not ensure_config_exists(): return {} + try: with open(CONFIG_FILE, "r") as f: config = json.load(f) - # Ensure default_runtime and concurrentRunners exist if "default_runtime" not in config: config["default_runtime"] = "llama-server" if "concurrentRunners" not in config: config["concurrentRunners"] = 1 + # Ensure proxies section and its sub-keys exist with defaults proxies_config = config.get("proxies", {}) if not isinstance(proxies_config, dict): # Handle case where 'proxies' might exist but not as a dict proxies_config = {} + ollama_proxy_config = proxies_config.get("ollama", {}) if not isinstance(ollama_proxy_config, dict): ollama_proxy_config = {} @@ -78,6 +86,7 @@ def load_config(): ollama_proxy_config["enabled"] = True proxies_config["ollama"] = ollama_proxy_config + lmstudio_proxy_config = proxies_config.get("lmstudio", {}) if not isinstance(lmstudio_proxy_config, dict): lmstudio_proxy_config = {} @@ -89,6 +98,7 @@ def load_config(): config["proxies"] = proxies_config + # Ensure logging section and its sub-keys exist with defaults logging_config = config.get("logging", {}) if not isinstance(logging_config, dict): # Handle case where 'logging' might exist but not as a dict @@ -134,6 +144,80 @@ def load_config(): print("Warning: Config: 'llama-runtimes' key exists but is not a dictionary. Ignoring.") # If 'llama-runtimes' is not in config or is None, it's handled gracefully (no changes made to it) + + + raw_audio = config.get("audio") + if isinstance(raw_audio, dict): + # Process runtimes + raw_runtimes = raw_audio.get("runtimes") + processed_runtimes = {} + if isinstance(raw_runtimes, dict): + for name, details in raw_runtimes.items(): + if isinstance(details, dict): + runtime_path = details.get("runtime") + if isinstance(runtime_path, str) and runtime_path.strip(): + processed_runtimes[name] = { + "runtime": runtime_path.strip() + } + else: + logging.warning(f"Config: Audio runtime '{name}' has invalid or empty 'runtime' path. Skipping.") + print(f"Warning: Config: Audio runtime '{name}' has invalid or empty 'runtime' path. Skipping.") + else: + logging.warning(f"Config: Audio runtime '{name}' details should be a dictionary. Skipping.") + print(f"Warning: Config: Audio runtime '{name}' details should be a dictionary. Skipping.") + elif raw_runtimes is not None: + logging.warning("Config: 'audio.runtimes' exists but is not a dictionary. Ignoring.") + print("Warning: Config: 'audio.runtimes' exists but is not a dictionary. Ignoring.") + + + # Process models + raw_models = raw_audio.get("models") + processed_models = {} + if isinstance(raw_models, dict): + for model_name, model_info in raw_models.items(): + if isinstance(model_info, dict): + model_path = model_info.get("model_path") + runtime = model_info.get("runtime") + parameters = model_info.get("parameters", {}) + if isinstance(model_path, str) and model_path.strip(): + if isinstance(parameters, dict): + processed_models[model_name] = { + "model_path": model_path.strip(), + "runtime": runtime, + "parameters": parameters + } + else: + logging.warning(f"Config: Parameters for model '{model_name}' should be a dictionary. Using empty dict instead.") + print(f"Warning: Config: Parameters for model '{model_name}' should be a dictionary. Using empty dict instead.") + processed_models[model_name] = { + "model_path": model_path.strip(), + "runtime": runtime, + "parameters": {} + } + else: + logging.warning(f"Config: Model '{model_name}' has invalid or empty 'model_path'. Skipping.") + print(f"Warning: Config: Model '{model_name}' has invalid or empty 'model_path'. Skipping.") + else: + logging.warning(f"Config: Model entry '{model_name}' is not a dictionary. Skipping.") + print(f"Warning: Config: Model entry '{model_name}' is not a dictionary. Skipping.") + elif raw_models is not None: + logging.warning("Config: 'audio.models' exists but is not a dictionary. Ignoring.") + print("Warning: Config: 'audio.models' exists but is not a dictionary. Ignoring.") + + + # Update the audio section in config + config["audio"] = { + "runtimes": processed_runtimes, + "models": processed_models + } + + + elif raw_audio is not None: + logging.warning("Config: 'audio' key exists but is not a dictionary. Ignoring.") + print("Warning: Config: 'audio' key exists but is not a dictionary. Ignoring.") + + + print(f"Loaded config (processed): {config}") # Print loaded config return config except (OSError, json.JSONDecodeError) as e: @@ -141,6 +225,7 @@ def load_config(): logging.error(f"Error loading config file: {e}") return {} + def calculate_system_fingerprint(config: dict) -> str: """Calculates a 16-character hash of the configuration parameters.""" import hashlib @@ -149,6 +234,7 @@ def calculate_system_fingerprint(config: dict) -> str: hash_object = hashlib.md5(config_str.encode()) return hash_object.hexdigest()[:16] + if __name__ == '__main__': # Example usage: config = load_config() diff --git a/llama_runner/headless_service_manager.py b/llama_runner/headless_service_manager.py index 54bfe75..25e7d1a 100644 --- a/llama_runner/headless_service_manager.py +++ b/llama_runner/headless_service_manager.py @@ -35,7 +35,8 @@ def _initialize_services(self): # Initialize LlamaRunnerManager self.llama_runner_manager = LlamaRunnerManager( models=self.models_specific_config, # This is app_config['models'] - llama_runtimes=self.app_config.get('llama-runtimes', {}), # Ensure correct key + llama_runtimes=self.app_config.get('llama-runtimes', {}), # Ensure correct key, + audio_config=self.app_config.get('audio', {}), default_runtime=self.app_config.get('default_runtime', 'llama-server'), # Ensure correct key and default model_status_widgets={} # No UI widgets in headless mode ) @@ -48,6 +49,7 @@ def _initialize_services(self): # Get proxy and logging settings from the unified config proxies_config = self.app_config.get('proxies', {}) + audio_config = self.app_config.get('audio', {}) ollama_proxy_settings = proxies_config.get('ollama', {}) lmstudio_proxy_settings = proxies_config.get('lmstudio', {}) logging_settings = self.app_config.get('logging', {}) @@ -63,7 +65,9 @@ def _initialize_services(self): self.ollama_proxy = OllamaProxyThread( all_models_config=self.models_specific_config, runtimes_config=self.app_config.get('llama-runtimes', {}), + audio_config=audio_config, is_model_running_callback=self.llama_runner_manager.is_llama_runner_running, + is_model_whisper_running=self.llama_runner_manager.is_whisper_runner_running, get_runner_port_callback=self.llama_runner_manager.get_runner_port, request_runner_start_callback=self.llama_runner_manager.request_runner_start, prompt_logging_enabled=prompt_logging_enabled, @@ -80,6 +84,7 @@ def _initialize_services(self): self.lmstudio_proxy = LMStudioProxyThread( # Using the aliased FastAPIProxyThread all_models_config=self.models_specific_config, runtimes_config=self.app_config.get('llama-runtimes', {}), + audio_config=audio_config, is_model_running_callback=self.llama_runner_manager.is_llama_runner_running, get_runner_port_callback=self.llama_runner_manager.get_runner_port, request_runner_start_callback=self.llama_runner_manager.request_runner_start, diff --git a/llama_runner/llama_cpp_runner.py b/llama_runner/llama_cpp_runner.py index 0ba32eb..15c13e2 100644 --- a/llama_runner/llama_cpp_runner.py +++ b/llama_runner/llama_cpp_runner.py @@ -84,7 +84,6 @@ async def start(self): command.append(f"--{arg_name}") command.append(str(value)) - print(f"Starting llama.cpp server with command: {' '.join(command)}") logging.info(f"Starting llama.cpp server with command: {' '.join(command)}") # Clear the output buffer before starting a new process diff --git a/llama_runner/llama_runner_manager.py b/llama_runner/llama_runner_manager.py index 5b578b6..86d0825 100644 --- a/llama_runner/llama_runner_manager.py +++ b/llama_runner/llama_runner_manager.py @@ -3,26 +3,35 @@ import logging from typing import Optional, Dict, Any + from PySide6.QtCore import QObject, QTimer, QCoreApplication, QEvent, Signal, Slot + # Import the custom event classes from llama_runner.llama_runner_thread import LlamaRunnerThread, RunnerStoppedEvent, RunnerErrorEvent from llama_runner.error_output_dialog import ErrorOutputDialog + +from llama_runner.whisper_cpp_runner import WhisperServer + + class LlamaRunnerManager(QObject): # Define a custom event type for events posted to the parent (e.g., MainWindow) # This replaces the QEvent.User + 4 magic number. MANAGER_PARENT_NOTIFICATION_EVENT_TYPE = QEvent.Type(QEvent.registerEventType()) + # Define signals directly as class attributes runner_port_ready_for_proxy = Signal(str, int) runner_stopped_for_proxy = Signal(str) + def __init__( self, models: dict, llama_runtimes: dict, default_runtime: str, + audio_config, model_status_widgets: dict, # runner_port_ready_for_proxy and runner_stopped_for_proxy are now class attributes parent=None, @@ -37,33 +46,163 @@ def __init__( # self.runner_port_ready_for_proxy = runner_port_ready_for_proxy # Removed, now class attribute # self.runner_stopped_for_proxy = runner_stopped_for_proxy # Removed, now class attribute + self.llama_runner_threads: Dict[str, LlamaRunnerThread] = {} self._runner_startup_futures: Dict[str, asyncio.Future] = {} self._current_running_model: Optional[str] = None self.concurrent_runners_limit = 1 # Will be set by MainWindow after instantiation + self.audio_config = audio_config + self.whisper_servers: Dict[str, WhisperServer] = {} + + + + def start_whisper_server(self, model_name: str) -> None: + """ + Create and start a whisper server for the model. + """ + if model_name in self.whisper_servers: + logging.info(f"WhisperServer already started for model {model_name}") + return + try: + print(f"Starting Runner for {model_name}...") + status_widget = self.model_status_widgets.get(model_name) + if status_widget: + status_widget.update_status("Starting...") + status_widget.update_port("N/A") + status_widget.set_buttons_enabled(False, False) + + + whisper_server = WhisperServer(self.audio_config, model_name) + whisper_server.start_server() + self.whisper_servers[model_name] = whisper_server + port = whisper_server.get_port() + + + logging.info(f"Whisper Runner started for model {model_name}") + + print(f"Whisper Runner for {model_name} ready on port {port}.") + status_widget = self.model_status_widgets.get(model_name) + if status_widget: + status_widget.update_port(port) + status_widget.update_status("Running") + status_widget.set_buttons_enabled(False, True) + + except Exception as e: + logging.error(f"Failed to start WhisperServer for {model_name}: {e}") + + + + def stop_whisper_server(self, model_name: str) -> None: + """ + Stop the whisper server for the model. + """ + whisper_server = self.whisper_servers.get(model_name) + if whisper_server: + try: + status_widget = self.model_status_widgets.get(model_name) + if status_widget: + status_widget.update_status("Stopping...") + status_widget.set_buttons_enabled(False, False) + print(f"Stopping Runner for {model_name}...") + whisper_server.stop_server() + del self.whisper_servers[model_name] + if status_widget: + status_widget.update_status("Not Running") + status_widget.update_port("N/A") + status_widget.set_buttons_enabled(True, False) + logging.info(f"WhisperServer stopped for model {model_name}") + except Exception as e: + logging.error(f"Error stopping WhisperServer for {model_name}: {e}") + else: + logging.warning(f"WhisperServer for {model_name} not running") + + + + def stop_all_whisper_servers(self) -> None: + """ + Stop all running whisper servers. + """ + for model_name in list(self.whisper_servers.keys()): + self.stop_whisper_server(model_name) + + + def is_whisper_runner_running(self, model_name: str) -> bool: + whisper_class = self.whisper_servers.get(model_name) + if whisper_class: + return True + return False + + + def get_whisper_port(self, model_name: str) -> Optional[int]: + whisper_class = self.whisper_servers.get(model_name) + if whisper_class: + return whisper_class.get_port() + return None + + + def set_concurrent_runners_limit(self, limit: int): self.concurrent_runners_limit = limit + def is_llama_runner_running(self, model_name: str) -> bool: thread = self.llama_runner_threads.get(model_name) if thread and thread.isRunning() and thread.runner and thread.runner.is_running(): return True return False + def get_runner_port(self, model_name: str) -> Optional[int]: thread = self.llama_runner_threads.get(model_name) if thread and thread.isRunning() and thread.runner and thread.runner.is_running(): return thread.runner.get_port() return None - def request_runner_start(self, model_name: str) -> asyncio.Future: + + def request_runner_start(self, model_name: str, iswhisper: bool = False) -> asyncio.Future: logging.info(f"Received request to start runner for model: {model_name}") + + # Count running runners (llama + whisper) + running_llama = sum(1 for thread in self.llama_runner_threads.values() if thread.isRunning()) + running_whisper = len(self.whisper_servers) + total_running = running_llama + running_whisper + + + if total_running >= self.concurrent_runners_limit: + if self.concurrent_runners_limit == 1: + # With limit 1, stop all to start new one + self.stop_all_llama_runners() + self.stop_all_whisper_servers() + else: + future = asyncio.Future() + future.set_exception(RuntimeError( + f"Concurrent runner limit ({self.concurrent_runners_limit}) reached. Cannot start '{model_name}'.")) + self._runner_startup_futures[model_name] = future + QTimer.singleShot(1000, lambda: self._cleanup_completed_future(model_name)) + return future + + + if iswhisper: + if self.is_whisper_runner_running(model_name): + logging.info(f"Runner for {model_name} is already starting. Returning existing Future.") + return self._runner_startup_futures[model_name] + + + self.start_whisper_server(model_name) + future = asyncio.Future() + future.set_result(self.get_whisper_port(model_name)) + self._runner_startup_futures[model_name] = future + QTimer.singleShot(1000, lambda: self._cleanup_completed_future(model_name)) + return future + + if model_name in self._runner_startup_futures and not self._runner_startup_futures[model_name].done(): logging.info(f"Runner for {model_name} is already starting. Returning existing Future.") return self._runner_startup_futures[model_name] + if self.is_llama_runner_running(model_name): port = self.get_runner_port(model_name) if port is not None: @@ -81,34 +220,17 @@ def request_runner_start(self, model_name: str) -> asyncio.Future: QTimer.singleShot(1000, lambda: self._cleanup_completed_future(model_name)) return future - running_runners = {name: thread for name, thread in self.llama_runner_threads.items() if thread.isRunning()} - num_running = len(running_runners) - - if num_running >= self.concurrent_runners_limit: - if self.concurrent_runners_limit == 1: - models_to_stop = list(running_runners.keys()) - if models_to_stop: - logging.info(f"Concurrent runner limit ({self.concurrent_runners_limit}) reached. Stopping existing runner(s): {models_to_stop} before starting {model_name}.") - for name_to_stop in models_to_stop: - self.stop_llama_runner(name_to_stop) - else: - logging.warning("Concurrent runner limit reached but no running runners found?") - else: - logging.warning(f"Concurrent runner limit ({self.concurrent_runners_limit}) reached. Cannot start {model_name}.") - future = asyncio.Future() - future.set_exception(RuntimeError(f"Concurrent runner limit ({self.concurrent_runners_limit}) reached. Cannot start '{model_name}'.")) - self._runner_startup_futures[model_name] = future - QTimer.singleShot(1000, lambda: self._cleanup_completed_future(model_name)) - return future future = asyncio.Future() self._runner_startup_futures[model_name] = future + model_config = self.models[model_name] model_path = model_config.get("model_path") llama_cpp_runtime_key = model_config.get("llama_cpp_runtime", "default") _raw_llama_cpp_runtime_config = self.llama_runtimes.get(llama_cpp_runtime_key, self.default_runtime) + if isinstance(_raw_llama_cpp_runtime_config, dict): llama_cpp_runtime_command = _raw_llama_cpp_runtime_config.get("runtime") if not llama_cpp_runtime_command: @@ -122,21 +244,25 @@ def request_runner_start(self, model_name: str) -> asyncio.Future: future.set_exception(RuntimeError(f"Invalid runtime configuration type for '{llama_cpp_runtime_key}'.")) return future + if not model_path: logging.error(f"Configuration Error: Model '{model_name}' has no 'model_path' specified in config.json.") future.set_exception(RuntimeError(f"Configuration Error: Model '{model_name}' has no 'model_path'.")) return future + if not os.path.exists(model_path): logging.error(f"File Not Found: Model file not found: {model_path}") future.set_exception(FileNotFoundError(f"Model file not found: {model_path}")) return future + if llama_cpp_runtime_key != "default" and not os.path.exists(llama_cpp_runtime_command): logging.error(f"Runtime Not Found: Llama.cpp runtime not found: {llama_cpp_runtime_command}") future.set_exception(FileNotFoundError(f"Llama.cpp runtime not found: {llama_cpp_runtime_command}")) return future + print(f"Starting Llama Runner for {model_name}...") status_widget = self.model_status_widgets.get(model_name) if status_widget: @@ -144,6 +270,7 @@ def request_runner_start(self, model_name: str) -> asyncio.Future: status_widget.update_port("N/A") status_widget.set_buttons_enabled(False, False) + thread = LlamaRunnerThread( model_name=model_name, model_path=model_path, @@ -156,16 +283,20 @@ def request_runner_start(self, model_name: str) -> asyncio.Future: thread.error.connect(lambda message, output_buffer, name=model_name: self.on_llama_runner_error(name, message, output_buffer)) # thread.stopped.connect(lambda name=model_name: self.on_llama_runner_stopped(name)) # Removed, now handled by customEvent + self.llama_runner_threads[model_name] = thread thread.start() + return future + def _cleanup_completed_future(self, model_name: str): if model_name in self._runner_startup_futures and not self._runner_startup_futures[model_name].done(): logging.debug(f"Cleaning up completed future for {model_name}") del self._runner_startup_futures[model_name] + def stop_llama_runner(self, model_name: str): if model_name in self.llama_runner_threads and self.llama_runner_threads[model_name].isRunning(): print(f"Stopping Llama Runner for {model_name}...") @@ -188,6 +319,7 @@ def stop_llama_runner(self, model_name: str): # Or, if an event is preferred for the manager itself: # QCoreApplication.instance().postEvent(self, RunnerStoppedEvent(model_name)) + # Original logic: post a generic event to parent. # This might be for a different purpose than the thread's stopped event. parent_event = QEvent(LlamaRunnerManager.MANAGER_PARENT_NOTIFICATION_EVENT_TYPE) @@ -202,6 +334,7 @@ def stop_llama_runner(self, model_name: str): self.on_llama_runner_stopped(model_name) + def stop_all_llama_runners(self): print("Stopping all Llama Runners...") # Collect running threads first to avoid modifying the dict during iteration @@ -216,6 +349,7 @@ def stop_all_llama_runners(self): for model_name, thread in running_threads: thread.wait() + @Slot(str) def on_llama_runner_started(self, model_name: str): status_widget = self.model_status_widgets.get(model_name) @@ -223,6 +357,7 @@ def on_llama_runner_started(self, model_name: str): status_widget.update_status("Starting...") status_widget.set_buttons_enabled(False, False) + @Slot(str) def on_llama_runner_stopped(self, model_name: str): print(f"Llama Runner for {model_name} stopped.") @@ -244,6 +379,7 @@ def on_llama_runner_stopped(self, model_name: str): else: logging.warning(f"Stopped signal received for unknown or already cleaned up model: {model_name}") + @Slot(str, str, list) def on_llama_runner_error(self, model_name: str, message: str, output_buffer: list): print(f"Llama Runner for {model_name} error: {message}") @@ -262,6 +398,7 @@ def on_llama_runner_error(self, model_name: str, message: str, output_buffer: li logging.debug(f"Runner {model_name} errored while startup Future was pending.") self._runner_startup_futures[model_name].set_exception(RuntimeError(f"Runner for {model_name} errored during startup: {message}")) + def customEvent(self, event: QEvent): # Handle custom stopped event from LlamaRunnerThread if event.type() == RunnerStoppedEvent.EVENT_TYPE: @@ -277,6 +414,7 @@ def customEvent(self, event: QEvent): else: super().customEvent(event) + @Slot(str, int) def on_llama_runner_port_ready_and_emit(self, model_name: str, port: int): print(f"Llama Runner for {model_name} ready on port {port}.") @@ -296,6 +434,7 @@ def on_llama_runner_port_ready_and_emit(self, model_name: str, port: int): logging.info(f"Set current running model: {model_name}") self.runner_port_ready_for_proxy.emit(model_name, port) + @Slot() def check_runner_statuses(self): for model_name, thread in list(self.llama_runner_threads.items()): @@ -311,4 +450,4 @@ def check_runner_statuses(self): if app_instance and parent_object: app_instance.postEvent(parent_object, parent_event) else: - logging.warning(f"Could not post parent event for {model_name} (check_runner_statuses): App/Parent None.") \ No newline at end of file + logging.warning(f"Could not post parent event for {model_name} (check_runner_statuses): App/Parent None.") diff --git a/llama_runner/lmstudio_proxy_thread.py b/llama_runner/lmstudio_proxy_thread.py index 5ffd7d4..3ae0c7a 100644 --- a/llama_runner/lmstudio_proxy_thread.py +++ b/llama_runner/lmstudio_proxy_thread.py @@ -20,11 +20,23 @@ from llama_runner import gguf_metadata # Import the new metadata module from llama_runner.config_loader import calculate_system_fingerprint +from llama_runner.whisper_cpp_runner import WhisperServer +from io import BytesIO +from fastapi import UploadFile as FastAPIUploadFile +from fastapi.middleware.cors import CORSMiddleware + # Configure logging (already done in main.py for configurable levels) # logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') # --- Create our own FastAPI app instance --- app = FastAPI() +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) # --- End create app instance --- @@ -974,6 +986,99 @@ async def _v1_embeddings_handler(request: Request): logging.error(f"Error in /v1/embeddings handler: {e}\n{traceback.format_exc()}") return JSONResponse(content={"error": {"message": f"Internal server error: {e}", "type": "internal_error"}}, status_code=status.HTTP_500_INTERNAL_SERVER_ERROR) +@app.post("/v1/audio/transcriptions") +async def openai_speech_to_text(request: Request): + """Function to convert speech to text using whisper.cpp""" + try: + # Retrieve callback function to notify request start + request_runner_start_callback = request.app.state.request_runner_start_callback + # Parse form data from the incoming request + form = await request.form() + # Extract uploaded audio file from form + file = form.get("file") + # Read the content of the uploaded file into bytes + contents = await file.read() + # Create a FastAPI UploadFile object from bytes content, preserving filename + fastapi_file = FastAPIUploadFile(filename=file.filename, file=BytesIO(contents)) + # Extract model name from the form data + model = str(form.get("model")) + # Retrieve audio configuration from app state + audio_config = request.app.state.audio_config + + # Notify that processing of this model's request has started + request_runner_start_callback(model, True) + + # Initialize WhisperServer with audio configuration and model + whisper_server = WhisperServer(audio_config, model) + # Convert the uploaded audio file to WAV format + audio_file_path = whisper_server.convert_to_wav(fastapi_file) + # Perform transcription on the WAV audio file + result = whisper_server.transcribe_audio(audio_file_path) + # Return transcription result as JSON response + return JSONResponse(content=result) + + except json.JSONDecodeError: + # Handle cases where request body is not valid JSON + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid JSON request body") + + except Exception as e: + # Log unexpected errors and return HTTP 500 error + logging.error(f"Error handling /v1/audio/transcriptions: {e}\n{traceback.format_exc()}") + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail="Internal Server Error processing transcription request") + + +@app.post("/v1/audio/translations") +async def openai_speech_to_text_translate(request: Request): + """Function to convert speech to text using whisper.cpp with translation""" + try: + # Retrieve callback function to notify request start + request_runner_start_callback = request.app.state.request_runner_start_callback + # Parse form data from the incoming request + form = await request.form() + # Extract uploaded audio file from form + file = form.get("file") + # Read the content of the uploaded file into bytes + contents = await file.read() + # Create a FastAPI UploadFile object from bytes content, preserving filename + fastapi_file = FastAPIUploadFile(filename=file.filename, file=BytesIO(contents)) + # Extract model name from the form data + model = str(form.get("model")) + + # Save the current audio config to restore later + prev_audio_config = request.app.state.audio_config + # Remove "language" parameter if it exists for this model's config + if request.app.state.audio_config["models"][model]["parameters"].get("language", False): + del request.app.state.audio_config["models"][model]["parameters"]["language"] + # Enable translation mode by setting "translate" parameter + request.app.state.audio_config["models"][model]["parameters"]["translate"] = "" + # Retrieve updated audio configuration + audio_config = request.app.state.audio_config + + # Notify that processing of this model's request has started + request_runner_start_callback(model, True) + # Initialize WhisperServer with updated audio configuration and model + whisper_server = WhisperServer(audio_config, model) + # Convert the uploaded audio file to WAV format + audio_file_path = whisper_server.convert_to_wav(fastapi_file) + # Perform transcription with translation on the WAV audio file + result = whisper_server.transcribe_audio(audio_file_path) + + # Restore previous audio config after request processed + request.app.state.audio_config = prev_audio_config + + # Return transcription and translation result as JSON response + return JSONResponse(content=result) + + except json.JSONDecodeError: + # Handle cases where request body is not valid JSON + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid JSON request body") + + except Exception as e: + # Log unexpected errors and return HTTP 500 error + logging.error(f"Error handling /v1/audio/transcriptions: {e}\n{traceback.format_exc()}") + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail="Internal Server Error processing transcription request") + + logging.info("Updated dynamic routing handlers for /v1/chat/completions, /v1/completions, /v1/embeddings to support conditional streaming.") @@ -999,7 +1104,8 @@ class FastAPIProxyThread(QThread): # Renamed class def __init__(self, all_models_config: Dict[str, Dict[str, Any]], # Renamed models_config - runtimes_config: Dict[str, Dict[str, Any]], # Added runtimes_config + runtimes_config: Dict[str, Dict[str, Any]], # Added runtimes_config, + audio_config, is_model_running_callback: Callable[[str], bool], get_runner_port_callback: Callable[[str], Optional[int]], request_runner_start_callback: Callable[[str], asyncio.Future], # Callback now returns Future @@ -1009,6 +1115,7 @@ def __init__(self, super().__init__() self.all_models_config = all_models_config # Store all_models_config self.runtimes_config = runtimes_config # Store runtimes_config + self.audio_config = audio_config self.is_model_running_callback = is_model_running_callback self.get_runner_port_callback = get_runner_port_callback self.request_runner_start_callback = request_runner_start_callback # Store the callback @@ -1128,6 +1235,11 @@ async def run_async(self): app.state.request_runner_start_callback = self.request_runner_start_callback # Pass the new callback app.state.prompt_logging_enabled = self.prompt_logging_enabled # Set prompt logging flag on state app.state.prompts_logger = self.prompts_logger # Set prompts logger on state + + # Audio global variables + app.state.audio_config = self.audio_config + app.state.running_whisper = None + # Extract metadata for all models and store it in app.state.models_metadata # Note: get_all_models_lmstudio_format expects the main models config (all_models_config) app.state.models_metadata = gguf_metadata.get_all_models_lmstudio_format( diff --git a/llama_runner/main_window.py b/llama_runner/main_window.py index afb18e2..93f7296 100644 --- a/llama_runner/main_window.py +++ b/llama_runner/main_window.py @@ -37,7 +37,9 @@ def __init__(self): # Load settings from config with defaults self.prompt_logging_enabled = self.config.get('logging', {}).get('prompt_logging_enabled', False) self.llama_runtimes = self.config.get("llama-runtimes", {}) + self.audio_config = self.config.get("audio") self.default_runtime = self.config.get("default_runtime", "llama-server") + self.audio_models = self.audio_config.get("models", {}) self.models = self.config.get("models", {}) self.concurrent_runners_limit = self.config.get("concurrentRunners", 1) if not isinstance(self.concurrent_runners_limit, int) or self.concurrent_runners_limit < 1: @@ -63,7 +65,11 @@ def __init__(self): self.model_metadata_cache[model_name] = metadata else: logging.warning(f"Model '{model_name}' has no 'model_path' in config. Skipping metadata caching.") - + for model_name, model_config in self.audio_models.items(): + model_path = model_config.get("model_path") + if model_path: + logging.warning(f"Model '{model_name}' has no 'model_path' in config. Skipping metadata caching.") + self.fastapi_proxy_thread: Optional[FastAPIProxyThread] = None self.ollama_proxy_thread: Optional[OllamaProxyThread] = None @@ -114,6 +120,14 @@ def __init__(self): self.model_status_widgets[model_name] = status_widget status_widget.start_button.clicked.connect(lambda checked, name=model_name: self.llama_runner_manager.request_runner_start(name)) status_widget.stop_button.clicked.connect(lambda checked, name=model_name: self.llama_runner_manager.stop_llama_runner(name)) + for model_name in self.audio_models.keys(): + self.model_list_widget.addItem(model_name) + #model_metadata = self.model_metadata_cache.get(model_name) + status_widget = ModelStatusWidget(model_name) + self.model_status_stack.addWidget(status_widget) + self.model_status_widgets[model_name] = status_widget + status_widget.start_button.clicked.connect(lambda checked, name=model_name: self.llama_runner_manager.request_runner_start(name, True)) + status_widget.stop_button.clicked.connect(lambda checked, name=model_name: self.llama_runner_manager.stop_whisper_server(name)) self.model_list_widget.currentItemChanged.connect(self.on_model_selection_changed) @@ -136,13 +150,14 @@ def __init__(self): self.llama_runner_manager = LlamaRunnerManager( models=self.models, llama_runtimes=self.llama_runtimes, + audio_config=self.audio_config, default_runtime=self.default_runtime, model_status_widgets=self.model_status_widgets, # runner_port_ready_for_proxy and runner_stopped_for_proxy are now owned by LlamaRunnerManager parent=self, ) self.llama_runner_manager.set_concurrent_runners_limit(self.concurrent_runners_limit) - + # --- Start the FastAPI Proxy (for LM Studio) automatically if enabled --- if self.lmstudio_proxy_enabled: self.start_fastapi_proxy() @@ -248,6 +263,7 @@ def on_model_selection_changed(self, current_item, previous_item): self.model_status_stack.setCurrentWidget(self.no_model_selected_widget) else: self.model_status_stack.setCurrentWidget(self.no_model_selected_widget) + # Runner management methods moved to LlamaRunnerManager @@ -264,6 +280,7 @@ def start_fastapi_proxy(self): self.fastapi_proxy_thread = FastAPIProxyThread( all_models_config=self.models, runtimes_config=self.llama_runtimes, + audio_config=self.audio_config, is_model_running_callback=self.llama_runner_manager.is_llama_runner_running, get_runner_port_callback=self.llama_runner_manager.get_runner_port, request_runner_start_callback=self.llama_runner_manager.request_runner_start, @@ -293,7 +310,9 @@ def start_ollama_proxy(self): self.ollama_proxy_thread = OllamaProxyThread( all_models_config=self.models, runtimes_config=self.llama_runtimes, + audio_config=self.audio_config, is_model_running_callback=self.llama_runner_manager.is_llama_runner_running, + is_model_whisper_running=self.llama_runner_manager.is_whisper_runner_running, get_runner_port_callback=self.llama_runner_manager.get_runner_port, request_runner_start_callback=self.llama_runner_manager.request_runner_start, prompt_logging_enabled=self.prompt_logging_enabled, diff --git a/llama_runner/ollama_proxy_thread.py b/llama_runner/ollama_proxy_thread.py index 0a75e55..e78a936 100644 --- a/llama_runner/ollama_proxy_thread.py +++ b/llama_runner/ollama_proxy_thread.py @@ -20,8 +20,20 @@ chatRequestFromOllama, chatResponseToOllama ) +from llama_runner.whisper_cpp_runner import WhisperServer +from io import BytesIO +from fastapi import UploadFile as FastAPIUploadFile +from fastapi.middleware.cors import CORSMiddleware + # --- Create our own FastAPI app instance --- app = FastAPI() +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) # --- End create app instance --- # Define standalone handlers that access state via app.state @@ -604,9 +616,6 @@ async def list_models(request: Request): raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail="Internal Server Error processing list models request") - -# --- End handlers for Ollama API endpoints --- - @app.post("/api/show") async def show_model_info(request: Request): """Handles Ollama /api/show requests.""" @@ -713,7 +722,6 @@ async def list_openai_models(request: Request): logging.error(f"Error handling /v1/models: {e}\n{traceback.format_exc()}") raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail="Internal Server Error processing list models request") -# --- End handlers for OpenAI compatible API endpoints (v1) --- @app.post("/v1/completions") async def openai_completions(request: Request): @@ -743,8 +751,8 @@ async def openai_chat_completions(request: Request): """Handles OpenAI /v1/chat/completions requests.""" try: request_body = await request.json() + print(request_body) # No conversion needed here, assuming the incoming request is already OpenAI format - async def chat_completion_response_stream(): # The dynamic router handles the forwarding and streaming async for chunk in _dynamic_route_runner_request_generator(request, target_path="/v1/chat/completions", request_body=request_body): @@ -790,6 +798,102 @@ async def openai_embeddings(request: Request): raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail="Internal Server Error processing embeddings request") +@app.post("/v1/audio/transcriptions") +async def openai_speech_to_text(request: Request): + """Function to convert speech to text using whisper.cpp""" + try: + # Retrieve callback function to notify request start + request_runner_start_callback = request.app.state.request_runner_start_callback + # Parse form data from the incoming request + form = await request.form() + # Extract uploaded audio file from form + file = form.get("file") + # Read the content of the uploaded file into bytes + contents = await file.read() + # Create a FastAPI UploadFile object from bytes content, preserving filename + fastapi_file = FastAPIUploadFile(filename=file.filename, file=BytesIO(contents)) + # Extract model name from the form data + model = str(form.get("model")) + # Retrieve audio configuration from app state + audio_config = request.app.state.audio_config + + # Notify that processing of this model's request has started + request_runner_start_callback(model, True) + + # Initialize WhisperServer with audio configuration and model + whisper_server = WhisperServer(audio_config, model) + # Convert the uploaded audio file to WAV format + audio_file_path = whisper_server.convert_to_wav(fastapi_file) + # Perform transcription on the WAV audio file + result = whisper_server.transcribe_audio(audio_file_path) + # Return transcription result as JSON response + return JSONResponse(content=result) + + except json.JSONDecodeError: + # Handle cases where request body is not valid JSON + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid JSON request body") + + except Exception as e: + # Log unexpected errors and return HTTP 500 error + logging.error(f"Error handling /v1/audio/transcriptions: {e}\n{traceback.format_exc()}") + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail="Internal Server Error processing transcription request") + + + +@app.post("/v1/audio/translations") +async def openai_speech_to_text_translate(request: Request): + """Function to convert speech to text using whisper.cpp with translation""" + try: + # Retrieve callback function to notify request start + request_runner_start_callback = request.app.state.request_runner_start_callback + # Parse form data from the incoming request + form = await request.form() + # Extract uploaded audio file from form + file = form.get("file") + # Read the content of the uploaded file into bytes + contents = await file.read() + # Create a FastAPI UploadFile object from bytes content, preserving filename + fastapi_file = FastAPIUploadFile(filename=file.filename, file=BytesIO(contents)) + # Extract model name from the form data + model = str(form.get("model")) + + # Save the current audio config to restore later + prev_audio_config = request.app.state.audio_config + # Remove "language" parameter if it exists for this model's config + if request.app.state.audio_config["models"][model]["parameters"].get("language", False): + del request.app.state.audio_config["models"][model]["parameters"]["language"] + # Enable translation mode by setting "translate" parameter + request.app.state.audio_config["models"][model]["parameters"]["translate"] = "" + # Retrieve updated audio configuration + audio_config = request.app.state.audio_config + + # Notify that processing of this model's request has started + request_runner_start_callback(model, True) + # Initialize WhisperServer with updated audio configuration and model + whisper_server = WhisperServer(audio_config, model) + # Convert the uploaded audio file to WAV format + audio_file_path = whisper_server.convert_to_wav(fastapi_file) + # Perform transcription with translation on the WAV audio file + result = whisper_server.transcribe_audio(audio_file_path) + + # Restore previous audio config after request processed + request.app.state.audio_config = prev_audio_config + + # Return transcription and translation result as JSON response + return JSONResponse(content=result) + + except json.JSONDecodeError: + # Handle cases where request body is not valid JSON + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid JSON request body") + + except Exception as e: + # Log unexpected errors and return HTTP 500 error + logging.error(f"Error handling /v1/audio/transcriptions: {e}\n{traceback.format_exc()}") + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail="Internal Server Error processing transcription request") + +# --- End handlers for OpenAI compatible API endpoints (v1) --- + + class OllamaProxyThread(QThread): """ QThread to run the FastAPI proxy emulating the Ollama API in a separate thread. @@ -802,7 +906,9 @@ class OllamaProxyThread(QThread): def __init__(self, all_models_config: Dict[str, Dict[str, Any]], # Renamed models_config to all_models_config runtimes_config: Dict[str, Dict[str, Any]], # Renamed models_config to runtimes_config + audio_config, is_model_running_callback: Callable[[str], bool], + is_model_whisper_running, get_runner_port_callback: Callable[[str], Optional[int]], request_runner_start_callback: Callable[[str], asyncio.Future], prompt_logging_enabled: bool, # Add prompt logging flag @@ -810,8 +916,10 @@ def __init__(self, super().__init__() self.all_models_config = all_models_config # Store all_models_config self.runtimes_config = runtimes_config # Store runtimes_config + self.audio_config = audio_config self.is_model_running_callback = is_model_running_callback self.get_runner_port_callback = get_runner_port_callback + self.is_model_whisper_running = is_model_whisper_running self.request_runner_start_callback = request_runner_start_callback self.prompt_logging_enabled = prompt_logging_enabled # Store the flag self.prompts_logger = prompts_logger # Store the logger instance @@ -903,11 +1011,15 @@ async def run_async(self): app.state.all_models_config = self.all_models_config # Pass all_models_config app.state.runtimes_config = self.runtimes_config # Pass runtimes_config app.state.is_model_running_callback = self.is_model_running_callback + app.state.is_model_whisper_running = self.is_model_whisper_running app.state.get_runner_port_callback = self.get_runner_port_callback app.state.request_runner_start_callback = self.request_runner_start_callback app.state.prompt_logging_enabled = self.prompt_logging_enabled # Set prompt logging flag on state app.state.prompts_logger = self.prompts_logger # Set prompts logger on state - + + # Audio global variables + app.state.audio_config = self.audio_config + # Use port 11434 as required for Ollama emulation uvicorn_config = uvicorn.Config(app, host="127.0.0.1", port=11434, reload=False) self._uvicorn_server = uvicorn.Server(uvicorn_config) diff --git a/llama_runner/whisper_cpp_runner.py b/llama_runner/whisper_cpp_runner.py new file mode 100644 index 0000000..a77cb97 --- /dev/null +++ b/llama_runner/whisper_cpp_runner.py @@ -0,0 +1,161 @@ +import subprocess +import requests +import time +import logging +import os +from fastapi import UploadFile +from typing import Optional, Dict, Any, Union + + +class WhisperServer: + """ + Manage the startup of the whisper server and interaction with it. + """ + + def __init__(self, audio_config: Dict[str, Any], model_name: str): + """ + Initialize the server with audio configuration and model name. + + :param audio_config: audio config (normalized) + :param model_name: model name from audio_config['models'] + """ + self.audio_config = audio_config + self.model_name = model_name + + # Get model and runtime config + models = audio_config.get('models', {}) + model_conf = models.get(model_name, {}) + + self.runtime_name = model_conf.get('runtime', 'default') + runtimes = audio_config.get('runtimes', {}) + runtime_conf = runtimes.get(self.runtime_name, {}) + self.runtime_path = runtime_conf.get('runtime') + + if not self.runtime_path: + raise ValueError(f"Runtime path for '{self.runtime_name}' not defined in audio config.") + + self.model_path = model_conf.get("model_path") + if not self.model_path: + raise ValueError(f"Model path for '{self.model_name}' not defined in audio config.") + + # Compose launch command + self.cmd = [ + self.runtime_path, + '--model', self.model_path, + ] + + parameters = model_conf.get("parameters", {}) + if isinstance(parameters, dict): + for option, value in parameters.items(): + self.cmd.extend([f'--{option}', str(value)]) + + # Check if host and port exist, if not add with default values + default_host = 'localhost' + default_port = '9000' # string, since command list elements are strings + + if '--host' not in self.cmd: + self.cmd.extend(['--host', default_host]) + if '--port' not in self.cmd: + self.cmd.extend(['--port', default_port]) + + # Remove '' in self.cmd if exists + for i in range(len(self.cmd)): + if self.cmd[i] == '': + self.cmd.pop(i) + + + # Extract host and port from the command list + def get_cmd_param(cmd_list, param_name, default): + try: + idx = cmd_list.index(param_name) + return cmd_list[idx + 1] + except (ValueError, IndexError): + return default + + self.host = get_cmd_param(self.cmd, '--host', default_host) + self.port = get_cmd_param(self.cmd, '--port', default_port) + self.base_url = f'http://{self.host}:{self.port}' + + self.process: Optional[subprocess.Popen] = None + + def start_server(self, wait_seconds: float = 5.0) -> None: + """Start the whisper server with current parameters.""" + logging.info(f"Starting whisper-server with command: {' '.join(self.cmd)}") + self.process = subprocess.Popen(self.cmd) + print(f"Whisper-server started on {self.host}:{self.port} with model {self.model_name}") + time.sleep(wait_seconds) + + def stop_server(self) -> None: + """Stop the server if it is running.""" + if self.process and self.process.poll() is None: + self.process.terminate() + try: + self.process.wait(timeout=10) + except subprocess.TimeoutExpired: + self.process.kill() + self.process.wait() + print("Whisper-server stopped") + else: + print("Whisper-server is not running or already stopped.") + + def transcribe_audio(self, audio_path: str) -> Union[Dict[str, Any], None]: + """Send an audio file to the server for transcription and return the result.""" + url = f"{self.base_url}/inference" + data = {"response_format": "json"} + + try: + with open(audio_path, 'rb') as audio_file: + files = {'file': audio_file} + response = requests.post(url, files=files, data=data) + response.raise_for_status() + return response.json() + except requests.RequestException as e: + logging.error(f"Error transcribing audio: {e}") + return None + + def convert_to_wav(self, input_file: UploadFile, output_path: Optional[str] = None) -> str: + """ + Convert incoming audio file to WAV (16kHz, mono, PCM s16le). + + :param input_file: Uploaded audio file + :param output_path: Path to save WAV file. Defaults to ~/.llama-runner/temp.wav + :return: Path to saved WAV file + """ + if output_path is None: + output_path = os.path.expanduser("~/.llama-runner/temp.wav") + + input_tmp_dir = os.path.dirname(output_path) + input_tmp_path = os.path.join(input_tmp_dir, "temp_input") + + os.makedirs(input_tmp_dir, exist_ok=True) + + try: + with open(input_tmp_path, "wb") as f: + f.write(input_file.file.read()) + finally: + input_file.file.close() + + cmd = [ + "ffmpeg", + "-y", + "-i", input_tmp_path, + "-ar", "16000", + "-ac", "1", + "-c:a", "pcm_s16le", + output_path + ] + + try: + subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + except subprocess.CalledProcessError as e: + error_msg = e.stderr.decode(errors='ignore') + raise RuntimeError(f"Error during audio conversion: {error_msg}") + finally: + if os.path.exists(input_tmp_path): + os.remove(input_tmp_path) + + return output_path + + + def get_port(self): + return int(self.port)