diff --git a/app/main.py b/app/main.py index fd4b1f3..3b52cdd 100755 --- a/app/main.py +++ b/app/main.py @@ -27,16 +27,19 @@ # Parse command line arguments parser = argparse.ArgumentParser(description='FastAPI Model Serving Application') -parser.add_argument('--device', type=str, default='cuda', +# Defaults read from the environment so the same image runs unmodified across +# providers (Cloud Run injects PORT; RunPod/VM set DEVICE etc.). Explicit CLI +# flags still override these defaults. +parser.add_argument('--device', type=str, default=os.getenv('DEVICE', 'cuda'), help='Device to run the models on (e.g., cpu, cuda, mps)') -parser.add_argument('--host', type=str, default='0.0.0.0', +parser.add_argument('--host', type=str, default=os.getenv('HOST', '0.0.0.0'), help='Host to run the server on') -parser.add_argument('--port', type=int, default=8888, +parser.add_argument('--port', type=int, default=int(os.getenv('PORT', '6050')), help='Port to run the server on') -parser.add_argument('--reload', action='store_true', +parser.add_argument('--reload', action='store_true', help='Enable auto-reload') -parser.add_argument('--workers', type=int, default=1, - help='Number of worker processes') +parser.add_argument('--workers', type=int, default=int(os.getenv('WORKERS', '1')), + help='Number of worker processes (keep at 1 per GPU; scale via replicas)') args = parser.parse_args() if __name__ == "__main__": @@ -60,10 +63,14 @@ async def startup_event(): app.state.device = args.device try: - # Load model configuration + # Load model configuration. ${MODEL_BASE} in the config is expanded from + # the environment so the same config points at any model store — + # /datasets (VM mount), a provider volume, or an https:// object-store + # URL (URLs are fetched + cached by checkpoint_utils at load time). + os.environ.setdefault('MODEL_BASE', '/datasets') config_path = os.path.join(os.path.dirname(__file__), 'model_config.json') with open(config_path, 'r') as f: - config = json.load(f) + config = json.loads(os.path.expandvars(f.read())) logger.info(f"Loading models on device: {args.device}") diff --git a/app/model_config.json b/app/model_config.json index 5905379..011d870 100755 --- a/app/model_config.json +++ b/app/model_config.json @@ -3,7 +3,7 @@ { "model_id": "msv3", "model_type": "yolo-ultralytics", - "model_path": "/datasets/detect.yolov11.msv3.pt", + "model_path": "${MODEL_BASE}/detect.yolov11.msv3.pt", "imgsz": 640, "conf": 0.5 }, @@ -18,19 +18,19 @@ "model_id": "miewid-msv4.1", "model_type": "miewid", "imgsz": 440, - "checkpoint_path": "/datasets/miew_id.msv4_1_main.bin", + "checkpoint_path": "${MODEL_BASE}/miew_id.msv4_1_main.bin", "version": 4.1 }, { "model_id": "miewid-trout", "model_type": "miewid", "imgsz": 440, - "checkpoint_path": "/datasets/miewid_trout.bin" + "checkpoint_path": "${MODEL_BASE}/miewid_trout.bin" }, { "model_id": "efficientnet-classifier", "model_type": "efficientnetv2", - "checkpoint_path": "/datasets/vplabeler-msv3.pt", + "checkpoint_path": "${MODEL_BASE}/vplabeler-msv3.pt", "img_size": 512, "threshold": 0.5 } diff --git a/app/models/yolo_ultralytics.py b/app/models/yolo_ultralytics.py index 2544a7e..a736867 100644 --- a/app/models/yolo_ultralytics.py +++ b/app/models/yolo_ultralytics.py @@ -3,6 +3,7 @@ from PIL import Image from ultralytics import YOLO from .base_model import BaseModel +from ..utils.checkpoint_utils import get_checkpoint_path import logging logger = logging.getLogger(__name__) @@ -24,8 +25,11 @@ def load(self, model_path: str, device: str, **kwargs) -> None: - imgsz: Default image size for inference - conf: Default confidence threshold """ - logger.info(f"Loading YOLO model from {model_path} on device {device}") - self.model = YOLO(model_path) + # Resolve URLs (and validate local paths) the same way the other model + # types do, so the detector weight can live in any object store. + local_model_path = get_checkpoint_path(model_path) + logger.info(f"Loading YOLO model from {local_model_path} on device {device}") + self.model = YOLO(local_model_path) self.model.to(device) # Store model info diff --git a/deploy/README.md b/deploy/README.md new file mode 100644 index 0000000..ed68a40 --- /dev/null +++ b/deploy/README.md @@ -0,0 +1,76 @@ +# Deploying the ML detector service on on-demand GPU (provider-independent) + +This service is **stateless** and GPU-bound, which makes it a clean fit for +serverless / on-demand GPU platforms: you pay per-second only while a GPU is +actually processing, and the platform autoscales across many GPUs during bursts +and back down to a warm baseline (or zero) when idle. **The platform is your +load balancer** — you do not run nginx/HAProxy yourself. + +The goal of this directory is to stay **provider-independent**: one OCI image +(built from the repo's existing `docker/dockerfile`), serving plain HTTP, runs +identically on RunPod, Cloud Run, or a plain VM. Each provider gets only a thin +config file here. + +## The portability contract (what the container guarantees) + +The image makes zero provider-specific assumptions. It is configured entirely +through environment variables: + +| Env var | Default | Purpose | +|---|---|---| +| `PORT` | `6050` | HTTP port. Cloud Run injects this; RunPod/VM can set it. (`app/main.py`) | +| `HOST` | `0.0.0.0` | Bind address. | +| `DEVICE` | `cuda` | `cuda` / `cpu` / `mps`. | +| `WORKERS` | `1` | **Keep at 1 per GPU.** Scale with replicas, not workers (see below). | +| `MODEL_BASE` | `/datasets` | Prefix for model weights in `app/model_config.json`. Can be a filesystem path **or** an `https://` object-store prefix. | + +**Model storage is the one thing to decide per environment.** `MODEL_BASE` is +expanded into `model_config.json` at startup. Because every model loader (incl. +the YOLO detector) resolves paths through `checkpoint_utils.get_checkpoint_path`, +weights can be: + +- a **mounted volume** — `MODEL_BASE=/datasets` (VM) or `/runpod-volume/models` (RunPod network volume), or +- an **object-store URL** — `MODEL_BASE=https://storage.googleapis.com/your-bucket/models` (fetched + cached to `/tmp/checkpoints` at boot). + +The URL option is the most portable: identical config everywhere, no +provider-specific volume wiring. + +## Two load-balancing knobs (the same idea on every platform) + +1. **Concurrency per replica = 2** — matches the in-process semaphore + `MAX_CONCURRENT_PREDICTIONS` in `predict_router.py` / `pipeline_router.py`. + The platform sends the 3rd concurrent request to another GPU replica. +2. **min / max replicas** — `min=1` keeps a GPU warm (fast first response); + `max=N` is your burst ceiling. `min=0` = true scale-to-zero (cheapest, but + the first request after idle pays the cold start). + +> **Do not raise `WORKERS`.** Multiple uvicorn workers on one GPU each load a +> full copy of all 5 models into the same VRAM (OOM risk) while the GPU still +> executes serially — no throughput gain. One worker per GPU; scale via replicas. + +## Cold start (know this before choosing min=0) + +The service eagerly loads 5 models at startup (`startup_event`), and the health +check allows a 90s `start_period`. Real cold start = image pull + load all +models into VRAM (tens of seconds). Mitigations: keep `min`/`activeWorkers >= 1` +(chosen here), and put weights on fast storage near the GPU region. + +## Per-provider files + +- **Cloud Run** — `cloudrun/service.yaml` (declarative) or `cloudrun/deploy.sh` + (imperative). NVIDIA L4, `min-instances=1`, `concurrency=2`, `timeout=300`. +- **RunPod** — `runpod/endpoint.json`. HTTP **load-balancing** serverless + endpoint (not the queue/handler model, which would be RunPod-specific code), + `activeWorkers=1`, `concurrencyPerWorker=2`, network volume or URL for models. + +Both consume the **same image**. To move providers, you rebuild nothing — you +just apply the other config file and point `MODEL_BASE` at that environment's +model store. + +## Quick local parity check + +```bash +# Runs the same way the platforms invoke it (env-driven, 1 worker, CPU): +PORT=6050 DEVICE=cpu WORKERS=1 MODEL_BASE=/datasets python3 -m app.main +curl -f http://localhost:6050/health +``` diff --git a/deploy/cloudrun/deploy.sh b/deploy/cloudrun/deploy.sh new file mode 100644 index 0000000..c7de4c5 --- /dev/null +++ b/deploy/cloudrun/deploy.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash +# Imperative Cloud Run deploy (alternative to `gcloud run services replace service.yaml`). +# Builds the existing Dockerfile, pushes to Artifact Registry, deploys with an L4 GPU. +# +# Prereqs: gcloud auth + an Artifact Registry repo named "ml-service". +set -euo pipefail + +PROJECT_ID="${PROJECT_ID:?set PROJECT_ID}" +REGION="${REGION:-us-central1}" +BUCKET="${MODEL_BUCKET:?set MODEL_BUCKET (gs bucket holding model weights)}" +IMAGE="${REGION}-docker.pkg.dev/${PROJECT_ID}/ml-service/ml-detector:latest" + +# Build with the repo's existing Dockerfile (no Cloud-Run-specific image). +gcloud builds submit --tag "${IMAGE}" --project "${PROJECT_ID}" -f docker/dockerfile . + +gcloud run deploy ml-detector \ + --project "${PROJECT_ID}" \ + --region "${REGION}" \ + --image "${IMAGE}" \ + --gpu 1 --gpu-type nvidia-l4 \ + --cpu 8 --memory 32Gi \ + --concurrency 2 \ + --min-instances 1 \ + --max-instances 10 \ + --no-cpu-throttling \ + --timeout 300 \ + --port 6050 \ + --set-env-vars "PORT=6050,DEVICE=cuda,WORKERS=1,MODEL_BASE=https://storage.googleapis.com/${BUCKET}/models" + +# For true scale-to-zero (cheapest, cold-start on first hit): --min-instances 0 diff --git a/deploy/cloudrun/service.yaml b/deploy/cloudrun/service.yaml new file mode 100644 index 0000000..481f2f7 --- /dev/null +++ b/deploy/cloudrun/service.yaml @@ -0,0 +1,50 @@ +# Cloud Run service for the ML detector service (NVIDIA L4, scale-to-zero capable). +# +# Deploy: gcloud run services replace deploy/cloudrun/service.yaml --region=us-central1 +# Or use: deploy/cloudrun/deploy.sh (imperative equivalent) +# +# Portability note: this runs the SAME image as RunPod / the VM. The only +# Cloud-Run-specific surface is this file. The container reads PORT from the +# environment (Cloud Run injects it) and pulls models from ${MODEL_BASE}. +apiVersion: serving.knative.dev/v1 +kind: Service +metadata: + name: ml-detector + labels: + cloud.googleapis.com/location: us-central1 +spec: + template: + metadata: + annotations: + # --- Warm baseline + autoscale (your chosen "always fast" mode) --- + autoscaling.knative.dev/minScale: "1" # keep 1 GPU warm; set "0" for true scale-to-zero + autoscaling.knative.dev/maxScale: "10" # burst ceiling across GPUs + run.googleapis.com/cpu-throttling: "false" # don't throttle between requests (model stays resident) + run.googleapis.com/startup-cpu-boost: "true" + spec: + # Match the in-process semaphore (MAX_CONCURRENT_PREDICTIONS = 2). + # Cloud Run routes the 3rd concurrent request to another instance. + containerConcurrency: 2 + timeoutSeconds: 300 # /pipeline runs long (detect -> classify -> extract -> orient) + containers: + - image: REGION-docker.pkg.dev/PROJECT_ID/ml-service/ml-detector:latest + ports: + - containerPort: 6050 # must equal the PORT env below + env: + - name: PORT + value: "6050" + - name: DEVICE + value: "cuda" + - name: WORKERS + value: "1" # one worker per GPU; scale via instances, not workers + - name: MODEL_BASE + # Object-store prefix; weights are fetched + cached at startup. + # Use a GCS bucket fronted by https, or mount a GCS volume (below). + value: "https://storage.googleapis.com/YOUR_BUCKET/models" + resources: + limits: + cpu: "8" # L4 requires >=4 CPU; 8 recommended + memory: 32Gi # >=16Gi required; 32Gi recommended for 5 resident models + nvidia.com/gpu: "1" + nodeSelector: + run.googleapis.com/accelerator: nvidia-l4 diff --git a/deploy/runpod/endpoint.json b/deploy/runpod/endpoint.json new file mode 100644 index 0000000..2d9b3c7 --- /dev/null +++ b/deploy/runpod/endpoint.json @@ -0,0 +1,42 @@ +{ + "_comment": "RunPod Serverless endpoint config for the ML detector service. Use an HTTP 'Load Balancing' serverless endpoint so RunPod routes plain HTTP straight to the container on $PORT — the SAME image and HTTP interface as Cloud Run and the VM (no RunPod-specific handler code, which keeps you provider-independent). The classic queue-based serverless model would require a runpod.serverless handler wrapper and is intentionally avoided here.", + + "name": "ml-detector", + "endpointType": "load_balancer", + + "image": "YOUR_REGISTRY/ml-detector:latest", + "containerPort": 6050, + + "env": { + "PORT": "6050", + "DEVICE": "cuda", + "WORKERS": "1", + "_MODEL_BASE_comment": "Point at a network volume mount (e.g. /runpod-volume/models) OR an https:// object-store prefix. URLs are fetched + cached at startup by checkpoint_utils.", + "MODEL_BASE": "/runpod-volume/models" + }, + + "networkVolume": { + "_comment": "Attach a Network Volume holding the model weights; it mounts at /runpod-volume. Omit if MODEL_BASE is an https:// URL instead.", + "mountPath": "/runpod-volume" + }, + + "scaling": { + "_comment": "activeWorkers >= 1 keeps a GPU warm (your 'always fast' mode); FlashBoot keeps recently-idle workers hot. Set activeWorkers 0 for true scale-to-zero (cheapest, cold-start on first hit).", + "activeWorkers": 1, + "maxWorkers": 10, + "_concurrency_comment": "Concurrency per worker should match MAX_CONCURRENT_PREDICTIONS (2) in predict_router.py / pipeline_router.py.", + "concurrencyPerWorker": 2, + "idleTimeoutSeconds": 30 + }, + + "gpu": { + "_comment": "Pick by VRAM needed for 5 resident models; L4 / A5000 / A40 are cost-effective. Detection+ID does not need an H100.", + "types": ["NVIDIA L4", "NVIDIA RTX A5000", "NVIDIA A40"] + }, + + "healthcheck": { + "_comment": "Reuse the container's existing /health endpoint (GPU + torch + models-loaded checks).", + "path": "/health", + "port": 6050 + } +} diff --git a/docker/docker-compose.prod.yml b/docker/docker-compose.prod.yml index 79740e1..a1ba879 100644 --- a/docker/docker-compose.prod.yml +++ b/docker/docker-compose.prod.yml @@ -17,7 +17,9 @@ services: - ultralytics_config:/root/.config/Ultralytics environment: - PYTHONPATH=/app - command: python3 -m app.main --host 0.0.0.0 --port 6050 --device ${DEVICE:-cuda} --workers ${WORKERS:-4} + # One worker per GPU: extra workers each load a full copy of all models into + # the same VRAM (OOM risk) with no throughput gain since the GPU runs serially. + command: python3 -m app.main --host 0.0.0.0 --port 6050 --device ${DEVICE:-cuda} --workers ${WORKERS:-1} deploy: resources: reservations: