WildMeOrg · JasonWildMe · Jun 1, 2026
diff --git a/app/main.py b/app/main.py
@@ -27,16 +27,19 @@
 
 # Parse command line arguments
 parser = argparse.ArgumentParser(description='FastAPI Model Serving Application')
-parser.add_argument('--device', type=str, default='cuda', 
+# Defaults read from the environment so the same image runs unmodified across
+# providers (Cloud Run injects PORT; RunPod/VM set DEVICE etc.). Explicit CLI
+# flags still override these defaults.
+parser.add_argument('--device', type=str, default=os.getenv('DEVICE', 'cuda'),
                    help='Device to run the models on (e.g., cpu, cuda, mps)')
-parser.add_argument('--host', type=str, default='0.0.0.0', 
+parser.add_argument('--host', type=str, default=os.getenv('HOST', '0.0.0.0'),
                    help='Host to run the server on')
-parser.add_argument('--port', type=int, default=8888, 
+parser.add_argument('--port', type=int, default=int(os.getenv('PORT', '6050')),
                    help='Port to run the server on')
-parser.add_argument('--reload', action='store_true', 
+parser.add_argument('--reload', action='store_true',
                    help='Enable auto-reload')
-parser.add_argument('--workers', type=int, default=1, 
-                   help='Number of worker processes')
+parser.add_argument('--workers', type=int, default=int(os.getenv('WORKERS', '1')),
+                   help='Number of worker processes (keep at 1 per GPU; scale via replicas)')
 args = parser.parse_args()
 
 if __name__ == "__main__":
@@ -60,10 +63,14 @@ async def startup_event():
     app.state.device = args.device
 
     try:
-        # Load model configuration
+        # Load model configuration. ${MODEL_BASE} in the config is expanded from
+        # the environment so the same config points at any model store —
+        # /datasets (VM mount), a provider volume, or an https:// object-store
+        # URL (URLs are fetched + cached by checkpoint_utils at load time).
+        os.environ.setdefault('MODEL_BASE', '/datasets')
         config_path = os.path.join(os.path.dirname(__file__), 'model_config.json')
         with open(config_path, 'r') as f:
-            config = json.load(f)
+            config = json.loads(os.path.expandvars(f.read()))
 
         logger.info(f"Loading models on device: {args.device}")
 

diff --git a/app/model_config.json b/app/model_config.json
@@ -3,7 +3,7 @@
 	{
 		"model_id": "msv3",
 		"model_type": "yolo-ultralytics",
-		"model_path": "/datasets/detect.yolov11.msv3.pt",
+		"model_path": "${MODEL_BASE}/detect.yolov11.msv3.pt",
 		"imgsz": 640,
 		"conf": 0.5
 	},
@@ -18,19 +18,19 @@
 	    "model_id": "miewid-msv4.1",
 	    "model_type": "miewid",
 	    "imgsz": 440,
-	    "checkpoint_path": "/datasets/miew_id.msv4_1_main.bin",
+	    "checkpoint_path": "${MODEL_BASE}/miew_id.msv4_1_main.bin",
 	    "version": 4.1
 	},
 	{
 	    "model_id": "miewid-trout",
 	    "model_type": "miewid",
 	    "imgsz": 440,
-	    "checkpoint_path": "/datasets/miewid_trout.bin"
+	    "checkpoint_path": "${MODEL_BASE}/miewid_trout.bin"
 	},
 	{
 	    "model_id": "efficientnet-classifier",
 	    "model_type": "efficientnetv2",
-	    "checkpoint_path": "/datasets/vplabeler-msv3.pt",
+	    "checkpoint_path": "${MODEL_BASE}/vplabeler-msv3.pt",
 	    "img_size": 512,
 	    "threshold": 0.5
 	}

diff --git a/app/models/yolo_ultralytics.py b/app/models/yolo_ultralytics.py
@@ -3,6 +3,7 @@
 from PIL import Image
 from ultralytics import YOLO
 from .base_model import BaseModel
+from ..utils.checkpoint_utils import get_checkpoint_path
 import logging
 
 logger = logging.getLogger(__name__)
@@ -24,8 +25,11 @@ def load(self, model_path: str, device: str, **kwargs) -> None:
                 - imgsz: Default image size for inference
                 - conf: Default confidence threshold
         """
-        logger.info(f"Loading YOLO model from {model_path} on device {device}")
-        self.model = YOLO(model_path)
+        # Resolve URLs (and validate local paths) the same way the other model
+        # types do, so the detector weight can live in any object store.
+        local_model_path = get_checkpoint_path(model_path)
+        logger.info(f"Loading YOLO model from {local_model_path} on device {device}")
+        self.model = YOLO(local_model_path)
         self.model.to(device)
 
         # Store model info

diff --git a/deploy/README.md b/deploy/README.md
@@ -0,0 +1,76 @@
+# Deploying the ML detector service on on-demand GPU (provider-independent)
+
+This service is **stateless** and GPU-bound, which makes it a clean fit for
+serverless / on-demand GPU platforms: you pay per-second only while a GPU is
+actually processing, and the platform autoscales across many GPUs during bursts
+and back down to a warm baseline (or zero) when idle. **The platform is your
+load balancer** — you do not run nginx/HAProxy yourself.
+
+The goal of this directory is to stay **provider-independent**: one OCI image
+(built from the repo's existing `docker/dockerfile`), serving plain HTTP, runs
+identically on RunPod, Cloud Run, or a plain VM. Each provider gets only a thin
+config file here.
+
+## The portability contract (what the container guarantees)
+
+The image makes zero provider-specific assumptions. It is configured entirely
+through environment variables:
+
+| Env var | Default | Purpose |
+|---|---|---|
+| `PORT` | `6050` | HTTP port. Cloud Run injects this; RunPod/VM can set it. (`app/main.py`) |
+| `HOST` | `0.0.0.0` | Bind address. |
+| `DEVICE` | `cuda` | `cuda` / `cpu` / `mps`. |
+| `WORKERS` | `1` | **Keep at 1 per GPU.** Scale with replicas, not workers (see below). |
+| `MODEL_BASE` | `/datasets` | Prefix for model weights in `app/model_config.json`. Can be a filesystem path **or** an `https://` object-store prefix. |
+
+**Model storage is the one thing to decide per environment.** `MODEL_BASE` is
+expanded into `model_config.json` at startup. Because every model loader (incl.
+the YOLO detector) resolves paths through `checkpoint_utils.get_checkpoint_path`,
+weights can be:
+
+- a **mounted volume** — `MODEL_BASE=/datasets` (VM) or `/runpod-volume/models` (RunPod network volume), or
+- an **object-store URL** — `MODEL_BASE=https://storage.googleapis.com/your-bucket/models` (fetched + cached to `/tmp/checkpoints` at boot).
+
+The URL option is the most portable: identical config everywhere, no
+provider-specific volume wiring.
+
+## Two load-balancing knobs (the same idea on every platform)
+
+1. **Concurrency per replica = 2** — matches the in-process semaphore
+   `MAX_CONCURRENT_PREDICTIONS` in `predict_router.py` / `pipeline_router.py`.
+   The platform sends the 3rd concurrent request to another GPU replica.
+2. **min / max replicas** — `min=1` keeps a GPU warm (fast first response);
+   `max=N` is your burst ceiling. `min=0` = true scale-to-zero (cheapest, but
+   the first request after idle pays the cold start).
+
+> **Do not raise `WORKERS`.** Multiple uvicorn workers on one GPU each load a
+> full copy of all 5 models into the same VRAM (OOM risk) while the GPU still
+> executes serially — no throughput gain. One worker per GPU; scale via replicas.
+
+## Cold start (know this before choosing min=0)
+
+The service eagerly loads 5 models at startup (`startup_event`), and the health
+check allows a 90s `start_period`. Real cold start = image pull + load all
+models into VRAM (tens of seconds). Mitigations: keep `min`/`activeWorkers >= 1`
+(chosen here), and put weights on fast storage near the GPU region.
+
+## Per-provider files
+
+- **Cloud Run** — `cloudrun/service.yaml` (declarative) or `cloudrun/deploy.sh`
+  (imperative). NVIDIA L4, `min-instances=1`, `concurrency=2`, `timeout=300`.
+- **RunPod** — `runpod/endpoint.json`. HTTP **load-balancing** serverless
+  endpoint (not the queue/handler model, which would be RunPod-specific code),
+  `activeWorkers=1`, `concurrencyPerWorker=2`, network volume or URL for models.
+
+Both consume the **same image**. To move providers, you rebuild nothing — you
+just apply the other config file and point `MODEL_BASE` at that environment's
+model store.
+
+## Quick local parity check
+
+```bash
+# Runs the same way the platforms invoke it (env-driven, 1 worker, CPU):
+PORT=6050 DEVICE=cpu WORKERS=1 MODEL_BASE=/datasets python3 -m app.main
+curl -f http://localhost:6050/health
+```
diff --git a/deploy/cloudrun/deploy.sh b/deploy/cloudrun/deploy.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+# Imperative Cloud Run deploy (alternative to `gcloud run services replace service.yaml`).
+# Builds the existing Dockerfile, pushes to Artifact Registry, deploys with an L4 GPU.
+#
+# Prereqs: gcloud auth + an Artifact Registry repo named "ml-service".
+set -euo pipefail
+
+PROJECT_ID="${PROJECT_ID:?set PROJECT_ID}"
+REGION="${REGION:-us-central1}"
+BUCKET="${MODEL_BUCKET:?set MODEL_BUCKET (gs bucket holding model weights)}"
+IMAGE="${REGION}-docker.pkg.dev/${PROJECT_ID}/ml-service/ml-detector:latest"
+
+# Build with the repo's existing Dockerfile (no Cloud-Run-specific image).
+gcloud builds submit --tag "${IMAGE}" --project "${PROJECT_ID}" -f docker/dockerfile .
+
+gcloud run deploy ml-detector \
+  --project "${PROJECT_ID}" \
+  --region "${REGION}" \
+  --image "${IMAGE}" \
+  --gpu 1 --gpu-type nvidia-l4 \
+  --cpu 8 --memory 32Gi \
+  --concurrency 2 \
+  --min-instances 1 \
+  --max-instances 10 \
+  --no-cpu-throttling \
+  --timeout 300 \
+  --port 6050 \
+  --set-env-vars "PORT=6050,DEVICE=cuda,WORKERS=1,MODEL_BASE=https://storage.googleapis.com/${BUCKET}/models"
+
+# For true scale-to-zero (cheapest, cold-start on first hit): --min-instances 0
diff --git a/deploy/cloudrun/service.yaml b/deploy/cloudrun/service.yaml
@@ -0,0 +1,50 @@
+# Cloud Run service for the ML detector service (NVIDIA L4, scale-to-zero capable).
+#
+# Deploy:   gcloud run services replace deploy/cloudrun/service.yaml --region=us-central1
+# Or use:   deploy/cloudrun/deploy.sh  (imperative equivalent)
+#
+# Portability note: this runs the SAME image as RunPod / the VM. The only
+# Cloud-Run-specific surface is this file. The container reads PORT from the
+# environment (Cloud Run injects it) and pulls models from ${MODEL_BASE}.
+apiVersion: serving.knative.dev/v1
+kind: Service
+metadata:
+  name: ml-detector
+  labels:
+    cloud.googleapis.com/location: us-central1
+spec:
+  template:
+    metadata:
+      annotations:
+        # --- Warm baseline + autoscale (your chosen "always fast" mode) ---
+        autoscaling.knative.dev/minScale: "1"     # keep 1 GPU warm; set "0" for true scale-to-zero
+        autoscaling.knative.dev/maxScale: "10"     # burst ceiling across GPUs
+        run.googleapis.com/cpu-throttling: "false" # don't throttle between requests (model stays resident)
+        run.googleapis.com/startup-cpu-boost: "true"
+    spec:
+      # Match the in-process semaphore (MAX_CONCURRENT_PREDICTIONS = 2).
+      # Cloud Run routes the 3rd concurrent request to another instance.
+      containerConcurrency: 2
+      timeoutSeconds: 300          # /pipeline runs long (detect -> classify -> extract -> orient)
+      containers:
+        - image: REGION-docker.pkg.dev/PROJECT_ID/ml-service/ml-detector:latest
+          ports:
+            - containerPort: 6050  # must equal the PORT env below
+          env:
+            - name: PORT
+              value: "6050"
+            - name: DEVICE
+              value: "cuda"
+            - name: WORKERS
+              value: "1"           # one worker per GPU; scale via instances, not workers
+            - name: MODEL_BASE
+              # Object-store prefix; weights are fetched + cached at startup.
+              # Use a GCS bucket fronted by https, or mount a GCS volume (below).
+              value: "https://storage.googleapis.com/YOUR_BUCKET/models"
+          resources:
+            limits:
+              cpu: "8"             # L4 requires >=4 CPU; 8 recommended
+              memory: 32Gi         # >=16Gi required; 32Gi recommended for 5 resident models
+              nvidia.com/gpu: "1"
+      nodeSelector:
+        run.googleapis.com/accelerator: nvidia-l4
diff --git a/deploy/runpod/endpoint.json b/deploy/runpod/endpoint.json
@@ -0,0 +1,42 @@
+{
+  "_comment": "RunPod Serverless endpoint config for the ML detector service. Use an HTTP 'Load Balancing' serverless endpoint so RunPod routes plain HTTP straight to the container on $PORT — the SAME image and HTTP interface as Cloud Run and the VM (no RunPod-specific handler code, which keeps you provider-independent). The classic queue-based serverless model would require a runpod.serverless handler wrapper and is intentionally avoided here.",
+
+  "name": "ml-detector",
+  "endpointType": "load_balancer",
+
+  "image": "YOUR_REGISTRY/ml-detector:latest",
+  "containerPort": 6050,
+
+  "env": {
+    "PORT": "6050",
+    "DEVICE": "cuda",
+    "WORKERS": "1",
+    "_MODEL_BASE_comment": "Point at a network volume mount (e.g. /runpod-volume/models) OR an https:// object-store prefix. URLs are fetched + cached at startup by checkpoint_utils.",
+    "MODEL_BASE": "/runpod-volume/models"
+  },
+
+  "networkVolume": {
+    "_comment": "Attach a Network Volume holding the model weights; it mounts at /runpod-volume. Omit if MODEL_BASE is an https:// URL instead.",
+    "mountPath": "/runpod-volume"
+  },
+
+  "scaling": {
+    "_comment": "activeWorkers >= 1 keeps a GPU warm (your 'always fast' mode); FlashBoot keeps recently-idle workers hot. Set activeWorkers 0 for true scale-to-zero (cheapest, cold-start on first hit).",
+    "activeWorkers": 1,
+    "maxWorkers": 10,
+    "_concurrency_comment": "Concurrency per worker should match MAX_CONCURRENT_PREDICTIONS (2) in predict_router.py / pipeline_router.py.",
+    "concurrencyPerWorker": 2,
+    "idleTimeoutSeconds": 30
+  },
+
+  "gpu": {
+    "_comment": "Pick by VRAM needed for 5 resident models; L4 / A5000 / A40 are cost-effective. Detection+ID does not need an H100.",
+    "types": ["NVIDIA L4", "NVIDIA RTX A5000", "NVIDIA A40"]
+  },
+
+  "healthcheck": {
+    "_comment": "Reuse the container's existing /health endpoint (GPU + torch + models-loaded checks).",
+    "path": "/health",
+    "port": 6050
+  }
+}
diff --git a/docker/docker-compose.prod.yml b/docker/docker-compose.prod.yml
@@ -17,7 +17,9 @@ services:
       - ultralytics_config:/root/.config/Ultralytics
     environment:
       - PYTHONPATH=/app
-    command: python3 -m app.main --host 0.0.0.0 --port 6050 --device ${DEVICE:-cuda} --workers ${WORKERS:-4}
+    # One worker per GPU: extra workers each load a full copy of all models into
+    # the same VRAM (OOM risk) with no throughput gain since the GPU runs serially.
+    command: python3 -m app.main --host 0.0.0.0 --port 6050 --device ${DEVICE:-cuda} --workers ${WORKERS:-1}
     deploy:
       resources:
         reservations: