Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 15 additions & 8 deletions app/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,16 +27,19 @@

# Parse command line arguments
parser = argparse.ArgumentParser(description='FastAPI Model Serving Application')
parser.add_argument('--device', type=str, default='cuda',
# Defaults read from the environment so the same image runs unmodified across
# providers (Cloud Run injects PORT; RunPod/VM set DEVICE etc.). Explicit CLI
# flags still override these defaults.
parser.add_argument('--device', type=str, default=os.getenv('DEVICE', 'cuda'),
help='Device to run the models on (e.g., cpu, cuda, mps)')
parser.add_argument('--host', type=str, default='0.0.0.0',
parser.add_argument('--host', type=str, default=os.getenv('HOST', '0.0.0.0'),
help='Host to run the server on')
parser.add_argument('--port', type=int, default=8888,
parser.add_argument('--port', type=int, default=int(os.getenv('PORT', '6050')),
help='Port to run the server on')
parser.add_argument('--reload', action='store_true',
parser.add_argument('--reload', action='store_true',
help='Enable auto-reload')
parser.add_argument('--workers', type=int, default=1,
help='Number of worker processes')
parser.add_argument('--workers', type=int, default=int(os.getenv('WORKERS', '1')),
help='Number of worker processes (keep at 1 per GPU; scale via replicas)')
args = parser.parse_args()

if __name__ == "__main__":
Expand All @@ -60,10 +63,14 @@ async def startup_event():
app.state.device = args.device

try:
# Load model configuration
# Load model configuration. ${MODEL_BASE} in the config is expanded from
# the environment so the same config points at any model store —
# /datasets (VM mount), a provider volume, or an https:// object-store
# URL (URLs are fetched + cached by checkpoint_utils at load time).
os.environ.setdefault('MODEL_BASE', '/datasets')
config_path = os.path.join(os.path.dirname(__file__), 'model_config.json')
with open(config_path, 'r') as f:
config = json.load(f)
config = json.loads(os.path.expandvars(f.read()))

logger.info(f"Loading models on device: {args.device}")

Expand Down
8 changes: 4 additions & 4 deletions app/model_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
{
"model_id": "msv3",
"model_type": "yolo-ultralytics",
"model_path": "/datasets/detect.yolov11.msv3.pt",
"model_path": "${MODEL_BASE}/detect.yolov11.msv3.pt",
"imgsz": 640,
"conf": 0.5
},
Expand All @@ -18,19 +18,19 @@
"model_id": "miewid-msv4.1",
"model_type": "miewid",
"imgsz": 440,
"checkpoint_path": "/datasets/miew_id.msv4_1_main.bin",
"checkpoint_path": "${MODEL_BASE}/miew_id.msv4_1_main.bin",
"version": 4.1
},
{
"model_id": "miewid-trout",
"model_type": "miewid",
"imgsz": 440,
"checkpoint_path": "/datasets/miewid_trout.bin"
"checkpoint_path": "${MODEL_BASE}/miewid_trout.bin"
},
{
"model_id": "efficientnet-classifier",
"model_type": "efficientnetv2",
"checkpoint_path": "/datasets/vplabeler-msv3.pt",
"checkpoint_path": "${MODEL_BASE}/vplabeler-msv3.pt",
"img_size": 512,
"threshold": 0.5
}
Expand Down
8 changes: 6 additions & 2 deletions app/models/yolo_ultralytics.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from PIL import Image
from ultralytics import YOLO
from .base_model import BaseModel
from ..utils.checkpoint_utils import get_checkpoint_path
import logging

logger = logging.getLogger(__name__)
Expand All @@ -24,8 +25,11 @@ def load(self, model_path: str, device: str, **kwargs) -> None:
- imgsz: Default image size for inference
- conf: Default confidence threshold
"""
logger.info(f"Loading YOLO model from {model_path} on device {device}")
self.model = YOLO(model_path)
# Resolve URLs (and validate local paths) the same way the other model
# types do, so the detector weight can live in any object store.
local_model_path = get_checkpoint_path(model_path)
logger.info(f"Loading YOLO model from {local_model_path} on device {device}")
self.model = YOLO(local_model_path)
self.model.to(device)

# Store model info
Expand Down
76 changes: 76 additions & 0 deletions deploy/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# Deploying the ML detector service on on-demand GPU (provider-independent)

This service is **stateless** and GPU-bound, which makes it a clean fit for
serverless / on-demand GPU platforms: you pay per-second only while a GPU is
actually processing, and the platform autoscales across many GPUs during bursts
and back down to a warm baseline (or zero) when idle. **The platform is your
load balancer** — you do not run nginx/HAProxy yourself.

The goal of this directory is to stay **provider-independent**: one OCI image
(built from the repo's existing `docker/dockerfile`), serving plain HTTP, runs
identically on RunPod, Cloud Run, or a plain VM. Each provider gets only a thin
config file here.

## The portability contract (what the container guarantees)

The image makes zero provider-specific assumptions. It is configured entirely
through environment variables:

| Env var | Default | Purpose |
|---|---|---|
| `PORT` | `6050` | HTTP port. Cloud Run injects this; RunPod/VM can set it. (`app/main.py`) |
| `HOST` | `0.0.0.0` | Bind address. |
| `DEVICE` | `cuda` | `cuda` / `cpu` / `mps`. |
| `WORKERS` | `1` | **Keep at 1 per GPU.** Scale with replicas, not workers (see below). |
| `MODEL_BASE` | `/datasets` | Prefix for model weights in `app/model_config.json`. Can be a filesystem path **or** an `https://` object-store prefix. |

**Model storage is the one thing to decide per environment.** `MODEL_BASE` is
expanded into `model_config.json` at startup. Because every model loader (incl.
the YOLO detector) resolves paths through `checkpoint_utils.get_checkpoint_path`,
weights can be:

- a **mounted volume** — `MODEL_BASE=/datasets` (VM) or `/runpod-volume/models` (RunPod network volume), or
- an **object-store URL** — `MODEL_BASE=https://storage.googleapis.com/your-bucket/models` (fetched + cached to `/tmp/checkpoints` at boot).

The URL option is the most portable: identical config everywhere, no
provider-specific volume wiring.

## Two load-balancing knobs (the same idea on every platform)

1. **Concurrency per replica = 2** — matches the in-process semaphore
`MAX_CONCURRENT_PREDICTIONS` in `predict_router.py` / `pipeline_router.py`.
The platform sends the 3rd concurrent request to another GPU replica.
2. **min / max replicas** — `min=1` keeps a GPU warm (fast first response);
`max=N` is your burst ceiling. `min=0` = true scale-to-zero (cheapest, but
the first request after idle pays the cold start).

> **Do not raise `WORKERS`.** Multiple uvicorn workers on one GPU each load a
> full copy of all 5 models into the same VRAM (OOM risk) while the GPU still
> executes serially — no throughput gain. One worker per GPU; scale via replicas.

## Cold start (know this before choosing min=0)

The service eagerly loads 5 models at startup (`startup_event`), and the health
check allows a 90s `start_period`. Real cold start = image pull + load all
models into VRAM (tens of seconds). Mitigations: keep `min`/`activeWorkers >= 1`
(chosen here), and put weights on fast storage near the GPU region.

## Per-provider files

- **Cloud Run** — `cloudrun/service.yaml` (declarative) or `cloudrun/deploy.sh`
(imperative). NVIDIA L4, `min-instances=1`, `concurrency=2`, `timeout=300`.
- **RunPod** — `runpod/endpoint.json`. HTTP **load-balancing** serverless
endpoint (not the queue/handler model, which would be RunPod-specific code),
`activeWorkers=1`, `concurrencyPerWorker=2`, network volume or URL for models.

Both consume the **same image**. To move providers, you rebuild nothing — you
just apply the other config file and point `MODEL_BASE` at that environment's
model store.

## Quick local parity check

```bash
# Runs the same way the platforms invoke it (env-driven, 1 worker, CPU):
PORT=6050 DEVICE=cpu WORKERS=1 MODEL_BASE=/datasets python3 -m app.main
curl -f http://localhost:6050/health
```
30 changes: 30 additions & 0 deletions deploy/cloudrun/deploy.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/usr/bin/env bash
# Imperative Cloud Run deploy (alternative to `gcloud run services replace service.yaml`).
# Builds the existing Dockerfile, pushes to Artifact Registry, deploys with an L4 GPU.
#
# Prereqs: gcloud auth + an Artifact Registry repo named "ml-service".
set -euo pipefail

PROJECT_ID="${PROJECT_ID:?set PROJECT_ID}"
REGION="${REGION:-us-central1}"
BUCKET="${MODEL_BUCKET:?set MODEL_BUCKET (gs bucket holding model weights)}"
IMAGE="${REGION}-docker.pkg.dev/${PROJECT_ID}/ml-service/ml-detector:latest"

# Build with the repo's existing Dockerfile (no Cloud-Run-specific image).
gcloud builds submit --tag "${IMAGE}" --project "${PROJECT_ID}" -f docker/dockerfile .

gcloud run deploy ml-detector \
--project "${PROJECT_ID}" \
--region "${REGION}" \
--image "${IMAGE}" \
--gpu 1 --gpu-type nvidia-l4 \
--cpu 8 --memory 32Gi \
--concurrency 2 \
--min-instances 1 \
--max-instances 10 \
--no-cpu-throttling \
--timeout 300 \
--port 6050 \
--set-env-vars "PORT=6050,DEVICE=cuda,WORKERS=1,MODEL_BASE=https://storage.googleapis.com/${BUCKET}/models"

# For true scale-to-zero (cheapest, cold-start on first hit): --min-instances 0
50 changes: 50 additions & 0 deletions deploy/cloudrun/service.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# Cloud Run service for the ML detector service (NVIDIA L4, scale-to-zero capable).
#
# Deploy: gcloud run services replace deploy/cloudrun/service.yaml --region=us-central1
# Or use: deploy/cloudrun/deploy.sh (imperative equivalent)
#
# Portability note: this runs the SAME image as RunPod / the VM. The only
# Cloud-Run-specific surface is this file. The container reads PORT from the
# environment (Cloud Run injects it) and pulls models from ${MODEL_BASE}.
apiVersion: serving.knative.dev/v1
kind: Service
metadata:
name: ml-detector
labels:
cloud.googleapis.com/location: us-central1
spec:
template:
metadata:
annotations:
# --- Warm baseline + autoscale (your chosen "always fast" mode) ---
autoscaling.knative.dev/minScale: "1" # keep 1 GPU warm; set "0" for true scale-to-zero
autoscaling.knative.dev/maxScale: "10" # burst ceiling across GPUs
run.googleapis.com/cpu-throttling: "false" # don't throttle between requests (model stays resident)
run.googleapis.com/startup-cpu-boost: "true"
spec:
# Match the in-process semaphore (MAX_CONCURRENT_PREDICTIONS = 2).
# Cloud Run routes the 3rd concurrent request to another instance.
containerConcurrency: 2
timeoutSeconds: 300 # /pipeline runs long (detect -> classify -> extract -> orient)
containers:
- image: REGION-docker.pkg.dev/PROJECT_ID/ml-service/ml-detector:latest
ports:
- containerPort: 6050 # must equal the PORT env below
env:
- name: PORT
value: "6050"
- name: DEVICE
value: "cuda"
- name: WORKERS
value: "1" # one worker per GPU; scale via instances, not workers
- name: MODEL_BASE
# Object-store prefix; weights are fetched + cached at startup.
# Use a GCS bucket fronted by https, or mount a GCS volume (below).
value: "https://storage.googleapis.com/YOUR_BUCKET/models"
resources:
limits:
cpu: "8" # L4 requires >=4 CPU; 8 recommended
memory: 32Gi # >=16Gi required; 32Gi recommended for 5 resident models
nvidia.com/gpu: "1"
nodeSelector:
run.googleapis.com/accelerator: nvidia-l4
42 changes: 42 additions & 0 deletions deploy/runpod/endpoint.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
{
"_comment": "RunPod Serverless endpoint config for the ML detector service. Use an HTTP 'Load Balancing' serverless endpoint so RunPod routes plain HTTP straight to the container on $PORT — the SAME image and HTTP interface as Cloud Run and the VM (no RunPod-specific handler code, which keeps you provider-independent). The classic queue-based serverless model would require a runpod.serverless handler wrapper and is intentionally avoided here.",

"name": "ml-detector",
"endpointType": "load_balancer",

"image": "YOUR_REGISTRY/ml-detector:latest",
"containerPort": 6050,

"env": {
"PORT": "6050",
"DEVICE": "cuda",
"WORKERS": "1",
"_MODEL_BASE_comment": "Point at a network volume mount (e.g. /runpod-volume/models) OR an https:// object-store prefix. URLs are fetched + cached at startup by checkpoint_utils.",
"MODEL_BASE": "/runpod-volume/models"
},

"networkVolume": {
"_comment": "Attach a Network Volume holding the model weights; it mounts at /runpod-volume. Omit if MODEL_BASE is an https:// URL instead.",
"mountPath": "/runpod-volume"
},

"scaling": {
"_comment": "activeWorkers >= 1 keeps a GPU warm (your 'always fast' mode); FlashBoot keeps recently-idle workers hot. Set activeWorkers 0 for true scale-to-zero (cheapest, cold-start on first hit).",
"activeWorkers": 1,
"maxWorkers": 10,
"_concurrency_comment": "Concurrency per worker should match MAX_CONCURRENT_PREDICTIONS (2) in predict_router.py / pipeline_router.py.",
"concurrencyPerWorker": 2,
"idleTimeoutSeconds": 30
},

"gpu": {
"_comment": "Pick by VRAM needed for 5 resident models; L4 / A5000 / A40 are cost-effective. Detection+ID does not need an H100.",
"types": ["NVIDIA L4", "NVIDIA RTX A5000", "NVIDIA A40"]
},

"healthcheck": {
"_comment": "Reuse the container's existing /health endpoint (GPU + torch + models-loaded checks).",
"path": "/health",
"port": 6050
}
}
4 changes: 3 additions & 1 deletion docker/docker-compose.prod.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@ services:
- ultralytics_config:/root/.config/Ultralytics
environment:
- PYTHONPATH=/app
command: python3 -m app.main --host 0.0.0.0 --port 6050 --device ${DEVICE:-cuda} --workers ${WORKERS:-4}
# One worker per GPU: extra workers each load a full copy of all models into
# the same VRAM (OOM risk) with no throughput gain since the GPU runs serially.
command: python3 -m app.main --host 0.0.0.0 --port 6050 --device ${DEVICE:-cuda} --workers ${WORKERS:-1}
deploy:
resources:
reservations:
Expand Down