-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathconfig.example.yml
More file actions
184 lines (174 loc) · 7.74 KB
/
config.example.yml
File metadata and controls
184 lines (174 loc) · 7.74 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
# ollama-queue-proxy configuration
#
# Copy this file to config.yml and fill in your values.
# config.yml is in .gitignore — never commit it with real keys.
#
# WARNING: Default config has NO authentication.
# If exposing beyond localhost, set auth.enabled: true and configure API keys.
# The docker-compose.yml binds to 127.0.0.1 for this reason.
#
# All values can be overridden via environment variables using OQP_ prefix
# and __ for nesting. Examples:
# OQP_PROXY__PORT=11435
# OQP_OLLAMA__HOSTS__0__URL=http://ollama:11434
# OQP_AUTH__ENABLED=true
# OQP_ROUTING__STRATEGY=model_aware
# OQP_EMBEDDING_CACHE__ENABLED=true
# OQP_KEEP_ALIVE__DEFAULT=5m
proxy:
host: "0.0.0.0"
port: 11435
max_concurrent: 2 # concurrent requests dispatched to Ollama at once.
# Set this to match Ollama's OLLAMA_NUM_PARALLEL (default 1 in Ollama;
# the proxy default of 2 assumes you've set OLLAMA_NUM_PARALLEL ≥ 2).
# Higher values increase throughput but also GPU memory pressure.
allow_model_management: false # block /api/pull, /api/push, /api/delete, /api/create, /api/copy
drain_timeout: 30 # seconds to wait for in-flight requests on shutdown
max_request_body_mb: 50 # reject bodies larger than this before buffering
ollama:
hosts:
# If Ollama is running natively (not in Docker):
# Linux: http://172.17.0.1:11434
# Mac/Windows: http://host.docker.internal:11434
# If Ollama is in the same Docker Compose stack, use its service name:
# http://ollama:11434
- url: "http://ollama-primary:11434"
name: "primary"
weight: 1 # NEW v0.2.0: relative weight for weighted round-robin (default 1)
model_sync_interval: 30 # NEW v0.2.0: seconds between /api/tags polls for routing table
# - url: "http://ollama-secondary:11434"
# name: "secondary"
# weight: 1
# model_sync_interval: 30
health_check_interval: 30 # seconds between recovery checks on unhealthy hosts
request_timeout: 300 # seconds before upstream timeout
queue:
high:
max_depth: 50
max_wait: 120 # seconds before stale request is dropped
high_watermark_pct: 80 # fire queue.high_watermark webhook at this % of max_depth
normal:
max_depth: 100
max_wait: 300
high_watermark_pct: 80
low:
max_depth: 200
max_wait: 600
high_watermark_pct: 80
overflow_status_code: 503 # 503 or 429
webhooks:
enabled: false
url: "" # POST target for queue events; fire-and-forget (5s timeout)
# Note: webhook payloads include client_id. If using a third-party target, be aware
# that client identifiers will be shared externally.
events:
- queue.full
- queue.high_watermark
- queue.drained
- host.unhealthy
- host.recovered
auth:
enabled: false # set true to require API keys on all requests
# WARNING: auth.enabled: true with no keys configured is a fatal startup error.
keys: []
# Example keys (v0.2.0 — includes new max_concurrent field):
# - key: "${ADMIN_API_KEY}" # or hardcode, but use env vars for real deployments
# client_id: "admin"
# description: "Admin key — full access"
# max_priority: high
# management: true # required to call /queue/pause, /drain, /flush
# max_concurrent: 0 # 0 = unlimited (subject to proxy.max_concurrent)
# - key: "${OPENWEBUI_API_KEY}"
# client_id: "openwebui"
# description: "Open WebUI — interactive"
# max_priority: high
# management: false
# max_concurrent: 0
# - key: "${MEMSEARCH_API_KEY}"
# client_id: "memsearch-watch"
# description: "Background embedding re-index jobs"
# max_priority: low
# management: false
# max_concurrent: 2 # cap batch client at 2 concurrent
rate_limit:
max_failures: 10 # consecutive auth failures per IP before 429
window_seconds: 60 # rolling window for failure counting
logging:
level: "info"
format: "json" # json or text
# ---------------------------------------------------------------------------
# NEW v0.2.0 — Client injection
# ---------------------------------------------------------------------------
# Port-based auth bypass for clients that can't send Bearer headers.
# Each injection listener binds to a separate port and injects a fixed client
# identity. The client doesn't need to send any Authorization header.
#
# SECURITY:
# - Injection ports default to 127.0.0.1 (loopback only).
# - Binding to non-loopback addresses requires allow_public_injection: true.
# - If allow_public_injection: true AND auth.enabled: false, the proxy emits a
# startup security warning — any host on the network can consume GPU time.
# - Authorization headers received on injection ports are stripped before
# forwarding to upstream — tokens are never relayed to Ollama.
#
# client_injection:
# listeners:
# - listen_port: 11436
# inject_as: memsearch-watch # must match an auth.keys[].client_id
# bind: 127.0.0.1 # default: loopback only
# - listen_port: 11437
# inject_as: localllm
# allow_public_injection: false # must be true to bind to non-loopback
# ---------------------------------------------------------------------------
# NEW v0.2.0 — Model-aware routing
# ---------------------------------------------------------------------------
# Route requests to the Ollama host that already has the target model loaded.
# Polling uses ollama.hosts[].model_sync_interval (per host, default 30s).
# Weighted round-robin is deterministic: weight=2 means 2 requests per 1 for weight=1.
#
# Startup: probes all hosts via /api/tags once before accepting requests.
# Fails fast if no host responds.
#
# OQP_ROUTING__STRATEGY=model_aware
#
# routing:
# strategy: model_aware # model_aware | round_robin (default: round_robin)
# fallback: any_healthy # when no host has the model loaded: pick any healthy host
# model_poll_timeout: 3 # seconds for each /api/tags poll
# ---------------------------------------------------------------------------
# NEW v0.2.0 — Embedding cache
# ---------------------------------------------------------------------------
# Hash-keyed Valkey (or Dragonfly drop-in) cache for /api/embed and /api/embeddings.
# Cache hits bypass the queue and upstream entirely.
#
# Backend: Valkey is recommended. Dragonfly is a supported drop-in (same RESP protocol).
# For a Valkey service in the same Docker Compose stack:
# backend: "redis://valkey:6379/0"
#
# Startup: if enabled: true, the proxy pings the backend. Startup aborts if unreachable.
# Runtime: RESP errors degrade gracefully — logged once/minute, bypass cache, no user failure.
#
# OQP_EMBEDDING_CACHE__ENABLED=true
# OQP_EMBEDDING_CACHE__BACKEND=redis://valkey:6379/0
#
# embedding_cache:
# enabled: false
# backend: "redis://valkey:6379/0"
# ttl: 86400 # seconds (24 hours default)
# max_entry_bytes: 32768 # skip caching responses larger than this (32 KB)
# key_prefix: "oqp:embed:"
# connect_timeout: 2 # seconds; startup aborts if unreachable
# ---------------------------------------------------------------------------
# NEW v0.2.0 — keep_alive defaulting
# ---------------------------------------------------------------------------
# Inject a keep_alive value into request bodies so Ollama doesn't unload models
# between bursty requests. Applies to: /api/generate, /api/chat, /api/embed, /api/embeddings.
#
# override: false — only inject if the client doesn't send keep_alive
# override: true — always replace the client's value with the proxy default
#
# OQP_KEEP_ALIVE__DEFAULT=5m
#
# keep_alive:
# default: "5m"
# override: false