Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions deploy/helm/rag/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,18 @@ spec:
- name: TAVILY_SEARCH_API_KEY
value: {{ (index .Values "llama-stack").secrets.TAVILY_SEARCH_API_KEY | quote }}
{{- end }}
{{- if .Values.pgvector }}
- name: PGVECTOR_HOST
value: {{ .Values.pgvector.secret.host | quote }}
- name: PGVECTOR_PORT
value: {{ .Values.pgvector.secret.port | quote }}
- name: PGVECTOR_USER
value: {{ .Values.pgvector.secret.user | quote }}
- name: PGVECTOR_PASSWORD
value: {{ .Values.pgvector.secret.password | quote }}
- name: PGVECTOR_DB
value: {{ .Values.pgvector.secret.dbname | quote }}
{{- end }}
{{- if .Values.suggestedQuestions }}
- name: RAG_QUESTION_SUGGESTIONS
valueFrom:
Expand Down
91 changes: 91 additions & 0 deletions deploy/helm/rag/templates/embedding-warmup-job.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
{{/*
Embedding Warmup Job
This job ensures the embedding model is fully loaded before ingestion pipelines run.
It prevents the race condition where pipelines try to embed documents before the embedding model is ready.
*/}}
{{- if .Values.global.embeddingWarmup.enabled | default true }}
apiVersion: batch/v1
kind: Job
metadata:
name: {{ include "rag.fullname" . }}-embedding-warmup
labels:
{{- include "rag.labels" . | nindent 4 }}
app.kubernetes.io/component: embedding-warmup
annotations:
# Run as a post-install hook with low weight to run early
"helm.sh/hook": post-install,post-upgrade
"helm.sh/hook-weight": "-10"
"helm.sh/hook-delete-policy": hook-succeeded,before-hook-creation
spec:
ttlSecondsAfterFinished: 300
backoffLimit: 10
template:
metadata:
labels:
{{- include "rag.selectorLabels" . | nindent 8 }}
app.kubernetes.io/component: embedding-warmup
spec:
restartPolicy: OnFailure
containers:
- name: warmup
image: "image-registry.openshift-image-registry.svc:5000/openshift/tools:latest"
imagePullPolicy: IfNotPresent
env:
- name: LLAMASTACK_URL
value: "http://llamastack:8321"
- name: EMBEDDING_MODEL
value: {{ .Values.global.embeddingWarmup.model | default "all-MiniLM-L6-v2" | quote }}
- name: MAX_RETRIES
value: {{ .Values.global.embeddingWarmup.maxRetries | default "60" | quote }}
- name: RETRY_INTERVAL
value: {{ .Values.global.embeddingWarmup.retryInterval | default "5" | quote }}
command:
- /bin/bash
- -c
- |
set -e
echo "=== Embedding Model Warmup Job ==="
echo "LlamaStack URL: $LLAMASTACK_URL"
echo "Embedding Model: $EMBEDDING_MODEL"
echo "Max Retries: $MAX_RETRIES"
echo "Retry Interval: ${RETRY_INTERVAL}s"
echo ""

# First wait for LlamaStack to be available
echo "Step 1: Waiting for LlamaStack to be available..."
retries=0
until curl -sf "$LLAMASTACK_URL/v1/models" > /dev/null 2>&1; do
retries=$((retries + 1))
if [ $retries -ge $MAX_RETRIES ]; then
echo "ERROR: LlamaStack not available after $MAX_RETRIES retries"
exit 1
fi
echo " Waiting for LlamaStack... (attempt $retries/$MAX_RETRIES)"
sleep $RETRY_INTERVAL
done
echo " LlamaStack is available!"
echo ""

# Now warm up the embedding model by making an actual embedding request
echo "Step 2: Warming up embedding model..."
retries=0
until curl -sf -X POST "$LLAMASTACK_URL/v1/inference/embeddings" \
-H "Content-Type: application/json" \
-d "{\"model_id\": \"$EMBEDDING_MODEL\", \"contents\": [\"warmup test\"]}" \
--max-time 30 \
| grep -q "embeddings"; do
retries=$((retries + 1))
if [ $retries -ge $MAX_RETRIES ]; then
echo "ERROR: Embedding model not ready after $MAX_RETRIES retries"
exit 1
fi
echo " Waiting for embedding model to load... (attempt $retries/$MAX_RETRIES)"
sleep $RETRY_INTERVAL
done
echo " Embedding model is ready!"
echo ""

echo "=== Warmup Complete ==="
echo "The embedding model is now loaded and ready for ingestion pipelines."
{{- end }}

7 changes: 7 additions & 0 deletions deploy/helm/rag/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,13 @@ volumeMounts:
global:
models: {}
mcp-servers: {}
# Embedding warmup configuration
# Ensures the embedding model is loaded before ingestion pipelines run
embeddingWarmup:
enabled: true
model: "all-MiniLM-L6-v2"
maxRetries: 60 # Maximum number of retries (60 * 5s = 5 minutes max wait)
retryInterval: 5 # Seconds between retries

# Hugging Face Token for model downloads
llm-service:
Expand Down
Loading
Loading