rh-ai-quickstart · tpawlows · Dec 22, 2025 · Dec 22, 2025 · Dec 22, 2025 · Dec 29, 2025
diff --git a/README.md b/README.md
@@ -62,6 +62,9 @@ This QuickStart allows users to explore the capabilities of RAG by:
 ### Minimum hardware requirements 
 - 1 GPU/HPU with 24GB of VRAM for the LLM, refer to the [chart below](#supported-models)
 - 1 GPU/HPU with 24GB of VRAM for the safety/shield model (optional)
+- Xeon deployments: one worker node with Intel Xeon processors, Sapphire Rapids (SPR) or newer (EMR/GNR)
+  - for example: m8i.8xlarge, m7i.8xlarge, r8i.8xlarge
+  - vLLM requires a minimum of 16 vCPUs and 64 GB of RAM to run
 
 ### Minimum software requirements 
 - OpenShift Client CLI - [oc](https://docs.redhat.com/en/documentation/openshift_container_platform/4.18/html/cli_tools/openshift-cli-oc#installing-openshift-cli)
@@ -92,8 +95,8 @@ This QuickStart allows users to explore the capabilities of RAG by:
 | Function    | Model Name                             | Hardware    | AWS
 |-------------|----------------------------------------|-------------|-------------
 | Embedding   | `all-MiniLM-L6-v2`                     | CPU/GPU/HPU |
-| Generation  | `meta-llama/Llama-3.2-3B-Instruct`     | L4/HPU      | g6.2xlarge
-| Generation  | `meta-llama/Llama-3.1-8B-Instruct`     | L4/HPU      | g6.2xlarge
+| Generation  | `meta-llama/Llama-3.2-3B-Instruct`     | L4<br>HPU<br>Xeon | g6.2xlarge<br>N/A<br>m8i.8xlarge
+| Generation  | `meta-llama/Llama-3.1-8B-Instruct`     | L4<br>HPU<br>Xeon | g6.2xlarge<br>N/A<br>m8i.8xlarge
 | Generation  | `meta-llama/Meta-Llama-3-70B-Instruct` | A100 x2/HPU | p4d.24xlarge
 | Safety      | `meta-llama/Llama-Guard-3-8B`          | L4/HPU      | g6.2xlarge
 
@@ -220,6 +223,15 @@ To install on CPU nodes only:
 make install NAMESPACE=llama-stack-rag LLM=llama-3-2-3b-instruct DEVICE=cpu
 ```
 
+**Xeon Deployment Example:**
+To install on Xeon nodes only:
+
+```bash
+make install NAMESPACE=llama-stack-rag LLM=llama-3-2-3b-instruct DEVICE=xeon
+```
+- This assumes that all your worker nodes use Sapphire Rapids (SPR) or newer Intel Xeon processors.
+- If you have heterogeneous worker nodes, work with your cluster administrator to identify SPR+ nodes and use taint keys, similar to the GPU and HPU deployments above, to set `LLM_TOLERATION` and `SAFETY_TOLERATION` to schedule on valid nodes.
+
 **Simplified Commands (No Tolerations Needed):**
 
 If you have no tainted nodes (all worker nodes have accelerators), you can use simplified commands:
@@ -233,6 +245,10 @@ make install NAMESPACE=llama-stack-rag LLM=llama-3-2-3b-instruct SAFETY=llama-gu
 
 # CPU deployment
 make install NAMESPACE=llama-stack-rag LLM=llama-3-2-3b-instruct SAFETY=llama-guard-3-8b DEVICE=cpu
+
+# Xeon deployment
+make install NAMESPACE=llama-stack-rag LLM=llama-3-2-3b-instruct SAFETY=llama-guard-3-8b DEVICE=xeon
+
 ```
 
 When prompted, enter your **[Hugging Face Token](https://huggingface.co/settings/tokens)**.

diff --git a/deploy/helm/Makefile b/deploy/helm/Makefile
@@ -218,7 +218,7 @@ help: ## Show this help message
 	@echo -e "  SAFETY        - Enable specific safety model (e.g., llama-guard-3-8b)"
 	@echo -e "  LLM_TOLERATION - Set toleration for LLM model (e.g., nvidia.com/gpu)"
 	@echo -e "  SAFETY_TOLERATION - Set toleration for safety model"
-	@echo -e "  DEVICE        - Set device type: cpu, gpu, or hpu (default: gpu)"
+	@echo -e "  DEVICE        - Set device type: cpu, gpu, hpu or xeon (default: gpu)"
 	@echo -e "  HF_TOKEN      - Hugging Face token for model downloads"
 	@echo -e "  LLM_URL       - URL for remote LLM service"
 	@echo -e "  SAFETY_URL    - URL for remote safety service"

diff --git a/deploy/helm/rag/values.yaml b/deploy/helm/rag/values.yaml
@@ -42,6 +42,7 @@ volumeMounts:
 # - Use DEVICE=cpu for CPU-only deployment
 # - Use DEVICE=gpu for NVIDIA GPU deployment (default)
 # - Use DEVICE=hpu for Intel Gaudi HPU deployment (requires Intel Gaudi drivers and setup)
+# - Use DEVICE=xeon for Intel Xeon CPU deployment (optimized for large CPU instances, works on SPR/EMR/GNR, requires min 16vCPU and 32GB RAM to run efficiently)
 
 # global:
 #   models:
@@ -116,7 +117,30 @@ volumeMounts:
 #       - --distributed-executor-backend=mp
 #       - --dtype=auto
 #       - --max-model-len=8000
-
+#     # Example Xeon configurations:
+#     llama-3-2-3b-instruct:
+#       id: meta-llama/Llama-3.2-3B-Instruct
+#       enabled: true
+#       device: "xeon"
+#       args:
+#       - --max-model-len
+#       - "14336"
+#       - --max-num-seqs
+#       - "32"
+#     llama-3-1-8b-instruct:
+#       id: meta-llama/Llama-3.1-8B-Instruct
+#       enabled: true
+#       device: "xeon"
+#       args:
+#         - --max-model-len
+#         - "14336"
+#         - --max-num-seqs
+#         - "32"
+#         - --enable-auto-tool-choice
+#         - --chat-template
+#         - /chat-templates/tool_chat_template_llama3.2_json.jinja
+#         - --tool-call-parser
+#         - llama3_json
 global:
   models: {}
   mcp-servers: {}