diff --git a/README.md b/README.md
index 205dc22..cd5e732 100644
--- a/README.md
+++ b/README.md
@@ -62,6 +62,9 @@ This QuickStart allows users to explore the capabilities of RAG by:
### Minimum hardware requirements
- 1 GPU/HPU with 24GB of VRAM for the LLM, refer to the [chart below](#supported-models)
- 1 GPU/HPU with 24GB of VRAM for the safety/shield model (optional)
+- Xeon deployments: one worker node with Intel Xeon processors, Sapphire Rapids (SPR) or newer (EMR/GNR)
+ - for example: m8i.8xlarge, m7i.8xlarge, r8i.8xlarge
+ - vLLM requires a minimum of 16 vCPUs and 64 GB of RAM to run
### Minimum software requirements
- OpenShift Client CLI - [oc](https://docs.redhat.com/en/documentation/openshift_container_platform/4.18/html/cli_tools/openshift-cli-oc#installing-openshift-cli)
@@ -92,8 +95,8 @@ This QuickStart allows users to explore the capabilities of RAG by:
| Function | Model Name | Hardware | AWS
|-------------|----------------------------------------|-------------|-------------
| Embedding | `all-MiniLM-L6-v2` | CPU/GPU/HPU |
-| Generation | `meta-llama/Llama-3.2-3B-Instruct` | L4/HPU | g6.2xlarge
-| Generation | `meta-llama/Llama-3.1-8B-Instruct` | L4/HPU | g6.2xlarge
+| Generation | `meta-llama/Llama-3.2-3B-Instruct` | L4
HPU
Xeon | g6.2xlarge
N/A
m8i.8xlarge
+| Generation | `meta-llama/Llama-3.1-8B-Instruct` | L4
HPU
Xeon | g6.2xlarge
N/A
m8i.8xlarge
| Generation | `meta-llama/Meta-Llama-3-70B-Instruct` | A100 x2/HPU | p4d.24xlarge
| Safety | `meta-llama/Llama-Guard-3-8B` | L4/HPU | g6.2xlarge
@@ -220,6 +223,15 @@ To install on CPU nodes only:
make install NAMESPACE=llama-stack-rag LLM=llama-3-2-3b-instruct DEVICE=cpu
```
+**Xeon Deployment Example:**
+To install on Xeon nodes only:
+
+```bash
+make install NAMESPACE=llama-stack-rag LLM=llama-3-2-3b-instruct DEVICE=xeon
+```
+- This assumes that all your worker nodes use Sapphire Rapids (SPR) or newer Intel Xeon processors.
+- If you have heterogeneous worker nodes, work with your cluster administrator to identify SPR+ nodes and use taint keys, similar to the GPU and HPU deployments above, to set `LLM_TOLERATION` and `SAFETY_TOLERATION` to schedule on valid nodes.
+
**Simplified Commands (No Tolerations Needed):**
If you have no tainted nodes (all worker nodes have accelerators), you can use simplified commands:
@@ -233,6 +245,10 @@ make install NAMESPACE=llama-stack-rag LLM=llama-3-2-3b-instruct SAFETY=llama-gu
# CPU deployment
make install NAMESPACE=llama-stack-rag LLM=llama-3-2-3b-instruct SAFETY=llama-guard-3-8b DEVICE=cpu
+
+# Xeon deployment
+make install NAMESPACE=llama-stack-rag LLM=llama-3-2-3b-instruct SAFETY=llama-guard-3-8b DEVICE=xeon
+
```
When prompted, enter your **[Hugging Face Token](https://huggingface.co/settings/tokens)**.
diff --git a/deploy/helm/Makefile b/deploy/helm/Makefile
index 44f6b10..b80b6ac 100644
--- a/deploy/helm/Makefile
+++ b/deploy/helm/Makefile
@@ -218,7 +218,7 @@ help: ## Show this help message
@echo -e " SAFETY - Enable specific safety model (e.g., llama-guard-3-8b)"
@echo -e " LLM_TOLERATION - Set toleration for LLM model (e.g., nvidia.com/gpu)"
@echo -e " SAFETY_TOLERATION - Set toleration for safety model"
- @echo -e " DEVICE - Set device type: cpu, gpu, or hpu (default: gpu)"
+ @echo -e " DEVICE - Set device type: cpu, gpu, hpu or xeon (default: gpu)"
@echo -e " HF_TOKEN - Hugging Face token for model downloads"
@echo -e " LLM_URL - URL for remote LLM service"
@echo -e " SAFETY_URL - URL for remote safety service"
diff --git a/deploy/helm/rag/values.yaml b/deploy/helm/rag/values.yaml
index 744fe05..1a36c59 100644
--- a/deploy/helm/rag/values.yaml
+++ b/deploy/helm/rag/values.yaml
@@ -42,6 +42,7 @@ volumeMounts:
# - Use DEVICE=cpu for CPU-only deployment
# - Use DEVICE=gpu for NVIDIA GPU deployment (default)
# - Use DEVICE=hpu for Intel Gaudi HPU deployment (requires Intel Gaudi drivers and setup)
+# - Use DEVICE=xeon for Intel Xeon CPU deployment (optimized for large CPU instances, works on SPR/EMR/GNR, requires min 16vCPU and 32GB RAM to run efficiently)
# global:
# models:
@@ -116,7 +117,30 @@ volumeMounts:
# - --distributed-executor-backend=mp
# - --dtype=auto
# - --max-model-len=8000
-
+# # Example Xeon configurations:
+# llama-3-2-3b-instruct:
+# id: meta-llama/Llama-3.2-3B-Instruct
+# enabled: true
+# device: "xeon"
+# args:
+# - --max-model-len
+# - "14336"
+# - --max-num-seqs
+# - "32"
+# llama-3-1-8b-instruct:
+# id: meta-llama/Llama-3.1-8B-Instruct
+# enabled: true
+# device: "xeon"
+# args:
+# - --max-model-len
+# - "14336"
+# - --max-num-seqs
+# - "32"
+# - --enable-auto-tool-choice
+# - --chat-template
+# - /chat-templates/tool_chat_template_llama3.2_json.jinja
+# - --tool-call-parser
+# - llama3_json
global:
models: {}
mcp-servers: {}