From 706d02354a0bae591c3ac96f3f355c0cabd228f0 Mon Sep 17 00:00:00 2001
From: Tomasz Pawlowski <tomasz.pawlowski@intel.com>
Date: Mon, 22 Dec 2025 11:11:55 +0100
Subject: [PATCH 1/5] Add example config of llama-3-2-3b-instruct for Xeon
 deployment

---
 deploy/helm/Makefile        |  2 +-
 deploy/helm/rag/values.yaml | 17 +++++++++++++++++
 2 files changed, 18 insertions(+), 1 deletion(-)
diff --git a/deploy/helm/Makefile b/deploy/helm/Makefile
index 44f6b100..b80b6acc 100644
--- a/deploy/helm/Makefile
+++ b/deploy/helm/Makefile
@@ -218,7 +218,7 @@ help: ## Show this help message
 	@echo -e "  SAFETY        - Enable specific safety model (e.g., llama-guard-3-8b)"
 	@echo -e "  LLM_TOLERATION - Set toleration for LLM model (e.g., nvidia.com/gpu)"
 	@echo -e "  SAFETY_TOLERATION - Set toleration for safety model"
-	@echo -e "  DEVICE        - Set device type: cpu, gpu, or hpu (default: gpu)"
+	@echo -e "  DEVICE        - Set device type: cpu, gpu, hpu or xeon (default: gpu)"
 	@echo -e "  HF_TOKEN      - Hugging Face token for model downloads"
 	@echo -e "  LLM_URL       - URL for remote LLM service"
 	@echo -e "  SAFETY_URL    - URL for remote safety service"
diff --git a/deploy/helm/rag/values.yaml b/deploy/helm/rag/values.yaml
index 744fe059..31e1256e 100644
--- a/deploy/helm/rag/values.yaml
+++ b/deploy/helm/rag/values.yaml
@@ -42,6 +42,7 @@ volumeMounts:
 # - Use DEVICE=cpu for CPU-only deployment
 # - Use DEVICE=gpu for NVIDIA GPU deployment (default)
 # - Use DEVICE=hpu for Intel Gaudi HPU deployment (requires Intel Gaudi drivers and setup)
+# - Use DEVICE=xeon for Intel Xeon CPU deployment (optimized for large CPU instances, works on SPR/EMR/GNR, requires min 16vCPU and 32GB RAM to run efficiently)
 
 # global:
 #   models:
@@ -116,6 +117,22 @@ volumeMounts:
 #       - --distributed-executor-backend=mp
 #       - --dtype=auto
 #       - --max-model-len=8000
+#     Example of xeon configuration:
+#     llama-3-2-3b-instruct:
+#       id: meta-llama/Llama-3.2-3B-Instruct
+#       enabled: true
+#       device: "xeon"
+#       limits:
+#         cpu: "32"
+#         memory: 64Gi
+#       requests:
+#         cpu: "16"
+#         memory: 32Gi
+#       args:
+#       - --max-model-len
+#       - "14336"
+#       - --max-num-seqs
+#       - "32"
 
 global:
   models: {}

From b752d00c4c09a833a90dd51ee14bab89e880c0fa Mon Sep 17 00:00:00 2001
From: Tomasz Pawlowski <tomasz.pawlowski@intel.com>
Date: Mon, 22 Dec 2025 12:03:35 +0100
Subject: [PATCH 2/5] Add Xeon section to README.md and update values min
 requirements

---
 README.md                   | 19 +++++++++++++++++--
 deploy/helm/rag/values.yaml |  4 ++--
 2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 205dc220..7d29ae08 100644
--- a/README.md
+++ b/README.md
@@ -62,6 +62,8 @@ This QuickStart allows users to explore the capabilities of RAG by:
 ### Minimum hardware requirements 
 - 1 GPU/HPU with 24GB of VRAM for the LLM, refer to the [chart below](#supported-models)
 - 1 GPU/HPU with 24GB of VRAM for the safety/shield model (optional)
+- Xeon deployments: one worker node with Intel Xeon processors, Sapphire Rapids (SPR) or newer (EMR/GNR), e.g. r7i.8xlarge, m8i.8xlarge
+  - vLLM requires a minimum of 16 vCPUs and 64 GB of RAM to run
 
 ### Minimum software requirements 
 - OpenShift Client CLI - [oc](https://docs.redhat.com/en/documentation/openshift_container_platform/4.18/html/cli_tools/openshift-cli-oc#installing-openshift-cli)
@@ -92,8 +94,8 @@ This QuickStart allows users to explore the capabilities of RAG by:
 | Function    | Model Name                             | Hardware    | AWS
 |-------------|----------------------------------------|-------------|-------------
 | Embedding   | `all-MiniLM-L6-v2`                     | CPU/GPU/HPU |
-| Generation  | `meta-llama/Llama-3.2-3B-Instruct`     | L4/HPU      | g6.2xlarge
-| Generation  | `meta-llama/Llama-3.1-8B-Instruct`     | L4/HPU      | g6.2xlarge
+| Generation  | `meta-llama/Llama-3.2-3B-Instruct`     | L4/HPU<br>Xeon | g6.2xlarge<br>m8i.8xlarge
+| Generation  | `meta-llama/Llama-3.1-8B-Instruct`     | L4/HPU<br>Xeon | g6.2xlarge<br>m8i.8xlarge
 | Generation  | `meta-llama/Meta-Llama-3-70B-Instruct` | A100 x2/HPU | p4d.24xlarge
 | Safety      | `meta-llama/Llama-Guard-3-8B`          | L4/HPU      | g6.2xlarge
 
@@ -220,6 +222,15 @@ To install on CPU nodes only:
 make install NAMESPACE=llama-stack-rag LLM=llama-3-2-3b-instruct DEVICE=cpu
 ```
 
+**Xeon Deployment Example:**
+To install on Xeon nodes only:
+
+```bash
+make install NAMESPACE=llama-stack-rag LLM=llama-3-2-3b-instruct DEVICE=xeon
+```
+- This assumes that all your worker nodes use Sapphire Rapids (SPR) or newer Intel Xeon processors.
+- If you have heterogeneous worker nodes, work with your cluster administrator to identify SPR+ nodes and use taint keys, similar to the GPU and HPU deployments above, to set `LLM_TOLERATION` and `SAFETY_TOLERATION` to schedule on valid nodes.
+
 **Simplified Commands (No Tolerations Needed):**
 
 If you have no tainted nodes (all worker nodes have accelerators), you can use simplified commands:
@@ -233,6 +244,10 @@ make install NAMESPACE=llama-stack-rag LLM=llama-3-2-3b-instruct SAFETY=llama-gu
 
 # CPU deployment
 make install NAMESPACE=llama-stack-rag LLM=llama-3-2-3b-instruct SAFETY=llama-guard-3-8b DEVICE=cpu
+
+# Xeon deployment
+make install NAMESPACE=llama-stack-rag LLM=llama-3-2-3b-instruct SAFETY=llama-guard-3-8b DEVICE=xeon
+
 ```
 
 When prompted, enter your **[Hugging Face Token](https://huggingface.co/settings/tokens)**.
diff --git a/deploy/helm/rag/values.yaml b/deploy/helm/rag/values.yaml
index 31e1256e..5e8c839c 100644
--- a/deploy/helm/rag/values.yaml
+++ b/deploy/helm/rag/values.yaml
@@ -117,7 +117,7 @@ volumeMounts:
 #       - --distributed-executor-backend=mp
 #       - --dtype=auto
 #       - --max-model-len=8000
-#     Example of xeon configuration:
+#     # Example Xeon configuration:
 #     llama-3-2-3b-instruct:
 #       id: meta-llama/Llama-3.2-3B-Instruct
 #       enabled: true
@@ -127,7 +127,7 @@ volumeMounts:
 #         memory: 64Gi
 #       requests:
 #         cpu: "16"
-#         memory: 32Gi
+#         memory: 64Gi
 #       args:
 #       - --max-model-len
 #       - "14336"

From a1e105ce349afe3bd9225383ec2c82f4db084acf Mon Sep 17 00:00:00 2001
From: Tomasz Pawlowski <tomasz.pawlowski@intel.com>
Date: Mon, 22 Dec 2025 12:09:20 +0100
Subject: [PATCH 3/5] Minor README update

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 7d29ae08..38e3c60f 100644
--- a/README.md
+++ b/README.md
@@ -62,7 +62,8 @@ This QuickStart allows users to explore the capabilities of RAG by:
 ### Minimum hardware requirements 
 - 1 GPU/HPU with 24GB of VRAM for the LLM, refer to the [chart below](#supported-models)
 - 1 GPU/HPU with 24GB of VRAM for the safety/shield model (optional)
-- Xeon deployments: one worker node with Intel Xeon processors, Sapphire Rapids (SPR) or newer (EMR/GNR), e.g. r7i.8xlarge, m8i.8xlarge
+- Xeon deployments: one worker node with Intel Xeon processors, Sapphire Rapids (SPR) or newer (EMR/GNR)
+  - for example: m8i.8xlarge, m7i.8xlarge, r8i.8xlarge
   - vLLM requires a minimum of 16 vCPUs and 64 GB of RAM to run
 
 ### Minimum software requirements 

From 6692abc032c91c7e80aea8bf8e86c52b34d93fdf Mon Sep 17 00:00:00 2001
From: Tomasz Pawlowski <tomasz.pawlowski@intel.com>
Date: Mon, 29 Dec 2025 16:35:38 +0100
Subject: [PATCH 4/5] add llama-3-1-8b-instruct example for xeon

---
 deploy/helm/rag/values.yaml | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/deploy/helm/rag/values.yaml b/deploy/helm/rag/values.yaml
index 5e8c839c..1a36c593 100644
--- a/deploy/helm/rag/values.yaml
+++ b/deploy/helm/rag/values.yaml
@@ -117,23 +117,30 @@ volumeMounts:
 #       - --distributed-executor-backend=mp
 #       - --dtype=auto
 #       - --max-model-len=8000
-#     # Example Xeon configuration:
+#     # Example Xeon configurations:
 #     llama-3-2-3b-instruct:
 #       id: meta-llama/Llama-3.2-3B-Instruct
 #       enabled: true
 #       device: "xeon"
-#       limits:
-#         cpu: "32"
-#         memory: 64Gi
-#       requests:
-#         cpu: "16"
-#         memory: 64Gi
 #       args:
 #       - --max-model-len
 #       - "14336"
 #       - --max-num-seqs
 #       - "32"
-
+#     llama-3-1-8b-instruct:
+#       id: meta-llama/Llama-3.1-8B-Instruct
+#       enabled: true
+#       device: "xeon"
+#       args:
+#         - --max-model-len
+#         - "14336"
+#         - --max-num-seqs
+#         - "32"
+#         - --enable-auto-tool-choice
+#         - --chat-template
+#         - /chat-templates/tool_chat_template_llama3.2_json.jinja
+#         - --tool-call-parser
+#         - llama3_json
 global:
   models: {}
   mcp-servers: {}

From 70cd48534c4805386358ad6152a9ddd3bffde2b6 Mon Sep 17 00:00:00 2001
From: Tomasz Pawlowski <tomasz.pawlowski@intel.com>
Date: Thu, 8 Jan 2026 16:18:24 +0100
Subject: [PATCH 5/5] Split HW section in supported models table, add N/A for
 HPU

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 38e3c60f..cd5e732f 100644
--- a/README.md
+++ b/README.md
@@ -95,8 +95,8 @@ This QuickStart allows users to explore the capabilities of RAG by:
 | Function    | Model Name                             | Hardware    | AWS
 |-------------|----------------------------------------|-------------|-------------
 | Embedding   | `all-MiniLM-L6-v2`                     | CPU/GPU/HPU |
-| Generation  | `meta-llama/Llama-3.2-3B-Instruct`     | L4/HPU<br>Xeon | g6.2xlarge<br>m8i.8xlarge
-| Generation  | `meta-llama/Llama-3.1-8B-Instruct`     | L4/HPU<br>Xeon | g6.2xlarge<br>m8i.8xlarge
+| Generation  | `meta-llama/Llama-3.2-3B-Instruct`     | L4<br>HPU<br>Xeon | g6.2xlarge<br>N/A<br>m8i.8xlarge
+| Generation  | `meta-llama/Llama-3.1-8B-Instruct`     | L4<br>HPU<br>Xeon | g6.2xlarge<br>N/A<br>m8i.8xlarge
 | Generation  | `meta-llama/Meta-Llama-3-70B-Instruct` | A100 x2/HPU | p4d.24xlarge
 | Safety      | `meta-llama/Llama-Guard-3-8B`          | L4/HPU      | g6.2xlarge