From 706d02354a0bae591c3ac96f3f355c0cabd228f0 Mon Sep 17 00:00:00 2001 From: Tomasz Pawlowski Date: Mon, 22 Dec 2025 11:11:55 +0100 Subject: [PATCH 1/5] Add example config of llama-3-2-3b-instruct for Xeon deployment --- deploy/helm/Makefile | 2 +- deploy/helm/rag/values.yaml | 17 +++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/deploy/helm/Makefile b/deploy/helm/Makefile index 44f6b100..b80b6acc 100644 --- a/deploy/helm/Makefile +++ b/deploy/helm/Makefile @@ -218,7 +218,7 @@ help: ## Show this help message @echo -e " SAFETY - Enable specific safety model (e.g., llama-guard-3-8b)" @echo -e " LLM_TOLERATION - Set toleration for LLM model (e.g., nvidia.com/gpu)" @echo -e " SAFETY_TOLERATION - Set toleration for safety model" - @echo -e " DEVICE - Set device type: cpu, gpu, or hpu (default: gpu)" + @echo -e " DEVICE - Set device type: cpu, gpu, hpu or xeon (default: gpu)" @echo -e " HF_TOKEN - Hugging Face token for model downloads" @echo -e " LLM_URL - URL for remote LLM service" @echo -e " SAFETY_URL - URL for remote safety service" diff --git a/deploy/helm/rag/values.yaml b/deploy/helm/rag/values.yaml index 744fe059..31e1256e 100644 --- a/deploy/helm/rag/values.yaml +++ b/deploy/helm/rag/values.yaml @@ -42,6 +42,7 @@ volumeMounts: # - Use DEVICE=cpu for CPU-only deployment # - Use DEVICE=gpu for NVIDIA GPU deployment (default) # - Use DEVICE=hpu for Intel Gaudi HPU deployment (requires Intel Gaudi drivers and setup) +# - Use DEVICE=xeon for Intel Xeon CPU deployment (optimized for large CPU instances, works on SPR/EMR/GNR, requires min 16vCPU and 32GB RAM to run efficiently) # global: # models: @@ -116,6 +117,22 @@ volumeMounts: # - --distributed-executor-backend=mp # - --dtype=auto # - --max-model-len=8000 +# Example of xeon configuration: +# llama-3-2-3b-instruct: +# id: meta-llama/Llama-3.2-3B-Instruct +# enabled: true +# device: "xeon" +# limits: +# cpu: "32" +# memory: 64Gi +# requests: +# cpu: "16" +# memory: 32Gi +# args: +# - --max-model-len +# - "14336" +# - --max-num-seqs +# - "32" global: models: {} From b752d00c4c09a833a90dd51ee14bab89e880c0fa Mon Sep 17 00:00:00 2001 From: Tomasz Pawlowski Date: Mon, 22 Dec 2025 12:03:35 +0100 Subject: [PATCH 2/5] Add Xeon section to README.md and update values min requirements --- README.md | 19 +++++++++++++++++-- deploy/helm/rag/values.yaml | 4 ++-- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 205dc220..7d29ae08 100644 --- a/README.md +++ b/README.md @@ -62,6 +62,8 @@ This QuickStart allows users to explore the capabilities of RAG by: ### Minimum hardware requirements - 1 GPU/HPU with 24GB of VRAM for the LLM, refer to the [chart below](#supported-models) - 1 GPU/HPU with 24GB of VRAM for the safety/shield model (optional) +- Xeon deployments: one worker node with Intel Xeon processors, Sapphire Rapids (SPR) or newer (EMR/GNR), e.g. r7i.8xlarge, m8i.8xlarge + - vLLM requires a minimum of 16 vCPUs and 64 GB of RAM to run ### Minimum software requirements - OpenShift Client CLI - [oc](https://docs.redhat.com/en/documentation/openshift_container_platform/4.18/html/cli_tools/openshift-cli-oc#installing-openshift-cli) @@ -92,8 +94,8 @@ This QuickStart allows users to explore the capabilities of RAG by: | Function | Model Name | Hardware | AWS |-------------|----------------------------------------|-------------|------------- | Embedding | `all-MiniLM-L6-v2` | CPU/GPU/HPU | -| Generation | `meta-llama/Llama-3.2-3B-Instruct` | L4/HPU | g6.2xlarge -| Generation | `meta-llama/Llama-3.1-8B-Instruct` | L4/HPU | g6.2xlarge +| Generation | `meta-llama/Llama-3.2-3B-Instruct` | L4/HPU
Xeon | g6.2xlarge
m8i.8xlarge +| Generation | `meta-llama/Llama-3.1-8B-Instruct` | L4/HPU
Xeon | g6.2xlarge
m8i.8xlarge | Generation | `meta-llama/Meta-Llama-3-70B-Instruct` | A100 x2/HPU | p4d.24xlarge | Safety | `meta-llama/Llama-Guard-3-8B` | L4/HPU | g6.2xlarge @@ -220,6 +222,15 @@ To install on CPU nodes only: make install NAMESPACE=llama-stack-rag LLM=llama-3-2-3b-instruct DEVICE=cpu ``` +**Xeon Deployment Example:** +To install on Xeon nodes only: + +```bash +make install NAMESPACE=llama-stack-rag LLM=llama-3-2-3b-instruct DEVICE=xeon +``` +- This assumes that all your worker nodes use Sapphire Rapids (SPR) or newer Intel Xeon processors. +- If you have heterogeneous worker nodes, work with your cluster administrator to identify SPR+ nodes and use taint keys, similar to the GPU and HPU deployments above, to set `LLM_TOLERATION` and `SAFETY_TOLERATION` to schedule on valid nodes. + **Simplified Commands (No Tolerations Needed):** If you have no tainted nodes (all worker nodes have accelerators), you can use simplified commands: @@ -233,6 +244,10 @@ make install NAMESPACE=llama-stack-rag LLM=llama-3-2-3b-instruct SAFETY=llama-gu # CPU deployment make install NAMESPACE=llama-stack-rag LLM=llama-3-2-3b-instruct SAFETY=llama-guard-3-8b DEVICE=cpu + +# Xeon deployment +make install NAMESPACE=llama-stack-rag LLM=llama-3-2-3b-instruct SAFETY=llama-guard-3-8b DEVICE=xeon + ``` When prompted, enter your **[Hugging Face Token](https://huggingface.co/settings/tokens)**. diff --git a/deploy/helm/rag/values.yaml b/deploy/helm/rag/values.yaml index 31e1256e..5e8c839c 100644 --- a/deploy/helm/rag/values.yaml +++ b/deploy/helm/rag/values.yaml @@ -117,7 +117,7 @@ volumeMounts: # - --distributed-executor-backend=mp # - --dtype=auto # - --max-model-len=8000 -# Example of xeon configuration: +# # Example Xeon configuration: # llama-3-2-3b-instruct: # id: meta-llama/Llama-3.2-3B-Instruct # enabled: true @@ -127,7 +127,7 @@ volumeMounts: # memory: 64Gi # requests: # cpu: "16" -# memory: 32Gi +# memory: 64Gi # args: # - --max-model-len # - "14336" From a1e105ce349afe3bd9225383ec2c82f4db084acf Mon Sep 17 00:00:00 2001 From: Tomasz Pawlowski Date: Mon, 22 Dec 2025 12:09:20 +0100 Subject: [PATCH 3/5] Minor README update --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 7d29ae08..38e3c60f 100644 --- a/README.md +++ b/README.md @@ -62,7 +62,8 @@ This QuickStart allows users to explore the capabilities of RAG by: ### Minimum hardware requirements - 1 GPU/HPU with 24GB of VRAM for the LLM, refer to the [chart below](#supported-models) - 1 GPU/HPU with 24GB of VRAM for the safety/shield model (optional) -- Xeon deployments: one worker node with Intel Xeon processors, Sapphire Rapids (SPR) or newer (EMR/GNR), e.g. r7i.8xlarge, m8i.8xlarge +- Xeon deployments: one worker node with Intel Xeon processors, Sapphire Rapids (SPR) or newer (EMR/GNR) + - for example: m8i.8xlarge, m7i.8xlarge, r8i.8xlarge - vLLM requires a minimum of 16 vCPUs and 64 GB of RAM to run ### Minimum software requirements From 6692abc032c91c7e80aea8bf8e86c52b34d93fdf Mon Sep 17 00:00:00 2001 From: Tomasz Pawlowski Date: Mon, 29 Dec 2025 16:35:38 +0100 Subject: [PATCH 4/5] add llama-3-1-8b-instruct example for xeon --- deploy/helm/rag/values.yaml | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/deploy/helm/rag/values.yaml b/deploy/helm/rag/values.yaml index 5e8c839c..1a36c593 100644 --- a/deploy/helm/rag/values.yaml +++ b/deploy/helm/rag/values.yaml @@ -117,23 +117,30 @@ volumeMounts: # - --distributed-executor-backend=mp # - --dtype=auto # - --max-model-len=8000 -# # Example Xeon configuration: +# # Example Xeon configurations: # llama-3-2-3b-instruct: # id: meta-llama/Llama-3.2-3B-Instruct # enabled: true # device: "xeon" -# limits: -# cpu: "32" -# memory: 64Gi -# requests: -# cpu: "16" -# memory: 64Gi # args: # - --max-model-len # - "14336" # - --max-num-seqs # - "32" - +# llama-3-1-8b-instruct: +# id: meta-llama/Llama-3.1-8B-Instruct +# enabled: true +# device: "xeon" +# args: +# - --max-model-len +# - "14336" +# - --max-num-seqs +# - "32" +# - --enable-auto-tool-choice +# - --chat-template +# - /chat-templates/tool_chat_template_llama3.2_json.jinja +# - --tool-call-parser +# - llama3_json global: models: {} mcp-servers: {} From 70cd48534c4805386358ad6152a9ddd3bffde2b6 Mon Sep 17 00:00:00 2001 From: Tomasz Pawlowski Date: Thu, 8 Jan 2026 16:18:24 +0100 Subject: [PATCH 5/5] Split HW section in supported models table, add N/A for HPU --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 38e3c60f..cd5e732f 100644 --- a/README.md +++ b/README.md @@ -95,8 +95,8 @@ This QuickStart allows users to explore the capabilities of RAG by: | Function | Model Name | Hardware | AWS |-------------|----------------------------------------|-------------|------------- | Embedding | `all-MiniLM-L6-v2` | CPU/GPU/HPU | -| Generation | `meta-llama/Llama-3.2-3B-Instruct` | L4/HPU
Xeon | g6.2xlarge
m8i.8xlarge -| Generation | `meta-llama/Llama-3.1-8B-Instruct` | L4/HPU
Xeon | g6.2xlarge
m8i.8xlarge +| Generation | `meta-llama/Llama-3.2-3B-Instruct` | L4
HPU
Xeon | g6.2xlarge
N/A
m8i.8xlarge +| Generation | `meta-llama/Llama-3.1-8B-Instruct` | L4
HPU
Xeon | g6.2xlarge
N/A
m8i.8xlarge | Generation | `meta-llama/Meta-Llama-3-70B-Instruct` | A100 x2/HPU | p4d.24xlarge | Safety | `meta-llama/Llama-Guard-3-8B` | L4/HPU | g6.2xlarge