rh-ai-quickstart · sauagarwa · Feb 25, 2026 · Feb 18, 2026 · mhdawson · Feb 24, 2026
diff --git a/deploy/helm/Makefile b/deploy/helm/Makefile
@@ -181,6 +181,8 @@ helm_llm_service_args = \
 helm_llama_stack_args = \
     $(if $(LLM),--set global.models.$(LLM).enabled=true,) \
     $(if $(SAFETY),--set global.models.$(SAFETY).enabled=true,) \
+    $(if $(LLM_ID),--set global.models.$(LLM).id='$(LLM_ID)',) \
+    $(if $(SAFETY_ID),--set global.models.$(SAFETY).id='$(SAFETY_ID)',) \
     $(if $(LLM_URL),--set global.models.$(LLM).url='$(LLM_URL)',) \
     $(if $(SAFETY_URL),--set global.models.$(SAFETY).url='$(SAFETY_URL)',) \
     $(if $(LLM_API_TOKEN),--set global.models.$(LLM).apiToken='$(LLM_API_TOKEN)',) \
@@ -232,6 +234,8 @@ help: ## Show this help message
 	@echo -e "$(GREEN)Command-Line Parameters (override values file):$(NC)"
 	@echo -e "  LLM           - Enable specific LLM model (e.g., llama-3-2-3b-instruct)"
 	@echo -e "  SAFETY        - Enable specific safety model (e.g., llama-guard-3-8b)"
+	@echo -e "  LLM_ID        - Model ID for LLM (required for remote models)"
+	@echo -e "  SAFETY_ID     - Model ID for safety model (required for remote models)"
 	@echo -e "  LLM_TOLERATION - Set toleration for LLM model (e.g., nvidia.com/gpu)"
 	@echo -e "  SAFETY_TOLERATION - Set toleration for safety model"
 	@echo -e "  DEVICE        - Set device type: cpu, gpu, hpu or xeon (default: gpu)"
@@ -493,6 +497,10 @@ install: ## Install the RAG deployment
 			HELM_ARGS="$$HELM_ARGS --set global.models.$(LLM).tolerations[0].effect=NoSchedule"; \
 			HELM_ARGS="$$HELM_ARGS --set global.models.$(LLM).tolerations[0].operator=Exists"; \
 		fi; \
+		if [ -n "$(LLM_ID)" ]; then \
+			echo -e "$(BLUE)[INFO]$(NC) Setting LLM ID: $(LLM_ID)"; \
+			HELM_ARGS="$$HELM_ARGS --set global.models.$(LLM).id=$(LLM_ID)"; \
+		fi; \
 		if [ -n "$(LLM_URL)" ]; then \
 			echo -e "$(BLUE)[INFO]$(NC) Setting LLM URL: $(LLM_URL)"; \
 			HELM_ARGS="$$HELM_ARGS --set global.models.$(LLM).url=$(LLM_URL)"; \
@@ -511,6 +519,10 @@ install: ## Install the RAG deployment
 			HELM_ARGS="$$HELM_ARGS --set global.models.$(SAFETY).tolerations[0].effect=NoSchedule"; \
 			HELM_ARGS="$$HELM_ARGS --set global.models.$(SAFETY).tolerations[0].operator=Exists"; \
 		fi; \
+		if [ -n "$(SAFETY_ID)" ]; then \
+			echo -e "$(BLUE)[INFO]$(NC) Setting SAFETY ID: $(SAFETY_ID)"; \
+			HELM_ARGS="$$HELM_ARGS --set global.models.$(SAFETY).id=$(SAFETY_ID)"; \
+		fi; \
 		if [ -n "$(SAFETY_URL)" ]; then \
 			echo -e "$(BLUE)[INFO]$(NC) Setting SAFETY URL: $(SAFETY_URL)"; \
 			HELM_ARGS="$$HELM_ARGS --set global.models.$(SAFETY).url='$(SAFETY_URL)'"; \

diff --git a/evaluations/.gitignore b/evaluations/.gitignore
@@ -0,0 +1,7 @@
+# Test artifacts - can be regenerated
+results/conversation_results/*.json
+results/conversation_results/screenshots/
+results/deep_eval_results/
+
+# Playwright browsers
+bin/
diff --git a/evaluations/README.md b/evaluations/README.md
@@ -0,0 +1,114 @@
+# RAG Evaluations
+
+Two-step process: first run UI tests to collect conversations, then run DeepEval to score them.
+
+## Setup
+
+First deploy the RAG quickstart
+
+Second change into the evaluations directory and setup
+
+```bash
+cd evaluations
+uv sync
+```
+
+Playwright Chromium is auto-installed to `evaluations/bin/` on first run.
+
+## Running both steps together
+
+Set environment variables to configure the evaluator LLM, then run `evaluate.py`:
+
+```bash
+export LLM_URL=http://localhost:8321/v1
+export LLM_API_TOKEN=dummy
+export LLM_ID=llama-4-scout-17b-16e-w4a16
+
+uv run python evaluate.py
+```
+
+## Running steps individually
+
+**Step 1 — Collect conversations** (Playwright drives the RAG UI, saves responses + retrieved chunks to `results/conversation_results/`):
+
+```bash
+# Run all conversation tests
+uv run pytest test_conversations_ui.py
+
+# Run a category of tests (subdirectory under conversations/)
+uv run pytest test_conversations_ui.py --subdir=hr
+uv run pytest test_conversations_ui.py --subdir=legal
+
+# Run a specific test
+uv run pytest test_conversations_ui.py -k "hr_benefits"
+
+# Debug mode (visible browser, slow)
+uv run pytest test_conversations_ui.py -v --headed --slowmo=1000
+```
+
+**Step 2 — Score with DeepEval** (reads Step 1 output, evaluates with an LLM-as-judge):
+
+```bash
+uv run python deep_eval_rag.py
+```
+
+**Key options for Step 2:**
+
+| Flag | Default | Description |
+|---|---|---|
+| `--api-endpoint` | `$LLM_URL` | OpenAI-compatible endpoint |
+| `--api-key` | `$LLM_API_TOKEN` | API key |
+| `--stage` | `both` | `1` (conversational only), `2` (retrieval only), `both` |
+| `--max-concurrent` | `4` | Test cases evaluated simultaneously |
+| `--max-concurrent-calls` | `16` | Max concurrent LLM API calls |
+| `--debug` | off | Verbose HTTP/retry logging |
+
+Results are saved to `results/deep_eval_results/evaluation_results_<timestamp>.json`.
+
+## Validating the metrics
+
+The `bad-conversations/` directory contains conversations with known incorrect responses. Use these to verify that the metrics are correctly identifying bad outputs without needing to run UI tests:
+
+```bash
+uv run python evaluate.py --check
+```
+
+## Metrics
+
+**Stage 1 — Conversational** (per conversation):
+These metrics validate the answer from the agent against "idealized" retrieval
+chunks from the source documents that we've defined as part of the test case.
+It tests how well the agent generated the answer from the chunks it received. The
+agent may do a good job despite retrieval being poor.
+- **Response Accuracy** — claims in response are supported by retrieved context
+- **Response Completeness** — response covers key facts from retrieved context
+- **Answer Relevance** — response addresses the actual question asked
+
+**Stage 2 — Retrieval** (per turn with `expected_rag_content`):
+These metrics validate how good the RAG retrieval was. To keep runtime/LLM capability
+needed to a resonable level, we truncate to the first 10 actual RAG chunks returned
+for these checks. The chunk count metric failing will show you when there were
+more than number of chunks to we we truncated.
+- **Chunk Count / Deduplication** — basic retrieval hygiene checks
+- **Chunk Alignment** — actual chunks cover same content as expected chunks
+- **Contextual Precision** — relevant chunks ranked above irrelevant ones
+- **Contextual Relevancy** — each retrieved chunk is relevant to the query
+- **Faithfulness** — response claims are grounded in retrieved context
+
+## Directory structure
+
+```
+evaluations/
+├── conversations/               # Test definitions (JSON)
+├── results/
+│   ├── conversation_results/    # Output from Step 1
+│   └── deep_eval_results/       # Output from Step 2
+├── evaluate.py                  # Runs Step 1 + Step 2 together
+├── test_conversations_ui.py     # Step 1 — Playwright runner
+├── deep_eval_rag.py             # Step 2 — DeepEval scorer
+├── get_rag_metrics.py           # Metric definitions
+└── helpers/
+    ├── custom_llm.py            # OpenAI-compatible LLM wrapper
+    ├── endpoint.py              # RAG UI endpoint detection
+    └── token_counter.py         # Token usage tracking
+```
diff --git a/evaluations/bad-conversations/README.md b/evaluations/bad-conversations/README.md
@@ -0,0 +1,26 @@
+# Known-Bad Conversations
+
+Test fixtures that deliberately trigger metric failures, used to validate that the evaluation framework correctly detects problems.
+
+Run with:
+```bash
+python evaluate.py --check
+```
+
+## Files
+
+| File | Primary Target | Expected Secondary Failures |
+|------|---------------|---------------------------|
+| `fail_response_accuracy_hallucination.json` | Response Accuracy (hallucination) | Chunk Alignment (actual has extra health retreat chunk not in expected) |
+| `fail_response_accuracy_contradiction.json` | Response Accuracy (contradiction) | None (all chunk facts covered, one detail contradicted: "fragile and prone to chipping" vs "Indestructible") |
+| `fail_response_completeness.json` | Response Completeness | None (one accurate claim, just incomplete) |
+| `fail_factual_consistency.json` | Response Accuracy (contradiction) | None (uses correct terminology, only the number is wrong) |
+| `fail_answer_relevance.json` | Answer Relevance | None (accurate and complete per chunks, but answers plan features instead of the cost question asked) |
+| `fail_chunk_count.json` | ChunkCountMetric | None (correct answer, just too many chunks retrieved) |
+| `fail_chunk_deduplication.json` | ChunkDeduplicationMetric | None |
+| `fail_chunk_alignment.json` | Chunk Alignment | Faithfulness (response grounded in expected chunks; actual chunks are different health content) |
+| `fail_contextual_recall.json` | ContextualRecall | Chunk Alignment (actual partial chunk ≠ full expected chunk) |
+| `fail_contextual_precision.json` | ContextualPrecision | None (all chunks are health-related so Contextual Relevancy passes; best chunk is ranked last so Precision fails) |
+| `fail_contextual_relevancy.json` | ContextualRelevancy | ContextualRecall (actual chunks don't cover expected output) |
+| `fail_faithfulness.json` | FaithfulnessMetric | None (Stage 1 passes — response matches expected chunks) |
+| `hr_benefits_test_fail.json` | Response Accuracy, Response Completeness | None (no actual_rag_content so Stage 2 does not run; response fabricates generic HR benefits and completely ignores the FantaCo-specific chunks) |
diff --git a/evaluations/bad-conversations/fail_answer_relevance.json b/evaluations/bad-conversations/fail_answer_relevance.json
@@ -0,0 +1,30 @@
+{
+  "metadata": {
+    "description": "KNOWN-BAD: User asks how much the health insurance costs per month. The chunks describe plan features (not cost). The assistant accurately and completely covers the plan features but never addresses the cost question. Should fail Answer Relevance; Sanity Check should pass (health insurance chunks for a health insurance question) and Response Completeness should pass (response covers all chunk content)."
+  },
+  "config": {
+    "mode": "direct",
+    "vector_dbs": ["hr-vector-db-v1-0"]
+  },
+  "conversation": [
+    {
+      "role": "user",
+      "content": "How much does FantaCo's health insurance cost per month?",
+      "expected_output": "The retrieved documents describe the plan features but do not mention monthly cost or premium amounts.",
+      "expected_rag_content": {
+        "chunks": [
+          "The \"Fountain of Youth & General Awesomeness\" Healthcare Plan (Because Aging is So Last Century, and Frankly, a Bit Boring): Our comprehensive health, dental, and vision plan is so good, it's rumored to reverse male pattern baldness, grant the ability to see in the dark, and allow you to understand what your pet is really thinking. It includes: Unlimited Unicorn Tear Eye Drops: For sparkling vision, the ability to spot a typo from a mile away, and occasionally seeing rainbows where there are none. Dragon Scale Dental Implants: Indestructible, pearly white, they glow faintly in moonlight, and can be used to open particularly stubborn pickle jars. Griffin Feather Pillows & Comforters: For the most regenerative sleep imaginable. On-site Alchemist & Potion Master: For all your minor transmutation needs, bespoke wellness potions, and emergency glitter infusions. (Warning: Do not request the Philosopher's Stone. HR gets very cross, and the paperwork is a nightmare.)"
+        ]
+      },
+      "actual_rag_content": {
+        "chunks": [
+          "The \"Fountain of Youth & General Awesomeness\" Healthcare Plan (Because Aging is So Last Century, and Frankly, a Bit Boring): Our comprehensive health, dental, and vision plan is so good, it's rumored to reverse male pattern baldness, grant the ability to see in the dark, and allow you to understand what your pet is really thinking. It includes: Unlimited Unicorn Tear Eye Drops: For sparkling vision, the ability to spot a typo from a mile away, and occasionally seeing rainbows where there are none. Dragon Scale Dental Implants: Indestructible, pearly white, they glow faintly in moonlight, and can be used to open particularly stubborn pickle jars. Griffin Feather Pillows & Comforters: For the most regenerative sleep imaginable. On-site Alchemist & Potion Master: For all your minor transmutation needs, bespoke wellness potions, and emergency glitter infusions. (Warning: Do not request the Philosopher's Stone. HR gets very cross, and the paperwork is a nightmare.)"
+        ]
+      }
+    },
+    {
+      "role": "assistant",
+      "content": "FantaCo offers the \"Fountain of Youth & General Awesomeness\" Healthcare Plan. It includes Unlimited Unicorn Tear Eye Drops for sparkling vision, Dragon Scale Dental Implants that are indestructible and glow faintly in moonlight, Griffin Feather Pillows & Comforters for regenerative sleep, and an On-site Alchemist & Potion Master for wellness potions and emergency glitter infusions."
+    }
+  ]
+}
diff --git a/evaluations/bad-conversations/fail_chunk_alignment.json b/evaluations/bad-conversations/fail_chunk_alignment.json
@@ -0,0 +1,32 @@
+{
+  "metadata": {
+    "description": "KNOWN-BAD: actual_rag_content contains health-related chunks (EAP, prescription coverage, wellness check-up) that are all on-topic but completely different content from the expected health insurance chunk. Should fail Chunk Alignment (actual content does not match expected). Faithfulness is an expected secondary failure because the response (correctly grounded in expected chunks) references content not present in the actual chunks."
+  },
+  "config": {
+    "mode": "direct",
+    "vector_dbs": ["hr-vector-db-v1-0"]
+  },
+  "conversation": [
+    {
+      "role": "user",
+      "content": "What are the health insurance benefits offered?",
+      "expected_output": "FantaCo offers the \"Fountain of Youth & General Awesomeness\" Healthcare Plan with Unicorn Tear Eye Drops, Dragon Scale Dental Implants, Griffin Feather Pillows, and an On-site Alchemist.",
+      "expected_rag_content": {
+        "chunks": [
+          "The \"Fountain of Youth & General Awesomeness\" Healthcare Plan (Because Aging is So Last Century, and Frankly, a Bit Boring): Our comprehensive health, dental, and vision plan is so good, it's rumored to reverse male pattern baldness, grant the ability to see in the dark, and allow you to understand what your pet is really thinking. It includes: Unlimited Unicorn Tear Eye Drops: For sparkling vision, the ability to spot a typo from a mile away, and occasionally seeing rainbows where there are none. Dragon Scale Dental Implants: Indestructible, pearly white, they glow faintly in moonlight, and can be used to open particularly stubborn pickle jars. Griffin Feather Pillows & Comforters: For the most regenerative sleep imaginable. On-site Alchemist & Potion Master: For all your minor transmutation needs, bespoke wellness potions, and emergency glitter infusions. (Warning: Do not request the Philosopher's Stone. HR gets very cross, and the paperwork is a nightmare.)"
+        ]
+      },
+      "actual_rag_content": {
+        "chunks": [
+          "Employee Assistance Program (EAP): FantaCo provides all employees with access to confidential counseling services, including sessions with licensed therapists, goblin mediators, and certified emotional support centaurs. Available 24/7 via crystal ball or standard phone.",
+          "Prescription Potion Coverage: All medically necessary potions, elixirs, and tinctures are covered at 80% after meeting your annual cauldron deductible of 50 gold coins.",
+          "Annual Wellness Check-up: Each employee receives a complimentary annual health screening conducted by our resident physician and part-time sorcerer, Dr. Cornelius Willowmere, including a full aura assessment."
+        ]
+      }
+    },
+    {
+      "role": "assistant",
+      "content": "FantaCo offers the \"Fountain of Youth & General Awesomeness\" Healthcare Plan. It includes Unlimited Unicorn Tear Eye Drops for sparkling vision, Dragon Scale Dental Implants that are indestructible and glow faintly in moonlight, Griffin Feather Pillows & Comforters for regenerative sleep, and an On-site Alchemist & Potion Master for wellness potions and emergency glitter infusions."
+    }
+  ]
+}