rh-ai-quickstart · sauagarwa · Apr 13, 2026 · Mar 23, 2026 · Mar 23, 2026 · Mar 23, 2026
diff --git a/deploy/helm/rag-values.yaml.example b/deploy/helm/rag-values.yaml.example
@@ -180,6 +180,17 @@ llama-stack:
     # OPENAI_API_KEY: "your_openai_key_here"
     # ANTHROPIC_API_KEY: "your_anthropic_key_here"
 
+  # File Processors Configuration
+  #
+  # Available providers:
+  #   inline::pypdf        - PDF extraction (lightweight, uses PyPDF)
+  #
+  fileProcessors:
+    enabled: true
+    providers:
+      - provider_id: pypdf
+        provider_type: inline::pypdf
+
 # Suggested Questions Configuration
 # These questions appear in the chat UI when users select a database
 # The key should match the vector_store_name (identifier) of the database

diff --git a/deploy/helm/rag/Chart.yaml b/deploy/helm/rag/Chart.yaml
@@ -2,8 +2,8 @@ apiVersion: v2
 name: rag
 description: A Helm chart for Kubernetes
 type: application
-version: 0.2.40
-appVersion: "0.2.40"
+version: 0.2.41
+appVersion: "0.2.41"
 
 dependencies:
   - name: pgvector

diff --git a/deploy/helm/rag/values.yaml b/deploy/helm/rag/values.yaml
@@ -3,7 +3,7 @@ replicaCount: 1
 image:
   repository: quay.io/rh-ai-quickstart/llamastack-dist-ui
   pullPolicy: Always
-  tag: 0.2.40
+  tag: 0.2.41
 
 service:
   type: ClusterIP
@@ -165,6 +165,19 @@ pgvector:
     host: pgvector
     port: "5432"
 
+  # Create a separate vector database for each ingestion pipeline
+  extraDatabases:
+    - name: hr_vector_db
+      vectordb: true
+    - name: legal_vector_db
+      vectordb: true
+    - name: sales_vector_db
+      vectordb: true
+    - name: procurement_vector_db
+      vectordb: true
+    - name: techsupport_vector_db
+      vectordb: true
+
     # Upload sample files to the minio bucket 
   sampleFileUpload:
     enabled: true
@@ -176,6 +189,12 @@ pgvector:
 
 llama-stack:
   enabled: true
+  fileProcessors:
+    enabled: true
+    providers:
+      - provider_id: pypdf
+        provider_type: inline::pypdf
+
   secrets:
     TAVILY_SEARCH_API_KEY: "Paste-your-key-here"
 

diff --git a/frontend/llama_stack_ui/distribution/ui/modules/local_extractors.py b/frontend/llama_stack_ui/distribution/ui/modules/local_extractors.py
@@ -0,0 +1,114 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import io
+import logging
+import os
+
+from docx import Document
+from openpyxl import load_workbook
+
+logger = logging.getLogger(__name__)
+
+LOCAL_SUPPORTED_EXTENSIONS = [".docx", ".xlsx"]
+PROVIDER_SUPPORTED_EXTENSIONS = [".txt", ".pdf", ".md"]
+
+
+def extract_text_from_docx(file) -> str:
+    """Extract all text content from a .docx file.
+
+    Reads paragraph text and table cell text from the document.
+
+    Args:
+        file: File-like object containing .docx data
+
+    Returns:
+        str: Extracted text with paragraphs separated by newlines
+    """
+    doc = Document(file)
+    parts = [p.text for p in doc.paragraphs]
+
+    for table in doc.tables:
+        for row in table.rows:
+            for cell in row.cells:
+                parts.append(cell.text)
+
+    return "\n".join(parts)
+
+
+def extract_text_from_xlsx(file) -> str:
+    """Extract all text content from an .xlsx file.
+
+    Reads each sheet and converts rows to tab-separated values.
+
+    Args:
+        file: File-like object containing .xlsx data
+
+    Returns:
+        str: Extracted text with sheet headers and tab-separated row values
+    """
+    wb = load_workbook(file, read_only=True)
+    parts = []
+
+    for sheet_name in wb.sheetnames:
+        ws = wb[sheet_name]
+        parts.append(f"Sheet: {sheet_name}")
+        for row in ws.iter_rows(values_only=True):
+            row_text = "\t".join(
+                str(cell) if cell is not None else "" for cell in row
+            )
+            parts.append(row_text)
+
+    wb.close()
+    return "\n".join(parts)
+
+
+def extract_text(file, filename: str) -> str:
+    """Extract text from a locally supported file type.
+
+    Routes to the appropriate extractor based on file extension.
+
+    Args:
+        file: File-like object with document data
+        filename: Original filename used to determine the file type
+
+    Returns:
+        str: Extracted plain text content
+
+    Raises:
+        ValueError: If the file extension is not locally supported
+    """
+    ext = os.path.splitext(filename)[1].lower()
+
+    if ext == ".docx":
+        return extract_text_from_docx(file)
+    elif ext == ".xlsx":
+        return extract_text_from_xlsx(file)
+    else:
+        raise ValueError(f"Unsupported file type for local extraction: {ext}")
+
+
+def create_text_file_from_extracted_content(
+    content: str, original_filename: str
+) -> io.BytesIO:
+    """Wrap extracted text as an in-memory .txt file for the Llama Stack API.
+
+    Creates a BytesIO object with .name and .size attributes so it can be
+    passed directly to the files.create API endpoint.
+
+    Args:
+        content: Extracted plain text to wrap
+        original_filename: Original filename; the stem is reused with a .txt extension
+
+    Returns:
+        io.BytesIO: In-memory text file ready for upload
+    """
+    text_bytes = content.encode("utf-8")
+    text_file = io.BytesIO(text_bytes)
+    stem = os.path.splitext(original_filename)[0]
+    text_file.name = f"{stem}.txt"
+    text_file.size = len(text_bytes)
+    return text_file
diff --git a/frontend/llama_stack_ui/distribution/ui/modules/utils.py b/frontend/llama_stack_ui/distribution/ui/modules/utils.py
@@ -79,6 +79,7 @@ def strip_file_citations(text):
     """
     text = re.sub(r'file<[^>]+>', '', text)
     text = re.sub(r'<\|file-[^|]*\|>', '', text)
+    text = re.sub(r'<\|[0-9a-fA-F-]{8,}\|>', '', text)
     text = re.sub(r'【[^】]*†[^】]*】', '', text)
     text = re.sub(r'  +', ' ', text)
     return text
@@ -92,6 +93,7 @@ def strip_file_citations_streaming(text):
     """
     text = strip_file_citations(text)
     text = re.sub(r'<\|(?:f(?:i(?:l(?:e(?:-[^|]*)?)?)?)?)?\s*$', '', text)
+    text = re.sub(r'<\|[0-9a-fA-F-]*$', '', text)
     text = re.sub(r'\bfile<[^>]*$', '', text)
     text = re.sub(r'【[^】]*$', '', text)
     return text

diff --git a/frontend/llama_stack_ui/distribution/ui/page/playground/chat.py b/frontend/llama_stack_ui/distribution/ui/page/playground/chat.py
@@ -86,9 +86,21 @@ def fetch_models_and_tools():
 
     # Fetch models, excluding guardrail/shield models
     models = client.models.list()
+
+    def _get_model_id(model):
+        return getattr(model, "identifier", None) or model.id
+
+    def _get_model_type(model):
+        for attr in ("model_type", "api_model_type"):
+            val = getattr(model, attr, None)
+            if val is not None:
+                return val
+        meta = getattr(model, "custom_metadata", None) or {}
+        return meta.get("model_type")
+
     model_list = [
-        model.identifier for model in models
-        if model.api_model_type == "llm" and model.identifier not in shields_set
+        _get_model_id(model) for model in models
+        if _get_model_type(model) == "llm" and _get_model_id(model) not in shields_set
     ]
 
     # Fetch and categorize toolgroups

diff --git a/frontend/llama_stack_ui/distribution/ui/page/upload/upload.py b/frontend/llama_stack_ui/distribution/ui/page/upload/upload.py
@@ -11,6 +11,12 @@
 import streamlit as st
 
 from llama_stack_ui.distribution.ui.modules.api import llama_stack_api
+from llama_stack_ui.distribution.ui.modules.local_extractors import (
+    LOCAL_SUPPORTED_EXTENSIONS,
+    PROVIDER_SUPPORTED_EXTENSIONS,
+    create_text_file_from_extracted_content,
+    extract_text,
+)
 from llama_stack_ui.distribution.ui.modules.utils import get_vector_db_name
 
 
@@ -21,6 +27,7 @@ def _init_upload_page_session_state():
         "creation_message": "",
         "selected_vector_db": "",
         "newly_created_vdb": None,
+        "extraction_method": "provider",
     }
     for key, value in defaults.items():
         if key not in st.session_state:
@@ -189,6 +196,9 @@ def _create_vector_database(vdb_name):
 def _show_document_upload_ui(vector_db_name, vector_db_obj=None):
     """Display UI for uploading documents to an existing vector database.
 
+    Shows an extraction method toggle that determines which file types are
+    accepted and how they are processed before ingestion.
+
     Args:
         vector_db_name (str): Name of the selected vector database
         vector_db_obj: The actual vector database object with identifier
@@ -200,44 +210,82 @@ def _show_document_upload_ui(vector_db_name, vector_db_obj=None):
 
     _show_status("upload_status", "upload_message")
 
+    local_label = (
+        "Docling ("
+        + ", ".join(LOCAL_SUPPORTED_EXTENSIONS) + ")"
+    )
+    provider_label = (
+        "LlamaStack Provider ("
+        + ", ".join(PROVIDER_SUPPORTED_EXTENSIONS) + ")"
+    )
+    method_options = [provider_label, local_label]
+
+    selected_label = st.radio(
+        "Extraction method",
+        method_options,
+        key="extraction_method_radio",
+        horizontal=False,
+        help="Local extraction converts .docx/.xlsx to text in the browser. "
+             "LlamaStack Provider sends files directly to the server.",
+    )
+
+    is_local = selected_label == local_label
+    st.session_state["extraction_method"] = "local" if is_local else "provider"
+
+    if is_local:
+        accepted_types = [ext.lstrip(".") for ext in LOCAL_SUPPORTED_EXTENSIONS]
+    else:
+        accepted_types = [ext.lstrip(".") for ext in PROVIDER_SUPPORTED_EXTENSIONS]
+
     upload_key = f"processed_files_{vector_db_name}"
     if upload_key not in st.session_state:
         st.session_state[upload_key] = set()
 
     uploaded_files = st.file_uploader(
         "Browse and select files to upload (files will upload automatically)",
         accept_multiple_files=True,
-        type=["txt", "pdf", "doc", "docx", "md"],
-        key=f"uploader_{vector_db_name}",
+        type=accepted_types,
+        key=f"uploader_{vector_db_name}_{st.session_state['extraction_method']}",
         help=(
-            "Select one or more documents - they will be uploaded "
+            "Select one or more documents — they will be uploaded "
             "automatically to this vector database"
         ),
     )
 
     if uploaded_files:
-        file_set_id = frozenset([f.name + str(f.size) for f in uploaded_files])
+        new_files = [
+            f for f in uploaded_files
+            if f.name + str(f.size) not in st.session_state[upload_key]
+        ]
 
-        if file_set_id not in st.session_state[upload_key]:
-            st.session_state[upload_key].add(file_set_id)
+        if new_files:
+            for f in new_files:
+                st.session_state[upload_key].add(f.name + str(f.size))
 
             if vector_db_obj and hasattr(vector_db_obj, 'id'):
                 vector_db_id = vector_db_obj.id
             else:
                 vector_db_id = vector_db_name
 
             _upload_documents_to_database(
-                vector_db_name, uploaded_files, vector_db_id
+                vector_db_name,
+                new_files,
+                vector_db_id,
+                extraction_method=st.session_state["extraction_method"],
             )
 
-
-def _upload_documents_to_database(vector_db_name, uploaded_files, vector_db_id=None):
+def _upload_documents_to_database(vector_db_name, uploaded_files, vector_db_id=None, extraction_method="provider"):
     """Upload documents to an existing vector database.
 
+    When extraction_method is "local", files are first converted to plain text
+    using the local extractors and the resulting .txt content is uploaded.
+    When "provider", files are sent directly to the LlamaStack server.
+
     Args:
         vector_db_name (str): Name of the target vector database
         uploaded_files: List of uploaded files from Streamlit file uploader
         vector_db_id (str): The actual database identifier for API calls
+        extraction_method (str): "local" for client-side extraction, "provider" for server-side
     """
     try:
         st.session_state["upload_status"] = None
@@ -251,16 +299,37 @@ def _upload_documents_to_database(vector_db_name, uploaded_files, vector_db_id=N
         actual_db_id = vector_db_id or vector_db_name
         uploaded_file_ids = []
 
-        with st.spinner(f"Uploading {len(uploaded_files)} file(s)..."):
+        spinner_msg = (
+            f"Extracting and uploading {len(uploaded_files)} file(s)..."
+            if extraction_method == "local"
+            else f"Uploading {len(uploaded_files)} file(s)..."
+        )
+
+        with st.spinner(spinner_msg):
             for uploaded_file in uploaded_files:
+                original_filename = uploaded_file.name
+
+                if extraction_method == "local":
+                    text_content = extract_text(uploaded_file, original_filename)
+                    file_to_upload = create_text_file_from_extracted_content(
+                        text_content, original_filename
+                    )
+                else:
+                    file_to_upload = uploaded_file
+
                 file_response = llama_stack_api.client.files.create(
-                    file=uploaded_file,
+                    file=file_to_upload,
                     purpose="assistants"
                 )
-                llama_stack_api.client.vector_stores.files.create(
-                    vector_store_id=actual_db_id,
-                    file_id=file_response.id,
-                )
+
+                vs_file_kwargs = {
+                    "vector_store_id": actual_db_id,
+                    "file_id": file_response.id,
+                }
+                if extraction_method == "local":
+                    vs_file_kwargs["attributes"] = {"source": original_filename}
+
+                llama_stack_api.client.vector_stores.files.create(**vs_file_kwargs)
                 uploaded_file_ids.append(file_response.id)
 
         st.session_state["upload_status"] = "success"