diff --git a/configs/config_web_opensearch.yml b/configs/config_web_opensearch.yml new file mode 100644 index 00000000..c4ae5b56 --- /dev/null +++ b/configs/config_web_opensearch.yml @@ -0,0 +1,207 @@ +# This is a Web mode configuration for OpenSearch vector retrieval. +# It has the following features: +# - Web search enabled by default +# - Knowledge retrieval using the built-in OpenSearch backend. +# - Supports self-hosted OpenSearch and Amazon OpenSearch Serverless via SigV4. + +general: + use_uvloop: true + telemetry: + logging: + console: + _type: console + level: INFO + # tracing: + # langsmith: # Optional: LangSmith tracing - requires langsmith API key. Set using `export LANGSMITH_API_KEY=` + # _type: langsmith + # project: nvidia-aiq + + front_end: + _type: aiq_api + runner_class: aiq_api.plugin.AIQAPIWorker + # ========================================================================= + # Knowledge API is automatically enabled when knowledge_retrieval function + # is configured + # ========================================================================= + # Async Job API Settings + # ========================================================================= + # Async job infrastructure database (NAT JobStore + EventStore) + # Used by: /v1/jobs/async routes, SSE streaming, job status persistence + # Requires async driver for SQLite (aiosqlite) or PostgreSQL (asyncpg) + # Environment overrides: + # - NAT_JOB_STORE_DB_URL (direct override) + # - NAT_JOB_STORE_DB_URL_DEV / NAT_JOB_STORE_DB_URL_PROD (via NAT_ENV) + db_url: ${NAT_JOB_STORE_DB_URL:-sqlite+aiosqlite:///./jobs.db} + # Job expiry - how long completed jobs stay in database before cleanup + expiry_seconds: 86400 # 24 hours (min: 600, max: 604800/7 days) + cors: + allow_origin_regex: 'http://localhost(:\d+)?|http://127.0.0.1(:\d+)?' + allow_methods: + - GET + - POST + - DELETE + - OPTIONS + allow_headers: + - "*" + allow_credentials: true + expose_headers: + - "*" + +llms: + nemotron_llm_intent: + _type: nim + model_name: nvidia/nemotron-3-nano-30b-a3b + base_url: "https://integrate.api.nvidia.com/v1" + temperature: 0.5 + top_p: 0.9 + max_tokens: 4096 + num_retries: 5 + chat_template_kwargs: + enable_thinking: true + + nemotron_nano_llm: + _type: nim + model_name: nvidia/nemotron-3-nano-30b-a3b + base_url: "https://integrate.api.nvidia.com/v1" + temperature: 0.1 + top_p: 0.3 + max_tokens: 16384 + num_retries: 5 + chat_template_kwargs: + enable_thinking: true + + gpt_oss_llm: + _type: nim + model_name: openai/gpt-oss-120b + base_url: https://integrate.api.nvidia.com/v1 + temperature: 1.0 + top_p: 1.0 + max_tokens: 256000 + api_key: ${NVIDIA_API_KEY} + max_retries: 10 + + # Nemotron Super is compatible and tested with AIQ but has limited availability + # on the Build API due to high demand. + # Uncomment nemotron_super_llm below if the endpoint is accessible. + # nemotron_super_llm: + # _type: nim + # model_name: nvidia/nemotron-3-super-120b-a12b + # base_url: "https://integrate.api.nvidia.com/v1" + # temperature: 1.0 + # top_p: 1.0 + # max_tokens: 128000 + # num_retries: 5 + # chat_template_kwargs: + # enable_thinking: true + +functions: + # ========================================================================= + # Data Source Registry + # ========================================================================= + # Central registry that controls: + # 1. UI toggles — each source appears as an on/off switch in the frontend + # 2. Per-message filtering — users can select active sources per request + # 3. Tool auto-inheritance — agents with no explicit `tools` list receive + # every tool listed here (use `exclude_tools` on agents to specialize) + # + # Source entry fields: + # id, name, description, tools, requires_auth (default: false), + # default_enabled (default: true) + # + # See docs/source/customization/tools-and-sources.md for full details. + # ========================================================================= + data_sources: + _type: data_source_registry + sources: + - id: web_search + name: "Web Search" + description: "Search the web for real-time information." + tools: + - web_search_tool + - advanced_web_search_tool + - id: knowledge_layer + name: "Knowledge Base" + description: "Search uploaded documents and files." + tools: + - knowledge_search + + web_search_tool: + _type: tavily_web_search + max_results: 5 + max_content_length: 1000 + + advanced_web_search_tool: + _type: tavily_web_search + max_results: 2 + advanced_search: true + + # Knowledge Retrieval (see sources/knowledge_layer/KNOWLEDGE-LAYER-SETUP.md) + knowledge_search: + _type: knowledge_retrieval + backend: opensearch + collection_name: ${COLLECTION_NAME:-test_collection} + top_k: 5 + opensearch_url: ${OPENSEARCH_URL:-http://localhost:9200} + opensearch_auth_type: ${OPENSEARCH_AUTH_TYPE:-none} + opensearch_aws_region: ${AWS_REGION:-us-east-1} + opensearch_aws_service: ${OPENSEARCH_AWS_SERVICE:-aoss} + opensearch_index_prefix: ${OPENSEARCH_INDEX_PREFIX:-aiq} + opensearch_embedding_dim: ${OPENSEARCH_EMBEDDING_DIM:-2048} + opensearch_ingestion_mode: ${OPENSEARCH_INGESTION_MODE:-auto} + opensearch_dask_scheduler_address: ${NAT_DASK_SCHEDULER_ADDRESS:-} + opensearch_dask_file_transfer: ${OPENSEARCH_DASK_FILE_TRANSFER:-bytes} + embed_model: ${AIQ_EMBED_MODEL:-nvidia/llama-nemotron-embed-vl-1b-v2} + embed_base_url: ${AIQ_EMBED_BASE_URL:-https://integrate.api.nvidia.com/v1} + + # Paper Search (optional - requires SERPER_API_KEY) + # Uncomment the block below and set SERPER_API_KEY to enable academic paper search. + # paper_search_tool: + # _type: paper_search + # max_results: 5 + # serper_api_key: ${SERPER_API_KEY} + + # ========================================================================= + # Agents — inherit all registry tools; use exclude_tools to specialize + # ========================================================================= + intent_classifier: + _type: intent_classifier + llm: nemotron_llm_intent + # tools: omitted -> inherits all from data_source_registry + # exclude_tools: [] + + clarifier_agent: + _type: clarifier_agent + llm: nemotron_nano_llm # replace with nemotron_super_llm if available + planner_llm: nemotron_nano_llm # replace with nemotron_super_llm if available + # tools: omitted -> inherits all from data_source_registry + # exclude_tools: [] + max_turns: 3 + enable_plan_approval: true + log_response_max_chars: 2000 + verbose: true + + shallow_research_agent: + _type: shallow_research_agent + llm: nemotron_nano_llm + # tools: omitted -> inherits all from data_source_registry + exclude_tools: + - advanced_web_search_tool + max_llm_turns: 10 + max_tool_iterations: 5 + + deep_research_agent: + _type: deep_research_agent + orchestrator_llm: gpt_oss_llm + researcher_llm: nemotron_nano_llm # replace with nemotron_super_llm if available + planner_llm: gpt_oss_llm + # tools: omitted -> inherits all from data_source_registry + exclude_tools: + - web_search_tool + max_loops: 2 + +workflow: + _type: chat_deepresearcher_agent + enable_escalation: true + enable_clarifier: true + use_async_deep_research: true + checkpoint_db: ${AIQ_CHECKPOINT_DB:-./checkpoints.db} diff --git a/deploy/helm/README.md b/deploy/helm/README.md index 7cd16932..4b092219 100644 --- a/deploy/helm/README.md +++ b/deploy/helm/README.md @@ -128,6 +128,45 @@ To see what values the chart supports before installing: helm show values aiq2-web-2.0.0.tgz ``` +### Amazon OpenSearch Serverless + +The backend image can be overridden through values without forking the chart: + +```yaml +aiq: + apps: + backend: + image: + repository: / + tag: +``` + +For Amazon OpenSearch Serverless, set the backend workflow config to `configs/config_web_opensearch.yml` and configure +SigV4 through environment values: + +```yaml +aiq: + apps: + backend: + env: + CONFIG_FILE: configs/config_web_opensearch.yml + OPENSEARCH_URL: https://abc123.us-west-2.aoss.amazonaws.com + OPENSEARCH_AUTH_TYPE: sigv4 + OPENSEARCH_AWS_SERVICE: aoss + OPENSEARCH_INDEX_PREFIX: aiq + AWS_REGION: us-west-2 + OPENSEARCH_INGESTION_MODE: auto + OPENSEARCH_DASK_FILE_TRANSFER: bytes +``` + +A complete example is available at +[`deploy/helm/examples/aws-opensearch-serverless-values.yaml`](examples/aws-opensearch-serverless-values.yaml). + +For EKS Pod Identity, associate the IAM role with the backend service account for this release. With the default chart +names, the namespace is `ns-aiq` and the backend service account is `aiq-backend`. EKS Pod Identity associations are +created through EKS, not by annotating the service account. The role also needs OpenSearch Serverless IAM access and a +data access policy for the target collection/index pattern. + ### Verify ```bash diff --git a/deploy/helm/examples/aws-opensearch-serverless-values.yaml b/deploy/helm/examples/aws-opensearch-serverless-values.yaml new file mode 100644 index 00000000..8bf41172 --- /dev/null +++ b/deploy/helm/examples/aws-opensearch-serverless-values.yaml @@ -0,0 +1,32 @@ +# Example values for AI-Q with Amazon OpenSearch Serverless. +# +# Before applying this file: +# - Create an OpenSearch Serverless vector collection and note its data endpoint. +# - Create an AOSS data access policy for the service account IAM role. +# - Create an EKS Pod Identity association that maps the backend service account +# for this release (default name: aiq-backend in namespace ns-aiq) to that role. + +aiq: + apps: + backend: + image: + # Override this when testing a custom image that includes unreleased changes. + repository: nvcr.io/nvidia/blueprint/aiq-agent + tag: "2.0.0" + pullPolicy: IfNotPresent + imagePullSecrets: + - name: ngc-image-pull-secret + secretEnv: + NVIDIA_API_KEY: NVIDIA_API_KEY + env: + CONFIG_FILE: configs/config_web_opensearch.yml + COLLECTION_NAME: default_collection + OPENSEARCH_URL: https://abc123.us-west-2.aoss.amazonaws.com + OPENSEARCH_AUTH_TYPE: sigv4 + OPENSEARCH_AWS_SERVICE: aoss + OPENSEARCH_INDEX_PREFIX: aiq + AWS_REGION: us-west-2 + OPENSEARCH_INGESTION_MODE: auto + OPENSEARCH_DASK_FILE_TRANSFER: bytes + DASK_NWORKERS: "1" + DASK_NTHREADS: "4" diff --git a/docs/source/deployment/aws-opensearch-serverless.md b/docs/source/deployment/aws-opensearch-serverless.md new file mode 100644 index 00000000..ff9b17da --- /dev/null +++ b/docs/source/deployment/aws-opensearch-serverless.md @@ -0,0 +1,477 @@ + + +# Amazon OpenSearch Serverless + +AI-Q can use the built-in OpenSearch knowledge backend with Amazon OpenSearch Serverless vector collections. The backend +uses SigV4 service `aoss`, creates one OpenSearch index per AI-Q collection/session, and supports Dask ingestion workers +by creating the OpenSearch client inside the worker process. + +```{note} +**Migrating from AI-Q v1.0.** On v1.0, OpenSearch support shipped through a custom Docker image +built from [`awslabs/ai-on-eks`](https://github.com/awslabs/ai-on-eks) via `./deploy.sh build`. On +v2.0, OpenSearch is a built-in knowledge backend selected through workflow YAML +(`backend: opensearch`). You no longer need to maintain a custom image build pipeline. +``` + +## Architecture + +```{mermaid} +flowchart LR + user[User / UI] -->|HTTPS| backend[aiq-agent pod
service account: aiq-backend] + backend -->|submit ingest| dask_sched[Dask scheduler] + dask_sched --> dask_worker[Dask worker
same service account] + backend -->|SigV4 retrieval| aoss[(Amazon OpenSearch
Serverless collection)] + dask_worker -->|SigV4 ingest| aoss + pod_identity[EKS Pod Identity
association] -.maps SA to.-> iam[IAM role
aoss:APIAccessAll] + iam -.assumed by.-> backend + iam -.assumed by.-> dask_worker + aoss_dap[AOSS data access policy] -.grants index ops.-> iam +``` + +The backend pod and every Dask worker assume the same IAM role through the EKS Pod Identity +association on the `aiq-backend` service account. Each Dask worker constructs its own OpenSearch +client, so SigV4 signing happens in the worker's process — no signer state is serialized across +the cluster. + +## Prerequisites + +| Item | Version / detail | +|------|------------------| +| AWS account | with permissions to create AOSS collections, IAM roles, and EKS Pod Identity associations | +| AWS CLI | v2.15+ (Pod Identity associations require recent AWS CLI) | +| `kubectl` | v1.29+ | +| `helm` | v3.14+ | +| EKS cluster | v1.29+ with the EKS Pod Identity Agent add-on installed | +| Region | the same region for the EKS cluster and the AOSS collection | +| `nvcr.io` access | NGC API key for pulling `nvcr.io/nvidia/blueprint/aiq-agent` | + +Install the EKS Pod Identity Agent add-on once per cluster: + +```bash +aws eks create-addon \ + --cluster-name \ + --addon-name eks-pod-identity-agent +``` + +Confirm it is `ACTIVE` before continuing: + +```bash +aws eks describe-addon --cluster-name --addon-name eks-pod-identity-agent \ + --query 'addon.status' --output text +``` + +Expected: `ACTIVE`. + +## Create the OpenSearch Serverless collection + +AOSS requires an encryption policy and a network policy before the collection can be created. +Replace `` and `` throughout. The examples below use AWS-owned KMS keys +and a public network policy; harden these for production. + +### 1. Encryption policy + +```bash +COLLECTION= +REGION= + +aws opensearchserverless create-security-policy \ + --region "$REGION" \ + --name "${COLLECTION}-enc" \ + --type encryption \ + --policy "{\"Rules\":[{\"ResourceType\":\"collection\",\"Resource\":[\"collection/${COLLECTION}\"]}],\"AWSOwnedKey\":true}" +``` + +### 2. Network policy + +```bash +aws opensearchserverless create-security-policy \ + --region "$REGION" \ + --name "${COLLECTION}-net" \ + --type network \ + --policy "[{\"Rules\":[{\"ResourceType\":\"collection\",\"Resource\":[\"collection/${COLLECTION}\"]},{\"ResourceType\":\"dashboard\",\"Resource\":[\"collection/${COLLECTION}\"]}],\"AllowFromPublic\":true}]" +``` + +For private VPC access, replace `AllowFromPublic` with `SourceVPCEs`. See the +[AOSS network policy docs](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/serverless-network.html). + +### 3. Create the collection + +```bash +aws opensearchserverless create-collection \ + --region "$REGION" \ + --name "$COLLECTION" \ + --type VECTORSEARCH +``` + +Wait until the collection is `ACTIVE` and capture the data endpoint: + +```bash +aws opensearchserverless batch-get-collection \ + --region "$REGION" --names "$COLLECTION" \ + --query 'collectionDetails[0].[status,collectionEndpoint]' --output text +``` + +Expected output: `ACTIVE https://abc123..aoss.amazonaws.com`. Save the endpoint — it +is the `OPENSEARCH_URL` value used in Helm values. + +## IAM role for the AIQ pod + +Pod Identity assumes an IAM role through `pods.eks.amazonaws.com`. The trust policy for this role +must allow `sts:AssumeRole` and `sts:TagSession` for that principal. + +### 1. Trust policy + +Save as `aiq-trust-policy.json`: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { "Service": "pods.eks.amazonaws.com" }, + "Action": ["sts:AssumeRole", "sts:TagSession"] + } + ] +} +``` + +### 2. Permissions policy + +The role needs `aoss:APIAccessAll` on the collection, plus the AOSS dashboard endpoint if you +want to inspect indexes from the AWS console. Save as `aiq-permissions-policy.json` and substitute +your account ID and collection name: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": "aoss:APIAccessAll", + "Resource": "arn:aws:aoss:::collection/" + } + ] +} +``` + +The `` is the suffix returned by `batch-get-collection` under `id` (a 26-character +identifier), not the human-readable name. + +### 3. Create the role + +```bash +aws iam create-role \ + --role-name aiq-opensearch-role \ + --assume-role-policy-document file://aiq-trust-policy.json + +aws iam put-role-policy \ + --role-name aiq-opensearch-role \ + --policy-name aiq-opensearch-access \ + --policy-document file://aiq-permissions-policy.json +``` + +Capture the role ARN — it goes into the Pod Identity association in Task 6. + +```bash +aws iam get-role --role-name aiq-opensearch-role --query 'Role.Arn' --output text +``` + +## Grant the role access to AOSS + +AOSS authorizes data plane operations (index create, document write, search) through a +*data access policy* that is separate from IAM. The policy lists IAM principals and the +collections/indexes they can act on. + +Save as `aiq-data-access-policy.json`. Substitute your role ARN and AIQ index prefix +(`aiq` matches the default `OPENSEARCH_INDEX_PREFIX`): + +```json +[ + { + "Rules": [ + { + "ResourceType": "collection", + "Resource": ["collection/"], + "Permission": ["aoss:DescribeCollectionItems"] + }, + { + "ResourceType": "index", + "Resource": ["index//aiq*"], + "Permission": [ + "aoss:CreateIndex", + "aoss:DeleteIndex", + "aoss:UpdateIndex", + "aoss:DescribeIndex", + "aoss:ReadDocument", + "aoss:WriteDocument" + ] + } + ], + "Principal": ["arn:aws:iam:::role/aiq-opensearch-role"], + "Description": "AIQ backend access to AOSS indexes" + } +] +``` + +```bash +aws opensearchserverless create-access-policy \ + --region "$REGION" \ + --name "${COLLECTION}-aiq" \ + --type data \ + --policy file://aiq-data-access-policy.json +``` + +The index resource pattern `index//aiq*` covers every AIQ session collection, since +the OpenSearch backend creates indexes named `aiq-` (or `aiq-s_` for session +collections). + +## Associate the role with the AIQ service account + +EKS Pod Identity binds an IAM role to a Kubernetes service account. With the default Helm +release names, the namespace is `ns-aiq` and the backend service account is `aiq-backend`. + +```bash +aws eks create-pod-identity-association \ + --cluster-name \ + --namespace ns-aiq \ + --service-account aiq-backend \ + --role-arn arn:aws:iam:::role/aiq-opensearch-role +``` + +The same service account is used by the embedded Dask scheduler and worker, so SigV4 +credentials are available throughout the ingestion pipeline. No service-account annotation is +required — Pod Identity does not use OIDC trust like IRSA. + +## Workflow Config + +Use `configs/config_web_opensearch.yml`: + +```{note} +**Text-only ingestion.** The OpenSearch backend extracts plain text from PDFs, DOCX, and PPTX. It does +not currently support table/image/chart extraction (those flags are LlamaIndex-only). For multimodal, +use the LlamaIndex backend or Foundational RAG. +``` + +```yaml +functions: + knowledge_search: + _type: knowledge_retrieval + backend: opensearch + collection_name: ${COLLECTION_NAME:-test_collection} + opensearch_url: ${OPENSEARCH_URL} + opensearch_auth_type: sigv4 + opensearch_aws_region: ${AWS_REGION} + opensearch_aws_service: aoss + opensearch_index_prefix: ${OPENSEARCH_INDEX_PREFIX:-aiq} + opensearch_ingestion_mode: ${OPENSEARCH_INGESTION_MODE:-auto} + opensearch_dask_file_transfer: ${OPENSEARCH_DASK_FILE_TRANSFER:-bytes} +``` + +Session collection names such as `s_` map to physical indexes like `aiq-s_` inside the same Serverless +collection endpoint. The backend stores collection metadata in mapping `_meta` and the TTL cleanup thread deletes +expired session indexes. + +## Helm Values + +Use the example values file as a starting point: + +### Pull secret for `nvcr.io` + +The example values reference `nvcr.io/nvidia/blueprint/aiq-agent`. Create an NGC API key at +[`ngc.nvidia.com`](https://ngc.nvidia.com), then create the pull secret in the release namespace: + +```bash +kubectl create namespace ns-aiq --dry-run=client -o yaml | kubectl apply -f - + +kubectl -n ns-aiq create secret docker-registry ngc-image-pull-secret \ + --docker-server=nvcr.io \ + --docker-username='$oauthtoken' \ + --docker-password= +``` + +The secret name `ngc-image-pull-secret` matches the +[`deploy/helm/examples/aws-opensearch-serverless-values.yaml`](../../../deploy/helm/examples/aws-opensearch-serverless-values.yaml) +`imagePullSecrets` entry. Change both if you use a different name. + +### Embedding endpoint + +The OpenSearch ingestor calls an OpenAI-compatible embeddings endpoint to vectorize chunks +before indexing. Two options: + +**Option A: NVIDIA hosted API (default).** The ingestor calls +`https://integrate.api.nvidia.com/v1` and reads `NVIDIA_API_KEY` from the pod environment. +Create the shared credentials secret once and the example values' `secretEnv` block injects +`NVIDIA_API_KEY` into the backend container: + +```bash +kubectl -n ns-aiq create secret generic aiq-credentials \ + --from-literal=NVIDIA_API_KEY= +``` + +The chart's `secretEnv` pattern maps env-var names to keys in this shared secret. Add other +keys (database credentials, etc.) to the same secret if your release needs them. + +**Option B: Self-hosted NIM on the same cluster.** Override `AIQ_EMBED_BASE_URL` to point at +your in-cluster NIM service and leave `NVIDIA_API_KEY` empty. Add to `backend.env` in your +values: + +```yaml + AIQ_EMBED_BASE_URL: http://nim-embedqa.ns-nim.svc.cluster.local:8000/v1 + AIQ_EMBED_MODEL: nvidia/llama-nemotron-embed-vl-1b-v2 +``` + +The embedding model dimension must match `OPENSEARCH_EMBEDDING_DIM` in the workflow config +(default `2048` for `nvidia/llama-nemotron-embed-vl-1b-v2`). Mismatched dimensions surface +as `mapper_parsing_exception` on the first ingest. + +```bash +helm upgrade --install aiq deploy/helm/deployment-k8s \ + -n ns-aiq --create-namespace \ + -f deploy/helm/examples/aws-opensearch-serverless-values.yaml +``` + +Override the backend image when testing unreleased code: + +```yaml +aiq: + apps: + backend: + image: + repository: / + tag: +``` + +## Verify the deployment + +### 1. Pod is running and Pod Identity is attached + +```bash +kubectl -n ns-aiq get pods -l app.kubernetes.io/name=aiq-agent +kubectl -n ns-aiq describe pod -l app.kubernetes.io/name=aiq-agent | grep -A2 'AWS_CONTAINER_CREDENTIALS' +``` + +Expected: pod is `Running`, the describe output shows +`AWS_CONTAINER_CREDENTIALS_FULL_URI` injected by the EKS Pod Identity Agent. If that variable +is missing, the Pod Identity association is not in effect — re-check the cluster, namespace, +and service-account triple in the previous section. + +### 2. Backend health check + +```bash +kubectl -n ns-aiq port-forward svc/aiq-agent 8000:8000 & +curl -sf http://localhost:8000/health +``` + +Expected: `{"status":"healthy"}` (the `aiq_api` front end exposes a JSON health route at `/health`). + +### 3. Upload a document + +```bash +curl -sf -X POST http://localhost:8000/v1/collections \ + -H 'Content-Type: application/json' \ + -d '{"name":"smoke","description":"smoke test"}' + +curl -sf -X POST http://localhost:8000/v1/collections/smoke/documents \ + -F 'files=@README.md' +``` + +Expected: a `job_id` is returned. Poll `GET /v1/documents/{job_id}/status` until `status` is +`SUCCESS`. If it stalls in `INGESTING`, check the Dask worker logs for SigV4 errors: + +```bash +kubectl -n ns-aiq logs -l app.kubernetes.io/name=aiq-agent --tail=200 | grep -i opensearch +``` + +### 4. Confirm the index appears in AOSS + +```bash +aws opensearchserverless list-collections --region "$REGION" +``` + +```bash +curl -sf "http://localhost:8000/v1/collections" | jq +``` + +Expected: `aiq-smoke` index visible in the AOSS console under the collection's index browser, +and the `smoke` collection listed by the AIQ API. + +```{note} +**AOSS visibility delay.** AOSS is eventually consistent for search after writes. A `_count` immediately +after a successful upload may report `0` for ~5–30 seconds before catching up. If the AIQ status says +`completed` but the AOSS console index browser shows zero docs, wait 30s and refresh — the index will +populate. This is also why the live-test suite includes a polling visibility wait. +``` + +### 5. Run a knowledge query + +```bash +curl -sf -X POST http://localhost:8000/v1/chat/completions \ + -H 'Content-Type: application/json' \ + -d '{"messages":[{"role":"user","content":"what is in the smoke document"}]}' +``` + +Expected: response includes content from `README.md` with citations. + +## Local Live Test + +For SSO credentials, clear stale environment credentials before running the test. Environment credentials take +precedence over `AWS_PROFILE`. + +```bash +unset AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY AWS_SESSION_TOKEN AWS_CREDENTIAL_EXPIRATION +aws sso login --profile cs-admin + +AIQ_OPENSEARCH_SERVERLESS_LIVE_TESTS=1 \ +OPENSEARCH_URL=https://abc123.us-west-2.aoss.amazonaws.com \ +AWS_REGION=us-west-2 \ +AWS_PROFILE=cs-admin \ +uv run python -m pytest tests/knowledge_layer_tests/test_opensearch_serverless_live.py -rs -vv +``` + +## Troubleshooting + +| Symptom | Likely cause | Fix | +|---------|--------------|-----| +| `403` from AOSS | Missing IAM or data access policy | Grant `aoss:APIAccessAll` and AOSS data access permissions for the index pattern | +| `Credentials were refreshed, but the refreshed credentials are still expired` | Stale exported AWS session credentials override SSO | Unset the `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`, `AWS_SESSION_TOKEN`, and `AWS_CREDENTIAL_EXPIRATION` variables | +| Empty results immediately after ingest | AOSS search visibility delay | Retry retrieval; live tests wait for document visibility | +| Mapping dimension error | Embedding model dimension does not match index mapping | Set `OPENSEARCH_EMBEDDING_DIM` before creating the index | +| Dask worker stdout is empty during local testing | `DASK_DISTRIBUTED__LOGGING__DISTRIBUTED=warning` (default in `deploy/.env`) silences worker logs. Ingestion still succeeds — verify by counting docs in AOSS, not by tailing the worker. | Override locally with `DASK_DISTRIBUTED__LOGGING__DISTRIBUTED=info` if you need worker logs during development. | + +## Cleanup + +```bash +helm uninstall aiq -n ns-aiq +kubectl delete namespace ns-aiq + +aws eks delete-pod-identity-association \ + --cluster-name \ + --association-id + +aws iam delete-role-policy --role-name aiq-opensearch-role --policy-name aiq-opensearch-access +aws iam delete-role --role-name aiq-opensearch-role + +aws opensearchserverless delete-access-policy --type data --name "${COLLECTION}-aiq" +aws opensearchserverless delete-collection --id +aws opensearchserverless delete-security-policy --type network --name "${COLLECTION}-net" +aws opensearchserverless delete-security-policy --type encryption --name "${COLLECTION}-enc" +``` + +Get the Pod Identity `` with: + +```bash +aws eks list-pod-identity-associations \ + --cluster-name --namespace ns-aiq \ + --query 'associations[?serviceAccount==`aiq-backend`].associationId' --output text +``` + +Get the AOSS `` with: + +```bash +aws opensearchserverless batch-get-collection --names "$COLLECTION" \ + --query 'collectionDetails[0].id' --output text +``` diff --git a/docs/source/deployment/index.md b/docs/source/deployment/index.md index ee6e5c99..583f1e56 100644 --- a/docs/source/deployment/index.md +++ b/docs/source/deployment/index.md @@ -27,6 +27,8 @@ All containerized deployments run the same three services: - **[Kubernetes (Helm)](./kubernetes.md)** -- Helm chart deployment for Kubernetes clusters, including NGC image pull secrets, configuration switching, FRAG integration, and troubleshooting. +- **[Amazon OpenSearch Serverless](./aws-opensearch-serverless.md)** -- EKS and OpenSearch Serverless deployment notes for the built-in OpenSearch knowledge backend. + - **[Docker Build System](./docker-build.md)** -- Multi-stage Dockerfile architecture, build targets (dev vs. release), base images, and startup scripts (`entrypoint.py` and `start_web.py`). - **[Observability](./observability.md)** -- Tracing and monitoring with Phoenix, LangSmith, Weave, and OpenTelemetry. diff --git a/docs/source/index.md b/docs/source/index.md index 44d0e3e8..1caf0515 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -97,6 +97,7 @@ Docker Build System <./deployment/docker-build.md> Observability <./deployment/observability.md> Production <./deployment/production.md> Kubernetes <./deployment/kubernetes.md> +Amazon OpenSearch Serverless <./deployment/aws-opensearch-serverless.md> ``` ```{toctree} diff --git a/docs/superpowers/plans/2026-05-03-opensearch-eks-reference-deployment.md b/docs/superpowers/plans/2026-05-03-opensearch-eks-reference-deployment.md new file mode 100644 index 00000000..f02e1863 --- /dev/null +++ b/docs/superpowers/plans/2026-05-03-opensearch-eks-reference-deployment.md @@ -0,0 +1,842 @@ +# OpenSearch EKS Reference Deployment Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Make the OpenSearch / Amazon OpenSearch Serverless reference deployment on EKS with Pod Identity self-contained inside this repo, so an AWS customer can stand up AIQ 2.0 end-to-end without forking images or hunting through external docs. + +**Architecture:** Pure docs + example values + verification commands. No application code in this plan (gaps/risks are deferred to a follow-up plan). The existing OpenSearch adapter, registration, helm chart, and helm example values stay as they are. We deepen `docs/source/deployment/aws-opensearch-serverless.md` and `deploy/helm/examples/aws-opensearch-serverless-values.yaml` so the doc walks a customer from "have an AWS account" to "AIQ pod is querying AOSS via SigV4 from Dask workers using EKS Pod Identity." + +**Tech Stack:** MyST/Markdown (Sphinx with `myst_parser`, `sphinxmermaid`), YAML (Helm values, NAT workflow config), AWS CLI (`aoss`, `iam`, `eks`), `kubectl`, `helm`. + +**Spec mapping (PR ask #6, awslabs publishing removed):** +- "A reference YAML config" → `configs/config_web_opensearch.yml` (already exists; verified in Task 11). +- "EKS deployment docs using Pod Identity" → `docs/source/deployment/aws-opensearch-serverless.md` (today: 99 lines, thin) is expanded by Tasks 1–10. + +**Working file inventory (read these before starting):** +- `docs/source/deployment/aws-opensearch-serverless.md` — primary doc, expanded throughout. +- `deploy/helm/examples/aws-opensearch-serverless-values.yaml` — example values, gets `imagePullSecrets` and embedding wiring. +- `deploy/helm/README.md` — already cross-links the example file (no further changes here). +- `docs/source/deployment/index.md` — already lists the new doc on line 30 (no further changes here). +- `configs/config_web_opensearch.yml` — workflow config, already env-substitution-driven (no further changes here). +- `sources/knowledge_layer/KNOWLEDGE-LAYER-SETUP.md` — gets one cross-link added in Task 11. + +--- + +### Task 1: Add v1.0 → v2.0 migration callout at the top of the AOSS doc + +**Files:** +- Modify: `docs/source/deployment/aws-opensearch-serverless.md` (insert after the H1 on line 6) + +**Why:** The PR is framed as closing the v1.0/v2.0 gap. AWS readers landing on this page need to see in the first 10 seconds that they no longer need the `./deploy.sh build` custom-image step from the v1.0 reference. Without this, customers will assume the v1.0 fork pattern is still required. + +- [ ] **Step 1: Insert the migration note** + +Open `docs/source/deployment/aws-opensearch-serverless.md`. After the existing line 8 paragraph (the one starting "AI-Q can use the built-in OpenSearch knowledge backend…"), insert: + +```markdown +:::{note} Migrating from AI-Q v1.0 +On v1.0, OpenSearch support shipped through a custom Docker image built from +[`awslabs/ai-on-eks`](https://github.com/awslabs/ai-on-eks) via `./deploy.sh build`. On v2.0, +OpenSearch is a built-in knowledge backend selected through workflow YAML +(`backend: opensearch`). You no longer need to fork or rebuild the NVIDIA base images. +::: +``` + +The `:::{note}` syntax is a MyST admonition; this repo's `conf.py` enables `colon_fence` so it renders as a callout box in Sphinx output. + +- [ ] **Step 2: Render the page locally and confirm the callout shows** + +Run from the repo root: + +```bash +cd docs && make html +``` + +Expected: build completes with no warnings about the new file. Open `docs/_build/html/source/deployment/aws-opensearch-serverless.html` in a browser and confirm the green/blue note box appears under the H1. + +- [ ] **Step 3: Commit** + +```bash +git add docs/source/deployment/aws-opensearch-serverless.md +git commit -m "docs(opensearch): add v1.0 to v2.0 migration callout to AOSS guide" +``` + +--- + +### Task 2: Add an architecture diagram to the AOSS doc + +**Files:** +- Modify: `docs/source/deployment/aws-opensearch-serverless.md` (insert a new `## Architecture` section above `## Workflow Config`) + +**Why:** The non-obvious part of this design is that Dask workers create their own OpenSearch client so SigV4 credentials resolve in the worker's process (Pod Identity, SSO, env profiles). A diagram makes this "aha" visible. Mermaid is already enabled via `sphinxmermaid` in `docs/source/conf.py`. + +- [ ] **Step 1: Insert the architecture section** + +Above the `## Workflow Config` heading, insert: + +```markdown +## Architecture + +```{mermaid} +flowchart LR + user[User / UI] -->|HTTPS| backend[aiq-agent pod
service account: aiq-backend] + backend -->|submit ingest| dask_sched[Dask scheduler] + dask_sched --> dask_worker[Dask worker
same service account] + backend -->|SigV4 retrieval| aoss[(Amazon OpenSearch
Serverless collection)] + dask_worker -->|SigV4 ingest| aoss + pod_identity[EKS Pod Identity
association] -.maps SA to.-> iam[IAM role
aoss:APIAccessAll] + iam -.assumed by.-> backend + iam -.assumed by.-> dask_worker + aoss_dap[AOSS data access policy] -.grants index ops.-> iam +``` + +The backend pod and every Dask worker assume the same IAM role through the EKS Pod Identity +association on the `aiq-backend` service account. Each Dask worker constructs its own OpenSearch +client, so SigV4 signing happens in the worker's process — no signer state is serialized across +the cluster. +``` + +(The triple-backtick `{mermaid}` fence is the MyST/Sphinx mermaid directive.) + +- [ ] **Step 2: Render and confirm the diagram appears** + +Run: + +```bash +cd docs && make html +``` + +Expected: build completes, diagram renders as SVG in the HTML output. + +- [ ] **Step 3: Commit** + +```bash +git add docs/source/deployment/aws-opensearch-serverless.md +git commit -m "docs(opensearch): add architecture diagram showing SigV4 in Dask workers" +``` + +--- + +### Task 3: Add a Prerequisites section + +**Files:** +- Modify: `docs/source/deployment/aws-opensearch-serverless.md` (insert `## Prerequisites` immediately after `## Architecture`) + +**Why:** The current doc assumes you already have an EKS cluster, AOSS collection, IAM role, and Pod Identity association. New readers fall off here. A prerequisites section sets expectations and lists exact tool versions. + +- [ ] **Step 1: Insert prerequisites** + +```markdown +## Prerequisites + +| Item | Version / detail | +|------|------------------| +| AWS account | with permissions to create AOSS collections, IAM roles, and EKS Pod Identity associations | +| AWS CLI | v2.15+ (Pod Identity associations require recent AWS CLI) | +| `kubectl` | v1.29+ | +| `helm` | v3.14+ | +| EKS cluster | v1.29+ with the EKS Pod Identity Agent add-on installed | +| Region | the same region for the EKS cluster and the AOSS collection | +| `nvcr.io` access | NGC API key for pulling `nvcr.io/nvidia/blueprint/aiq-agent` | + +Install the EKS Pod Identity Agent add-on once per cluster: + +```bash +aws eks create-addon \ + --cluster-name \ + --addon-name eks-pod-identity-agent +``` + +Confirm it is `ACTIVE` before continuing: + +```bash +aws eks describe-addon --cluster-name --addon-name eks-pod-identity-agent \ + --query 'addon.status' --output text +``` + +Expected: `ACTIVE`. +``` + +- [ ] **Step 2: Render and confirm** + +```bash +cd docs && make html +``` + +Expected: no Sphinx warnings, the new section renders as a table plus two code blocks. + +- [ ] **Step 3: Commit** + +```bash +git add docs/source/deployment/aws-opensearch-serverless.md +git commit -m "docs(opensearch): list EKS and tooling prerequisites for AOSS deployment" +``` + +--- + +### Task 4: Add an end-to-end AOSS collection creation walkthrough + +**Files:** +- Modify: `docs/source/deployment/aws-opensearch-serverless.md` (insert `## Create the OpenSearch Serverless collection` after `## Prerequisites`) + +**Why:** AOSS requires an encryption policy and a network policy *before* the collection can be created. This trips up first-time AOSS users. Today the doc only shows the Pod Identity command and skips collection creation entirely. + +- [ ] **Step 1: Insert the collection creation walkthrough** + +```markdown +## Create the OpenSearch Serverless collection + +AOSS requires an encryption policy and a network policy before the collection can be created. +Replace `` and `` throughout. The examples below use AWS-owned KMS keys +and a public network policy; harden these for production. + +### 1. Encryption policy + +```bash +COLLECTION= +REGION= + +aws opensearchserverless create-security-policy \ + --region "$REGION" \ + --name "${COLLECTION}-enc" \ + --type encryption \ + --policy "{\"Rules\":[{\"ResourceType\":\"collection\",\"Resource\":[\"collection/${COLLECTION}\"]}],\"AWSOwnedKey\":true}" +``` + +### 2. Network policy + +```bash +aws opensearchserverless create-security-policy \ + --region "$REGION" \ + --name "${COLLECTION}-net" \ + --type network \ + --policy "[{\"Rules\":[{\"ResourceType\":\"collection\",\"Resource\":[\"collection/${COLLECTION}\"]},{\"ResourceType\":\"dashboard\",\"Resource\":[\"collection/${COLLECTION}\"]}],\"AllowFromPublic\":true}]" +``` + +For private VPC access, replace `AllowFromPublic` with `SourceVPCEs`. See the +[AOSS network policy docs](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/serverless-network.html). + +### 3. Create the collection + +```bash +aws opensearchserverless create-collection \ + --region "$REGION" \ + --name "$COLLECTION" \ + --type VECTORSEARCH +``` + +Wait until the collection is `ACTIVE` and capture the data endpoint: + +```bash +aws opensearchserverless batch-get-collection \ + --region "$REGION" --names "$COLLECTION" \ + --query 'collectionDetails[0].[status,collectionEndpoint]' --output text +``` + +Expected output: `ACTIVE https://abc123..aoss.amazonaws.com`. Save the endpoint — it +is the `OPENSEARCH_URL` value used in Helm values. +``` + +- [ ] **Step 2: Render and confirm three numbered subsections appear** + +```bash +cd docs && make html +``` + +Expected: H3 entries 1, 2, 3 in the right-side TOC. + +- [ ] **Step 3: Commit** + +```bash +git add docs/source/deployment/aws-opensearch-serverless.md +git commit -m "docs(opensearch): walk through AOSS encryption, network, and collection creation" +``` + +--- + +### Task 5: Document the IAM role and trust policy for Pod Identity + +**Files:** +- Modify: `docs/source/deployment/aws-opensearch-serverless.md` (insert `## IAM role for the AIQ pod` after the collection creation section) + +**Why:** Pod Identity uses a trust policy that names `pods.eks.amazonaws.com`, which is different from IRSA's OIDC trust policy. Customers familiar with IRSA will write the wrong trust policy. The doc must show the exact trust policy. + +- [ ] **Step 1: Insert the IAM role section** + +```markdown +## IAM role for the AIQ pod + +Pod Identity assumes an IAM role through `pods.eks.amazonaws.com`. The trust policy for this role +must allow `sts:AssumeRole` and `sts:TagSession` for that principal. + +### 1. Trust policy + +Save as `aiq-trust-policy.json`: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { "Service": "pods.eks.amazonaws.com" }, + "Action": ["sts:AssumeRole", "sts:TagSession"] + } + ] +} +``` + +### 2. Permissions policy + +The role needs `aoss:APIAccessAll` on the collection, plus the AOSS dashboard endpoint if you +want to inspect indexes from the AWS console. Save as `aiq-permissions-policy.json` and substitute +your account ID and collection name: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": "aoss:APIAccessAll", + "Resource": "arn:aws:aoss:::collection/" + } + ] +} +``` + +The `` is the suffix returned by `batch-get-collection` under `id` (a 26-character +identifier), not the human-readable name. + +### 3. Create the role + +```bash +aws iam create-role \ + --role-name aiq-opensearch-role \ + --assume-role-policy-document file://aiq-trust-policy.json + +aws iam put-role-policy \ + --role-name aiq-opensearch-role \ + --policy-name aiq-opensearch-access \ + --policy-document file://aiq-permissions-policy.json +``` + +Capture the role ARN — it goes into the Pod Identity association in Task 6. + +```bash +aws iam get-role --role-name aiq-opensearch-role --query 'Role.Arn' --output text +``` +``` + +- [ ] **Step 2: Render and verify both JSON code fences highlight as JSON** + +```bash +cd docs && make html +``` + +- [ ] **Step 3: Commit** + +```bash +git add docs/source/deployment/aws-opensearch-serverless.md +git commit -m "docs(opensearch): document Pod Identity trust policy and AOSS IAM permissions" +``` + +--- + +### Task 6: Document the AOSS data access policy and Pod Identity association + +**Files:** +- Modify: `docs/source/deployment/aws-opensearch-serverless.md` — replace the existing thin `## EKS Pod Identity` section (currently lines ~36–53 of the file) with a deeper version that covers the AOSS data access policy and the association command. + +**Why:** AOSS has a second authorization layer (data access policies) on top of IAM. Customers stop at IAM, hit a 403, and don't know that the *index resource pattern* is the missing piece. The current doc mentions this in passing — the new section makes it explicit and shows the JSON. + +- [ ] **Step 1: Replace the existing `## EKS Pod Identity` section** + +Delete the current `## EKS Pod Identity` block and replace it with: + +```markdown +## Grant the role access to AOSS + +AOSS authorizes data plane operations (index create, document write, search) through a +*data access policy* that is separate from IAM. The policy lists IAM principals and the +collections/indexes they can act on. + +Save as `aiq-data-access-policy.json`. Substitute your role ARN and AIQ index prefix +(`aiq` matches the default `OPENSEARCH_INDEX_PREFIX`): + +```json +[ + { + "Rules": [ + { + "ResourceType": "collection", + "Resource": ["collection/"], + "Permission": ["aoss:DescribeCollectionItems"] + }, + { + "ResourceType": "index", + "Resource": ["index//aiq*"], + "Permission": [ + "aoss:CreateIndex", + "aoss:DeleteIndex", + "aoss:UpdateIndex", + "aoss:DescribeIndex", + "aoss:ReadDocument", + "aoss:WriteDocument" + ] + } + ], + "Principal": ["arn:aws:iam:::role/aiq-opensearch-role"], + "Description": "AIQ backend access to AOSS indexes" + } +] +``` + +```bash +aws opensearchserverless create-access-policy \ + --region "$REGION" \ + --name "${COLLECTION}-aiq" \ + --type data \ + --policy file://aiq-data-access-policy.json +``` + +The index resource pattern `index//aiq*` covers every AIQ session collection, since +the OpenSearch backend creates indexes named `aiq-` (or `aiq-s_` for session +collections). + +## Associate the role with the AIQ service account + +EKS Pod Identity binds an IAM role to a Kubernetes service account. With the default Helm +release names, the namespace is `ns-aiq` and the backend service account is `aiq-backend`. + +```bash +aws eks create-pod-identity-association \ + --cluster-name \ + --namespace ns-aiq \ + --service-account aiq-backend \ + --role-arn arn:aws:iam:::role/aiq-opensearch-role +``` + +The same service account is used by the embedded Dask scheduler and worker, so SigV4 +credentials are available throughout the ingestion pipeline. No service-account annotation is +required — Pod Identity does not use OIDC trust like IRSA. +``` + +- [ ] **Step 2: Render and confirm both new H2 sections appear in the page TOC** + +```bash +cd docs && make html +``` + +Expected: side TOC shows "Grant the role access to AOSS" and "Associate the role with the AIQ service account". + +- [ ] **Step 3: Commit** + +```bash +git add docs/source/deployment/aws-opensearch-serverless.md +git commit -m "docs(opensearch): expand Pod Identity section with AOSS data access policy" +``` + +--- + +### Task 7: Add `imagePullSecrets` to the example Helm values and document the NGC secret + +**Files:** +- Modify: `deploy/helm/examples/aws-opensearch-serverless-values.yaml` +- Modify: `docs/source/deployment/aws-opensearch-serverless.md` (extend the `## Helm Values` section) + +**Why:** The example values point at `nvcr.io/nvidia/blueprint/aiq-agent` but customers have no way to pull from `nvcr.io` without an `imagePullSecret`. This is the most common first-failure mode. Fix it in the example, document the secret creation. + +- [ ] **Step 1: Add `imagePullSecrets` to the example values** + +Edit `deploy/helm/examples/aws-opensearch-serverless-values.yaml` so the `backend` block reads: + +```yaml +aiq: + apps: + backend: + image: + repository: nvcr.io/nvidia/blueprint/aiq-agent + tag: "2.0.0" + pullPolicy: IfNotPresent + imagePullSecrets: + - name: ngc-image-pull-secret + env: + CONFIG_FILE: configs/config_web_opensearch.yml + COLLECTION_NAME: default_collection + OPENSEARCH_URL: https://abc123.us-west-2.aoss.amazonaws.com + OPENSEARCH_AUTH_TYPE: sigv4 + OPENSEARCH_AWS_SERVICE: aoss + OPENSEARCH_INDEX_PREFIX: aiq + AWS_REGION: us-west-2 + OPENSEARCH_INGESTION_MODE: auto + OPENSEARCH_DASK_FILE_TRANSFER: bytes + DASK_NWORKERS: "1" + DASK_NTHREADS: "4" +``` + +Verify the file parses: + +```bash +python -c "import yaml; yaml.safe_load(open('deploy/helm/examples/aws-opensearch-serverless-values.yaml'))" +``` + +Expected: no output (valid YAML). + +- [ ] **Step 2: Add a `### Pull secret for nvcr.io` subsection under `## Helm Values` in the AOSS doc** + +Insert before the existing `helm upgrade --install` block: + +```markdown +### Pull secret for `nvcr.io` + +The example values reference `nvcr.io/nvidia/blueprint/aiq-agent`. Create an NGC API key at +[`ngc.nvidia.com`](https://ngc.nvidia.com), then create the pull secret in the release namespace: + +```bash +kubectl create namespace ns-aiq --dry-run=client -o yaml | kubectl apply -f - + +kubectl -n ns-aiq create secret docker-registry ngc-image-pull-secret \ + --docker-server=nvcr.io \ + --docker-username='$oauthtoken' \ + --docker-password= +``` + +The secret name `ngc-image-pull-secret` matches the +[`deploy/helm/examples/aws-opensearch-serverless-values.yaml`](../../../deploy/helm/examples/aws-opensearch-serverless-values.yaml) +`imagePullSecrets` entry. Change both if you use a different name. +``` + +- [ ] **Step 3: Render and confirm both files** + +```bash +cd docs && make html +``` + +Expected: Sphinx build clean. Open the AOSS page and confirm the new `### Pull secret for nvcr.io` subsection appears under `## Helm Values`. + +- [ ] **Step 4: Commit** + +```bash +git add deploy/helm/examples/aws-opensearch-serverless-values.yaml docs/source/deployment/aws-opensearch-serverless.md +git commit -m "docs(opensearch): add nvcr.io pull secret to example values and AOSS guide" +``` + +--- + +### Task 8: Document the embedding endpoint configuration + +**Files:** +- Modify: `deploy/helm/examples/aws-opensearch-serverless-values.yaml` (add `NVIDIA_API_KEY` wiring through a Kubernetes secret) +- Modify: `docs/source/deployment/aws-opensearch-serverless.md` (add `### Embedding endpoint` under `## Helm Values`) + +**Why:** The OpenSearch ingestor needs an embedding endpoint. By default it calls `https://integrate.api.nvidia.com/v1` and requires `NVIDIA_API_KEY`. The example values do not show how to provide that key, so customers will silently 401. Document both the hosted-API path and the NIM-on-EKS override path. + +- [ ] **Step 1: Add the secret env wiring to the example values** + +Add to the `backend.env` block in `deploy/helm/examples/aws-opensearch-serverless-values.yaml`: + +```yaml + envFromSecret: + - name: nvidia-api-key + key: NVIDIA_API_KEY + envVar: NVIDIA_API_KEY +``` + +If the chart's existing schema for secret env wiring uses a different key (verify against +`deploy/helm/deployment-k8s/values.yaml` and the `_helpers.tpl` template before committing — +this repo's chart may use `extraEnvVarsSecret` or similar), update the example to match. +The intent: NVIDIA_API_KEY is sourced from a Kubernetes secret, not hard-coded into values. + +Verify the file still parses: + +```bash +python -c "import yaml; yaml.safe_load(open('deploy/helm/examples/aws-opensearch-serverless-values.yaml'))" +``` + +- [ ] **Step 2: Add the embedding endpoint subsection to the AOSS doc** + +Insert under `## Helm Values`, after the pull secret subsection: + +```markdown +### Embedding endpoint + +The OpenSearch ingestor calls an OpenAI-compatible embeddings endpoint to vectorize chunks +before indexing. Two options: + +**Option A: NVIDIA hosted API (default).** The ingestor calls +`https://integrate.api.nvidia.com/v1` and reads `NVIDIA_API_KEY` from the pod environment. +Create the secret once, then the example values mount it: + +```bash +kubectl -n ns-aiq create secret generic nvidia-api-key \ + --from-literal=NVIDIA_API_KEY= +``` + +**Option B: Self-hosted NIM on the same cluster.** Override `AIQ_EMBED_BASE_URL` to your +NIM service and leave `NVIDIA_API_KEY` empty. Add to `backend.env`: + +```yaml + AIQ_EMBED_BASE_URL: http://nim-embedqa.ns-nim.svc.cluster.local:8000/v1 + AIQ_EMBED_MODEL: nvidia/llama-nemotron-embed-vl-1b-v2 +``` + +The embedding model dimension must match `OPENSEARCH_EMBEDDING_DIM` in the workflow config +(default `2048` for `nvidia/llama-nemotron-embed-vl-1b-v2`). Mismatched dimensions surface +as `mapper_parsing_exception` on the first ingest. +``` + +- [ ] **Step 3: Render and verify** + +```bash +cd docs && make html +``` + +- [ ] **Step 4: Commit** + +```bash +git add deploy/helm/examples/aws-opensearch-serverless-values.yaml docs/source/deployment/aws-opensearch-serverless.md +git commit -m "docs(opensearch): document hosted-API and NIM-on-EKS embedding setups" +``` + +--- + +### Task 9: Add a verification / smoke test section + +**Files:** +- Modify: `docs/source/deployment/aws-opensearch-serverless.md` (insert `## Verify the deployment` after the `## Helm Values` section, before `## Local Live Test`) + +**Why:** After install there's no guidance for "did it actually work". A smoke test (port-forward, health check, upload a doc, search) catches the common failures (Pod Identity not associated, AOSS data policy missing, dimension mismatch) inside ten minutes. + +- [ ] **Step 1: Insert the verification section** + +```markdown +## Verify the deployment + +### 1. Pod is running and Pod Identity is attached + +```bash +kubectl -n ns-aiq get pods -l app.kubernetes.io/name=aiq-agent +kubectl -n ns-aiq describe pod -l app.kubernetes.io/name=aiq-agent | grep -A2 'AWS_CONTAINER_CREDENTIALS' +``` + +Expected: pod is `Running`, the describe output shows +`AWS_CONTAINER_CREDENTIALS_FULL_URI` injected by the EKS Pod Identity Agent. If that variable +is missing, the Pod Identity association is not in effect — re-check the cluster, namespace, +and service-account triple in Task 6. + +### 2. Backend health check + +```bash +kubectl -n ns-aiq port-forward svc/aiq-agent 8000:8000 & +curl -sf http://localhost:8000/health +``` + +Expected: `{"status":"ok"}` (or equivalent — match the health route exposed by the deployed +`aiq_api` front end). + +### 3. Upload a document + +```bash +curl -sf -X POST http://localhost:8000/v1/collections \ + -H 'Content-Type: application/json' \ + -d '{"name":"smoke","description":"smoke test"}' + +curl -sf -X POST http://localhost:8000/v1/collections/smoke/documents \ + -F 'files=@README.md' +``` + +Expected: a `job_id` is returned. Poll `GET /v1/documents/{job_id}/status` until `status` is +`SUCCESS`. If it stalls in `INGESTING`, check the Dask worker logs for SigV4 errors: + +```bash +kubectl -n ns-aiq logs -l app.kubernetes.io/name=aiq-agent --tail=200 | grep -i opensearch +``` + +### 4. Confirm the index appears in AOSS + +```bash +aws opensearchserverless list-collections --region "$REGION" +``` + +```bash +curl -sf "http://localhost:8000/v1/collections" | jq +``` + +Expected: `aiq-smoke` index visible in the AOSS console under the collection's index browser, +and the `smoke` collection listed by the AIQ API. + +### 5. Run a knowledge query + +```bash +curl -sf -X POST http://localhost:8000/v1/chat/completions \ + -H 'Content-Type: application/json' \ + -d '{"messages":[{"role":"user","content":"what is in the smoke document"}]}' +``` + +Expected: response includes content from `README.md` with citations. +``` + +- [ ] **Step 2: Render and verify the H3 anchors render** + +```bash +cd docs && make html +``` + +- [ ] **Step 3: Commit** + +```bash +git add docs/source/deployment/aws-opensearch-serverless.md +git commit -m "docs(opensearch): add end-to-end verification and smoke test for AOSS install" +``` + +--- + +### Task 10: Add a teardown section + +**Files:** +- Modify: `docs/source/deployment/aws-opensearch-serverless.md` (insert `## Cleanup` after `## Troubleshooting`) + +**Why:** AOSS collections cost money. Customers running this for a demo need a one-shot teardown so they don't leave a paid collection running. Also helps reviewers reproduce the demo without leaving artifacts. + +- [ ] **Step 1: Insert cleanup section** + +```markdown +## Cleanup + +```bash +helm uninstall aiq -n ns-aiq +kubectl delete namespace ns-aiq + +aws eks delete-pod-identity-association \ + --cluster-name \ + --association-id + +aws iam delete-role-policy --role-name aiq-opensearch-role --policy-name aiq-opensearch-access +aws iam delete-role --role-name aiq-opensearch-role + +aws opensearchserverless delete-access-policy --type data --name "${COLLECTION}-aiq" +aws opensearchserverless delete-collection --id +aws opensearchserverless delete-security-policy --type network --name "${COLLECTION}-net" +aws opensearchserverless delete-security-policy --type encryption --name "${COLLECTION}-enc" +``` + +Get the Pod Identity `` with: + +```bash +aws eks list-pod-identity-associations \ + --cluster-name --namespace ns-aiq \ + --query 'associations[?serviceAccount==`aiq-backend`].associationId' --output text +``` + +Get the AOSS `` with: + +```bash +aws opensearchserverless batch-get-collection --names "$COLLECTION" \ + --query 'collectionDetails[0].id' --output text +``` +``` + +- [ ] **Step 2: Render** + +```bash +cd docs && make html +``` + +- [ ] **Step 3: Commit** + +```bash +git add docs/source/deployment/aws-opensearch-serverless.md +git commit -m "docs(opensearch): add teardown commands for AOSS reference deployment" +``` + +--- + +### Task 11: Cross-link from main READMEs and validate the reference YAML + +**Files:** +- Modify: `sources/knowledge_layer/KNOWLEDGE-LAYER-SETUP.md` (add a deployment cross-link near the OpenSearch SigV4 example) +- Read-only verification: `configs/config_web_opensearch.yml` (no changes; just confirm it matches the doc) + +**Why:** The AOSS deployment doc is discoverable from `docs/source/deployment/index.md` already, but the knowledge layer setup guide — where customers land first when learning about backends — does not link out to the EKS-specific guide. Add the cross-link. + +- [ ] **Step 1: Add the cross-link in `KNOWLEDGE-LAYER-SETUP.md`** + +Find the AOSS example block (around line 220 of `sources/knowledge_layer/KNOWLEDGE-LAYER-SETUP.md`, the YAML showing `opensearch_aws_service: aoss`). Immediately after that YAML block, add: + +```markdown +> **Deploying on EKS?** See the +> [Amazon OpenSearch Serverless deployment guide](../../docs/source/deployment/aws-opensearch-serverless.md) +> for the end-to-end EKS Pod Identity setup, AOSS data access policy, Helm values, and +> verification commands. +``` + +- [ ] **Step 2: Validate the reference YAML still loads cleanly** + +The reference YAML config is part of ask #6. Confirm it parses and references the existing +backend identifier: + +```bash +python -c " +import yaml +cfg = yaml.safe_load(open('configs/config_web_opensearch.yml')) +ks = cfg['functions']['knowledge_search'] +assert ks['_type'] == 'knowledge_retrieval' +assert ks['backend'] == 'opensearch' +print('OK') +" +``` + +Expected: prints `OK`. + +- [ ] **Step 3: Commit** + +```bash +git add sources/knowledge_layer/KNOWLEDGE-LAYER-SETUP.md +git commit -m "docs(knowledge): link OpenSearch backend setup to EKS deployment guide" +``` + +--- + +### Task 12: Final docs build with `-W` (warnings as errors) + +**Files:** none (verification only) + +**Why:** Catches broken cross-references introduced by Task 11's relative link, mermaid syntax errors, MyST admonition typos, and any other Sphinx warnings. This is the equivalent of "run the test suite green" for a docs PR. + +- [ ] **Step 1: Run a strict build** + +```bash +cd docs +make clean +SPHINXOPTS="-W --keep-going -n" make html +``` + +Expected: build exits 0 with no warnings. The `-W` turns warnings into errors; `-n` enables nitpicky mode for cross-references; `--keep-going` reports every warning rather than stopping at the first. + +If warnings appear, fix them in place (typically: bad relative paths, missing TOC entries, malformed mermaid). Re-run until clean. + +- [ ] **Step 2: Open the rendered AOSS page and skim end-to-end** + +Open `docs/_build/html/source/deployment/aws-opensearch-serverless.html` in a browser. Walk +through it in order. Confirm: migration callout, mermaid diagram, prerequisites, AOSS +collection creation, IAM role, AOSS data access policy, Pod Identity association, helm pull +secret, embedding setup, verify steps, troubleshooting, cleanup. The reading flow should be a +straight line from "I have an AWS account" to "I have a working AIQ + AOSS deployment." + +- [ ] **Step 3: Commit any fixes from Step 1 (only if any)** + +```bash +git add docs/source/deployment/aws-opensearch-serverless.md +git commit -m "docs(opensearch): resolve sphinx -W warnings in AOSS deployment guide" +``` + +--- + +## Out of scope (handled in the follow-up plan) + +The follow-up "gaps/risks" plan covers: +1. `_embed_texts` empty-key fallback in `sources/knowledge_layer/src/opensearch/adapter.py:508`. +2. `OpenSearchAwsService` literal coercion edge case in `sources/knowledge_layer/src/register.py:42`. +3. Multimodal extraction parity (or explicit "text-only" callout) for the OpenSearch backend. +4. Committing this entire OpenSearch branch — currently every file is unstaged or untracked, so step zero before any of the above is a clean PR against `develop`. diff --git a/docs/superpowers/plans/2026-05-05-opensearch-gaps-and-risks.md b/docs/superpowers/plans/2026-05-05-opensearch-gaps-and-risks.md new file mode 100644 index 00000000..ab701068 --- /dev/null +++ b/docs/superpowers/plans/2026-05-05-opensearch-gaps-and-risks.md @@ -0,0 +1,468 @@ +# OpenSearch Gaps & Risks Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Close the gaps and risks surfaced by (a) the original PR evaluation and (b) the live-AOSS validation testing, so the `feat/opensearch-aoss` branch is mergeable as a complete v2.0 OpenSearch story. + +**Architecture:** Eight focused tasks. Five are small code or doc fixes for surface-level bugs. One is a research-first task (the `/v1/chat/completions` `conversation_id` silent-drop). Two are PR-readiness tasks (DCO sign-off + final docs build). No new modules; every change lands in files already on the branch. + +**Tech Stack:** Python (adapter + register), Markdown (Sphinx + MyST), bash (verification commands), `git` (rebase --signoff for DCO compliance). + +**Pre-flight context the executing engineer needs:** +- The branch already has 19 commits ahead of `develop`: 6 foundation + 12 from the EKS reference deployment plan + 1 fix for asymmetric NIM embedding models (commit `619228a`). +- Live testing today proved the full stack works against AOSS: local ingest, Dask ingest, file deletion (AOSS-aware bulk-delete), 30-page PDF, retrieval with page citations. +- Three gotchas were observed during testing and are folded into this plan. + +**Items deferred to a future plan:** TTL cleanup live test (needs `AIQ_TTL_CLEANUP_INTERVAL_SECONDS` override and patience), full EKS deploy walkthrough, multi-session concurrency load test. + +**File inventory:** +- `sources/knowledge_layer/src/opensearch/adapter.py` — Tasks 1, 6. +- `sources/knowledge_layer/src/register.py` — Task 5 (potentially). +- `sources/knowledge_layer/KNOWLEDGE-LAYER-SETUP.md` — Task 2. +- `docs/source/deployment/aws-opensearch-serverless.md` — Tasks 2, 3, 4, 6. +- `tests/knowledge_layer_tests/test_opensearch_adapter.py` — Task 1. +- Branch-wide commit history — Task 7. +- All docs source — Task 8. + +--- + +### Task 1: Fail fast on missing `NVIDIA_API_KEY` for hosted-API ingestion + +**Files:** +- Modify: `sources/knowledge_layer/src/opensearch/adapter.py:508` (ingestor `_embed_texts`) +- Modify: `sources/knowledge_layer/src/opensearch/adapter.py` retriever `_embed_texts` (line ~1342, may have shifted by ~6 lines after the input_type fix) +- Modify: `tests/knowledge_layer_tests/test_opensearch_adapter.py` (add unit test) + +**Why:** Today the adapter calls `OpenAI(api_key=os.environ.get("NVIDIA_API_KEY", ""))` — empty-string fallback. If the env var is unset and the embedding endpoint is the hosted NVIDIA API, the call surfaces as a confusing 401 from `integrate.api.nvidia.com` instead of a clear "missing key" error. Surfaced in the original PR evaluation. NIM-on-EKS users (no key needed) should still work; only the *default* hosted-API path with a missing key should fail loudly. + +- [ ] **Step 1: Write the failing test** + +In `tests/knowledge_layer_tests/test_opensearch_adapter.py`, add: + +```python +def test_ingestor_embed_raises_when_hosted_api_and_missing_key(monkeypatch): + """Hosted NVIDIA API with no key should raise a clear error before HTTP.""" + monkeypatch.delenv("NVIDIA_API_KEY", raising=False) + from knowledge_layer.opensearch.adapter import OpenSearchIngestor + + ingestor = OpenSearchIngestor({ + "endpoint": "http://localhost:9200", + "auth_type": "none", + "embed_base_url": "https://integrate.api.nvidia.com/v1", + "start_ttl_cleanup": False, + }) + with pytest.raises(RuntimeError, match="NVIDIA_API_KEY"): + ingestor._embed_texts(["hello world"]) + + +def test_ingestor_embed_allows_local_nim_without_key(monkeypatch): + """Self-hosted NIM with no key should pass through without complaint.""" + monkeypatch.delenv("NVIDIA_API_KEY", raising=False) + from knowledge_layer.opensearch.adapter import OpenSearchIngestor + + ingestor = OpenSearchIngestor({ + "endpoint": "http://localhost:9200", + "auth_type": "none", + "embed_base_url": "http://nim-embed.ns-nim.svc.cluster.local:8000/v1", + "start_ttl_cleanup": False, + }) + # Patch the OpenAI client so we don't hit the network; the test just + # asserts no early-exit RuntimeError fired. + class _FakeOpenAI: + def __init__(self, base_url, api_key): + self.embeddings = type("E", (), {"create": staticmethod(lambda **kw: type("R", (), {"data": [type("D", (), {"embedding": [0.0] * 4})()]})())})() + monkeypatch.setattr("openai.OpenAI", _FakeOpenAI) + result = ingestor._embed_texts(["hello"]) + assert result == [[0.0, 0.0, 0.0, 0.0]] +``` + +- [ ] **Step 2: Run tests, verify both fail** + +```bash +uv run python -m pytest tests/knowledge_layer_tests/test_opensearch_adapter.py::test_ingestor_embed_raises_when_hosted_api_and_missing_key tests/knowledge_layer_tests/test_opensearch_adapter.py::test_ingestor_embed_allows_local_nim_without_key -v +``` + +Expected: both FAIL — the first because no error is raised, the second because the un-patched OpenAI client tries to make a real HTTP call. + +- [ ] **Step 3: Implement the fix in both `_embed_texts` methods** + +Replace the unguarded `os.environ.get("NVIDIA_API_KEY", "")` with a helper that fails on the hosted-API URL pattern only: + +```python +# At module level, near other helpers: +def _resolve_embedding_api_key(embed_base_url: str) -> str: + api_key = os.environ.get("NVIDIA_API_KEY", "") + is_hosted_nvidia = "integrate.api.nvidia.com" in (embed_base_url or "") + if is_hosted_nvidia and not api_key: + raise RuntimeError( + "NVIDIA_API_KEY is required for the hosted NVIDIA embeddings API " + "(embed_base_url contains integrate.api.nvidia.com). Either set " + "NVIDIA_API_KEY or override AIQ_EMBED_BASE_URL to a self-hosted NIM endpoint." + ) + return api_key +``` + +Then in both `_embed_texts` methods: + +```python +client = OpenAI(base_url=self.embed_base_url, api_key=_resolve_embedding_api_key(self.embed_base_url)) +``` + +- [ ] **Step 4: Run tests, verify both pass** + +```bash +uv run python -m pytest tests/knowledge_layer_tests/test_opensearch_adapter.py::test_ingestor_embed_raises_when_hosted_api_and_missing_key tests/knowledge_layer_tests/test_opensearch_adapter.py::test_ingestor_embed_allows_local_nim_without_key -v +``` + +Expected: both PASS. + +- [ ] **Step 5: Run the full adapter test suite to confirm no regressions** + +```bash +uv run python -m pytest tests/knowledge_layer_tests/test_opensearch_adapter.py -q +``` + +Expected: all green. + +- [ ] **Step 6: Commit with sign-off** + +```bash +git add sources/knowledge_layer/src/opensearch/adapter.py \ + tests/knowledge_layer_tests/test_opensearch_adapter.py +git commit -s -m "fix(opensearch): fail fast when NVIDIA_API_KEY is missing for hosted API" +``` + +--- + +### Task 2: Document the OpenSearch backend as text-only + +**Files:** +- Modify: `sources/knowledge_layer/KNOWLEDGE-LAYER-SETUP.md` (in the OpenSearch section) +- Modify: `docs/source/deployment/aws-opensearch-serverless.md` (one-line callout near the workflow config section) + +**Why:** The LlamaIndex backend supports table/image/chart extraction via `AIQ_EXTRACT_TABLES/IMAGES/CHARTS`. The OpenSearch ingestor currently only does text chunking — `_read_pdf_file` extracts page text, no VLM, no pdfplumber tables. AWS customers reading the doc may assume parity with v1.0 multimodal behavior. Either build it or explicitly call out the gap. This task does the explicit-callout option. + +- [ ] **Step 1: Add a callout note in `KNOWLEDGE-LAYER-SETUP.md`** + +Find the OpenSearch backend section (the one that begins "**OpenSearch (Self-hosted)**" and contains the example YAML). Immediately before the AOSS YAML block (the one with `opensearch_aws_service: aoss`), insert: + +```markdown +> **Note: text-only ingestion.** The OpenSearch backend extracts plain text from PDFs, DOCX, and PPTX +> via `pypdf`/`docx2txt`/`python-pptx`. It does **not** currently honor `AIQ_EXTRACT_TABLES`, +> `AIQ_EXTRACT_IMAGES`, or `AIQ_EXTRACT_CHARTS` (those flags are LlamaIndex-only). For multimodal +> ingestion against OpenSearch, run the LlamaIndex backend instead, or use Foundational RAG which +> handles multimodal extraction server-side. +``` + +- [ ] **Step 2: Add the same note in the AOSS deployment doc** + +In `docs/source/deployment/aws-opensearch-serverless.md`, find the `## Workflow Config` section. Immediately after the `Use \`configs/config_web_opensearch.yml\`:` line and before the YAML excerpt, insert: + +```markdown +```{note} +**Text-only ingestion.** The OpenSearch backend extracts plain text from PDFs, DOCX, and PPTX. It does +not currently support table/image/chart extraction (those flags are LlamaIndex-only). For multimodal, +use the LlamaIndex backend or Foundational RAG. +``` +``` + +- [ ] **Step 3: Render docs and verify** + +```bash +uv run --extra docs sphinx-build -b html docs/source docs/_build/html +``` + +Expected: build succeeds with zero warnings. + +- [ ] **Step 4: Commit with sign-off** + +```bash +git add sources/knowledge_layer/KNOWLEDGE-LAYER-SETUP.md \ + docs/source/deployment/aws-opensearch-serverless.md +git commit -s -m "docs(opensearch): explicit text-only callout for ingestion path" +``` + +--- + +### Task 3: Fix health-endpoint expected response in the deployment doc + +**Files:** +- Modify: `docs/source/deployment/aws-opensearch-serverless.md` (the `## Verify the deployment` → `### 2. Backend health check` block) + +**Why:** Live testing showed `/health` returns `{"status":"healthy"}`. Our deployment doc says `Expected: {"status":"ok"}`. Trivial doc inaccuracy, easy fix while we're here. + +- [ ] **Step 1: Edit the expected-response line** + +Find this block in `docs/source/deployment/aws-opensearch-serverless.md`: + +```markdown +Expected: `{"status":"ok"}` (or equivalent — match the health route exposed by the deployed +`aiq_api` front end). +``` + +Replace with: + +```markdown +Expected: `{"status":"healthy"}` (the `aiq_api` front end exposes a JSON health route at `/health`). +``` + +- [ ] **Step 2: Render docs** + +```bash +uv run --extra docs sphinx-build -b html docs/source docs/_build/html +``` + +- [ ] **Step 3: Commit with sign-off** + +```bash +git add docs/source/deployment/aws-opensearch-serverless.md +git commit -s -m "docs(opensearch): correct health endpoint expected response shape" +``` + +--- + +### Task 4: Add an AOSS visibility-delay note in the Verify section + +**Files:** +- Modify: `docs/source/deployment/aws-opensearch-serverless.md` (the `## Verify the deployment` section) + +**Why:** During PDF testing, the `_count` against AOSS returned 0 immediately after a successful bulk write — even though the AIQ status said `completed: true, chunks_created: 30`. After ~10 seconds the count caught up. This is documented AOSS eventual-consistency behavior, but customers running the verification script will hit it the first time and assume something is broken. A short callout in the verify section preempts the support ticket. + +- [ ] **Step 1: Add the visibility note in step 4** + +Find `### 4. Confirm the index appears in AOSS` in `docs/source/deployment/aws-opensearch-serverless.md`. Immediately after the existing prose ending "and the `smoke` collection listed by the AIQ API.", insert: + +```markdown +```{note} +**AOSS visibility delay.** AOSS is eventually consistent for search after writes. A `_count` immediately +after a successful upload may report `0` for ~5–30 seconds before catching up. If the AIQ status says +`completed` but the AOSS console index browser shows zero docs, wait 30s and refresh — the index will +populate. This is also why the live-test suite includes a polling visibility wait. +``` +``` + +- [ ] **Step 2: Render docs** + +```bash +uv run --extra docs sphinx-build -b html docs/source docs/_build/html +``` + +- [ ] **Step 3: Commit with sign-off** + +```bash +git add docs/source/deployment/aws-opensearch-serverless.md +git commit -s -m "docs(opensearch): note AOSS visibility delay in verification steps" +``` + +--- + +### Task 5: Triage the `conversation_id` silent-drop on `/v1/chat/completions` + +**Files:** Triage-first; possibly modify `aiq_api` chat-completions route or the workflow config docs depending on findings. + +**Why:** During the PDF retrieval test, sending `{"conversation_id": "papers", ...}` in the chat-completions request body had no effect — the agent always used the YAML default (`COLLECTION_NAME=smoke`). Customers reading the OpenAPI schema will assume `conversation_id` controls collection routing; today it doesn't. Either (a) honor it in the route handler or (b) remove it from the request schema. This task starts with research because the fix path depends on what the field was originally intended for. + +- [ ] **Step 1: Locate the chat-completions request handler** + +```bash +grep -rn 'chat/completions\|conversation_id' src/aiq_api 2>/dev/null || \ +grep -rn 'chat/completions\|conversation_id' /Users/fdecarvalhop/Documents/projects/aiq/.venv/lib/python3.13/site-packages/aiq_api 2>/dev/null +``` + +Find the route registering `POST /v1/chat/completions`. Read it, identify whether `conversation_id` is present in the Pydantic request model and where (or if) it is ever read. + +- [ ] **Step 2: Locate where `Context.conversation_id` is set** + +```bash +grep -rn 'Context.*conversation_id\|set.*conversation_id\|context.*conversation' src/ /Users/fdecarvalhop/Documents/projects/aiq/.venv/lib/python3.13/site-packages/nat 2>/dev/null | head -20 +``` + +Determine the actual mechanism the UI uses (likely a header, cookie, or different route). + +- [ ] **Step 3: Decide and document the fix** + +Two acceptable outcomes — pick based on what Step 1–2 reveal: + +**Outcome A — honor the field.** If `conversation_id` is meant to flow to `Context.conversation_id`, add the wiring in the route handler: + +```python +# Pseudocode — actual path TBD by Step 1 +async def chat_completions(req: ChatCompletionsRequest, ...): + if req.conversation_id: + context.conversation_id = req.conversation_id + # ... existing handler +``` + +Add a unit test if the route has a test harness. + +**Outcome B — remove it from the schema.** If `conversation_id` is vestigial in this route (UI uses a header/cookie/different route), remove the field from the request model so the OpenAPI no longer advertises it: + +```python +class ChatCompletionsRequest(BaseModel): + messages: list[Message] + stream: bool = False + # conversation_id removed — see for session control via
+``` + +Document the actual session-control mechanism in `KNOWLEDGE-LAYER-SETUP.md` so customers know where to set it. + +- [ ] **Step 4: Run any affected tests** + +```bash +uv run python -m pytest tests/ -q -k chat_completions 2>&1 | tail -20 +``` + +Expected: green. + +- [ ] **Step 5: Commit with sign-off** + +```bash +git add +git commit -s -m "fix(api): honor conversation_id on /v1/chat/completions" +# OR +git commit -s -m "fix(api): drop unused conversation_id from /v1/chat/completions schema" +``` + +If Step 1 reveals this is outside the OpenSearch story (e.g., a long-standing `aiq_api` issue unrelated to v2.0 OpenSearch), STOP and report — it may belong in a separate PR rather than `feat/opensearch-aoss`. + +--- + +### Task 6: Document the Dask-worker logging gotcha + +**Files:** +- Modify: `docs/source/deployment/aws-opensearch-serverless.md` (the `## Architecture` or `## Troubleshooting` section) + +**Why:** During Dask-mode testing, the dask-worker subprocess produced an empty stdout file even though it successfully wrote 3 docs to AOSS — the `DASK_DISTRIBUTED__LOGGING__DISTRIBUTED=warning` setting in `deploy/.env` cascaded into the worker's Python logging config. In EKS this is invisible because pod stdout is separately captured; locally during testing, it makes "did the worker actually do anything?" hard to confirm. A doc note saves the next person 30 minutes. + +- [ ] **Step 1: Add a troubleshooting row** + +Find the `## Troubleshooting` table in `docs/source/deployment/aws-opensearch-serverless.md`. Add a new row before the table closes: + +```markdown +| Dask worker stdout is empty during local testing | `DASK_DISTRIBUTED__LOGGING__DISTRIBUTED=warning` (default in `deploy/.env`) silences worker logs. Ingestion still succeeds — verify by counting docs in AOSS, not by tailing the worker. | Override locally with `DASK_DISTRIBUTED__LOGGING__DISTRIBUTED=info` if you need worker logs during development. | +``` + +- [ ] **Step 2: Render docs** + +```bash +uv run --extra docs sphinx-build -b html docs/source docs/_build/html +``` + +- [ ] **Step 3: Commit with sign-off** + +```bash +git add docs/source/deployment/aws-opensearch-serverless.md +git commit -s -m "docs(opensearch): note Dask worker logging is silenced by default env" +``` + +--- + +### Task 7: DCO sign-off rebase across the branch + +**Files:** All commits on `feat/opensearch-aoss` not yet signed (everything except commits made *during* this plan, which use `git commit -s` from Task 1 onward). + +**Why:** CONTRIBUTING.md requires a `Signed-off-by:` trailer on every commit. The 19 commits before this plan's first sign-off don't have it. `git rebase --signoff` adds the trailer to every commit between develop and HEAD; commits already signed are left alone (idempotent). + +- [ ] **Step 1: Verify which commits lack the trailer** + +```bash +git log develop..HEAD --format='%h %s' | while read sha _; do + if [ -z "$(git show -s --format='%(trailers:key=Signed-off-by,valueonly)' "$sha")" ]; then + echo "MISSING: $sha" + fi +done +``` + +Note the count and SHAs. + +- [ ] **Step 2: Rebase with sign-off** + +```bash +git rebase --signoff develop +``` + +This rewrites every commit's SHA but adds `Signed-off-by:` to those missing it. + +- [ ] **Step 3: Verify all commits are signed** + +Re-run Step 1's verification loop. Expected: no `MISSING:` lines. + +- [ ] **Step 4: Spot-check a representative commit** + +```bash +git log -1 --format='%B' HEAD~5 +``` + +Expected: trailer block ends with both `Co-Authored-By: ...` and `Signed-off-by: Felipe Garcia `. + +No commit needed — the rebase already wrote the changes. + +--- + +### Task 8: Final docs build + adapter test suite as a quality gate + +**Files:** none (verification only). + +**Why:** Validates that all code edits in Tasks 1, 5, and 6 didn't break anything, and that all doc edits in Tasks 2, 3, 4, 6 still build clean under `-W -n`. Equivalent of a final CI check before declaring the branch PR-ready. + +- [ ] **Step 1: Strict docs build** + +```bash +cd docs && rm -rf _build +SPHINXOPTS="-W --keep-going -n" uv run --extra docs sphinx-build -b html source _build/html +``` + +Expected: exit 0, zero warnings. + +- [ ] **Step 2: Adapter unit tests** + +```bash +uv run python -m pytest tests/knowledge_layer_tests/test_opensearch_adapter.py -q +``` + +Expected: all green. + +- [ ] **Step 3: Reference YAML still parses** + +```bash +uv run python -c " +import yaml +cfg = yaml.safe_load(open('configs/config_web_opensearch.yml')) +ks = cfg['functions']['knowledge_search'] +assert ks['_type'] == 'knowledge_retrieval' and ks['backend'] == 'opensearch' +print('OK') +" +``` + +Expected: prints `OK`. + +- [ ] **Step 4: Helm example values still parses** + +```bash +uv run python -c "import yaml; yaml.safe_load(open('deploy/helm/examples/aws-opensearch-serverless-values.yaml'))" +``` + +Expected: no output. + +- [ ] **Step 5: Branch summary** + +```bash +git log --oneline develop..HEAD +echo "---" +git diff --shortstat develop..HEAD +``` + +Expected: a clean ordered list of commits, every one with a sign-off (verified in Task 7), the branch is ready to push to a fork and open a PR per CONTRIBUTING.md. + +No commit needed — this is the final gate. + +--- + +## Out of scope (folded into a future plan) + +1. **Live TTL cleanup test.** Requires `AIQ_TTL_CLEANUP_INTERVAL_SECONDS` override and ~hour wall-clock or test-time injection. Worth a dedicated test fixture but not blocking PR mergeability. +2. **Full EKS deploy walkthrough.** The reference deployment doc is complete; the actual EKS-cluster-up validation is a separate effort the AWS team can drive once the PR lands. +3. **Multi-session concurrency / load.** AIQ's session-bound collection model creates many indexes in parallel under load. Worth a soak test before scaling beyond ~10 concurrent users on a single AOSS collection. Not a v2.0-launch blocker. +4. **Multimodal extraction parity.** Task 2 documents the gap. Closing it (adding `pdfplumber` tables, VLM image captions to OpenSearch ingestion the way LlamaIndex does it) is a feature-sized effort. +5. **Better Pydantic error for `OpenSearchAwsService`.** The current Pydantic Literal validation message is acceptable; not worth a custom error. +6. **Repo fork + push.** Pre-PR mechanical step — covered conversationally; no plan task needed. diff --git a/pyproject.toml b/pyproject.toml index 14cd90cd..d72ee982 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -91,6 +91,11 @@ aiq_data_source_registry = "aiq_agent.common.data_source_registry" testpaths = ["tests", "sources/**/tests"] asyncio_mode = "auto" asyncio_default_fixture_loop_scope = "session" +markers = [ + "aws: tests that require AWS resources", + "integration: tests that require external services", + "opensearch_serverless: tests that require Amazon OpenSearch Serverless", +] [tool.ruff] diff --git a/sources/knowledge_layer/KNOWLEDGE-LAYER-SETUP.md b/sources/knowledge_layer/KNOWLEDGE-LAYER-SETUP.md index 05285a67..8946f003 100644 --- a/sources/knowledge_layer/KNOWLEDGE-LAYER-SETUP.md +++ b/sources/knowledge_layer/KNOWLEDGE-LAYER-SETUP.md @@ -9,7 +9,7 @@ A pluggable abstraction for document ingestion and retrieval. Swap backends with - **Collection Management** - create/delete/list collections per session or use case - **File Management** - upload/delete/list files with status tracking (UPLOADING → INGESTING → SUCCESS/FAILED) - **Content Typing** - TEXT, TABLE, CHART, IMAGE enums for frontend rendering -- **Backend Agnostic** - Swap between local (LlamaIndex) and hosted (RAG Blueprint) without core agent code changes +- **Backend Agnostic** - Swap between local (LlamaIndex), OpenSearch, and hosted RAG Blueprint without core agent code changes --- @@ -34,6 +34,7 @@ A pluggable abstraction for document ingestion and retrieval. Swap backends with | Backend | Config Name | Mode | Vector Store | Best For | |---------|-------------|------|--------------|----------| | `llamaindex` | `"llamaindex"` | Local Library | ChromaDB | Dev, prototyping, macOS/Linux | +| `opensearch` | `"opensearch"` | Direct Client | OpenSearch k-NN | Self-hosted OpenSearch, Amazon OpenSearch Serverless | | `foundational_rag` | `"foundational_rag"` | Hosted Service | Remote Milvus | Production, multi-user | **Local Library Mode** - Everything runs in your Python process. No external services needed. @@ -43,6 +44,10 @@ A pluggable abstraction for document ingestion and retrieval. Swap backends with - **`foundational_rag`** - Connects to [NVIDIA RAG Blueprint](https://github.com/NVIDIA-AI-Blueprints/rag) via HTTP. - [Deployment Guide](https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/docs/deploy-docker-self-hosted.md) +**OpenSearch Mode** - Stores AIQ collections directly in OpenSearch vector indexes. +- **`opensearch`** - Uses one OpenSearch index per AIQ collection/session. Supports unauthenticated local clusters, + basic auth, and SigV4 for Amazon OpenSearch Service or Amazon OpenSearch Serverless. + --- ## Quick Start @@ -58,6 +63,7 @@ export NVIDIA_API_KEY=nvapi-your-key-here # 2. Install backend (choose one) uv pip install -e "sources/knowledge_layer[llamaindex]" # Recommended for local dev - works on macOS/Linux uv pip install -e "sources/knowledge_layer[foundational_rag]" # Requires deployed server +uv pip install -e "sources/knowledge_layer[opensearch]" # Requires OpenSearch/OpenSearch Serverless ``` > **New to Knowledge Layer?** Start with `llamaindex` - it requires no external services and works on macOS and Linux. @@ -89,6 +95,8 @@ functions: rag_url: http://localhost:8081/v1 # foundational_rag only ingest_url: http://localhost:8082/v1 # foundational_rag only timeout: 120 # foundational_rag only + opensearch_url: http://localhost:9200 # opensearch only + opensearch_auth_type: none # opensearch only: none, basic, sigv4 ``` You can also use environment variable substitution in YAML for sensitive values: @@ -174,6 +182,144 @@ functions: > **Separate Docker stacks:** When AI-Q and RAG run as separate Docker Compose stacks, connect the AI-Q backend to the RAG network: `docker network connect nvidia-rag aiq-agent`. See the [Docker Compose README](../../deploy/compose/README.md#networking-when-aiq-and-rag-run-as-separate-compose-stacks) for details. +**OpenSearch (Self-hosted)** +```yaml +functions: + knowledge_search: + _type: knowledge_retrieval + backend: opensearch + collection_name: my_docs + top_k: 5 + opensearch_url: http://localhost:9200 + opensearch_auth_type: none + opensearch_index_prefix: aiq + opensearch_embedding_dim: 2048 + embed_model: nvidia/llama-nemotron-embed-vl-1b-v2 + embed_base_url: https://integrate.api.nvidia.com/v1 +``` + +For self-hosted clusters with basic auth: + +```yaml +functions: + knowledge_search: + _type: knowledge_retrieval + backend: opensearch + collection_name: my_docs + opensearch_url: https://opensearch.example.com:9200 + opensearch_auth_type: basic + opensearch_username: ${OPENSEARCH_USERNAME} + opensearch_password: ${OPENSEARCH_PASSWORD} + opensearch_verify_certs: true +``` + +For Amazon OpenSearch Serverless, use SigV4 with service `aoss`. For Amazon OpenSearch Service domains, use +service `es`. + +> **Note: text-only ingestion.** The OpenSearch backend extracts plain text from PDFs, DOCX, and PPTX +> via `pypdf`/`docx2txt`/`python-pptx`. It does **not** currently honor `AIQ_EXTRACT_TABLES`, +> `AIQ_EXTRACT_IMAGES`, or `AIQ_EXTRACT_CHARTS` (those flags are LlamaIndex-only). For multimodal +> ingestion against OpenSearch, run the LlamaIndex backend instead, or use Foundational RAG which +> handles multimodal extraction server-side. + +```yaml +functions: + knowledge_search: + _type: knowledge_retrieval + backend: opensearch + collection_name: my_docs + opensearch_url: https://abc123.us-west-2.aoss.amazonaws.com + opensearch_auth_type: sigv4 + opensearch_aws_region: us-west-2 + opensearch_aws_service: aoss + opensearch_index_prefix: aiq + opensearch_ingestion_mode: auto + opensearch_dask_file_transfer: bytes +``` + +> **Deploying on EKS?** See the +> [Amazon OpenSearch Serverless deployment guide](../../docs/source/deployment/aws-opensearch-serverless.md) +> for the end-to-end EKS Pod Identity setup, AOSS data access policy, Helm values, and +> verification commands. + +OpenSearch creates one physical index per collection using `-`, sanitized +for OpenSearch index naming rules. The adapter stores collection metadata in mapping `_meta` and stores each text chunk +as one OpenSearch document with a `knn_vector` field. + +For session-isolated web uploads, AI-Q uses the conversation/session collection name, such as `s_`. The OpenSearch +adapter maps that session collection to a dynamic index in the same OpenSearch endpoint, for example +`aiq-s_`. The TTL cleanup task removes expired OpenSearch indexes based on their collection `_meta.updated_at` +timestamp. + +OpenSearch ingestion runs locally by default. Set `opensearch_ingestion_mode: auto` or `OPENSEARCH_INGESTION_MODE=auto` +to use Dask when `NAT_DASK_SCHEDULER_ADDRESS` is configured, falling back to local ingestion when it is not. Set +`opensearch_ingestion_mode: dask` to require Dask. In Dask mode, each worker constructs its own OpenSearch client, so +AWS SigV4 credentials are resolved in the worker environment. This supports EKS Pod Identity, SSO-backed local workers, +and standard AWS SDK environment/profile credentials. `opensearch_dask_file_transfer: bytes` sends uploaded file +contents to workers and works without a shared volume; `paths` requires API and worker pods to share the same file path. + +#### Live OpenSearch Integration Tests + +Live tests are opt-in because they create and delete real OpenSearch indexes. They patch embeddings with deterministic +local vectors, so the tests validate OpenSearch indexing/search behavior without requiring `NVIDIA_API_KEY`. + +For an unauthenticated local OpenSearch cluster: + +```bash +AIQ_OPENSEARCH_LIVE_TESTS=1 \ +OPENSEARCH_URL=http://localhost:9200 \ +OPENSEARCH_AUTH_TYPE=none \ +uv run python -m pytest tests/knowledge_layer_tests/test_opensearch_live.py +``` + +For a self-hosted cluster with basic auth: + +```bash +AIQ_OPENSEARCH_LIVE_TESTS=1 \ +OPENSEARCH_URL=https://opensearch.example.com:9200 \ +OPENSEARCH_AUTH_TYPE=basic \ +OPENSEARCH_USERNAME=admin \ +OPENSEARCH_PASSWORD=admin \ +uv run python -m pytest tests/knowledge_layer_tests/test_opensearch_live.py +``` + +For Amazon OpenSearch Serverless: + +```bash +AIQ_OPENSEARCH_LIVE_TESTS=1 \ +OPENSEARCH_URL=https://abc123.us-west-2.aoss.amazonaws.com \ +OPENSEARCH_AUTH_TYPE=sigv4 \ +OPENSEARCH_AWS_SERVICE=aoss \ +AWS_REGION=us-west-2 \ +uv run python -m pytest tests/knowledge_layer_tests/test_opensearch_live.py +``` + +For Amazon OpenSearch Service domains, use `OPENSEARCH_AWS_SERVICE=es`. If you use a development cluster with +self-signed certificates, set `OPENSEARCH_VERIFY_CERTS=false`. + +A dedicated Amazon OpenSearch Serverless suite is also available. It always uses SigV4 service `aoss` and expects an +AOSS data endpoint: + +```bash +AIQ_OPENSEARCH_SERVERLESS_LIVE_TESTS=1 \ +OPENSEARCH_URL=https://abc123.us-west-2.aoss.amazonaws.com \ +AWS_REGION=us-west-2 \ +uv run python -m pytest tests/knowledge_layer_tests/test_opensearch_serverless_live.py +``` + +If you set the variables on separate lines, export them first: + +```bash +export AIQ_OPENSEARCH_SERVERLESS_LIVE_TESTS=1 +export OPENSEARCH_URL=https://abc123.us-west-2.aoss.amazonaws.com +export AWS_REGION=us-west-2 +uv run python -m pytest tests/knowledge_layer_tests/test_opensearch_serverless_live.py +``` + +This suite validates SigV4 health checks, collection lifecycle, vector ingestion, k-NN retrieval, filtered k-NN +retrieval, and file deletion against OpenSearch Serverless. The AWS principal must have data access permissions for +index creation/deletion and document read/write operations on the target collection. + ### Programmatic Usage ```python @@ -253,6 +399,41 @@ For more details, see the [Docker Compose README](../../deploy/compose/README.md Both LlamaIndex and Foundational RAG support session-based collections (`s_`) created by the UI. Each browser session gets its own isolated collection. +#### How collection routing works + +When a request arrives the `knowledge_search` tool reads `Context.conversation_id` and uses it as the +collection name, falling back to the static `collection_name` from YAML config when the context value +is absent. + +`Context.conversation_id` is populated by the `nat` framework **from the `conversation-id` HTTP +request header** (see `SessionManager.set_metadata_from_http_request` in the `nat` package). +The AI-Q UI sets this header automatically for every WebSocket and HTTP request it sends, which is why +session-isolated uploads work seamlessly through the UI. + +**Known limitation — `/v1/chat/completions` JSON body field is ignored.** +The OpenAI-compatible `POST /v1/chat/completions` endpoint uses `nat.data_models.api_server.ChatRequest` +as its request body model. `ChatRequest` does **not** declare a `conversation_id` field; the model uses +`extra="allow"`, so any `conversation_id` key in the JSON body is silently accepted and then discarded. +The framework never reads it back into the context. + +Consequence: callers that send `{"messages": [...], "conversation_id": "my-collection"}` in the body +will have that value silently dropped, and the tool will fall back to the configured `collection_name` +default instead of routing to `my-collection`. + +**Workaround (until upstream `nat` is patched):** pass the collection name as the +`conversation-id` HTTP header instead of a JSON body field: + +```bash +curl -X POST http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "conversation-id: my-collection" \ + -d '{"messages": [{"role": "user", "content": "..."}], "stream": false}' +``` + +This is tracked as a known gap. The permanent fix requires adding `conversation_id` to `ChatRequest` +and wiring it into `Context.conversation_id` inside the `nat` framework — a change that belongs in the +upstream `nat` / `aiq_api` repository, not in this repo. + ### TTL Cleanup Collections inactive for 24 hours are auto-deleted based on `last_indexed` timestamp. Background thread runs hourly. @@ -853,6 +1034,19 @@ Configuration values are resolved in the following order (highest to lowest prio | `AIQ_SUMMARY_DB` | All | Summary database URL (SQLite or PostgreSQL) | | `RAG_SERVER_URL` | foundational_rag | Query server URL (port 8081) | | `RAG_INGEST_URL` | foundational_rag | Ingestion server URL (port 8082) | +| `OPENSEARCH_URL` | opensearch | OpenSearch endpoint URL | +| `OPENSEARCH_AUTH_TYPE` | opensearch | Auth mode: `none`, `basic`, or `sigv4` | +| `OPENSEARCH_USERNAME` | opensearch | Username for basic auth | +| `OPENSEARCH_PASSWORD` | opensearch | Password for basic auth | +| `AWS_REGION` / `AWS_DEFAULT_REGION` | opensearch | AWS region for SigV4 auth | +| `OPENSEARCH_AWS_SERVICE` | opensearch | SigV4 service: `aoss` or `es` | +| `OPENSEARCH_INDEX_PREFIX` | opensearch | Prefix for physical OpenSearch indexes | +| `OPENSEARCH_CA_CERTS` | opensearch | Optional custom CA bundle path | +| `OPENSEARCH_INGESTION_MODE` | opensearch | Ingestion execution: `local`, `dask`, or `auto` | +| `OPENSEARCH_DASK_SCHEDULER_ADDRESS` | opensearch | Dask scheduler for distributed ingestion; falls back to `NAT_DASK_SCHEDULER_ADDRESS` | +| `OPENSEARCH_DASK_FILE_TRANSFER` | opensearch | Dask file transfer mode: `bytes` or `paths` | +| `OPENSEARCH_ALLOW_DOCUMENT_IDS` | opensearch | Override explicit document ID behavior; defaults off for AOSS | +| `OPENSEARCH_BULK_REFRESH` | opensearch | Override bulk refresh behavior; defaults off for AOSS | | `COLLECTION_NAME` | All | Default collection name | --- @@ -866,6 +1060,10 @@ Configuration values are resolved in the following order (highest to lowest prio | Empty retrieval results | Collection empty | Run ingestion first, verify collection name matches | | Job status 404 | Different process/instance | Factory uses singletons - ensure same process | | `milvus-lite` required | Missing dependency | `uv pip install "pymilvus[milvus_lite]"` | +| OpenSearch SigV4 auth fails | Missing AWS credentials or wrong service | Configure AWS credentials and use `aoss` for Serverless or `es` for managed domains | +| OpenSearch SSO works in AWS CLI but fails in tests | Expired `AWS_ACCESS_KEY_ID`/`AWS_SESSION_TOKEN` environment variables override `AWS_PROFILE` | `unset AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY AWS_SESSION_TOKEN AWS_CREDENTIAL_EXPIRATION`, then run `aws sso login --profile ` | +| OpenSearch mapping dimension error | Embedding dimension does not match index mapping | Set `opensearch_embedding_dim` to the selected embedding model dimension before creating the collection | +| AOSS returns 403 | IAM role or data access policy is incomplete | Grant the pod/user IAM role `aoss:APIAccessAll` and an AOSS data access policy covering `index//*` | | Backend registered twice | Module imported multiple times | Normal - factory logs warning but works fine | ### Debug Registration diff --git a/sources/knowledge_layer/README.md b/sources/knowledge_layer/README.md index df488bdd..53dd9b25 100644 --- a/sources/knowledge_layer/README.md +++ b/sources/knowledge_layer/README.md @@ -12,6 +12,9 @@ uv pip install -e "sources/knowledge_layer[llamaindex]" # With Foundational RAG (hosted production) uv pip install -e "sources/knowledge_layer[foundational_rag]" + +# With OpenSearch (self-hosted or Amazon OpenSearch) +uv pip install -e "sources/knowledge_layer[opensearch]" ``` ## Available Backends @@ -19,6 +22,7 @@ uv pip install -e "sources/knowledge_layer[foundational_rag]" | Backend | Vector Store | Best For | |---------|-------------|----------| | `llamaindex` | ChromaDB | Development, prototyping | +| `opensearch` | OpenSearch k-NN | Self-hosted OpenSearch, Amazon OpenSearch Serverless | | `foundational_rag` | Remote Milvus | Production, multi-user | ## Usage diff --git a/sources/knowledge_layer/pyproject.toml b/sources/knowledge_layer/pyproject.toml index 95f8f6a4..37db8435 100644 --- a/sources/knowledge_layer/pyproject.toml +++ b/sources/knowledge_layer/pyproject.toml @@ -18,7 +18,7 @@ build-backend = "setuptools.build_meta" requires = ["setuptools >= 64", "setuptools-scm>=8"] [tool.setuptools] -packages = ["knowledge_layer", "knowledge_layer.llamaindex", "knowledge_layer.foundational_rag"] +packages = ["knowledge_layer", "knowledge_layer.llamaindex", "knowledge_layer.foundational_rag", "knowledge_layer.opensearch"] package-dir = {"knowledge_layer" = "src"} [project] @@ -52,8 +52,16 @@ foundational_rag = [ "docx2txt>=0.8", "python-pptx>=0.6.21", ] +opensearch = [ + "opensearch-py>=2.4.0", + "boto3>=1.28.0", + "openai>=1.0.0", + "pypdf>=4.0.0", + "docx2txt>=0.8", + "python-pptx>=0.6.21", +] all = [ - "knowledge-layer[llamaindex,foundational_rag]", + "knowledge-layer[llamaindex,foundational_rag,opensearch]", ] [project.entry-points."nat.plugins"] diff --git a/sources/knowledge_layer/src/__init__.py b/sources/knowledge_layer/src/__init__.py index ec6b6819..1ea7ed45 100644 --- a/sources/knowledge_layer/src/__init__.py +++ b/sources/knowledge_layer/src/__init__.py @@ -22,6 +22,7 @@ Available Backends: - llamaindex: LlamaIndex + ChromaDB (lightweight, local) - foundational_rag: Hosted NVIDIA RAG Blueprint (production, multi-user) +- opensearch: OpenSearch vector search for self-hosted clusters and Amazon OpenSearch Serverless Note: NAT tool registrations require NAT to be installed. The adapter modules can be used standalone without NAT. diff --git a/sources/knowledge_layer/src/opensearch/__init__.py b/sources/knowledge_layer/src/opensearch/__init__.py new file mode 100644 index 00000000..7c906183 --- /dev/null +++ b/sources/knowledge_layer/src/opensearch/__init__.py @@ -0,0 +1,25 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +OpenSearch Knowledge Adapter. + +Supports self-hosted OpenSearch and Amazon OpenSearch / OpenSearch Serverless +for vector search using the Knowledge Layer adapter interfaces. +""" + +from .adapter import OpenSearchIngestor +from .adapter import OpenSearchRetriever + +__all__ = ["OpenSearchIngestor", "OpenSearchRetriever"] diff --git a/sources/knowledge_layer/src/opensearch/adapter.py b/sources/knowledge_layer/src/opensearch/adapter.py new file mode 100644 index 00000000..dcde7eb7 --- /dev/null +++ b/sources/knowledge_layer/src/opensearch/adapter.py @@ -0,0 +1,1553 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +OpenSearch adapter for the Knowledge Layer. + +This backend stores one OpenSearch vector index per AIQ collection/session and +supports three authentication modes: +- none: self-hosted development clusters without authentication +- basic: self-hosted clusters with username/password +- sigv4: Amazon OpenSearch Service and Amazon OpenSearch Serverless +""" + +from __future__ import annotations + +import logging +import os +import re +import threading +import time +import uuid +from datetime import UTC +from datetime import datetime +from pathlib import Path +from typing import Any +from urllib.parse import urlparse + +from aiq_agent.knowledge.base import BaseIngestor +from aiq_agent.knowledge.base import BaseRetriever +from aiq_agent.knowledge.base import TTLCleanupMixin +from aiq_agent.knowledge.factory import register_ingestor +from aiq_agent.knowledge.factory import register_retriever +from aiq_agent.knowledge.schema import Chunk +from aiq_agent.knowledge.schema import CollectionInfo +from aiq_agent.knowledge.schema import ContentType +from aiq_agent.knowledge.schema import FileInfo +from aiq_agent.knowledge.schema import FileProgress +from aiq_agent.knowledge.schema import FileStatus +from aiq_agent.knowledge.schema import IngestionJobStatus +from aiq_agent.knowledge.schema import JobState +from aiq_agent.knowledge.schema import RetrievalResult + +logger = logging.getLogger(__name__) + +# @environment_variable OPENSEARCH_URL +# @category Knowledge Layer +# @type str +# @default http://localhost:9200 +# @required false +# Base URL for self-hosted OpenSearch, Amazon OpenSearch, or OpenSearch Serverless. +DEFAULT_ENDPOINT = os.environ.get("OPENSEARCH_URL", "http://localhost:9200") + +# @environment_variable OPENSEARCH_AUTH_TYPE +# @category Knowledge Layer +# @type str +# @default none +# @required false +# Auth mode for OpenSearch: none, basic, or sigv4. +DEFAULT_AUTH_TYPE = os.environ.get("OPENSEARCH_AUTH_TYPE", "none") + +DEFAULT_INDEX_PREFIX = os.environ.get("OPENSEARCH_INDEX_PREFIX", "aiq") +DEFAULT_AWS_REGION = os.environ.get("AWS_REGION") or os.environ.get("AWS_DEFAULT_REGION", "us-east-1") +DEFAULT_AWS_SERVICE = os.environ.get("OPENSEARCH_AWS_SERVICE", "aoss") +DEFAULT_EMBED_MODEL = os.environ.get("AIQ_EMBED_MODEL", "nvidia/llama-nemotron-embed-vl-1b-v2") +DEFAULT_EMBED_BASE_URL = os.environ.get("AIQ_EMBED_BASE_URL", "https://integrate.api.nvidia.com/v1") +DEFAULT_VECTOR_FIELD = os.environ.get("OPENSEARCH_VECTOR_FIELD", "embedding") +DEFAULT_TEXT_FIELD = os.environ.get("OPENSEARCH_TEXT_FIELD", "content") +DEFAULT_EMBEDDING_DIM = int(os.environ.get("OPENSEARCH_EMBEDDING_DIM", "2048")) +DEFAULT_TIMEOUT = int(os.environ.get("OPENSEARCH_TIMEOUT", "120")) +DEFAULT_CHUNK_SIZE = int(os.environ.get("OPENSEARCH_CHUNK_SIZE", "1024")) +DEFAULT_CHUNK_OVERLAP = int(os.environ.get("OPENSEARCH_CHUNK_OVERLAP", "128")) +DEFAULT_INGESTION_MODE = os.environ.get("OPENSEARCH_INGESTION_MODE", "local") +DEFAULT_DASK_SCHEDULER_ADDRESS = os.environ.get("OPENSEARCH_DASK_SCHEDULER_ADDRESS") or os.environ.get( + "NAT_DASK_SCHEDULER_ADDRESS" +) +DEFAULT_DASK_FILE_TRANSFER = os.environ.get("OPENSEARCH_DASK_FILE_TRANSFER", "bytes") +DEFAULT_AOSS_DELETE_MAX_BATCHES = int(os.environ.get("OPENSEARCH_AOSS_DELETE_MAX_BATCHES", "100")) +DEFAULT_AOSS_DELETE_BACKOFF_SECONDS = float(os.environ.get("OPENSEARCH_AOSS_DELETE_BACKOFF_SECONDS", "0.25")) + +# Collection TTL settings, aligned with the other knowledge backends. +COLLECTION_TTL_HOURS = float(os.environ.get("AIQ_COLLECTION_TTL_HOURS", "24")) +TTL_CLEANUP_INTERVAL_SECONDS = int(os.environ.get("AIQ_TTL_CLEANUP_INTERVAL_SECONDS", "3600")) + +SUMMARY_MAX_INPUT_CHARS = 4000 +DEFAULT_BULK_BATCH_SIZE = 100 +DEFAULT_EMBEDDING_BATCH_SIZE = 16 +SUPPORTED_TEXT_EXTENSIONS = {".txt", ".md", ".csv", ".json", ".yaml", ".yml", ".log"} + + +def _utc_now() -> datetime: + return datetime.now(tz=UTC) + + +def _sanitize_index_part(value: str, fallback: str = "default") -> str: + """Convert a collection/prefix value into an OpenSearch-safe index name part.""" + normalized = re.sub(r"[^a-z0-9._-]+", "-", value.lower()).strip(".-_") + if not normalized: + normalized = fallback + if normalized[0] in ("-", "_", "+"): + normalized = f"x-{normalized.lstrip('-_+')}" + return normalized + + +def _trim_index_name(index_name: str) -> str: + """Ensure the physical index name stays within OpenSearch's length limit.""" + if len(index_name) <= 255: + return index_name + suffix = uuid.uuid5(uuid.NAMESPACE_URL, index_name).hex[:12] + return f"{index_name[:242]}-{suffix}" + + +def _normalize_endpoint(endpoint: str, default_scheme: str) -> str: + endpoint = str(endpoint).strip() + if "://" not in endpoint: + endpoint = f"{default_scheme}://{endpoint}" + return endpoint.rstrip("/") + + +def _score_to_similarity(score: Any) -> float: + """Normalize backend scores into the universal [0, 1] score contract.""" + try: + value = float(score) + except (TypeError, ValueError): + return 0.0 + return max(0.0, min(1.0, value)) + + +def _parse_timestamp(value: Any) -> datetime | None: + if not value: + return None + if isinstance(value, datetime): + return value + if isinstance(value, str): + try: + return datetime.fromisoformat(value.replace("Z", "+00:00")) + except ValueError: + return None + return None + + +def _generate_document_summary(text_content: str, file_name: str, llm=None) -> str | None: + """Generate a one-sentence document summary using the configured LangChain LLM.""" + if llm is None or not text_content.strip(): + return None + + prompt = ( + "Summarize this uploaded document in one concise sentence for a research assistant. " + "Focus on the document's topic and likely usefulness.\n\n" + f"Document: {file_name}\n\n" + f"Content excerpt:\n{text_content[:SUMMARY_MAX_INPUT_CHARS]}" + ) + + try: + response = llm.invoke(prompt) + summary = getattr(response, "content", response) + summary_text = str(summary).strip() + return summary_text[:500] if summary_text else None + except Exception as e: + logger.warning("Summary generation failed for %s: %s", file_name, e) + return None + + +def _read_text_file(file_path: Path) -> list[tuple[str, int | None, dict[str, Any]]]: + content = file_path.read_text(encoding="utf-8", errors="ignore") + return [(content, None, {"file_type": file_path.suffix.lower().lstrip(".") or "text"})] + + +def _read_pdf_file(file_path: Path) -> list[tuple[str, int | None, dict[str, Any]]]: + try: + from pypdf import PdfReader + except ImportError as e: + raise RuntimeError("PDF ingestion for OpenSearch requires pypdf. Install knowledge-layer[opensearch].") from e + + reader = PdfReader(str(file_path)) + pages: list[tuple[str, int | None, dict[str, Any]]] = [] + for idx, page in enumerate(reader.pages, start=1): + text = page.extract_text() or "" + if text.strip(): + pages.append((text, idx, {"file_type": "pdf"})) + return pages + + +def _read_docx_file(file_path: Path) -> list[tuple[str, int | None, dict[str, Any]]]: + try: + import docx2txt + except ImportError as e: + raise RuntimeError( + "DOCX ingestion for OpenSearch requires docx2txt. Install knowledge-layer[opensearch]." + ) from e + + return [(docx2txt.process(str(file_path)) or "", None, {"file_type": "docx"})] + + +def _read_pptx_file(file_path: Path) -> list[tuple[str, int | None, dict[str, Any]]]: + try: + from pptx import Presentation + except ImportError as e: + raise RuntimeError( + "PPTX ingestion for OpenSearch requires python-pptx. Install knowledge-layer[opensearch]." + ) from e + + presentation = Presentation(str(file_path)) + slides: list[tuple[str, int | None, dict[str, Any]]] = [] + for idx, slide in enumerate(presentation.slides, start=1): + texts = [] + for shape in slide.shapes: + if hasattr(shape, "text") and shape.text: + texts.append(shape.text) + content = "\n".join(texts) + if content.strip(): + slides.append((content, idx, {"file_type": "pptx", "slide_number": idx})) + return slides + + +def _read_file_segments(file_path: str) -> list[tuple[str, int | None, dict[str, Any]]]: + path = Path(file_path) + suffix = path.suffix.lower() + if suffix in SUPPORTED_TEXT_EXTENSIONS or not suffix: + return _read_text_file(path) + if suffix == ".pdf": + return _read_pdf_file(path) + if suffix == ".docx": + return _read_docx_file(path) + if suffix == ".pptx": + return _read_pptx_file(path) + raise RuntimeError(f"Unsupported file type for OpenSearch ingestion: {suffix}") + + +def _chunk_text(text: str, chunk_size: int, chunk_overlap: int) -> list[str]: + """Chunk text using a word-count approximation that avoids tokenizer dependencies.""" + words = text.split() + if not words: + return [] + + chunk_size = max(1, chunk_size) + chunk_overlap = max(0, min(chunk_overlap, chunk_size - 1)) + step = chunk_size - chunk_overlap + chunks = [] + + for start in range(0, len(words), step): + chunk_words = words[start : start + chunk_size] + if not chunk_words: + break + chunks.append(" ".join(chunk_words)) + if start + chunk_size >= len(words): + break + + return chunks + + +def _resolve_embedding_api_key(embed_base_url: str) -> str: + api_key = os.environ.get("NVIDIA_API_KEY", "") + is_hosted_nvidia = "integrate.api.nvidia.com" in (embed_base_url or "") + if is_hosted_nvidia and not api_key: + raise RuntimeError( + "NVIDIA_API_KEY is required for the hosted NVIDIA embeddings API " + "(embed_base_url contains integrate.api.nvidia.com). Either set " + "NVIDIA_API_KEY or override AIQ_EMBED_BASE_URL to a self-hosted NIM endpoint." + ) + return api_key + + +class _OpenSearchConfigMixin: + """Shared OpenSearch configuration and client helpers.""" + + config: dict[str, Any] + _client: Any + _client_lock: threading.RLock + + def _configure_opensearch(self) -> None: + self.auth_type = str(self.config.get("auth_type", DEFAULT_AUTH_TYPE)).lower() + raw_endpoint = self.config.get("endpoint") or self.config.get("opensearch_url") or DEFAULT_ENDPOINT + self.endpoint = _normalize_endpoint(raw_endpoint, "https" if self.auth_type == "sigv4" else "http") + self.username = self.config.get("username") or os.environ.get("OPENSEARCH_USERNAME") + self.password = self.config.get("password") or os.environ.get("OPENSEARCH_PASSWORD") + self.aws_region = self.config.get("aws_region", DEFAULT_AWS_REGION) + self.aws_service = self.config.get("aws_service", DEFAULT_AWS_SERVICE) + self.verify_certs = self.config.get("verify_certs", True) + self.ca_certs = self.config.get("ca_certs") or os.environ.get("OPENSEARCH_CA_CERTS") + self.timeout = self.config.get("timeout", DEFAULT_TIMEOUT) + self.max_retries = self.config.get("max_retries", 3) + self.retry_on_timeout = self.config.get("retry_on_timeout", True) + self.index_prefix = _sanitize_index_part(self.config.get("index_prefix", DEFAULT_INDEX_PREFIX), "aiq") + self.vector_field = self.config.get("vector_field", DEFAULT_VECTOR_FIELD) + self.text_field = self.config.get("text_field", DEFAULT_TEXT_FIELD) + self.embedding_dim = int(self.config.get("embedding_dim", DEFAULT_EMBEDDING_DIM)) + self.engine = self.config.get("engine", "faiss") + self.space_type = self.config.get("space_type", "cosinesimil") + self.m = int(self.config.get("m", 16)) + self.ef_construction = int(self.config.get("ef_construction", 512)) + self.ef_search = int(self.config.get("ef_search", 512)) + self.bulk_batch_size = int(self.config.get("bulk_batch_size", DEFAULT_BULK_BATCH_SIZE)) + self.aoss_delete_max_batches = int(self.config.get("aoss_delete_max_batches", DEFAULT_AOSS_DELETE_MAX_BATCHES)) + self.aoss_delete_backoff_seconds = float( + self.config.get("aoss_delete_backoff_seconds", DEFAULT_AOSS_DELETE_BACKOFF_SECONDS) + ) + self.allow_document_ids = self.config.get("allow_document_ids") + if self.allow_document_ids is None: + self.allow_document_ids = not (self.auth_type == "sigv4" and self.aws_service == "aoss") + self.bulk_refresh = self.config.get("bulk_refresh") + if self.bulk_refresh is None: + self.bulk_refresh = False if self.auth_type == "sigv4" and self.aws_service == "aoss" else True + + self._client = None + self._client_lock = threading.RLock() + + def _index_name_for_collection(self, collection_name: str) -> str: + collection_part = _sanitize_index_part(collection_name, "default") + return _trim_index_name(f"{self.index_prefix}-{collection_part}") + + def _create_client(self): + try: + from opensearchpy import OpenSearch + from opensearchpy import RequestsHttpConnection + except ImportError as e: + raise RuntimeError( + "OpenSearch dependencies not installed. Install with: knowledge-layer[opensearch]" + ) from e + + parsed = urlparse(self.endpoint) + use_ssl = parsed.scheme == "https" + client_kwargs: dict[str, Any] = { + "use_ssl": use_ssl, + "verify_certs": self.verify_certs, + "timeout": self.timeout, + "max_retries": self.max_retries, + "retry_on_timeout": self.retry_on_timeout, + "connection_class": RequestsHttpConnection, + } + if self.ca_certs: + client_kwargs["ca_certs"] = self.ca_certs + + if self.auth_type == "basic": + if not self.username or not self.password: + raise RuntimeError("OpenSearch basic auth requires username and password") + client_kwargs["http_auth"] = (self.username, self.password) + elif self.auth_type == "sigv4": + try: + import boto3 + from opensearchpy import AWSV4SignerAuth + except ImportError as e: + raise RuntimeError( + "OpenSearch SigV4 auth requires boto3 and opensearch-py. Install with: knowledge-layer[opensearch]" + ) from e + + credentials = boto3.Session(region_name=self.aws_region).get_credentials() + if credentials is None: + raise RuntimeError("No AWS credentials available for OpenSearch SigV4 auth") + client_kwargs["http_auth"] = AWSV4SignerAuth(credentials, self.aws_region, self.aws_service) + elif self.auth_type != "none": + raise RuntimeError("OpenSearch auth_type must be one of: none, basic, sigv4") + + if self.auth_type == "sigv4" and parsed.hostname: + host = { + "host": parsed.hostname, + "port": parsed.port or (443 if use_ssl else 80), + "scheme": parsed.scheme or "https", + } + return OpenSearch(hosts=[host], **client_kwargs) + + return OpenSearch(hosts=[self.endpoint], **client_kwargs) + + def _get_client(self): + with self._client_lock: + if self._client is None: + self._client = self._create_client() + return self._client + + def _index_mapping(self, collection_name: str, description: str | None = None) -> dict[str, Any]: + now = _utc_now().isoformat() + meta = { + "backend": "opensearch", + "collection_name": collection_name, + "description": description, + "created_at": now, + "updated_at": now, + "embedding_model": self.embed_model_name, + "embedding_dim": self.embedding_dim, + } + return { + "settings": { + "index": { + "knn": True, + "knn.algo_param.ef_search": self.ef_search, + } + }, + "mappings": { + "_meta": meta, + "dynamic_templates": [ + { + "metadata_strings": { + "path_match": "metadata.*", + "match_mapping_type": "string", + "mapping": {"type": "keyword", "ignore_above": 1024}, + } + } + ], + "properties": { + "chunk_id": {"type": "keyword"}, + "file_id": {"type": "keyword"}, + "file_name": {"type": "keyword"}, + self.text_field: {"type": "text"}, + self.vector_field: { + "type": "knn_vector", + "dimension": self.embedding_dim, + "method": { + "name": "hnsw", + "space_type": self.space_type, + "engine": self.engine, + "parameters": { + "ef_construction": self.ef_construction, + "m": self.m, + }, + }, + }, + "display_citation": {"type": "keyword"}, + "page_number": {"type": "integer"}, + "content_type": {"type": "keyword"}, + "content_subtype": {"type": "keyword"}, + "file_size": {"type": "long"}, + "metadata": {"type": "object", "enabled": True}, + "created_at": {"type": "date"}, + "updated_at": {"type": "date"}, + }, + }, + } + + def _get_index_meta(self, index_name: str) -> dict[str, Any]: + client = self._get_client() + try: + info = client.indices.get(index=index_name) + index_info = info.get(index_name, {}) if isinstance(info, dict) else {} + return (index_info.get("mappings") or {}).get("_meta") or {} + except Exception: + return {} + + def _put_index_meta(self, index_name: str, meta: dict[str, Any]) -> None: + client = self._get_client() + try: + client.indices.put_mapping(index=index_name, body={"_meta": meta}) + except Exception as e: + logger.debug("Failed to update OpenSearch mapping metadata for %s: %s", index_name, e) + + def _ensure_index(self, collection_name: str, description: str | None = None) -> str: + client = self._get_client() + index_name = self._index_name_for_collection(collection_name) + if client.indices.exists(index=index_name): + return index_name + try: + client.indices.create(index=index_name, body=self._index_mapping(collection_name, description)) + except Exception: + # A concurrent ingestion may have raced us to create the same index. Re-check + # existence; if the index is now present, the other worker won and we proceed. + # Otherwise the create failed for a real reason and the error must propagate. + if not client.indices.exists(index=index_name): + raise + return index_name + + def _update_collection_timestamp(self, collection_name: str) -> None: + index_name = self._index_name_for_collection(collection_name) + meta = self._get_index_meta(index_name) + if not meta: + return + meta["updated_at"] = _utc_now().isoformat() + self._put_index_meta(index_name, meta) + + def _health_check_client(self) -> bool: + client = self._get_client() + if client.ping(): + return True + + if self.auth_type == "sigv4" and self.aws_service == "aoss": + client.transport.perform_request("GET", "/_cat/indices") + return True + + return False + + +@register_ingestor("opensearch") +class OpenSearchIngestor(TTLCleanupMixin, _OpenSearchConfigMixin, BaseIngestor): + """OpenSearch-backed document ingestor.""" + + backend_name = "opensearch" + + def __init__(self, config: dict[str, Any] | None = None): + super().__init__(config) + self._configure_opensearch() + + self.embed_model_name = self.config.get("embed_model", DEFAULT_EMBED_MODEL) + self.embed_base_url = self.config.get("embed_base_url", DEFAULT_EMBED_BASE_URL) + self.embedding_batch_size = int(self.config.get("embedding_batch_size", DEFAULT_EMBEDDING_BATCH_SIZE)) + self.chunk_size = int(self.config.get("chunk_size", DEFAULT_CHUNK_SIZE)) + self.chunk_overlap = int(self.config.get("chunk_overlap", DEFAULT_CHUNK_OVERLAP)) + self.generate_summary_enabled = self.config.get("generate_summary", False) + self.summary_llm = self.config.get("summary_llm") + self.ingestion_mode = str(self.config.get("ingestion_mode", DEFAULT_INGESTION_MODE)).lower() + self.dask_scheduler_address = self.config.get("dask_scheduler_address", DEFAULT_DASK_SCHEDULER_ADDRESS) + self.dask_file_transfer = str(self.config.get("dask_file_transfer", DEFAULT_DASK_FILE_TRANSFER)).lower() + + self._jobs: dict[str, IngestionJobStatus] = {} + self._files: dict[str, FileInfo] = {} + self._lock = threading.RLock() + + if self.config.get("start_ttl_cleanup", True): + self._start_ttl_cleanup_task(COLLECTION_TTL_HOURS, TTL_CLEANUP_INTERVAL_SECONDS) + logger.info("OpenSearchIngestor initialized: endpoint=%s, auth_type=%s", self.endpoint, self.auth_type) + + def _embed_texts(self, texts: list[str]) -> list[list[float]]: + try: + from openai import OpenAI + except ImportError as e: + raise RuntimeError( + "OpenSearch ingestion requires openai for embeddings. Install knowledge-layer[opensearch]." + ) from e + + client = OpenAI(base_url=self.embed_base_url, api_key=_resolve_embedding_api_key(self.embed_base_url)) + embeddings: list[list[float]] = [] + for start in range(0, len(texts), self.embedding_batch_size): + batch = texts[start : start + self.embedding_batch_size] + response = client.embeddings.create( + model=self.embed_model_name, + input=batch, + extra_body={"input_type": "passage"}, + ) + embeddings.extend([list(item.embedding) for item in response.data]) + return embeddings + + def submit_job( + self, + file_paths: list[str], + collection_name: str, + config: dict[str, Any] | None = None, + ) -> str: + """Submit an ingestion job and return immediately with a polling job ID.""" + job_id = str(uuid.uuid4()) + job_config = {**self.config, **(config or {})} + original_filenames = job_config.get("original_filenames", []) + requested_file_id = job_config.get("file_id") + file_metadata = job_config.get("metadata") or {} + + validated_paths = [path for path in file_paths if os.path.exists(path)] + if not validated_paths: + job = IngestionJobStatus( + job_id=job_id, + status=JobState.FAILED, + submitted_at=_utc_now(), + completed_at=_utc_now(), + total_files=len(file_paths), + processed_files=0, + collection_name=collection_name, + backend=self.backend_name, + error_message="No valid file paths provided", + ) + with self._lock: + self._jobs[job_id] = job + return job_id + + file_details = [] + for i, path in enumerate(validated_paths): + file_name = original_filenames[i] if i < len(original_filenames) else Path(path).name + file_id = requested_file_id if requested_file_id and len(validated_paths) == 1 else str(uuid.uuid4()) + file_details.append( + FileProgress( + file_id=file_id, + file_name=file_name, + status=FileStatus.UPLOADING, + progress_percent=0.0, + ) + ) + with self._lock: + self._files[file_id] = FileInfo( + file_id=file_id, + file_name=file_name, + collection_name=collection_name, + status=FileStatus.UPLOADING, + file_size=os.path.getsize(path), + uploaded_at=_utc_now(), + metadata={**file_metadata, "job_id": job_id}, + ) + + job = IngestionJobStatus( + job_id=job_id, + status=JobState.PENDING, + submitted_at=_utc_now(), + total_files=len(validated_paths), + processed_files=0, + collection_name=collection_name, + backend=self.backend_name, + file_details=file_details, + ) + with self._lock: + self._jobs[job_id] = job + + if self._should_use_dask_ingestion(): + try: + self._start_dask_ingestion(job_id, validated_paths, collection_name, job_config) + except Exception as e: + if self.ingestion_mode == "auto": + logger.warning("Falling back to local OpenSearch ingestion because Dask submit failed: %s", e) + self._start_local_ingestion(job_id, validated_paths, collection_name, job_config) + else: + self._mark_job_failed(job_id, f"Dask ingestion submission failed: {e}") + if job_config.get("cleanup_files", False): + self._cleanup_paths(validated_paths) + else: + self._start_local_ingestion(job_id, validated_paths, collection_name, job_config) + return job_id + + def _should_use_dask_ingestion(self) -> bool: + if self.ingestion_mode == "local": + return False + if self.ingestion_mode == "dask": + return True + if self.ingestion_mode == "auto": + return bool(self.dask_scheduler_address) + raise RuntimeError("OpenSearch ingestion_mode must be one of: local, dask, auto") + + def _start_local_ingestion( + self, + job_id: str, + file_paths: list[str], + collection_name: str, + job_config: dict[str, Any], + ) -> None: + thread = threading.Thread( + target=self._run_ingestion, + args=(job_id, file_paths, collection_name, job_config), + daemon=True, + ) + thread.start() + + def _create_dask_client(self): + if not self.dask_scheduler_address: + raise RuntimeError( + "Dask ingestion requires OPENSEARCH_DASK_SCHEDULER_ADDRESS or NAT_DASK_SCHEDULER_ADDRESS" + ) + try: + from distributed import Client + except ImportError as e: + raise RuntimeError("Dask ingestion requires the distributed package") from e + return Client(self.dask_scheduler_address, timeout=f"{self.timeout}s") + + def _start_dask_ingestion( + self, + job_id: str, + file_paths: list[str], + collection_name: str, + job_config: dict[str, Any], + ) -> None: + from knowledge_layer.opensearch.distributed import run_opensearch_ingestion_task + + with self._lock: + job = self._jobs[job_id] + job.status = JobState.PROCESSING + job.started_at = _utc_now() + job.metadata["ingestion_mode"] = "dask" + for detail in job.file_details: + detail.status = FileStatus.INGESTING + tracked = self._files.get(detail.file_id) + if tracked: + tracked.status = FileStatus.INGESTING + + payloads = self._build_dask_file_payloads(job_id, file_paths, job_config) + worker_config = self._worker_config(job_config) + client = self._create_dask_client() + try: + future = client.submit( + run_opensearch_ingestion_task, + worker_config, + payloads, + collection_name, + key=f"aiq-opensearch-ingest-{job_id}", + pure=False, + ) + except Exception: + # The monitor thread (which has client.close() in its finally) is only + # started when submit succeeds. If submit raises — scheduler unreachable, + # serialisation error, key conflict — close the just-opened client here + # so the scheduler TCP connection does not leak across auto-mode retries. + close = getattr(client, "close", None) + if close is not None: + try: + close() + except Exception: + logger.debug("Failed to close Dask client after submit error", exc_info=True) + raise + thread = threading.Thread( + target=self._monitor_dask_ingestion, + args=(job_id, future, client, file_paths, job_config), + daemon=True, + ) + thread.start() + + def _worker_config(self, job_config: dict[str, Any]) -> dict[str, Any]: + worker_config = dict(job_config) + worker_config.pop("summary_llm", None) + worker_config["start_ttl_cleanup"] = False + worker_config["generate_summary"] = False + return worker_config + + def _build_dask_file_payloads( + self, + job_id: str, + file_paths: list[str], + job_config: dict[str, Any], + ) -> list[dict[str, Any]]: + with self._lock: + job = self._jobs[job_id] + details = list(job.file_details) + file_metadata = job_config.get("metadata") or {} + payloads = [] + for path, detail in zip(file_paths, details, strict=True): + payload = { + "file_id": detail.file_id, + "file_name": detail.file_name, + "metadata": file_metadata, + } + if self.dask_file_transfer == "bytes": + payload["data"] = Path(path).read_bytes() + payload["suffix"] = Path(path).suffix + elif self.dask_file_transfer == "paths": + payload["path"] = path + else: + raise RuntimeError("OpenSearch dask_file_transfer must be one of: bytes, paths") + payloads.append(payload) + return payloads + + def _monitor_dask_ingestion( + self, + job_id: str, + future: Any, + client: Any, + file_paths: list[str], + job_config: dict[str, Any], + ) -> None: + try: + result = future.result() + self._apply_dask_ingestion_result(job_id, result) + except Exception as e: + logger.exception("OpenSearch Dask ingestion job failed") + self._mark_job_failed(job_id, str(e)) + finally: + close = getattr(client, "close", None) + if close: + close() + if job_config.get("cleanup_files", False): + self._cleanup_paths(file_paths) + + def _apply_dask_ingestion_result(self, job_id: str, result: dict[str, Any]) -> None: + file_results = {item.get("file_id"): item for item in result.get("files", [])} + with self._lock: + job = self._jobs[job_id] + for index, detail in enumerate(job.file_details): + item = file_results.get(detail.file_id, {}) + status = FileStatus(item.get("status", FileStatus.FAILED)) + self._mark_file( + job, + index, + status, + chunks_created=int(item.get("chunks_created", 0)), + error=item.get("error_message"), + ) + summary = item.get("summary") + if summary: + from aiq_agent.knowledge import register_summary + + register_summary(job.collection_name, detail.file_name, summary) + tracked = self._files.get(detail.file_id) + if tracked: + tracked.metadata["summary"] = summary + + failed_count = sum(1 for detail in job.file_details if detail.status == FileStatus.FAILED) + job.processed_files = job.total_files + job.completed_at = _utc_now() + job.metadata.update( + { + "index_name": result.get("index_name"), + "total_chunks": result.get("total_chunks", 0), + "embedding_model": result.get("embedding_model", self.embed_model_name), + "ingestion_mode": "dask", + } + ) + if failed_count == job.total_files: + job.status = JobState.FAILED + job.error_message = result.get("error_message") or "All files failed ingestion" + else: + job.status = JobState.COMPLETED + job.error_message = None + self._update_collection_timestamp(job.collection_name) + + def _mark_job_failed(self, job_id: str, error: str) -> None: + with self._lock: + job = self._jobs[job_id] + job.status = JobState.FAILED + job.completed_at = _utc_now() + job.error_message = error + job.processed_files = job.total_files + for index, _ in enumerate(job.file_details): + self._mark_file(job, index, FileStatus.FAILED, error=error) + + def _cleanup_paths(self, file_paths: list[str]) -> None: + for file_path in file_paths: + try: + os.unlink(file_path) + except OSError: + pass + + def get_job_status(self, job_id: str) -> IngestionJobStatus: + with self._lock: + job = self._jobs.get(job_id) + if job is None: + return IngestionJobStatus( + job_id=job_id, + status=JobState.FAILED, + submitted_at=_utc_now(), + completed_at=_utc_now(), + total_files=0, + processed_files=0, + collection_name="unknown", + backend=self.backend_name, + error_message="Job ID not found", + ) + return job.model_copy(deep=True) + + def create_collection( + self, + name: str, + description: str | None = None, + metadata: dict[str, Any] | None = None, + ) -> CollectionInfo: + index_name = self._ensure_index(name, description) + meta = self._get_index_meta(index_name) + if metadata: + meta.update(metadata) + self._put_index_meta(index_name, meta) + return self._collection_info_from_index(name, index_name, meta) + + def delete_collection(self, name: str) -> bool: + client = self._get_client() + index_name = self._index_name_for_collection(name) + try: + if not client.indices.exists(index=index_name): + return False + client.indices.delete(index=index_name) + from aiq_agent.knowledge import clear_collection_summaries + + clear_collection_summaries(name) + with self._lock: + self._files = {fid: f for fid, f in self._files.items() if f.collection_name != name} + return True + except Exception as e: + logger.error("Failed to delete OpenSearch collection %s: %s", name, e) + return False + + def list_collections(self) -> list[CollectionInfo]: + client = self._get_client() + pattern = f"{self.index_prefix}-*" + try: + response = client.indices.get(index=pattern) + except Exception as e: + logger.debug("Failed to list OpenSearch collections with pattern %s: %s", pattern, e) + return [] + + collections = [] + for index_name, info in response.items(): + meta = (info.get("mappings") or {}).get("_meta") or {} + if meta.get("backend") != "opensearch": + continue + collection_name = meta.get("collection_name") or index_name.removeprefix(f"{self.index_prefix}-") + collections.append(self._collection_info_from_index(collection_name, index_name, meta)) + return collections + + def get_collection(self, name: str) -> CollectionInfo | None: + client = self._get_client() + index_name = self._index_name_for_collection(name) + try: + if not client.indices.exists(index=index_name): + return None + return self._collection_info_from_index(name, index_name, self._get_index_meta(index_name)) + except Exception as e: + logger.error("Failed to get OpenSearch collection %s: %s", name, e) + return None + + def upload_file( + self, + file_path: str, + collection_name: str, + metadata: dict[str, Any] | None = None, + ) -> FileInfo: + path = Path(file_path) + file_id = str(uuid.uuid4()) + if not path.exists(): + return FileInfo( + file_id=file_id, + file_name=path.name, + collection_name=collection_name, + status=FileStatus.FAILED, + error_message=f"File not found: {file_path}", + ) + + job_id = self.submit_job( + [file_path], + collection_name, + config={ + "file_id": file_id, + "original_filenames": [path.name], + "metadata": metadata or {}, + }, + ) + with self._lock: + info = self._files[file_id] + info.status = FileStatus.INGESTING + info.metadata["job_id"] = job_id + return info.model_copy(deep=True) + + def delete_file(self, file_id: str, collection_name: str) -> bool: + client = self._get_client() + index_name = self._index_name_for_collection(collection_name) + try: + if not client.indices.exists(index=index_name): + return False + + resolved_name = self._resolve_file_name(file_id, collection_name) + body = { + "query": { + "bool": { + "should": [ + {"term": {"file_id": file_id}}, + {"term": {"file_name": resolved_name}}, + {"term": {"file_name": file_id}}, + ], + "minimum_should_match": 1, + } + } + } + if self.auth_type == "sigv4" and self.aws_service == "aoss": + deleted = self._delete_file_documents_for_aoss(index_name, body) + else: + result = client.delete_by_query(index=index_name, body=body, refresh=True, conflicts="proceed") + deleted = int(result.get("deleted", 0)) if isinstance(result, dict) else 0 + + if deleted > 0: + # Only evict in-memory tracking when OpenSearch actually had documents + # to delete. If deleted == 0 (file still UPLOADING/INGESTING, or already + # gone), keeping the tracking entry lets get_file_status return the + # live job state instead of falling through to an empty index scan. + with self._lock: + for tracked_id, tracked_file in list(self._files.items()): + if tracked_file.collection_name == collection_name and tracked_file.file_name in ( + resolved_name, + file_id, + ): + self._files.pop(tracked_id, None) + elif tracked_id == file_id: + self._files.pop(tracked_id, None) + + from aiq_agent.knowledge import unregister_summary + + unregister_summary(collection_name, resolved_name) + self._update_collection_timestamp(collection_name) + return True + return False + except Exception as e: + logger.error("Failed to delete OpenSearch file %s: %s", file_id, e) + return False + + def _delete_file_documents_for_aoss(self, index_name: str, query_body: dict[str, Any]) -> int: + """AOSS doesn't support _delete_by_query; search first, then bulk delete by generated IDs.""" + client = self._get_client() + deleted = 0 + + seen_ids: set[str] = set() + stale_batches = 0 + + for _ in range(self.aoss_delete_max_batches): + response = client.search( + index=index_name, + body={ + "size": self.bulk_batch_size, + "_source": False, + "query": query_body["query"], + }, + request_timeout=self.timeout, + ) + hits = response.get("hits", {}).get("hits", []) + if not hits: + return deleted + + body = [] + for hit in hits: + hit_id = hit.get("_id") + if hit_id and hit_id not in seen_ids: + seen_ids.add(hit_id) + body.append({"delete": {"_index": index_name, "_id": hit_id}}) + + if not body: + stale_batches += 1 + if stale_batches >= 2: + return deleted + time.sleep(self.aoss_delete_backoff_seconds) + continue + stale_batches = 0 + + result = client.bulk(body=body, refresh=self.bulk_refresh, request_timeout=self.timeout) + if isinstance(result, dict) and result.get("errors"): + raise RuntimeError(f"OpenSearch bulk deletion failed: {result}") + + deleted += len(body) + if self.aoss_delete_backoff_seconds: + time.sleep(self.aoss_delete_backoff_seconds) + + raise RuntimeError( + f"OpenSearch AOSS file deletion exceeded {self.aoss_delete_max_batches} search/delete batches" + ) + + def list_files(self, collection_name: str) -> list[FileInfo]: + client = self._get_client() + index_name = self._index_name_for_collection(collection_name) + # Paginate via a composite aggregation. terms+size silently drops buckets past + # the cap; composite walks every distinct file_name deterministically via + # after_key. max_pages is a runaway guard, not a product limit — at page size + # 1000 it covers 10M files before bailing with a warning. + page_size = 1000 + max_pages = 10_000 + buckets: list[dict[str, Any]] = [] + after_key: dict[str, Any] | None = None + try: + if not client.indices.exists(index=index_name): + return [] + for page in range(max_pages): + composite: dict[str, Any] = { + "size": page_size, + "sources": [{"file_name": {"terms": {"field": "file_name"}}}], + } + if after_key is not None: + composite["after"] = after_key + response = client.search( + index=index_name, + body={ + "size": 0, + "aggs": { + "by_file": { + "composite": composite, + "aggs": { + "doc": { + "top_hits": { + "size": 1, + "_source": [ + "file_id", + "file_name", + "file_size", + "content_type", + "created_at", + "updated_at", + "metadata", + ], + }, + }, + "content_types": {"terms": {"field": "content_type", "size": 50}}, + }, + }, + }, + }, + ) + by_file = (response.get("aggregations") or {}).get("by_file") or {} + page_buckets = by_file.get("buckets") or [] + buckets.extend(page_buckets) + after_key = by_file.get("after_key") + if not after_key or not page_buckets: + break + else: + logger.warning( + "list_files for %s hit max_pages=%d at page_size=%d — results may be truncated", + collection_name, + max_pages, + page_size, + ) + except Exception as e: + logger.error("Failed to list OpenSearch files for %s: %s", collection_name, e) + return [] + + files = self._files_from_buckets(buckets, collection_name) + existing_names = {f.file_name for f in files} + with self._lock: + for tracked in self._files.values(): + if ( + tracked.collection_name == collection_name + and tracked.status == FileStatus.FAILED + and tracked.file_name not in existing_names + ): + files.append(tracked.model_copy(deep=True)) + existing_names.add(tracked.file_name) + return files + + def get_file_status(self, file_id: str, collection_name: str) -> FileInfo | None: + with self._lock: + tracked = self._files.get(file_id) + if tracked: + job_id = tracked.metadata.get("job_id") + if tracked.status == FileStatus.INGESTING and job_id: + job = self._jobs.get(job_id) + if job and job.status == JobState.COMPLETED: + tracked.status = FileStatus.SUCCESS + tracked.ingested_at = job.completed_at + elif job and job.status == JobState.FAILED: + tracked.status = FileStatus.FAILED + tracked.error_message = job.error_message + return tracked.model_copy(deep=True) + + for file_info in self.list_files(collection_name): + if file_id in (file_info.file_id, file_info.file_name): + return file_info + return None + + def generate_summary(self, text_content: str, file_name: str) -> str | None: + if not self.generate_summary_enabled: + return None + return _generate_document_summary(text_content, file_name, self.summary_llm) + + async def health_check(self) -> bool: + try: + return self._health_check_client() + except Exception as e: + logger.warning("OpenSearch health check failed: %s", e) + return False + + def _run_ingestion( + self, + job_id: str, + file_paths: list[str], + collection_name: str, + config: dict[str, Any], + ) -> None: + try: + with self._lock: + job = self._jobs[job_id] + job.status = JobState.PROCESSING + job.started_at = _utc_now() + for detail in job.file_details: + detail.status = FileStatus.INGESTING + + index_name = self._ensure_index(collection_name) + original_filenames = config.get("original_filenames", []) + total_chunks = 0 + + for i, file_path in enumerate(file_paths): + file_name = original_filenames[i] if i < len(original_filenames) else Path(file_path).name + file_id = job.file_details[i].file_id + try: + documents, summary_text = self._documents_for_file( + file_path, + file_id, + file_name, + config.get("metadata") or {}, + ) + if not documents: + self._mark_file(job, i, FileStatus.FAILED, error="No content extracted") + continue + + embeddings = self._embed_texts([doc[self.text_field] for doc in documents]) + for doc, embedding in zip(documents, embeddings, strict=True): + doc[self.vector_field] = embedding + + self._bulk_index_documents(index_name, documents) + chunks_created = len(documents) + total_chunks += chunks_created + self._mark_file(job, i, FileStatus.SUCCESS, chunks_created=chunks_created) + + if self.generate_summary_enabled: + summary = self.generate_summary(summary_text, file_name) + if summary: + from aiq_agent.knowledge import register_summary + + register_summary(collection_name, file_name, summary) + with self._lock: + if file_id in self._files: + self._files[file_id].metadata["summary"] = summary + + except Exception as e: + logger.exception("OpenSearch ingestion failed for %s", file_path) + self._mark_file(job, i, FileStatus.FAILED, error=str(e)) + + with self._lock: + failed_count = sum(1 for detail in job.file_details if detail.status == FileStatus.FAILED) + job.processed_files = job.total_files + job.completed_at = _utc_now() + job.metadata = { + "index_name": index_name, + "total_chunks": total_chunks, + "embedding_model": self.embed_model_name, + } + if failed_count == job.total_files: + job.status = JobState.FAILED + job.error_message = "All files failed ingestion" + else: + job.status = JobState.COMPLETED + + self._update_collection_timestamp(collection_name) + + except Exception as e: + logger.exception("OpenSearch ingestion job failed") + with self._lock: + job = self._jobs[job_id] + job.status = JobState.FAILED + job.completed_at = _utc_now() + job.error_message = str(e) + finally: + if config.get("cleanup_files", False): + for file_path in file_paths: + try: + os.unlink(file_path) + except OSError: + pass + + def _documents_for_file( + self, + file_path: str, + file_id: str, + file_name: str, + file_metadata: dict[str, Any] | None = None, + ) -> tuple[list[dict[str, Any]], str]: + file_size = os.path.getsize(file_path) + now = _utc_now().isoformat() + documents = [] + summary_parts = [] + file_metadata = file_metadata or {} + + for segment_text, page_number, segment_metadata in _read_file_segments(file_path): + chunks = _chunk_text(segment_text, self.chunk_size, self.chunk_overlap) + if not summary_parts and segment_text.strip(): + summary_parts.append(segment_text.strip()[:SUMMARY_MAX_INPUT_CHARS]) + + for chunk_index, content in enumerate(chunks): + content_type = ContentType.TEXT + chunk_id = str(uuid.uuid4()) + if page_number: + display_citation = f"{file_name}, p.{page_number}" + else: + display_citation = file_name + + documents.append( + { + "chunk_id": chunk_id, + "file_id": file_id, + "file_name": file_name, + self.text_field: content, + "display_citation": display_citation, + "page_number": page_number, + "content_type": content_type.value, + "content_subtype": None, + "file_size": file_size, + "metadata": { + **file_metadata, + **segment_metadata, + "chunk_index": chunk_index, + }, + "created_at": now, + "updated_at": now, + } + ) + + return documents, "\n".join(summary_parts) + + def _bulk_index_documents(self, index_name: str, documents: list[dict[str, Any]]) -> None: + client = self._get_client() + for start in range(0, len(documents), self.bulk_batch_size): + batch = documents[start : start + self.bulk_batch_size] + body = [] + for doc in batch: + action = {"index": {"_index": index_name}} + if self.allow_document_ids: + action["index"]["_id"] = doc["chunk_id"] + body.append(action) + body.append(doc) + result = client.bulk(body=body, refresh=self.bulk_refresh, request_timeout=self.timeout) + if isinstance(result, dict) and result.get("errors"): + raise RuntimeError(f"OpenSearch bulk indexing failed: {result}") + + def _mark_file( + self, + job: IngestionJobStatus, + file_index: int, + status: FileStatus, + chunks_created: int = 0, + error: str | None = None, + ) -> None: + with self._lock: + if file_index < len(job.file_details): + detail = job.file_details[file_index] + detail.status = status + detail.progress_percent = 100.0 + detail.chunks_created = chunks_created + detail.error_message = error + tracked = self._files.get(detail.file_id) + if tracked: + tracked.status = status + tracked.chunk_count = chunks_created + tracked.error_message = error + if status == FileStatus.SUCCESS: + tracked.ingested_at = _utc_now() + job.processed_files = min(job.total_files, file_index + 1) + + def _collection_info_from_index( + self, + collection_name: str, + index_name: str, + meta: dict[str, Any], + ) -> CollectionInfo: + client = self._get_client() + chunk_count = 0 + try: + count_result = client.count(index=index_name) + chunk_count = int(count_result.get("count", 0)) + except Exception: + pass + + files = self.list_files(collection_name) if chunk_count else [] + return CollectionInfo( + name=collection_name, + description=meta.get("description"), + file_count=len(files), + chunk_count=chunk_count, + created_at=_parse_timestamp(meta.get("created_at")), + updated_at=_parse_timestamp(meta.get("updated_at")), + backend=self.backend_name, + metadata={ + "index_name": index_name, + "endpoint": self.endpoint, + "embedding_model": meta.get("embedding_model", self.embed_model_name), + "embedding_dim": meta.get("embedding_dim", self.embedding_dim), + }, + ) + + def _files_from_buckets(self, buckets: list[dict[str, Any]], collection_name: str) -> list[FileInfo]: + files: list[FileInfo] = [] + for bucket in buckets: + # composite bucket keys are dicts ({"file_name": "..."}); legacy terms keys are scalars. + key = bucket.get("key") + if isinstance(key, dict): + file_name = key.get("file_name") or "unknown" + else: + file_name = key or "unknown" + chunk_count = int(bucket.get("doc_count", 0)) + top_hits = ((bucket.get("doc") or {}).get("hits") or {}).get("hits") or [] + source = (top_hits[0].get("_source") if top_hits else {}) or {} + content_type_buckets = (bucket.get("content_types") or {}).get("buckets") or [] + content_types = sorted(b.get("key") for b in content_type_buckets if b.get("key")) + files.append( + FileInfo( + file_id=source.get("file_id") or file_name, + file_name=file_name, + collection_name=collection_name, + status=FileStatus.SUCCESS, + file_size=source.get("file_size"), + chunk_count=chunk_count, + uploaded_at=_parse_timestamp(source.get("created_at")), + ingested_at=_parse_timestamp(source.get("updated_at")), + metadata={ + **(source.get("metadata") or {}), + "content_types": content_types, + }, + ) + ) + return files + + def _resolve_file_name(self, file_id: str, collection_name: str) -> str: + with self._lock: + tracked = self._files.get(file_id) + if tracked and tracked.collection_name == collection_name: + return tracked.file_name + for info in self._files.values(): + if info.collection_name == collection_name and info.file_name == file_id: + return info.file_name + return file_id + + +@register_retriever("opensearch") +class OpenSearchRetriever(_OpenSearchConfigMixin, BaseRetriever): + """OpenSearch-backed document retriever.""" + + backend_name = "opensearch" + + def __init__(self, config: dict[str, Any] | None = None): + super().__init__(config) + self._configure_opensearch() + self.embed_model_name = self.config.get("embed_model", DEFAULT_EMBED_MODEL) + self.embed_base_url = self.config.get("embed_base_url", DEFAULT_EMBED_BASE_URL) + self.embedding_batch_size = int(self.config.get("embedding_batch_size", DEFAULT_EMBEDDING_BATCH_SIZE)) + self.default_top_k = int(self.config.get("top_k", 10)) + logger.info("OpenSearchRetriever initialized: endpoint=%s, auth_type=%s", self.endpoint, self.auth_type) + + def _embed_texts(self, texts: list[str]) -> list[list[float]]: + try: + from openai import OpenAI + except ImportError as e: + raise RuntimeError( + "OpenSearch retrieval requires openai for embeddings. Install knowledge-layer[opensearch]." + ) from e + + client = OpenAI(base_url=self.embed_base_url, api_key=_resolve_embedding_api_key(self.embed_base_url)) + response = client.embeddings.create( + model=self.embed_model_name, + input=texts, + extra_body={"input_type": "query"}, + ) + return [list(item.embedding) for item in response.data] + + async def retrieve( + self, + query: str, + collection_name: str, + top_k: int = 10, + filters: dict[str, Any] | None = None, + ) -> RetrievalResult: + try: + client = self._get_client() + index_name = self._index_name_for_collection(collection_name) + if not client.indices.exists(index=index_name): + return RetrievalResult( + chunks=[], + query=query, + backend=self.backend_name, + success=False, + error_message=f"Collection '{collection_name}' not found", + ) + + query_embedding = self._embed_texts([query])[0] + body = self._build_search_body(query_embedding, top_k or self.default_top_k, filters) + response = client.search(index=index_name, body=body, request_timeout=self.timeout) + chunks = [ + chunk for chunk in (self.normalize(hit) for hit in response.get("hits", {}).get("hits", [])) if chunk + ] + + return RetrievalResult( + chunks=chunks, + total_tokens=sum(len(chunk.content.split()) for chunk in chunks), + query=query, + backend=self.backend_name, + success=True, + ) + except Exception as e: + logger.error("OpenSearch retrieval failed: %s", e) + return RetrievalResult( + chunks=[], + query=query, + backend=self.backend_name, + success=False, + error_message=f"Retrieval failed: {str(e)[:100]}", + ) + + def _build_search_body( + self, + query_embedding: list[float], + top_k: int, + filters: dict[str, Any] | None = None, + ) -> dict[str, Any]: + knn_body: dict[str, Any] = { + "vector": query_embedding, + "k": top_k, + } + filter_query = self._build_filter_query(filters) + if filter_query: + knn_body["filter"] = filter_query + + return { + "size": top_k, + "_source": { + "excludes": [self.vector_field], + }, + "query": { + "knn": { + self.vector_field: knn_body, + } + }, + } + + def _build_filter_query(self, filters: dict[str, Any] | None) -> dict[str, Any] | None: + if not filters: + return None + if "filter" in filters and isinstance(filters["filter"], dict): + return filters["filter"] + + clauses = [] + for key, value in filters.items(): + if key in ("filter", "filter_expr"): + continue + field_name = key if key in {"file_name", "content_type", "file_id"} else f"metadata.{key}" + clauses.append({"term": {field_name: value}}) + if not clauses: + return None + return {"bool": {"filter": clauses}} + + def normalize(self, raw_result: Any) -> Chunk | None: + if not isinstance(raw_result, dict): + return None + + source = raw_result.get("_source") or {} + content = source.get(self.text_field, "") + file_name = source.get("file_name", "unknown") + page_number = source.get("page_number") + content_type = self._content_type_from_source(source) + display_citation = source.get("display_citation") or self._display_citation(file_name, page_number) + + return Chunk( + chunk_id=source.get("chunk_id") or raw_result.get("_id") or str(uuid.uuid4()), + content=content or "", + score=_score_to_similarity(raw_result.get("_score", 0.0)), + file_name=file_name, + page_number=page_number, + display_citation=display_citation, + content_type=content_type, + content_subtype=source.get("content_subtype"), + structured_data=source.get("structured_data"), + image_storage_uri=source.get("image_storage_uri"), + image_url=source.get("image_url"), + metadata={ + **(source.get("metadata") or {}), + "file_id": source.get("file_id"), + "index": raw_result.get("_index"), + }, + ) + + async def health_check(self) -> bool: + try: + return self._health_check_client() + except Exception: + return False + + def _content_type_from_source(self, source: dict[str, Any]) -> ContentType: + raw_type = str(source.get("content_type", "text")).lower() + if raw_type == ContentType.TABLE.value: + return ContentType.TABLE + if raw_type == ContentType.CHART.value: + return ContentType.CHART + if raw_type == ContentType.IMAGE.value: + return ContentType.IMAGE + return ContentType.TEXT + + def _display_citation(self, file_name: str, page_number: Any) -> str: + if page_number: + return f"{file_name}, p.{page_number}" + return file_name diff --git a/sources/knowledge_layer/src/opensearch/distributed.py b/sources/knowledge_layer/src/opensearch/distributed.py new file mode 100644 index 00000000..477549d9 --- /dev/null +++ b/sources/knowledge_layer/src/opensearch/distributed.py @@ -0,0 +1,120 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Dask worker entry points for OpenSearch ingestion.""" + +from __future__ import annotations + +import logging +import os +import tempfile +from pathlib import Path +from typing import Any + +logger = logging.getLogger(__name__) + + +def run_opensearch_ingestion_task( + config: dict[str, Any], + files: list[dict[str, Any]], + collection_name: str, +) -> dict[str, Any]: + """Run OpenSearch ingestion in a Dask worker process. + + The worker creates its own OpenSearch client so SigV4 credentials are + resolved in the worker environment, including EKS Pod Identity. + """ + from knowledge_layer.opensearch.adapter import OpenSearchIngestor + + worker_config = dict(config) + worker_config["start_ttl_cleanup"] = False + worker_config["generate_summary"] = False + worker_config.pop("summary_llm", None) + + ingestor = OpenSearchIngestor(worker_config) + index_name = ingestor._ensure_index(collection_name) + total_chunks = 0 + file_results = [] + + for file_payload in files: + temp_path: str | None = None + file_path = file_payload.get("path") + file_id = file_payload["file_id"] + file_name = file_payload["file_name"] + metadata = file_payload.get("metadata") or {} + + try: + if file_path is None: + suffix = file_payload.get("suffix") or Path(file_name).suffix + with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp: + tmp.write(file_payload["data"]) + temp_path = tmp.name + file_path = temp_path + + documents, _ = ingestor._documents_for_file(file_path, file_id, file_name, metadata) + if not documents: + file_results.append( + { + "file_id": file_id, + "file_name": file_name, + "status": "failed", + "chunks_created": 0, + "error_message": "No content extracted", + } + ) + continue + + embeddings = ingestor._embed_texts([doc[ingestor.text_field] for doc in documents]) + for doc, embedding in zip(documents, embeddings, strict=True): + doc[ingestor.vector_field] = embedding + + ingestor._bulk_index_documents(index_name, documents) + chunks_created = len(documents) + total_chunks += chunks_created + file_results.append( + { + "file_id": file_id, + "file_name": file_name, + "status": "success", + "chunks_created": chunks_created, + } + ) + except Exception as e: + logger.exception("OpenSearch Dask ingestion failed for %s", file_name) + file_results.append( + { + "file_id": file_id, + "file_name": file_name, + "status": "failed", + "chunks_created": 0, + "error_message": str(e), + } + ) + finally: + if temp_path: + try: + os.unlink(temp_path) + except OSError: + pass + + ingestor._update_collection_timestamp(collection_name) + failed_count = sum(1 for item in file_results if item["status"] == "failed") + return { + "status": "failed" if failed_count == len(file_results) else "completed", + "files": file_results, + "total_chunks": total_chunks, + "index_name": index_name, + "embedding_model": ingestor.embed_model_name, + "error_message": "All files failed ingestion" if failed_count == len(file_results) else None, + } diff --git a/sources/knowledge_layer/src/register.py b/sources/knowledge_layer/src/register.py index 17dd5324..f10b19b9 100644 --- a/sources/knowledge_layer/src/register.py +++ b/sources/knowledge_layer/src/register.py @@ -38,7 +38,43 @@ # Type-safe backend selection - Pydantic validates at config load time -BackendType = Literal["llamaindex", "foundational_rag"] +BackendType = Literal["llamaindex", "foundational_rag", "opensearch"] +OpenSearchAuthType = Literal["none", "basic", "sigv4"] +OpenSearchAwsService = Literal["aoss", "es"] +OpenSearchIngestionMode = Literal["local", "dask", "auto"] +OpenSearchDaskFileTransfer = Literal["bytes", "paths"] + + +def _env_value(*names: str, default: str | None = None) -> str | None: + for name in names: + value = os.environ.get(name) + if value is not None and value != "": + return value + return default + + +def _env_bool(name: str, default: bool) -> bool: + value = os.environ.get(name) + if value is None: + return default + return value.lower() in {"1", "true", "yes", "on"} + + +def _env_optional_bool(name: str) -> bool | None: + value = os.environ.get(name) + if value is None or value == "": + return None + return value.lower() in {"1", "true", "yes", "on"} + + +def _env_int(name: str, default: int) -> int: + value = os.environ.get(name) + return int(value) if value is not None and value != "" else default + + +def _env_float(name: str, default: float) -> float: + value = os.environ.get(name) + return float(value) if value is not None and value != "" else default class KnowledgeRetrievalConfig(FunctionBaseConfig, name="knowledge_retrieval"): @@ -72,6 +108,147 @@ class KnowledgeRetrievalConfig(FunctionBaseConfig, name="knowledge_retrieval"): verify_ssl: bool = Field( default=True, description="Verify SSL certificates (foundational_rag only). Set false for self-signed certs." ) + # OpenSearch-specific options + opensearch_url: str = Field( + default_factory=lambda: _env_value("OPENSEARCH_URL", default="http://localhost:9200"), + description="OpenSearch endpoint URL (OpenSearch only).", + ) + opensearch_auth_type: OpenSearchAuthType = Field( + default_factory=lambda: _env_value("OPENSEARCH_AUTH_TYPE", default="none"), + description="OpenSearch auth mode: none, basic, or sigv4.", + ) + opensearch_username: str | None = Field( + default_factory=lambda: _env_value("OPENSEARCH_USERNAME"), + description="Username for OpenSearch basic auth. Falls back to OPENSEARCH_USERNAME.", + ) + opensearch_password: str | None = Field( + default_factory=lambda: _env_value("OPENSEARCH_PASSWORD"), + description="Password for OpenSearch basic auth. Falls back to OPENSEARCH_PASSWORD.", + ) + opensearch_verify_certs: bool = Field( + default_factory=lambda: _env_bool("OPENSEARCH_VERIFY_CERTS", True), + description="Verify OpenSearch TLS certificates. Set false only for trusted development clusters.", + ) + opensearch_ca_certs: str | None = Field( + default_factory=lambda: _env_value("OPENSEARCH_CA_CERTS"), + description="Path to a custom CA bundle for OpenSearch TLS verification.", + ) + opensearch_aws_region: str = Field( + default_factory=lambda: _env_value("AWS_REGION", "AWS_DEFAULT_REGION", default="us-east-1"), + description="AWS region for OpenSearch SigV4 auth.", + ) + opensearch_aws_service: OpenSearchAwsService = Field( + default_factory=lambda: _env_value("OPENSEARCH_AWS_SERVICE", default="aoss"), + description="SigV4 service name: aoss for Amazon OpenSearch Serverless, es for Amazon OpenSearch Service.", + ) + opensearch_index_prefix: str = Field( + default_factory=lambda: _env_value("OPENSEARCH_INDEX_PREFIX", default="aiq"), + description="Prefix for OpenSearch collection indexes.", + ) + opensearch_vector_field: str = Field( + default_factory=lambda: _env_value("OPENSEARCH_VECTOR_FIELD", default="embedding"), + description="Vector field name in OpenSearch documents.", + ) + opensearch_text_field: str = Field( + default_factory=lambda: _env_value("OPENSEARCH_TEXT_FIELD", default="content"), + description="Text field name in OpenSearch documents.", + ) + opensearch_embedding_dim: int = Field( + default_factory=lambda: _env_int("OPENSEARCH_EMBEDDING_DIM", 2048), + gt=0, + description="Embedding vector dimension for OpenSearch knn_vector mappings.", + ) + opensearch_engine: str = Field( + default_factory=lambda: _env_value("OPENSEARCH_ENGINE", default="faiss"), + description="OpenSearch k-NN engine.", + ) + opensearch_space_type: str = Field( + default_factory=lambda: _env_value("OPENSEARCH_SPACE_TYPE", default="cosinesimil"), + description="OpenSearch k-NN space type.", + ) + opensearch_m: int = Field( + default_factory=lambda: _env_int("OPENSEARCH_M", 16), + gt=0, + description="HNSW m parameter for OpenSearch indexes.", + ) + opensearch_ef_construction: int = Field( + default_factory=lambda: _env_int("OPENSEARCH_EF_CONSTRUCTION", 512), + gt=0, + description="HNSW ef_construction parameter for OpenSearch indexes.", + ) + opensearch_ef_search: int = Field( + default_factory=lambda: _env_int("OPENSEARCH_EF_SEARCH", 512), + gt=0, + description="OpenSearch ef_search query parameter.", + ) + opensearch_timeout: int = Field( + default_factory=lambda: _env_int("OPENSEARCH_TIMEOUT", 120), + gt=0, + description="OpenSearch request timeout in seconds.", + ) + opensearch_max_retries: int = Field( + default_factory=lambda: _env_int("OPENSEARCH_MAX_RETRIES", 3), + ge=0, + description="OpenSearch client max retries.", + ) + opensearch_bulk_batch_size: int = Field( + default_factory=lambda: _env_int("OPENSEARCH_BULK_BATCH_SIZE", 100), + gt=0, + description="Number of documents per OpenSearch bulk indexing request.", + ) + opensearch_embedding_batch_size: int = Field( + default_factory=lambda: _env_int("OPENSEARCH_EMBEDDING_BATCH_SIZE", 16), + gt=0, + description="Number of texts per embedding request for OpenSearch ingestion.", + ) + opensearch_chunk_size: int = Field( + default_factory=lambda: _env_int("OPENSEARCH_CHUNK_SIZE", 1024), + gt=0, + description="Approximate words per OpenSearch text chunk.", + ) + opensearch_chunk_overlap: int = Field( + default_factory=lambda: _env_int("OPENSEARCH_CHUNK_OVERLAP", 128), + ge=0, + description="Approximate overlapping words between OpenSearch text chunks.", + ) + opensearch_allow_document_ids: bool | None = Field( + default_factory=lambda: _env_optional_bool("OPENSEARCH_ALLOW_DOCUMENT_IDS"), + description="Whether to set explicit document IDs in bulk index requests. Defaults off for AOSS.", + ) + opensearch_bulk_refresh: bool | None = Field( + default_factory=lambda: _env_optional_bool("OPENSEARCH_BULK_REFRESH"), + description="Refresh policy for OpenSearch bulk writes. Defaults off for AOSS.", + ) + opensearch_aoss_delete_max_batches: int = Field( + default_factory=lambda: _env_int("OPENSEARCH_AOSS_DELETE_MAX_BATCHES", 100), + gt=0, + description="Maximum search/delete batches for AOSS file deletion.", + ) + opensearch_aoss_delete_backoff_seconds: float = Field( + default_factory=lambda: _env_float("OPENSEARCH_AOSS_DELETE_BACKOFF_SECONDS", 0.25), + ge=0, + description="Backoff between AOSS delete batches to account for eventual search visibility.", + ) + opensearch_ingestion_mode: OpenSearchIngestionMode = Field( + default_factory=lambda: _env_value("OPENSEARCH_INGESTION_MODE", default="local"), + description="OpenSearch ingestion execution mode: local, dask, or auto.", + ) + opensearch_dask_scheduler_address: str | None = Field( + default_factory=lambda: _env_value("OPENSEARCH_DASK_SCHEDULER_ADDRESS", "NAT_DASK_SCHEDULER_ADDRESS"), + description="Dask scheduler address for OpenSearch distributed ingestion.", + ) + opensearch_dask_file_transfer: OpenSearchDaskFileTransfer = Field( + default_factory=lambda: _env_value("OPENSEARCH_DASK_FILE_TRANSFER", default="bytes"), + description="How Dask ingestion workers receive files: bytes or paths.", + ) + embed_model: str = Field( + default_factory=lambda: _env_value("AIQ_EMBED_MODEL", default="nvidia/llama-nemotron-embed-vl-1b-v2"), + description="Embedding model for OpenSearch vector ingestion and retrieval.", + ) + embed_base_url: str = Field( + default_factory=lambda: _env_value("AIQ_EMBED_BASE_URL", default="https://integrate.api.nvidia.com/v1"), + description="OpenAI-compatible embeddings endpoint base URL.", + ) @model_validator(mode="after") def validate_backend_config(self): @@ -91,14 +268,36 @@ def validate_backend_config(self): logger.warning("rag_url is ignored for llamaindex backend") if self.ingest_url != "http://localhost:8082/v1": logger.warning("ingest_url is ignored for llamaindex backend") + if self.opensearch_url != "http://localhost:9200": + logger.warning("opensearch_url is ignored for llamaindex backend") elif backend == "foundational_rag": # Foundational RAG uses rag_url/ingest_url, warn if others are set if self.chroma_dir != "/tmp/chroma_data": logger.warning("chroma_dir is ignored for foundational_rag backend") + if self.opensearch_url != "http://localhost:9200": + logger.warning("opensearch_url is ignored for foundational_rag backend") if not self.verify_ssl: logger.warning("SSL verification disabled for foundational_rag. Use only in trusted environments.") + elif backend == "opensearch": + if self.chroma_dir != "/tmp/chroma_data": + logger.warning("chroma_dir is ignored for opensearch backend") + if self.rag_url != "http://localhost:8081/v1": + logger.warning("rag_url is ignored for opensearch backend") + if self.ingest_url != "http://localhost:8082/v1": + logger.warning("ingest_url is ignored for opensearch backend") + if self.opensearch_auth_type == "basic": + has_username = self.opensearch_username or os.environ.get("OPENSEARCH_USERNAME") + has_password = self.opensearch_password or os.environ.get("OPENSEARCH_PASSWORD") + if not has_username or not has_password: + logger.warning( + "OpenSearch basic auth selected but username/password are not fully configured. " + "Set opensearch_username/opensearch_password or OPENSEARCH_USERNAME/OPENSEARCH_PASSWORD." + ) + if not self.opensearch_verify_certs: + logger.warning("TLS verification disabled for opensearch. Use only in trusted environments.") + return self @@ -144,8 +343,48 @@ def _setup_backend(config: KnowledgeRetrievalConfig, summary_llm_obj=None) -> tu **summary_config, } + elif backend == "opensearch": + import knowledge_layer.opensearch.adapter # noqa: F401 + + os.environ.setdefault("OPENSEARCH_URL", config.opensearch_url) + backend_config = { + "endpoint": config.opensearch_url, + "auth_type": config.opensearch_auth_type, + "username": config.opensearch_username, + "password": config.opensearch_password, + "verify_certs": config.opensearch_verify_certs, + "ca_certs": config.opensearch_ca_certs, + "aws_region": config.opensearch_aws_region, + "aws_service": config.opensearch_aws_service, + "index_prefix": config.opensearch_index_prefix, + "vector_field": config.opensearch_vector_field, + "text_field": config.opensearch_text_field, + "embedding_dim": config.opensearch_embedding_dim, + "engine": config.opensearch_engine, + "space_type": config.opensearch_space_type, + "m": config.opensearch_m, + "ef_construction": config.opensearch_ef_construction, + "ef_search": config.opensearch_ef_search, + "timeout": config.opensearch_timeout, + "max_retries": config.opensearch_max_retries, + "bulk_batch_size": config.opensearch_bulk_batch_size, + "embedding_batch_size": config.opensearch_embedding_batch_size, + "chunk_size": config.opensearch_chunk_size, + "chunk_overlap": config.opensearch_chunk_overlap, + "allow_document_ids": config.opensearch_allow_document_ids, + "bulk_refresh": config.opensearch_bulk_refresh, + "aoss_delete_max_batches": config.opensearch_aoss_delete_max_batches, + "aoss_delete_backoff_seconds": config.opensearch_aoss_delete_backoff_seconds, + "ingestion_mode": config.opensearch_ingestion_mode, + "dask_scheduler_address": config.opensearch_dask_scheduler_address, + "dask_file_transfer": config.opensearch_dask_file_transfer, + "embed_model": config.embed_model, + "embed_base_url": config.embed_base_url, + **summary_config, + } + else: - raise ValueError(f"Unknown backend: {backend}. Use 'llamaindex' or 'foundational_rag'.") + raise ValueError(f"Unknown backend: {backend}. Use 'llamaindex', 'foundational_rag', or 'opensearch'.") os.environ["KNOWLEDGE_RETRIEVER_BACKEND"] = backend os.environ["KNOWLEDGE_INGESTOR_BACKEND"] = backend @@ -237,7 +476,7 @@ async def knowledge_retrieval(config: KnowledgeRetrievalConfig, _builder: Builde This function provides semantic search over documents that have been previously ingested into the knowledge layer. It supports multiple - backends (LlamaIndex, Foundational RAG) and returns formatted results + backends (LlamaIndex, Foundational RAG, OpenSearch) and returns formatted results suitable for LLM consumption. The retriever and ingestor are initialized once when the function is diff --git a/tests/knowledge_layer_tests/run_adapter_compliance.py b/tests/knowledge_layer_tests/run_adapter_compliance.py index 5564fb0f..99ef916e 100755 --- a/tests/knowledge_layer_tests/run_adapter_compliance.py +++ b/tests/knowledge_layer_tests/run_adapter_compliance.py @@ -18,10 +18,12 @@ # Quick mode - registration check only (no files/services needed) python tests/knowledge_layer_tests/run_adapter_compliance.py --backend llamaindex --quick python tests/knowledge_layer_tests/run_adapter_compliance.py --backend foundational_rag --quick + python tests/knowledge_layer_tests/run_adapter_compliance.py --backend opensearch --quick # Full mode - complete ingestion + retrieval test python tests/knowledge_layer_tests/run_adapter_compliance.py --backend llamaindex python tests/knowledge_layer_tests/run_adapter_compliance.py --backend foundational_rag + python tests/knowledge_layer_tests/run_adapter_compliance.py --backend opensearch Exit codes: 0 - All tests passed @@ -92,6 +94,7 @@ def _import_backend(self): backend_imports = { "llamaindex": "knowledge_layer.llamaindex", "foundational_rag": "knowledge_layer.foundational_rag", + "opensearch": "knowledge_layer.opensearch", } module_name = backend_imports.get(self.backend.lower()) @@ -451,7 +454,9 @@ def main(): epilog=__doc__, ) - parser.add_argument("--backend", "-b", required=True, help="Backend name (e.g., llamaindex, foundational_rag)") + parser.add_argument( + "--backend", "-b", required=True, help="Backend name (e.g., llamaindex, foundational_rag, opensearch)" + ) parser.add_argument("--config", "-c", default="{}", help="Backend config as JSON string (default: {})") diff --git a/tests/knowledge_layer_tests/test_opensearch_adapter.py b/tests/knowledge_layer_tests/test_opensearch_adapter.py new file mode 100644 index 00000000..55a19a9a --- /dev/null +++ b/tests/knowledge_layer_tests/test_opensearch_adapter.py @@ -0,0 +1,1132 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for the OpenSearch Knowledge Layer adapter.""" + +import asyncio +import threading +import time +from datetime import UTC +from datetime import datetime +from datetime import timedelta +from pathlib import Path +from typing import Any + +import pytest +from knowledge_layer.opensearch import adapter as opensearch_adapter +from knowledge_layer.opensearch.adapter import OpenSearchIngestor +from knowledge_layer.opensearch.adapter import OpenSearchRetriever + +from aiq_agent.knowledge.schema import Chunk +from aiq_agent.knowledge.schema import ContentType +from aiq_agent.knowledge.schema import FileInfo +from aiq_agent.knowledge.schema import FileStatus +from aiq_agent.knowledge.schema import JobState + + +class FakeOpenSearchIndices: + def __init__(self, client: "FakeOpenSearchClient"): + self._client = client + + def exists(self, index: str) -> bool: + return index in self._client.indexes + + def create(self, index: str, body: dict[str, Any]) -> dict[str, Any]: + with self._client.lock: + self._client.indexes[index] = body + self._client.docs.setdefault(index, {}) + return {"acknowledged": True} + + def delete(self, index: str) -> dict[str, Any]: + with self._client.lock: + self._client.indexes.pop(index, None) + self._client.docs.pop(index, None) + return {"acknowledged": True} + + def get(self, index: str) -> dict[str, Any]: + with self._client.lock: + if "*" in index: + prefix = index.rstrip("*") + return { + name: {"mappings": body.get("mappings", {})} + for name, body in self._client.indexes.items() + if name.startswith(prefix) + } + if index not in self._client.indexes: + raise KeyError(index) + return {index: {"mappings": self._client.indexes[index].get("mappings", {})}} + + def put_mapping(self, index: str, body: dict[str, Any]) -> dict[str, Any]: + with self._client.lock: + mappings = self._client.indexes[index].setdefault("mappings", {}) + if "_meta" in body: + mappings["_meta"] = body["_meta"] + return {"acknowledged": True} + + +class FakeOpenSearchClient: + def __init__(self): + self.lock = threading.RLock() + self.indexes: dict[str, dict[str, Any]] = {} + self.docs: dict[str, dict[str, dict[str, Any]]] = {} + self.indices = FakeOpenSearchIndices(self) + + def count(self, index: str) -> dict[str, int]: + with self.lock: + return {"count": len(self.docs.get(index, {}))} + + def bulk(self, body: list[dict[str, Any]], refresh: bool = True, request_timeout: int | None = None): + del refresh, request_timeout + with self.lock: + for action, doc in zip(body[0::2], body[1::2], strict=True): + index = action["index"]["_index"] + doc_id = action["index"]["_id"] + self.docs.setdefault(index, {})[doc_id] = doc + return {"errors": False} + + def search(self, index: str, body: dict[str, Any], request_timeout: int | None = None) -> dict[str, Any]: + del request_timeout + with self.lock: + docs = self.docs.get(index, {}) + aggs = (body or {}).get("aggs") or {} + if "by_file" in aggs: + # List-files composite aggregation: group by field, paginate via after_key. + by_file = aggs["by_file"] + composite = by_file.get("composite") or {} + source_field = composite["sources"][0]["file_name"]["terms"]["field"] + page_size = int(composite.get("size", 10)) + after_key = composite.get("after") or {} + after_value = after_key.get("file_name") + source_filter = by_file["aggs"]["doc"]["top_hits"].get("_source") + grouped: dict[str, list[dict[str, Any]]] = {} + for doc_id, doc in docs.items(): + grouped.setdefault(doc.get(source_field, "unknown"), []).append({"_id": doc_id, "_source": doc}) + # composite iterates keys in deterministic (sorted) order. + ordered_keys = sorted(grouped) + if after_value is not None: + ordered_keys = [k for k in ordered_keys if k > after_value] + page_keys = ordered_keys[:page_size] + buckets = [] + for key in page_keys: + group_docs = grouped[key] + ct_counts: dict[str, int] = {} + for hit in group_docs: + ct = hit["_source"].get("content_type") + if ct: + ct_counts[ct] = ct_counts.get(ct, 0) + 1 + top_source = self._filter_source(group_docs[0]["_source"], source_filter) + buckets.append( + { + "key": {"file_name": key}, + "doc_count": len(group_docs), + "doc": {"hits": {"hits": [{"_source": top_source}]}}, + "content_types": {"buckets": [{"key": k, "doc_count": v} for k, v in ct_counts.items()]}, + } + ) + agg_result: dict[str, Any] = {"buckets": buckets} + # Emit after_key only when more pages remain — mirrors real OpenSearch. + if len(page_keys) == page_size and len(ordered_keys) > page_size: + agg_result["after_key"] = {"file_name": page_keys[-1]} + return {"hits": {"hits": []}, "aggregations": {"by_file": agg_result}} + hits = [] + for doc_id, source_doc in docs.items(): + source = self._filter_source(source_doc, body.get("_source")) + hits.append({"_id": doc_id, "_index": index, "_score": 0.87, "_source": source}) + return {"hits": {"hits": hits[: body.get("size", 10)]}} + + def delete_by_query( + self, + index: str, + body: dict[str, Any], + refresh: bool = True, + conflicts: str = "proceed", + ) -> dict[str, int]: + del refresh, conflicts + should_terms = body["query"]["bool"]["should"] + deleted = 0 + with self.lock: + for doc_id, doc in list(self.docs.get(index, {}).items()): + if any(self._matches_term(doc, term.get("term", {})) for term in should_terms): + self.docs[index].pop(doc_id, None) + deleted += 1 + return {"deleted": deleted} + + def ping(self) -> bool: + return True + + def _filter_source(self, source: dict[str, Any], source_filter: Any) -> dict[str, Any]: + if isinstance(source_filter, list): + return {key: source[key] for key in source_filter if key in source} + if isinstance(source_filter, dict): + excluded = set(source_filter.get("excludes", [])) + return {key: value for key, value in source.items() if key not in excluded} + return dict(source) + + def _matches_term(self, doc: dict[str, Any], term: dict[str, Any]) -> bool: + for key, value in term.items(): + if doc.get(key) == value: + return True + return False + + +class FakeAossTransport: + def __init__(self): + self.requests: list[tuple[str, str]] = [] + + def perform_request(self, method: str, path: str): + self.requests.append((method, path)) + return "" + + +class FakeAossClient: + def __init__(self): + self.bulk_body: list[dict[str, Any]] | None = None + self.bulk_bodies: list[list[dict[str, Any]]] = [] + self.bulk_refresh: bool | str | None = None + self.search_hits: list[dict[str, Any]] = [] + self.search_responses: list[list[dict[str, Any]]] = [] + self.search_calls = 0 + self.transport = FakeAossTransport() + + def ping(self) -> bool: + return False + + def bulk(self, body: list[dict[str, Any]], refresh: bool = True, request_timeout: int | None = None): + del request_timeout + self.bulk_body = body + self.bulk_bodies.append(body) + self.bulk_refresh = refresh + return {"errors": False} + + def search(self, index: str, body: dict[str, Any], request_timeout: int | None = None) -> dict[str, Any]: + del index, body, request_timeout + self.search_calls += 1 + if self.search_responses: + return {"hits": {"hits": self.search_responses.pop(0)}} + hits = self.search_hits + self.search_hits = [] + return {"hits": {"hits": hits}} + + +class FakeFuture: + def __init__(self, result: dict[str, Any] | None = None, error: Exception | None = None): + self._result = result + self._error = error + + def result(self): + if self._error: + raise self._error + return self._result + + +class FakeDaskClient: + def __init__(self, submit_raises: Exception | None = None): + self.submissions: list[dict[str, Any]] = [] + self.closed = False + self._submit_raises = submit_raises + + def close(self) -> None: + self.closed = True + + def submit(self, fn, config, payloads, collection_name, **kwargs): + if self._submit_raises is not None: + raise self._submit_raises + self.submissions.append( + { + "fn": fn, + "config": config, + "payloads": payloads, + "collection_name": collection_name, + "kwargs": kwargs, + } + ) + return FakeFuture( + { + "status": "completed", + "files": [ + { + "file_id": payload["file_id"], + "file_name": payload["file_name"], + "status": "success", + "chunks_created": 1, + } + for payload in payloads + ], + "total_chunks": len(payloads), + "index_name": "aiq-dask-docs", + "embedding_model": "nvidia/test-embed", + } + ) + + +def test_opensearch_backend_registers_with_factory(): + from aiq_agent.knowledge import factory + from aiq_agent.knowledge.factory import get_ingestor + from aiq_agent.knowledge.factory import get_retriever + + factory._INGESTOR_INSTANCES.pop("opensearch", None) + + ingestor = get_ingestor("opensearch", {"endpoint": "localhost:9200"}) + retriever = get_retriever("opensearch", {"endpoint": "localhost:9200"}) + + assert ingestor.backend_name == "opensearch" + assert retriever.backend_name == "opensearch" + assert ingestor.endpoint == "http://localhost:9200" + assert retriever.endpoint == "http://localhost:9200" + + +def test_aoss_health_check_falls_back_to_cat_indices(): + ingestor = OpenSearchIngestor({"auth_type": "sigv4", "aws_service": "aoss"}) + fake_client = FakeAossClient() + ingestor._client = fake_client + + assert asyncio.run(ingestor.health_check()) + assert fake_client.transport.requests == [("GET", "/_cat/indices")] + + +def test_aoss_bulk_index_omits_document_ids_and_explicit_refresh(): + ingestor = OpenSearchIngestor({"auth_type": "sigv4", "aws_service": "aoss"}) + fake_client = FakeAossClient() + ingestor._client = fake_client + + ingestor._bulk_index_documents( + "aiq-aoss-test", + [ + { + "chunk_id": "chunk-1", + "file_id": "file-1", + "file_name": "doc.txt", + "content": "hello", + "embedding": [0.1, 0.2, 0.3, 0.4], + } + ], + ) + + assert fake_client.bulk_body is not None + assert fake_client.bulk_body[0] == {"index": {"_index": "aiq-aoss-test"}} + assert fake_client.bulk_refresh is False + + +def test_aoss_delete_searches_then_bulk_deletes_generated_ids(): + ingestor = OpenSearchIngestor({"auth_type": "sigv4", "aws_service": "aoss", "bulk_batch_size": 2}) + fake_client = FakeAossClient() + fake_client.search_hits = [{"_id": "generated-1"}, {"_id": "generated-2"}] + ingestor._client = fake_client + + deleted = ingestor._delete_file_documents_for_aoss( + "aiq-aoss-test", + { + "query": { + "bool": { + "should": [{"term": {"file_id": "file-1"}}], + "minimum_should_match": 1, + } + } + }, + ) + + assert deleted == 2 + assert fake_client.bulk_body == [ + {"delete": {"_index": "aiq-aoss-test", "_id": "generated-1"}}, + {"delete": {"_index": "aiq-aoss-test", "_id": "generated-2"}}, + ] + assert fake_client.bulk_refresh is False + + +def test_aoss_delete_stops_after_repeated_stale_hits(): + ingestor = OpenSearchIngestor( + { + "auth_type": "sigv4", + "aws_service": "aoss", + "bulk_batch_size": 2, + "aoss_delete_backoff_seconds": 0, + } + ) + fake_client = FakeAossClient() + fake_client.search_responses = [[{"_id": "generated-1"}], [{"_id": "generated-1"}], [{"_id": "generated-1"}]] + ingestor._client = fake_client + + deleted = ingestor._delete_file_documents_for_aoss( + "aiq-aoss-test", + { + "query": { + "bool": { + "should": [{"term": {"file_id": "file-1"}}], + "minimum_should_match": 1, + } + } + }, + ) + + assert deleted == 1 + assert fake_client.search_calls == 3 + assert fake_client.bulk_bodies == [[{"delete": {"_index": "aiq-aoss-test", "_id": "generated-1"}}]] + + +def test_index_name_helpers_are_opensearch_safe(): + assert opensearch_adapter._sanitize_index_part("Tenant A / Session +1") == "tenant-a-session-1" + assert opensearch_adapter._sanitize_index_part("+++Bad") == "bad" + assert len(opensearch_adapter._trim_index_name("a" * 300)) <= 255 + + +def test_session_collection_names_are_safe_dynamic_indexes(): + ingestor = OpenSearchIngestor({"index_prefix": "aiq-prod", "start_ttl_cleanup": False}) + collection_name = "s_123E4567-E89B-12D3-A456-426614174000" + + assert ingestor._index_name_for_collection(collection_name) == "aiq-prod-s_123e4567-e89b-12d3-a456-426614174000" + + +def test_index_mapping_keeps_metadata_strings_filterable(): + ingestor = OpenSearchIngestor({"embedding_dim": 4, "start_ttl_cleanup": False}) + + mapping = ingestor._index_mapping("docs") + + assert mapping["mappings"]["dynamic_templates"] == [ + { + "metadata_strings": { + "path_match": "metadata.*", + "match_mapping_type": "string", + "mapping": {"type": "keyword", "ignore_above": 1024}, + } + } + ] + + +def test_search_body_includes_knn_filter(): + retriever = OpenSearchRetriever({"embedding_dim": 4, "vector_field": "vec"}) + + body = retriever._build_search_body([0.1, 0.2, 0.3, 0.4], 3, {"file_name": "report.pdf", "topic": "roadmap"}) + + assert body["size"] == 3 + assert body["query"]["knn"]["vec"]["vector"] == [0.1, 0.2, 0.3, 0.4] + assert body["query"]["knn"]["vec"]["k"] == 3 + assert body["_source"]["excludes"] == ["vec"] + assert body["query"]["knn"]["vec"]["filter"] == { + "bool": { + "filter": [ + {"term": {"file_name": "report.pdf"}}, + {"term": {"metadata.topic": "roadmap"}}, + ] + } + } + + +def test_normalize_maps_opensearch_hit_to_chunk(): + retriever = OpenSearchRetriever({"text_field": "body"}) + chunk = retriever.normalize( + { + "_id": "doc-1", + "_index": "aiq-default", + "_score": 0.91, + "_source": { + "body": "OpenSearch content", + "file_name": "report.pdf", + "page_number": 2, + "content_type": "text", + "metadata": {"section": "intro"}, + }, + } + ) + + assert isinstance(chunk, Chunk) + assert chunk.chunk_id == "doc-1" + assert chunk.content == "OpenSearch content" + assert chunk.score == 0.91 + assert chunk.content_type == ContentType.TEXT + assert chunk.display_citation == "report.pdf, p.2" + assert chunk.metadata["section"] == "intro" + assert chunk.metadata["index"] == "aiq-default" + + +def test_ingestion_and_retrieval_with_fake_client(tmp_path): + fake_client = FakeOpenSearchClient() + test_file = tmp_path / "doc.txt" + test_file.write_text( + "OpenSearch stores document chunks as vectors for AIQ retrieval. " + "The adapter creates one index per collection and preserves citations.", + encoding="utf-8", + ) + + ingestor = OpenSearchIngestor( + { + "endpoint": "localhost:9200", + "embedding_dim": 4, + "chunk_size": 6, + "chunk_overlap": 1, + "index_prefix": "aiq-test", + } + ) + ingestor._client = fake_client + ingestor._embed_texts = lambda texts: [[0.1, 0.2, 0.3, 0.4] for _ in texts] + + file_info = ingestor.upload_file(str(test_file), "collection_a", metadata={"tenant": "alpha"}) + deadline = time.time() + 5 + job = ingestor.get_job_status(file_info.metadata["job_id"]) + while time.time() < deadline and not job.is_terminal: + time.sleep(0.05) + job = ingestor.get_job_status(file_info.metadata["job_id"]) + + assert job.status == JobState.COMPLETED + + status = ingestor.get_file_status(file_info.file_id, "collection_a") + assert status is not None + assert status.status == FileStatus.SUCCESS + assert status.chunk_count > 0 + + files = ingestor.list_files("collection_a") + assert len(files) == 1 + assert files[0].file_name == "doc.txt" + assert files[0].chunk_count == status.chunk_count + assert files[0].metadata["tenant"] == "alpha" + + retriever = OpenSearchRetriever( + { + "endpoint": "localhost:9200", + "embedding_dim": 4, + "index_prefix": "aiq-test", + } + ) + retriever._client = fake_client + retriever._embed_texts = lambda texts: [[0.1, 0.2, 0.3, 0.4] for _ in texts] + + result = asyncio.run(retriever.retrieve("How are chunks stored?", "collection_a", top_k=2)) + + assert result.success + assert result.backend == "opensearch" + assert result.chunks + assert result.chunks[0].file_name == "doc.txt" + assert result.chunks[0].display_citation == "doc.txt" + assert result.chunks[0].metadata["tenant"] == "alpha" + # source_path leaks internal filesystem paths (e.g. /tmp/tmpXXX.pdf in byte-upload + # paths) into API responses and LLM context — must never appear in retrieved metadata. + assert "source_path" not in result.chunks[0].metadata + + assert ingestor.delete_file(file_info.file_id, "collection_a") + assert ingestor.list_files("collection_a") == [] + + +def test_indexed_documents_omit_internal_source_path(tmp_path): + """source_path leaks internal filesystem paths (such as /tmp/tmpXXX.pdf temp files + used in Dask and byte-upload modes) into the OpenSearch index, and via normalize() + into every Chunk returned to API consumers and LLM context windows. Indexed docs + must never carry it in metadata.""" + fake_client = FakeOpenSearchClient() + test_file = tmp_path / "leak-check.txt" + test_file.write_text("Short payload to chunk.", encoding="utf-8") + + ingestor = OpenSearchIngestor( + { + "endpoint": "localhost:9200", + "embedding_dim": 4, + "chunk_size": 8, + "chunk_overlap": 1, + "index_prefix": "aiq-leak", + } + ) + ingestor._client = fake_client + ingestor._embed_texts = lambda texts: [[0.0, 0.0, 0.0, 0.0] for _ in texts] + + file_info = ingestor.upload_file(str(test_file), "collection_leak") + deadline = time.time() + 5 + job = ingestor.get_job_status(file_info.metadata["job_id"]) + while time.time() < deadline and not job.is_terminal: + time.sleep(0.05) + job = ingestor.get_job_status(file_info.metadata["job_id"]) + assert job.status == JobState.COMPLETED + + # Inspect every indexed document directly — the fake stores them under docs[index]. + indexed_docs = [doc for index_docs in fake_client.docs.values() for doc in index_docs.values()] + assert indexed_docs, "expected at least one chunk to be indexed" + for doc in indexed_docs: + metadata = doc.get("metadata") or {} + assert "source_path" not in metadata, f"source_path leaked into indexed metadata: {metadata!r}" + + +def test_ttl_cleanup_deletes_only_expired_opensearch_session_indexes(): + fake_client = FakeOpenSearchClient() + ingestor = OpenSearchIngestor({"index_prefix": "aiq-ttl", "start_ttl_cleanup": False}) + ingestor._client = fake_client + ingestor._ttl_hours = 24 + ingestor._cleanup_interval_seconds = 3600 + + old_collection = "s_old_session" + new_collection = "s_new_session" + ingestor.create_collection(old_collection) + ingestor.create_collection(new_collection) + old_index = ingestor._index_name_for_collection(old_collection) + new_index = ingestor._index_name_for_collection(new_collection) + old_meta = fake_client.indexes[old_index]["mappings"]["_meta"] + old_meta["updated_at"] = (datetime.now(UTC) - timedelta(hours=25)).isoformat() + new_meta = fake_client.indexes[new_index]["mappings"]["_meta"] + new_meta["updated_at"] = datetime.now(UTC).isoformat() + fake_client.indexes["aiq-ttl-unrelated"] = { + "mappings": { + "_meta": { + "backend": "other", + "collection_name": "unrelated", + "updated_at": (datetime.now(UTC) - timedelta(hours=25)).isoformat(), + } + } + } + + ingestor._cleanup_expired_collections() + + assert old_index not in fake_client.indexes + assert new_index in fake_client.indexes + assert "aiq-ttl-unrelated" in fake_client.indexes + + +def test_dask_ingestion_submits_bytes_payload_and_updates_job(tmp_path): + fake_dask = FakeDaskClient() + fake_client = FakeOpenSearchClient() + test_file = tmp_path / "dask.txt" + test_file.write_text("distributed opensearch ingestion", encoding="utf-8") + ingestor = OpenSearchIngestor( + { + "ingestion_mode": "dask", + "dask_scheduler_address": "tcp://scheduler:8786", + "dask_file_transfer": "bytes", + "embed_model": "nvidia/test-embed", + "start_ttl_cleanup": False, + } + ) + ingestor._client = fake_client + ingestor._create_dask_client = lambda: fake_dask + + job_id = ingestor.submit_job( + [str(test_file)], + "docs", + config={"original_filenames": ["dask.txt"], "metadata": {"tenant": "aws"}}, + ) + + deadline = time.time() + 5 + job = ingestor.get_job_status(job_id) + while time.time() < deadline and not job.is_terminal: + time.sleep(0.05) + job = ingestor.get_job_status(job_id) + + assert job.status == JobState.COMPLETED + assert job.metadata["ingestion_mode"] == "dask" + assert fake_dask.submissions + submission = fake_dask.submissions[0] + assert submission["collection_name"] == "docs" + assert submission["config"]["start_ttl_cleanup"] is False + assert "summary_llm" not in submission["config"] + assert submission["payloads"][0]["file_name"] == "dask.txt" + assert submission["payloads"][0]["data"] == b"distributed opensearch ingestion" + assert "path" not in submission["payloads"][0] + status = ingestor.get_file_status(job.file_details[0].file_id, "docs") + assert status is not None + assert status.status == FileStatus.SUCCESS + assert status.chunk_count == 1 + + +def test_delete_file_preserves_inflight_tracking_when_nothing_deleted(): + """If delete_file finds no documents to delete (file still UPLOADING or + INGESTING, or already gone), in-memory tracking must be left intact so + get_file_status keeps returning the live job state.""" + fake_client = FakeOpenSearchClient() + ingestor = OpenSearchIngestor( + { + "endpoint": "localhost:9200", + "embedding_dim": 4, + "index_prefix": "aiq-test", + } + ) + ingestor._client = fake_client + + # Create an empty index — exists() returns True but delete_by_query finds nothing. + fake_client.indices.create(index="aiq-test-c", body={}) + + # Mark a file as INGESTING in-memory; not yet in the index (job still running). + file_id = "inflight-1" + ingestor._files[file_id] = FileInfo( + file_id=file_id, + file_name="in-flight.pdf", + collection_name="c", + status=FileStatus.INGESTING, + file_size=100, + uploaded_at=datetime.now(tz=UTC), + metadata={"job_id": "job-1"}, + ) + + result = ingestor.delete_file(file_id, "c") + + assert result is False, "delete_file must return False when no documents were deleted" + assert file_id in ingestor._files, ( + "in-memory tracking must survive a no-op delete so an INGESTING job is not silently dropped" + ) + status = ingestor.get_file_status(file_id, "c") + assert status is not None, "get_file_status must still see the in-flight entry" + assert status.status == FileStatus.INGESTING + + +def test_dask_client_closed_when_submit_raises(tmp_path): + """If client.submit() raises (scheduler unreachable, serialisation error, + key conflict), _start_dask_ingestion must close the Dask client before + propagating so the scheduler TCP connection does not leak across retries.""" + test_file = tmp_path / "dask.txt" + test_file.write_text("doc", encoding="utf-8") + ingestor = OpenSearchIngestor( + { + "ingestion_mode": "dask", + "dask_scheduler_address": "tcp://scheduler:8786", + "dask_file_transfer": "bytes", + "start_ttl_cleanup": False, + } + ) + fake_dask = FakeDaskClient(submit_raises=RuntimeError("serialisation error")) + ingestor._create_dask_client = lambda: fake_dask + + job_id = ingestor.submit_job([str(test_file)], "docs") + job = ingestor.get_job_status(job_id) + + assert job.status == JobState.FAILED + assert "serialisation error" in job.error_message + assert fake_dask.closed, "Dask client must be closed when submit() raises" + + +def test_dask_ingestion_submission_failure_marks_job_failed(tmp_path): + test_file = tmp_path / "dask.txt" + test_file.write_text("distributed opensearch ingestion", encoding="utf-8") + ingestor = OpenSearchIngestor( + { + "ingestion_mode": "dask", + "dask_scheduler_address": "tcp://scheduler:8786", + "start_ttl_cleanup": False, + } + ) + ingestor._create_dask_client = lambda: (_ for _ in ()).throw(RuntimeError("scheduler unavailable")) + + job_id = ingestor.submit_job([str(test_file)], "docs") + job = ingestor.get_job_status(job_id) + + assert job.status == JobState.FAILED + assert "scheduler unavailable" in job.error_message + + +def test_dask_worker_task_constructs_backend_in_worker(monkeypatch): + from knowledge_layer.opensearch.distributed import run_opensearch_ingestion_task + + captured: dict[str, Any] = {} + + class WorkerIngestor: + def __init__(self, config: dict[str, Any]): + captured["config"] = config + self.text_field = "content" + self.vector_field = "embedding" + self.embed_model_name = config["embed_model"] + + def _ensure_index(self, collection_name: str) -> str: + captured["collection_name"] = collection_name + return "aiq-docs" + + def _documents_for_file( + self, + file_path: str, + file_id: str, + file_name: str, + file_metadata: dict[str, Any] | None = None, + ): + captured["worker_file_exists"] = Path(file_path).exists() + return ( + [ + { + "chunk_id": "chunk-1", + "file_id": file_id, + "file_name": file_name, + "content": Path(file_path).read_text(encoding="utf-8"), + "metadata": file_metadata or {}, + } + ], + "summary", + ) + + def _embed_texts(self, texts: list[str]) -> list[list[float]]: + captured["texts"] = texts + return [[0.1, 0.2, 0.3, 0.4]] + + def _bulk_index_documents(self, index_name: str, documents: list[dict[str, Any]]) -> None: + captured["index_name"] = index_name + captured["documents"] = documents + + def _update_collection_timestamp(self, collection_name: str) -> None: + captured["updated_collection"] = collection_name + + monkeypatch.setattr(opensearch_adapter, "OpenSearchIngestor", WorkerIngestor) + + result = run_opensearch_ingestion_task( + { + "auth_type": "sigv4", + "aws_service": "aoss", + "aws_region": "us-east-1", + "embed_model": "nvidia/test-embed", + "summary_llm": "not-worker-serializable", + }, + [ + { + "file_id": "file-1", + "file_name": "worker.txt", + "data": b"worker opensearch ingestion", + "suffix": ".txt", + "metadata": {"suite": "dask"}, + } + ], + "docs", + ) + + assert result["status"] == "completed" + assert captured["config"]["auth_type"] == "sigv4" + assert captured["config"]["aws_service"] == "aoss" + assert captured["config"]["start_ttl_cleanup"] is False + assert captured["config"]["generate_summary"] is False + assert "summary_llm" not in captured["config"] + assert captured["worker_file_exists"] is True + assert captured["documents"][0]["embedding"] == [0.1, 0.2, 0.3, 0.4] + assert captured["documents"][0]["metadata"]["suite"] == "dask" + + +def test_setup_backend_passes_opensearch_yaml_config(monkeypatch): + pytest.importorskip("nat") + from knowledge_layer.register import KnowledgeRetrievalConfig + from knowledge_layer.register import _setup_backend + + monkeypatch.setenv("OPENSEARCH_USERNAME", "env-user") + monkeypatch.setenv("OPENSEARCH_PASSWORD", "env-pass") + + config = KnowledgeRetrievalConfig( + backend="opensearch", + collection_name="docs", + opensearch_url="search.example.com", + opensearch_auth_type="sigv4", + opensearch_aws_region="us-west-2", + opensearch_aws_service="aoss", + opensearch_index_prefix="tenant-a", + opensearch_embedding_dim=4, + opensearch_chunk_size=200, + opensearch_chunk_overlap=20, + embed_model="nvidia/test-embed", + embed_base_url="https://integrate.example/v1", + ) + + backend, backend_config = _setup_backend(config) + + assert backend == "opensearch" + assert backend_config["endpoint"] == "search.example.com" + assert backend_config["auth_type"] == "sigv4" + assert backend_config["aws_region"] == "us-west-2" + assert backend_config["aws_service"] == "aoss" + assert backend_config["index_prefix"] == "tenant-a" + assert backend_config["embedding_dim"] == 4 + assert backend_config["chunk_size"] == 200 + assert backend_config["chunk_overlap"] == 20 + assert backend_config["embed_model"] == "nvidia/test-embed" + assert backend_config["embed_base_url"] == "https://integrate.example/v1" + + +def test_setup_backend_uses_opensearch_environment_defaults(monkeypatch): + pytest.importorskip("nat") + from knowledge_layer.register import KnowledgeRetrievalConfig + from knowledge_layer.register import _setup_backend + + monkeypatch.setenv("OPENSEARCH_URL", "https://env.us-east-1.aoss.amazonaws.com") + monkeypatch.setenv("OPENSEARCH_AUTH_TYPE", "sigv4") + monkeypatch.setenv("AWS_REGION", "us-east-1") + monkeypatch.setenv("OPENSEARCH_AWS_SERVICE", "aoss") + monkeypatch.setenv("OPENSEARCH_INDEX_PREFIX", "aiq-env") + monkeypatch.setenv("OPENSEARCH_INGESTION_MODE", "auto") + monkeypatch.setenv("NAT_DASK_SCHEDULER_ADDRESS", "tcp://scheduler:8786") + monkeypatch.setenv("OPENSEARCH_DASK_FILE_TRANSFER", "paths") + + config = KnowledgeRetrievalConfig(backend="opensearch") + backend, backend_config = _setup_backend(config) + + assert backend == "opensearch" + assert backend_config["endpoint"] == "https://env.us-east-1.aoss.amazonaws.com" + assert backend_config["auth_type"] == "sigv4" + assert backend_config["aws_region"] == "us-east-1" + assert backend_config["aws_service"] == "aoss" + assert backend_config["index_prefix"] == "aiq-env" + assert backend_config["ingestion_mode"] == "auto" + assert backend_config["dask_scheduler_address"] == "tcp://scheduler:8786" + assert backend_config["dask_file_transfer"] == "paths" + + +def test_ingestor_embed_raises_when_hosted_api_and_missing_key(monkeypatch): + """Hosted NVIDIA API with no key should raise a clear error before HTTP.""" + monkeypatch.delenv("NVIDIA_API_KEY", raising=False) + from knowledge_layer.opensearch.adapter import OpenSearchIngestor + + ingestor = OpenSearchIngestor( + { + "endpoint": "http://localhost:9200", + "auth_type": "none", + "embed_base_url": "https://integrate.api.nvidia.com/v1", + "start_ttl_cleanup": False, + } + ) + with pytest.raises(RuntimeError, match="NVIDIA_API_KEY"): + ingestor._embed_texts(["hello world"]) + + +def test_ingestor_embed_allows_local_nim_without_key(monkeypatch): + """Self-hosted NIM with no key should pass through without complaint.""" + monkeypatch.delenv("NVIDIA_API_KEY", raising=False) + from knowledge_layer.opensearch.adapter import OpenSearchIngestor + + ingestor = OpenSearchIngestor( + { + "endpoint": "http://localhost:9200", + "auth_type": "none", + "embed_base_url": "http://nim-embed.ns-nim.svc.cluster.local:8000/v1", + "start_ttl_cleanup": False, + } + ) + + class _FakeOpenAI: + def __init__(self, base_url, api_key): + fake_emb = type("D", (), {"embedding": [0.0] * 4})() + fake_resp = type("R", (), {"data": [fake_emb]})() + self.embeddings = type("E", (), {"create": staticmethod(lambda **kw: fake_resp)})() + + monkeypatch.setattr("openai.OpenAI", _FakeOpenAI) + result = ingestor._embed_texts(["hello"]) + assert result == [[0.0, 0.0, 0.0, 0.0]] + + +def test_ensure_index_recovers_when_concurrent_create_races(monkeypatch): + """Two concurrent jobs both see not-exists and both call create(); the + losing call must not raise — re-check exists() and treat the index as + ready if another worker already created it.""" + from knowledge_layer.opensearch.adapter import OpenSearchIngestor + from opensearchpy.exceptions import RequestError + + ingestor = OpenSearchIngestor( + { + "endpoint": "http://localhost:9200", + "auth_type": "none", + "start_ttl_cleanup": False, + } + ) + + exists_calls: list[str] = [] + + def fake_exists(index: str) -> bool: + # First call (pre-create) returns False; subsequent re-check after the + # race loss returns True (the winning worker has created it). + exists_calls.append(index) + return len(exists_calls) >= 2 + + def fake_create(index: str, body: dict) -> None: + raise RequestError( + 400, + "resource_already_exists_exception", + { + "error": { + "type": "resource_already_exists_exception", + "reason": f"index [{index}/abc] already exists", + } + }, + ) + + fake_client = type("C", (), {})() + fake_client.indices = type("I", (), {"exists": staticmethod(fake_exists), "create": staticmethod(fake_create)})() + monkeypatch.setattr(ingestor, "_get_client", lambda: fake_client) + + # Must not raise — race recovery should swallow the exists exception. + result = ingestor._ensure_index("smoke") + assert result.startswith("aiq-smoke") + assert len(exists_calls) == 2, "expected pre-create check + post-failure recovery check" + + +def test_list_files_aggregates_and_avoids_10k_hit_truncation(monkeypatch): + """list_files must request 0 hits and aggregate by file_name so collections + with more than the 10k index.max_result_window are not silently truncated. + Chunk counts must come from bucket doc_count, not from counted hits.""" + from knowledge_layer.opensearch.adapter import OpenSearchIngestor + + ingestor = OpenSearchIngestor( + { + "endpoint": "http://localhost:9200", + "auth_type": "none", + "start_ttl_cleanup": False, + } + ) + + captured_body: dict[str, Any] = {} + + def fake_search(index: str, body: dict[str, Any]) -> dict[str, Any]: + captured_body.update(body) + return { + "hits": {"hits": []}, + "aggregations": { + "by_file": { + # No after_key — single page, iteration terminates. + "buckets": [ + { + "key": {"file_name": "huge.pdf"}, + "doc_count": 50_000, + "doc": { + "hits": { + "hits": [ + { + "_source": { + "file_id": "f1", + "file_name": "huge.pdf", + "file_size": 12_345_678, + "created_at": "2026-05-01T00:00:00Z", + "updated_at": "2026-05-01T00:01:00Z", + "metadata": {"k": "v"}, + } + } + ] + } + }, + "content_types": {"buckets": [{"key": "text", "doc_count": 50_000}]}, + }, + { + "key": {"file_name": "small.md"}, + "doc_count": 3, + "doc": { + "hits": { + "hits": [ + { + "_source": { + "file_id": "f2", + "file_name": "small.md", + "file_size": 1234, + "created_at": "2026-05-02T00:00:00Z", + "updated_at": "2026-05-02T00:00:30Z", + "metadata": {}, + } + } + ] + } + }, + "content_types": {"buckets": []}, + }, + ] + } + }, + } + + fake_client = type("C", (), {})() + fake_client.indices = type("I", (), {"exists": staticmethod(lambda index: True)})() + fake_client.search = staticmethod(fake_search) + monkeypatch.setattr(ingestor, "_get_client", lambda: fake_client) + + files = ingestor.list_files("smoke") + + assert captured_body.get("size") == 0, "search body must request 0 hits and aggregate instead" + by_file_agg = (captured_body.get("aggs") or {}).get("by_file") or {} + assert "composite" in by_file_agg, "must use composite aggregation to paginate exhaustively" + + by_name = {f.file_name: f for f in files} + assert set(by_name) == {"huge.pdf", "small.md"} + # 50 000 > 10 000 max_result_window — only an aggregation can carry this count. + assert by_name["huge.pdf"].chunk_count == 50_000 + assert by_name["huge.pdf"].file_size == 12_345_678 + assert by_name["huge.pdf"].metadata["content_types"] == ["text"] + assert by_name["small.md"].chunk_count == 3 + + +def test_list_files_paginates_composite_until_after_key_exhausted(monkeypatch): + """Composite aggregation pages through every distinct file_name. With more files + than a single page can hold, list_files must follow after_key until exhaustion — + otherwise large collections silently lose files past the first page.""" + from knowledge_layer.opensearch.adapter import OpenSearchIngestor + + ingestor = OpenSearchIngestor( + { + "endpoint": "http://localhost:9200", + "auth_type": "none", + "start_ttl_cleanup": False, + } + ) + + # 7 distinct files; emit in pages of 3. Must take ceil(7/3) = 3 requests. + all_files = [f"f{i:02d}.txt" for i in range(7)] + page_size = 3 + search_calls: list[dict[str, Any] | None] = [] + + def fake_search(index: str, body: dict[str, Any]) -> dict[str, Any]: + composite = body["aggs"]["by_file"]["composite"] + after = composite.get("after") + search_calls.append(after) + after_value = (after or {}).get("file_name") + remaining = [name for name in all_files if after_value is None or name > after_value] + page = remaining[:page_size] + buckets = [ + { + "key": {"file_name": name}, + "doc_count": 1, + "doc": {"hits": {"hits": [{"_source": {"file_id": name, "file_name": name}}]}}, + "content_types": {"buckets": []}, + } + for name in page + ] + agg: dict[str, Any] = {"buckets": buckets} + if len(page) == page_size and len(remaining) > page_size: + agg["after_key"] = {"file_name": page[-1]} + return {"hits": {"hits": []}, "aggregations": {"by_file": agg}} + + fake_client = type("C", (), {})() + fake_client.indices = type("I", (), {"exists": staticmethod(lambda index: True)})() + fake_client.search = staticmethod(fake_search) + monkeypatch.setattr(ingestor, "_get_client", lambda: fake_client) + + files = ingestor.list_files("paginated") + + # All 7 files surface, none dropped. + assert sorted(f.file_name for f in files) == all_files + # First call has no after; subsequent calls carry the cursor; the run that returns + # < page_size buckets ends the loop without another request. + assert search_calls[0] is None + assert search_calls[1] == {"file_name": "f02.txt"} + assert search_calls[2] == {"file_name": "f05.txt"} + assert len(search_calls) == 3, f"expected 3 paginated requests, got {len(search_calls)}" + + +def test_ensure_index_reraises_when_create_fails_for_other_reasons(monkeypatch): + """If create() fails for a non-race reason (index still missing after the + failure), the original exception must propagate so the caller can fail + the job rather than silently swallowing it.""" + from knowledge_layer.opensearch.adapter import OpenSearchIngestor + from opensearchpy.exceptions import RequestError + + ingestor = OpenSearchIngestor( + { + "endpoint": "http://localhost:9200", + "auth_type": "none", + "start_ttl_cleanup": False, + } + ) + + def fake_exists(index: str) -> bool: # Always not-exists, even after failure + return False + + def fake_create(index: str, body: dict) -> None: + raise RequestError( + 400, + "invalid_index_name_exception", + {"error": {"type": "invalid_index_name_exception", "reason": "bad name"}}, + ) + + fake_client = type("C", (), {})() + fake_client.indices = type("I", (), {"exists": staticmethod(fake_exists), "create": staticmethod(fake_create)})() + monkeypatch.setattr(ingestor, "_get_client", lambda: fake_client) + + with pytest.raises(RequestError): + ingestor._ensure_index("smoke") diff --git a/tests/knowledge_layer_tests/test_opensearch_live.py b/tests/knowledge_layer_tests/test_opensearch_live.py new file mode 100644 index 00000000..7dcd733e --- /dev/null +++ b/tests/knowledge_layer_tests/test_opensearch_live.py @@ -0,0 +1,214 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Live OpenSearch integration tests. + +These tests are opt-in because they create and delete real OpenSearch indexes. +They use deterministic local embeddings so they only require OpenSearch access, +not NVIDIA_API_KEY or external embedding calls. +""" + +import asyncio +import os +import time +import uuid +from collections.abc import Callable +from typing import Any + +import pytest +from knowledge_layer.opensearch.adapter import OpenSearchIngestor +from knowledge_layer.opensearch.adapter import OpenSearchRetriever + +from aiq_agent.knowledge.schema import FileStatus +from aiq_agent.knowledge.schema import JobState + + +def _env_bool(name: str, default: bool = False) -> bool: + value = os.environ.get(name) + if value is None: + return default + return value.lower() in {"1", "true", "yes", "on"} + + +pytestmark = [ + pytest.mark.integration, + pytest.mark.skipif( + not _env_bool("AIQ_OPENSEARCH_LIVE_TESTS"), + reason="Set AIQ_OPENSEARCH_LIVE_TESTS=1 to run live OpenSearch integration tests.", + ), +] + + +def _live_config() -> dict[str, Any]: + pytest.importorskip("opensearchpy") + + auth_type = os.environ.get("OPENSEARCH_AUTH_TYPE", "none").lower() + username = os.environ.get("OPENSEARCH_USERNAME") + password = os.environ.get("OPENSEARCH_PASSWORD") + + if auth_type == "basic" and (not username or not password): + pytest.fail("OPENSEARCH_AUTH_TYPE=basic requires OPENSEARCH_USERNAME and OPENSEARCH_PASSWORD.") + if auth_type == "sigv4": + pytest.importorskip("boto3") + + prefix = os.environ.get("AIQ_OPENSEARCH_LIVE_INDEX_PREFIX", "aiq-live") + run_suffix = uuid.uuid4().hex[:8] + + return { + "endpoint": os.environ.get("OPENSEARCH_URL", "http://localhost:9200"), + "auth_type": auth_type, + "username": username, + "password": password, + "verify_certs": _env_bool("OPENSEARCH_VERIFY_CERTS", True), + "ca_certs": os.environ.get("OPENSEARCH_CA_CERTS"), + "aws_region": os.environ.get("AWS_REGION") or os.environ.get("AWS_DEFAULT_REGION", "us-east-1"), + "aws_service": os.environ.get("OPENSEARCH_AWS_SERVICE", "aoss"), + "index_prefix": f"{prefix}-{run_suffix}", + "embedding_dim": 4, + "engine": os.environ.get("OPENSEARCH_ENGINE", "faiss"), + "space_type": os.environ.get("OPENSEARCH_SPACE_TYPE", "cosinesimil"), + "chunk_size": 32, + "chunk_overlap": 0, + "timeout": int(os.environ.get("OPENSEARCH_TIMEOUT", "120")), + "bulk_batch_size": 10, + } + + +def _test_embedding(text: str) -> list[float]: + text = text.lower() + if "alpha" in text: + return [1.0, 0.0, 0.0, 0.0] + if "beta" in text: + return [0.0, 1.0, 0.0, 0.0] + return [0.5, 0.5, 0.0, 0.0] + + +def _patch_embeddings(adapter: OpenSearchIngestor | OpenSearchRetriever) -> None: + adapter._embed_texts = lambda texts: [_test_embedding(text) for text in texts] + + +def _wait_for_job(ingestor: OpenSearchIngestor, job_id: str, timeout_seconds: int = 60): + deadline = time.time() + timeout_seconds + job = ingestor.get_job_status(job_id) + while time.time() < deadline and not job.is_terminal: + time.sleep(0.25) + job = ingestor.get_job_status(job_id) + return job + + +def _retrieve_with_retry( + retriever: OpenSearchRetriever, + query: str, + collection_name: str, + top_k: int = 3, + timeout_seconds: int = 30, +): + deadline = time.time() + timeout_seconds + result = None + while time.time() < deadline: + result = asyncio.run(retriever.retrieve(query, collection_name, top_k=top_k)) + if result.success and result.chunks: + return result + time.sleep(0.5) + return result + + +@pytest.fixture +def live_backend() -> tuple[OpenSearchIngestor, OpenSearchRetriever, Callable[[str], None]]: + config = _live_config() + ingestor = OpenSearchIngestor(config) + retriever = OpenSearchRetriever(config) + _patch_embeddings(ingestor) + _patch_embeddings(retriever) + + created_collections: list[str] = [] + + def track_collection(collection_name: str) -> None: + created_collections.append(collection_name) + + yield ingestor, retriever, track_collection + + for collection_name in reversed(created_collections): + ingestor.delete_collection(collection_name) + + +def test_live_opensearch_collection_lifecycle(live_backend): + ingestor, _, track_collection = live_backend + collection_name = f"live-lifecycle-{uuid.uuid4().hex[:8]}" + track_collection(collection_name) + + assert asyncio.run(ingestor.health_check()) + + created = ingestor.create_collection(collection_name, description="Live OpenSearch lifecycle test") + + assert created.name == collection_name + assert created.backend == "opensearch" + assert created.metadata["index_name"].startswith(ingestor.index_prefix) + + fetched = ingestor.get_collection(collection_name) + assert fetched is not None + assert fetched.name == collection_name + + listed_names = {collection.name for collection in ingestor.list_collections()} + assert collection_name in listed_names + + assert ingestor.delete_collection(collection_name) + assert ingestor.get_collection(collection_name) is None + + +def test_live_opensearch_ingest_retrieve_and_delete(tmp_path, live_backend): + ingestor, retriever, track_collection = live_backend + collection_name = f"live-ingest-{uuid.uuid4().hex[:8]}" + track_collection(collection_name) + + alpha_file = tmp_path / "alpha.txt" + beta_file = tmp_path / "beta.txt" + alpha_file.write_text("alpha alpha alpha revenue roadmap vector document", encoding="utf-8") + beta_file.write_text("beta beta beta security operations vector document", encoding="utf-8") + + ingestor.create_collection(collection_name, description="Live OpenSearch ingestion test") + job_id = ingestor.submit_job( + [str(alpha_file), str(beta_file)], + collection_name, + config={ + "original_filenames": ["alpha.txt", "beta.txt"], + "metadata": {"suite": "opensearch-live"}, + }, + ) + + job = _wait_for_job(ingestor, job_id) + + assert job.status == JobState.COMPLETED, job.model_dump() + assert {detail.status for detail in job.file_details} == {FileStatus.SUCCESS} + + files = ingestor.list_files(collection_name) + names = {file.file_name for file in files} + assert names == {"alpha.txt", "beta.txt"} + assert all(file.metadata["suite"] == "opensearch-live" for file in files) + + result = _retrieve_with_retry(retriever, "alpha roadmap", collection_name) + + assert result is not None + assert result.success, result.error_message + assert result.chunks + assert result.chunks[0].file_name == "alpha.txt" + assert "alpha" in result.chunks[0].content.lower() + assert result.chunks[0].metadata["suite"] == "opensearch-live" + + file_ids = {detail.file_name: detail.file_id for detail in job.file_details} + assert ingestor.delete_file(file_ids["alpha.txt"], collection_name) + + remaining_names = {file.file_name for file in ingestor.list_files(collection_name)} + assert remaining_names == {"beta.txt"} diff --git a/tests/knowledge_layer_tests/test_opensearch_serverless_live.py b/tests/knowledge_layer_tests/test_opensearch_serverless_live.py new file mode 100644 index 00000000..2f8b4dda --- /dev/null +++ b/tests/knowledge_layer_tests/test_opensearch_serverless_live.py @@ -0,0 +1,347 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Live Amazon OpenSearch Serverless integration tests. + +These tests are opt-in because they create and delete real AOSS indexes. They +force SigV4 service `aoss` and use deterministic local embeddings, so they +validate the OpenSearch Serverless data plane without requiring NVIDIA_API_KEY. + +Run with exported env vars or same-line shell assignments, for example: + AIQ_OPENSEARCH_SERVERLESS_LIVE_TESTS=1 OPENSEARCH_URL=... AWS_REGION=... uv run python -m pytest ... +""" + +import asyncio +import os +import re +import time +import uuid +from collections.abc import Callable +from typing import Any + +import pytest +from botocore.exceptions import BotoCoreError +from botocore.exceptions import NoCredentialsError +from knowledge_layer.opensearch.adapter import OpenSearchIngestor +from knowledge_layer.opensearch.adapter import OpenSearchRetriever + +from aiq_agent.knowledge.schema import FileStatus +from aiq_agent.knowledge.schema import JobState + +pytestmark = [ + pytest.mark.aws, + pytest.mark.integration, + pytest.mark.opensearch_serverless, + pytest.mark.skipif( + os.environ.get("AIQ_OPENSEARCH_SERVERLESS_LIVE_TESTS", "").lower() not in {"1", "true", "yes", "on"}, + reason=( + "Set and export AIQ_OPENSEARCH_SERVERLESS_LIVE_TESTS=1, or pass it as a same-line env assignment, " + "to run live Amazon OpenSearch Serverless tests." + ), + ), +] + + +def _env_bool(name: str, default: bool = False) -> bool: + value = os.environ.get(name) + if value is None: + return default + return value.lower() in {"1", "true", "yes", "on"} + + +def _env_int(name: str, default: int) -> int: + value = os.environ.get(name) + if value is None: + return default + return int(value) + + +def _region_from_aoss_endpoint(endpoint: str) -> str | None: + match = re.search(r"\.([a-z]{2}-[a-z]+-\d)\.aoss\.amazonaws\.com/?$", endpoint) + return match.group(1) if match else None + + +def _serverless_config() -> dict[str, Any]: + boto3 = pytest.importorskip("boto3") + pytest.importorskip("opensearchpy") + + endpoint = os.environ.get("OPENSEARCH_URL") or os.environ.get("AOSS_ENDPOINT") + if not endpoint: + pytest.fail("Amazon OpenSearch Serverless live tests require OPENSEARCH_URL or AOSS_ENDPOINT.") + + if ".aoss.amazonaws.com" not in endpoint and not _env_bool("AIQ_OPENSEARCH_SERVERLESS_ALLOW_CUSTOM_ENDPOINT"): + pytest.fail( + "Amazon OpenSearch Serverless live tests expect an .aoss.amazonaws.com endpoint. " + "Set AIQ_OPENSEARCH_SERVERLESS_ALLOW_CUSTOM_ENDPOINT=1 for a custom/private endpoint." + ) + + aws_region = ( + os.environ.get("AWS_REGION") or os.environ.get("AWS_DEFAULT_REGION") or _region_from_aoss_endpoint(endpoint) + ) + if not aws_region: + pytest.fail("Set AWS_REGION/AWS_DEFAULT_REGION, or use a standard regional .aoss.amazonaws.com endpoint.") + + session = boto3.Session(region_name=aws_region) + try: + credentials = session.get_credentials() + if credentials is None: + raise NoCredentialsError() + credentials.get_frozen_credentials() + except (BotoCoreError, RuntimeError, NoCredentialsError) as e: + profile = os.environ.get("AWS_PROFILE") + profile_hint = f" --profile {profile}" if profile else "" + pytest.fail( + "Amazon OpenSearch Serverless live tests require valid, unexpired AWS credentials. " + f"Credential refresh failed: {e}. " + f"Run `aws sso login{profile_hint}` for SSO credentials, or export fresh " + "AWS_ACCESS_KEY_ID/AWS_SECRET_ACCESS_KEY/AWS_SESSION_TOKEN values." + ) + + prefix = os.environ.get("AIQ_OPENSEARCH_SERVERLESS_INDEX_PREFIX", "aiq-aoss-live") + run_suffix = uuid.uuid4().hex[:8] + + return { + "endpoint": endpoint, + "auth_type": "sigv4", + "aws_region": aws_region, + "aws_service": "aoss", + "verify_certs": _env_bool("OPENSEARCH_VERIFY_CERTS", True), + "ca_certs": os.environ.get("OPENSEARCH_CA_CERTS"), + "index_prefix": f"{prefix}-{run_suffix}", + "embedding_dim": 4, + "engine": os.environ.get("OPENSEARCH_SERVERLESS_ENGINE", "faiss"), + "space_type": os.environ.get("OPENSEARCH_SERVERLESS_SPACE_TYPE", "l2"), + "chunk_size": 32, + "chunk_overlap": 0, + "timeout": int(os.environ.get("OPENSEARCH_TIMEOUT", "120")), + "bulk_batch_size": 10, + } + + +def _test_embedding(text: str) -> list[float]: + text = text.lower() + if "aurora" in text: + return [1.0, 0.0, 0.0, 0.0] + if "bedrock" in text: + return [0.0, 1.0, 0.0, 0.0] + return [0.5, 0.5, 0.0, 0.0] + + +def _patch_embeddings(adapter: OpenSearchIngestor | OpenSearchRetriever) -> None: + adapter._embed_texts = lambda texts: [_test_embedding(text) for text in texts] + + +def _wait_for_job(ingestor: OpenSearchIngestor, job_id: str, timeout_seconds: int = 90): + deadline = time.time() + timeout_seconds + job = ingestor.get_job_status(job_id) + while time.time() < deadline and not job.is_terminal: + time.sleep(0.5) + job = ingestor.get_job_status(job_id) + return job + + +def _visible_doc_count_with_retry( + retriever: OpenSearchRetriever, + collection_name: str, + expected_count: int, + timeout_seconds: int | None = None, +) -> int: + timeout_seconds = timeout_seconds or _env_int("AIQ_OPENSEARCH_SERVERLESS_VISIBILITY_TIMEOUT", 180) + index_name = retriever._index_name_for_collection(collection_name) + client = retriever._get_client() + deadline = time.time() + timeout_seconds + count = 0 + + while time.time() < deadline: + try: + response = client.search( + index=index_name, + body={ + "size": 0, + "query": {"match_all": {}}, + }, + request_timeout=retriever.timeout, + ) + total = response.get("hits", {}).get("total", 0) + count = int(total.get("value", 0) if isinstance(total, dict) else total) + if count >= expected_count: + return count + except Exception: + count = 0 + time.sleep(2.0) + + return count + + +def _retrieve_with_retry( + retriever: OpenSearchRetriever, + query: str, + collection_name: str, + top_k: int = 3, + filters: dict[str, Any] | None = None, + timeout_seconds: int | None = None, +): + timeout_seconds = timeout_seconds or _env_int("AIQ_OPENSEARCH_SERVERLESS_RETRIEVAL_TIMEOUT", 180) + deadline = time.time() + timeout_seconds + result = None + while time.time() < deadline: + result = asyncio.run(retriever.retrieve(query, collection_name, top_k=top_k, filters=filters)) + if result.success and result.chunks: + return result + time.sleep(1.0) + return result + + +def _list_files_with_retry( + ingestor: OpenSearchIngestor, + collection_name: str, + expected_names: set[str], + timeout_seconds: int | None = None, +): + timeout_seconds = timeout_seconds or _env_int("AIQ_OPENSEARCH_SERVERLESS_VISIBILITY_TIMEOUT", 180) + deadline = time.time() + timeout_seconds + files = [] + while time.time() < deadline: + files = ingestor.list_files(collection_name) + if {file.file_name for file in files} == expected_names: + return files + time.sleep(1.0) + return files + + +def _file_statuses_with_retry( + ingestor: OpenSearchIngestor, + collection_name: str, + file_ids: dict[str, str], + timeout_seconds: int = 45, +): + deadline = time.time() + timeout_seconds + statuses = {} + while time.time() < deadline: + statuses = { + file_name: ingestor.get_file_status(file_id, collection_name) for file_name, file_id in file_ids.items() + } + if all(status is not None and status.status == FileStatus.SUCCESS for status in statuses.values()): + return statuses + time.sleep(1.0) + return statuses + + +@pytest.fixture +def serverless_backend() -> tuple[OpenSearchIngestor, OpenSearchRetriever, Callable[[str], None]]: + config = _serverless_config() + ingestor = OpenSearchIngestor(config) + retriever = OpenSearchRetriever(config) + _patch_embeddings(ingestor) + _patch_embeddings(retriever) + + created_collections: list[str] = [] + + def track_collection(collection_name: str) -> None: + created_collections.append(collection_name) + + yield ingestor, retriever, track_collection + + for collection_name in reversed(created_collections): + ingestor.delete_collection(collection_name) + + +def test_aoss_sigv4_health_and_collection_lifecycle(serverless_backend): + ingestor, _, track_collection = serverless_backend + collection_name = f"aoss-lifecycle-{uuid.uuid4().hex[:8]}" + track_collection(collection_name) + + assert ingestor.auth_type == "sigv4" + assert ingestor.aws_service == "aoss" + assert asyncio.run(ingestor.health_check()) + + created = ingestor.create_collection(collection_name, description="AOSS lifecycle test") + + assert created.name == collection_name + assert created.backend == "opensearch" + assert created.metadata["index_name"].startswith(ingestor.index_prefix) + + fetched = ingestor.get_collection(collection_name) + assert fetched is not None + assert fetched.name == collection_name + + index_meta = ingestor._get_index_meta(ingestor._index_name_for_collection(collection_name)) + assert index_meta["backend"] == "opensearch" + assert index_meta["collection_name"] == collection_name + + assert ingestor.delete_collection(collection_name) + assert ingestor.get_collection(collection_name) is None + + +def test_aoss_vector_ingest_retrieve_filter_and_delete(tmp_path, serverless_backend): + ingestor, retriever, track_collection = serverless_backend + collection_name = f"aoss-ingest-{uuid.uuid4().hex[:8]}" + track_collection(collection_name) + + aurora_file = tmp_path / "aurora.txt" + bedrock_file = tmp_path / "bedrock.txt" + aurora_file.write_text("aurora aurora vector search document for serverless retrieval", encoding="utf-8") + bedrock_file.write_text("bedrock bedrock vector search document for serverless retrieval", encoding="utf-8") + + ingestor.create_collection(collection_name, description="AOSS vector ingestion test") + job_id = ingestor.submit_job( + [str(aurora_file), str(bedrock_file)], + collection_name, + config={ + "original_filenames": ["aurora.txt", "bedrock.txt"], + "metadata": {"suite": "aoss-live", "provider": "aws"}, + }, + ) + + job = _wait_for_job(ingestor, job_id) + + assert job.status == JobState.COMPLETED, job.model_dump() + assert {detail.status for detail in job.file_details} == {FileStatus.SUCCESS} + + file_ids = {detail.file_name: detail.file_id for detail in job.file_details} + statuses = _file_statuses_with_retry(ingestor, collection_name, file_ids) + assert set(statuses) == {"aurora.txt", "bedrock.txt"} + assert all(status is not None and status.status == FileStatus.SUCCESS for status in statuses.values()) + assert all(status.metadata["suite"] == "aoss-live" for status in statuses.values() if status is not None) + + visible_count = _visible_doc_count_with_retry(retriever, collection_name, expected_count=2) + assert visible_count >= 2 + + result = _retrieve_with_retry(retriever, "aurora semantic search", collection_name) + + assert result is not None + assert result.success, result.error_message + assert result.chunks + assert result.chunks[0].file_name == "aurora.txt" + assert result.chunks[0].metadata["provider"] == "aws" + + filtered = _retrieve_with_retry( + retriever, + "bedrock semantic search", + collection_name, + top_k=2, + filters={"file_name": "aurora.txt"}, + ) + + assert filtered is not None + assert filtered.success + assert filtered.chunks + assert {chunk.file_name for chunk in filtered.chunks} == {"aurora.txt"} + + assert ingestor.delete_file(file_ids["aurora.txt"], collection_name) + + remaining_files = _list_files_with_retry(ingestor, collection_name, {"bedrock.txt"}) + remaining_names = {file.file_name for file in remaining_files} + assert remaining_names == {"bedrock.txt"} diff --git a/uv.lock b/uv.lock index 142a1aad..8f010e78 100644 --- a/uv.lock +++ b/uv.lock @@ -1316,6 +1316,14 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b0/0d/9feae160378a3553fa9a339b0e9c1a048e147a4127210e286ef18b730f03/durationpy-0.10-py3-none-any.whl", hash = "sha256:3b41e1b601234296b4fb368338fdcd3e13e0b4fb5b67345948f4f2bf9868b286", size = 3922, upload-time = "2025-05-17T13:52:36.463Z" }, ] +[[package]] +name = "events" +version = "0.5" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/25/ed/e47dec0626edd468c84c04d97769e7ab4ea6457b7f54dcb3f72b17fcd876/Events-0.5-py3-none-any.whl", hash = "sha256:a7286af378ba3e46640ac9825156c93bdba7502174dd696090fdfcd4d80a1abd", size = 6758, upload-time = "2023-07-31T08:23:13.645Z" }, +] + [[package]] name = "exa-py" version = "1.16.1" @@ -1472,7 +1480,7 @@ dependencies = [ [package.metadata] requires-dist = [ - { name = "nvidia-nat-eval", specifier = ">=1.5.0,<2" }, + { name = "nvidia-nat-eval", specifier = "==1.6.0" }, { name = "pydantic", specifier = ">=2.0.0" }, { name = "pytz", specifier = ">=2024.1" }, ] @@ -2157,14 +2165,17 @@ dependencies = [ [package.optional-dependencies] all = [ + { name = "boto3" }, { name = "chromadb" }, { name = "docx2txt" }, { name = "llama-index" }, { name = "llama-index-embeddings-nvidia" }, { name = "llama-index-vector-stores-chroma" }, { name = "openai" }, + { name = "opensearch-py" }, { name = "pdfplumber" }, { name = "pillow" }, + { name = "pypdf" }, { name = "pypdfium2" }, { name = "python-pptx" }, { name = "requests" }, @@ -2187,28 +2198,42 @@ llamaindex = [ { name = "pillow" }, { name = "pypdfium2" }, ] +opensearch = [ + { name = "boto3" }, + { name = "docx2txt" }, + { name = "openai" }, + { name = "opensearch-py" }, + { name = "pypdf" }, + { name = "python-pptx" }, +] [package.metadata] requires-dist = [ + { name = "boto3", marker = "extra == 'opensearch'", specifier = ">=1.28.0" }, { name = "chromadb", marker = "extra == 'llamaindex'", specifier = ">=0.4.0" }, { name = "docx2txt", marker = "extra == 'foundational-rag'", specifier = ">=0.8" }, { name = "docx2txt", marker = "extra == 'llamaindex'", specifier = ">=0.8" }, + { name = "docx2txt", marker = "extra == 'opensearch'", specifier = ">=0.8" }, { name = "httpx", specifier = ">=0.24.0" }, - { name = "knowledge-layer", extras = ["llamaindex", "foundational-rag"], marker = "extra == 'all'", editable = "sources/knowledge_layer" }, + { name = "knowledge-layer", extras = ["llamaindex", "foundational-rag", "opensearch"], marker = "extra == 'all'", editable = "sources/knowledge_layer" }, { name = "llama-index", marker = "extra == 'llamaindex'", specifier = ">=0.10.0" }, { name = "llama-index-embeddings-nvidia", marker = "extra == 'llamaindex'", specifier = ">=0.1.0" }, { name = "llama-index-vector-stores-chroma", marker = "extra == 'llamaindex'", specifier = ">=0.1.0" }, { name = "openai", marker = "extra == 'llamaindex'", specifier = ">=1.0.0" }, + { name = "openai", marker = "extra == 'opensearch'", specifier = ">=1.0.0" }, + { name = "opensearch-py", marker = "extra == 'opensearch'", specifier = ">=2.4.0" }, { name = "pdfplumber", marker = "extra == 'llamaindex'", specifier = ">=0.10.0" }, { name = "pillow", marker = "extra == 'llamaindex'", specifier = ">=9.0.0" }, { name = "pydantic", specifier = ">=2.0.0" }, + { name = "pypdf", marker = "extra == 'opensearch'", specifier = ">=4.0.0" }, { name = "pypdfium2", marker = "extra == 'llamaindex'", specifier = ">=5.0.0" }, { name = "python-dotenv", specifier = ">=1.0.0" }, { name = "python-pptx", marker = "extra == 'foundational-rag'", specifier = ">=0.6.21" }, + { name = "python-pptx", marker = "extra == 'opensearch'", specifier = ">=0.6.21" }, { name = "requests", marker = "extra == 'foundational-rag'", specifier = ">=2.28.0" }, { name = "urllib3", marker = "extra == 'foundational-rag'", specifier = ">=2.0.0" }, ] -provides-extras = ["llamaindex", "foundational-rag", "all"] +provides-extras = ["llamaindex", "foundational-rag", "opensearch", "all"] [[package]] name = "kubernetes" @@ -3739,6 +3764,35 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/6b/ca/bb4b9cbd96f72600abec5280cf8ed67bcd849ed19b8bec919aec97adb61c/openinference_semantic_conventions-0.1.26-py3-none-any.whl", hash = "sha256:35b4f487d18ac7d016125c428c0d950dd290e18dafb99787880a9b2e05745f42", size = 10401, upload-time = "2026-02-01T01:09:44.781Z" }, ] +[[package]] +name = "opensearch-protobufs" +version = "1.2.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "grpcio" }, + { name = "protobuf" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/d8/2f/e0cc165af7bb7b44cb00023b9fcaa01a28d1755a059ede28d0cd970c3cec/opensearch_protobufs-1.2.0-py3-none-any.whl", hash = "sha256:e806730894d0a0c8cdaa3cdbe07e4b7c46e1823f453777b36caf39e9cba28e2c", size = 54751, upload-time = "2026-01-22T18:51:56.805Z" }, +] + +[[package]] +name = "opensearch-py" +version = "3.2.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "certifi" }, + { name = "events" }, + { name = "opensearch-protobufs" }, + { name = "python-dateutil" }, + { name = "requests" }, + { name = "urllib3" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/82/9e/e77844cb2d625ca32331bfdd28930113b3778399c01dd5f1c350ceb55e65/opensearch_py-3.2.0.tar.gz", hash = "sha256:f40fb3a295275422df2ad6d9459f667af94472d5a9e567072e9ecf163eb22613", size = 259927, upload-time = "2026-04-27T18:17:50.467Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ef/63/7abb96bf2e3619acbd27de99e60619bfacfb7c55b68c4792a258e6d92871/opensearch_py-3.2.0-py3-none-any.whl", hash = "sha256:721a0d3b13fbed9e82278aed748285cf63a1855354ab7e73e3d4992d1b93418b", size = 387286, upload-time = "2026-04-27T18:17:48.658Z" }, +] + [[package]] name = "opentelemetry-api" version = "1.39.1"