diff --git a/tools/sdk-ai-bots/.gitignore b/tools/sdk-ai-bots/.gitignore new file mode 100644 index 00000000000..7c535376fd1 --- /dev/null +++ b/tools/sdk-ai-bots/.gitignore @@ -0,0 +1,12 @@ +__pycache__/ +*.pyc +*.pyo +*.egg-info/ +dist/ +build/ +logs/ +.pytest_cache/ +.env +**/graphrag_config/input/ +**/graphrag_config/output/ +**/graphrag_config/cache/ diff --git a/tools/sdk-ai-bots/README.md b/tools/sdk-ai-bots/README.md index 922a9268fc8..342067c00eb 100644 --- a/tools/sdk-ai-bots/README.md +++ b/tools/sdk-ai-bots/README.md @@ -48,6 +48,15 @@ A quality assurance system that continuously monitors and evaluates the performa A standalone TypeScript application that processes documentation from various repositories and maintains the knowledge base. It clones repositories, processes markdown files and TypeSpec Spector test files, uploads processed content to Azure Blob Storage, and updates the Azure AI Search index. This service maintains change detection for efficient processing and serves as the primary knowledge management component for the system. +### 7. Knowledge Graph Sync (`azure-sdk-qa-bot-knowledge-graph-sync/`) + +A Python application that extends the knowledge sync pipeline with a knowledge graph layer built using [Microsoft GraphRAG](https://github.com/microsoft/graphrag). It performs the same documentation sync as the TypeScript service (ported to Python), then additionally: + +- Extracts entities (decorators, patterns, APIs, services, etc.) and relationships from documentation +- Detects communities of related concepts via hierarchical clustering +- Uploads the graph to Azure Cosmos DB for entity-aware retrieval at query time +- Supports **incremental indexing** — only re-processes documents that changed in the current sync run + ## Knowledge Sources The bot provides intelligent responses by searching through comprehensive knowledge bases including: @@ -63,7 +72,7 @@ The bot provides intelligent responses by searching through comprehensive knowle - **Node.js**: Version 20+ - **Go**: Version 1.23+ (for backend service) -- **Python**: Version 3.10+ (for evaluation framework) +- **Python**: Version 3.11+ (for knowledge graph sync and evaluation framework) - **Azure Subscription**: For deploying cloud resources - **Teams Toolkit**: For Teams app development and deployment @@ -101,6 +110,14 @@ npm install npm start ``` +#### Knowledge Graph Sync (Python) + +```bash +cd azure-sdk-qa-bot-knowledge-graph-sync +pip install -e ".[dev]" +sync-knowledge-graph +``` + #### Shared Library ```bash @@ -125,7 +142,7 @@ To run evaluations, see: [azure-sdk-qa-bot-evaluation/README.md](./azure-sdk-qa- ### Documentation Sources -Add new documentation sources by updating the knowledge configuration. The Knowledge Sync Service uses `azure-sdk-qa-bot-knowledge-sync/config/knowledge-config.json`. See [Self-Serve Knowledge Sources Guide](docs/SELF_SERVE_ADD_KNOWLEDGE_SOURCES.md) for detailed instructions. +Add new documentation sources by updating the knowledge configuration. Both the Knowledge Sync Service and Knowledge Graph Sync use `config/knowledge-config.json` in their respective directories. See [Self-Serve Knowledge Sources Guide](docs/SELF_SERVE_ADD_KNOWLEDGE_SOURCES.md) for detailed instructions. ### Environment Variables diff --git a/tools/sdk-ai-bots/azure-sdk-qa-bot-agent/agents/chat_agent/init.py b/tools/sdk-ai-bots/azure-sdk-qa-bot-agent/agents/chat_agent/init.py index 90a185ea279..aadb7b76c4f 100644 --- a/tools/sdk-ai-bots/azure-sdk-qa-bot-agent/agents/chat_agent/init.py +++ b/tools/sdk-ai-bots/azure-sdk-qa-bot-agent/agents/chat_agent/init.py @@ -33,6 +33,7 @@ import config.app_config as app_config from config.app_config import get as cfg from tools.knowledge_tools import KnowledgeTools +from tools.graph_knowledge_tools import GraphKnowledgeTools from tools.web_tools import WebTools from tools.ado_mcp_tools import create_ado_mcp_tool from tools.github_mcp_tools import create_github_mcp_tool @@ -90,14 +91,37 @@ async def main() -> None: # Init Tools (synchronous / instant) knowledge_tools = KnowledgeTools() + graph_knowledge_tools = GraphKnowledgeTools() web_tools = WebTools() pipeline_tools = PipelineTools() web_search_tool = agent_client.get_web_search_tool( search_context_size="medium", ) + # KNOWLEDGE_TOOL_MODE selects which knowledge-retrieval tool to expose + # to the agent. Accepted values (case-insensitive): + # "vector" (default) — Azure AI Search-backed `search_knowledge_base`. + # "graph" — GraphRAG DRIFT-search `search_knowledge_graph`. + knowledge_tool_mode = cfg("KNOWLEDGE_TOOL_MODE", "vector").strip().lower() + if knowledge_tool_mode == "graph": + knowledge_tool_choice = graph_knowledge_tools.search_knowledge_graph + else: + if knowledge_tool_mode != "vector": + logger.warning( + "KNOWLEDGE_TOOL_MODE=%r is not recognised " + "(expected 'vector' or 'graph'); falling back to 'vector'.", + knowledge_tool_mode, + ) + knowledge_tool_mode = "vector" + knowledge_tool_choice = knowledge_tools.search_knowledge_base + logger.info( + "Knowledge tool registration: mode=%s, tool=%s", + knowledge_tool_mode, + getattr(knowledge_tool_choice, "__name__", repr(knowledge_tool_choice)), + ) + tools = [ - knowledge_tools.search_knowledge_base, + knowledge_tool_choice, web_tools.web_fetch, pipeline_tools.azsdk_analyze_pipeline, web_search_tool, diff --git a/tools/sdk-ai-bots/azure-sdk-qa-bot-agent/config/graphrag/settings.yaml b/tools/sdk-ai-bots/azure-sdk-qa-bot-agent/config/graphrag/settings.yaml new file mode 100644 index 00000000000..c5705185f8b --- /dev/null +++ b/tools/sdk-ai-bots/azure-sdk-qa-bot-agent/config/graphrag/settings.yaml @@ -0,0 +1,71 @@ +# GraphRAG query-side configuration for the Azure SDK QA Bot Agent. +# +# This file mirrors the model + vector-store config of the +# `azure-sdk-qa-bot-knowledge-graph-sync` project so we can query the same +# AI Search indexes and run local/global search via the GraphRAG Python API. +# +# Only the sections needed for QUERY are present (no input / extract_graph / +# community_reports prompts, etc.). Indexing happens in the sync project. +# +# Environment variables resolved by graphrag_common.config._parse_env_variables: +# AOAI_CHAT_COMPLETIONS_ENDPOINT — e.g. https://.openai.azure.com +# AI_SEARCH_BASE_URL — e.g. https://.search.windows.net +# AI_SEARCH_INDEX_TEXT_UNITS — text units index name +# AI_SEARCH_INDEX_ENTITIES — entity description index name +# AI_SEARCH_INDEX_COMMUNITIES — community full-content index name + +completion_models: + default_completion_model: + model_provider: azure + model: gpt-5.4 + auth_method: azure_managed_identity + api_base: ${AOAI_CHAT_COMPLETIONS_ENDPOINT} + api_version: "2024-12-01-preview" + cognitive_services_endpoint: https://cognitiveservices.azure.com/.default + +embedding_models: + default_embedding_model: + model_provider: azure + model: text-embedding-3-small + auth_method: azure_managed_identity + api_base: ${AOAI_CHAT_COMPLETIONS_ENDPOINT} + api_version: "2024-12-01-preview" + cognitive_services_endpoint: https://cognitiveservices.azure.com/.default + rate_limit: + type: sliding_window + period_in_seconds: 60 + requests_per_period: 60 + tokens_per_period: 100000 + retry: + type: exponential_backoff + max_retries: 5 + base_delay: 2.0 + max_delay: 30.0 + jitter: true + +vector_store: + type: azure_ai_search + url: ${AI_SEARCH_BASE_URL} + audience: "https://search.azure.com" + index_schema: + text_unit_text: + index_name: "azuresdkqabot-dev-search-index-text-units" + vector_size: 1536 + entity_description: + index_name: "azuresdkqabot-dev-search-index-entities" + vector_size: 1536 + community_full_content: + index_name: "azuresdkqabot-dev-search-index-communities" + vector_size: 1536 + +# DRIFT search latency tuning. +# Field reference: https://microsoft.github.io/graphrag/config/yaml/#drift_search +drift_search: + n_depth: 1 # default 3 — drop iterative rounds + drift_k_followups: 5 # default 20 — fewer follow-up local searches + primer_folds: 2 # default 5 — cheaper primer ranking + primer_llm_max_tokens: 4000 # default 12000 — bound primer prompt/response + data_max_tokens: 4000 # default 12000 — bound data prompt size + concurrency: 16 # default 32 — leave headroom on AOAI TPM + local_search_top_k_mapped_entities: 10 # leave at default; expose for tuning + local_search_top_k_relationships: 10 # leave at default; expose for tuning diff --git a/tools/sdk-ai-bots/azure-sdk-qa-bot-agent/requirements.txt b/tools/sdk-ai-bots/azure-sdk-qa-bot-agent/requirements.txt index 0e844f9e45a..952a71825d4 100644 --- a/tools/sdk-ai-bots/azure-sdk-qa-bot-agent/requirements.txt +++ b/tools/sdk-ai-bots/azure-sdk-qa-bot-agent/requirements.txt @@ -18,4 +18,7 @@ mcp>=1.0.0 # Backend server fastapi uvicorn[standard] -azure-monitor-opentelemetry \ No newline at end of file +azure-monitor-opentelemetry +graphrag>=3.1.0 +pandas>=2.0 +pyarrow>=15.0 \ No newline at end of file diff --git a/tools/sdk-ai-bots/azure-sdk-qa-bot-agent/scripts/deploy_hosted_agent.py b/tools/sdk-ai-bots/azure-sdk-qa-bot-agent/scripts/deploy_hosted_agent.py index 51bfdb6816c..089d2f73085 100644 --- a/tools/sdk-ai-bots/azure-sdk-qa-bot-agent/scripts/deploy_hosted_agent.py +++ b/tools/sdk-ai-bots/azure-sdk-qa-bot-agent/scripts/deploy_hosted_agent.py @@ -34,6 +34,7 @@ from azure.ai.projects import AIProjectClient from azure.ai.projects.models import ( AgentProtocol, + ContainerConfiguration, HostedAgentDefinition, ProtocolVersionRecord, ) @@ -274,14 +275,14 @@ def main() -> None: agent = project.agents.create_version( agent_name=image_name, definition=HostedAgentDefinition( - container_protocol_versions=[ + protocol_versions=[ ProtocolVersionRecord( protocol=AgentProtocol.RESPONSES, version="1.0.0" ) ], cpu="2", memory="4Gi", - image=image, + container_configuration=ContainerConfiguration(image=image), environment_variables=env_vars, ), metadata={"enableVnextExperience": "true"}, diff --git a/tools/sdk-ai-bots/azure-sdk-qa-bot-agent/server.py b/tools/sdk-ai-bots/azure-sdk-qa-bot-agent/server.py index bdcffec675d..1e834a8d9b8 100644 --- a/tools/sdk-ai-bots/azure-sdk-qa-bot-agent/server.py +++ b/tools/sdk-ai-bots/azure-sdk-qa-bot-agent/server.py @@ -16,7 +16,7 @@ load_dotenv(override=False) -from fastapi import FastAPI, Request +from fastapi import FastAPI, Header, HTTPException, Request from models.chat import ChatRequest, ChatResponse from models.conversation import ConversationMessage, SaveConversationMessageResponse from models.feedback import FeedbackRequest, FeedbackResponse @@ -233,5 +233,72 @@ async def _update_thread_memory(message: ConversationMessage) -> None: ) +# --------------------------------------------------------------------------- # +# GraphRAG admin endpoints +# --------------------------------------------------------------------------- # +# These let the knowledge-graph-sync project tell the bot "I've just +# published a fresh build — please reload from blob" without restarting +# the pod. See utils/knowledge_graph.py for the atomic swap mechanics. +# +# Auth: a shared secret header (``X-Admin-Token``) is checked against the +# ``GRAPHRAG_ADMIN_TOKEN`` value loaded from App Configuration. +# The sync job reads the same secret and includes it in the POST. + + +def _verify_admin_token(token: str | None) -> None: + """Raise 401/503 if the supplied admin token doesn't match the + configured ``GRAPHRAG_ADMIN_TOKEN``. + + Treats an empty configured secret as "endpoint disabled" (returns + 503) — never authorise unauthenticated reloads even by accident. + """ + expected = app_config.get("GRAPHRAG_ADMIN_TOKEN", "") + if not expected: + raise HTTPException( + status_code=503, + detail="GraphRAG admin endpoint disabled (GRAPHRAG_ADMIN_TOKEN not configured)", + ) + if not token or token != expected: + raise HTTPException(status_code=401, detail="invalid admin token") + + +@app.post("/admin/graphrag/reload") +async def admin_graphrag_reload( + x_admin_token: str | None = Header(default=None, alias="X-Admin-Token"), +): + """Atomically reload the GraphRAG parquets from the configured blob source. + + In-flight DRIFT queries keep their captured DataFrame snapshot and + finish against the old data; subsequent queries see the new data. + On failure the prior build remains active. + """ + _verify_admin_token(x_admin_token) + from utils.knowledge_graph import get_knowledge_graph_service + + service = get_knowledge_graph_service() + if not service.enabled: + raise HTTPException( + status_code=409, detail="GraphRAG service is disabled (no source configured)" + ) + try: + status = await service.reload() + except Exception as exc: + logger.exception("GraphRAG reload failed") + raise HTTPException(status_code=500, detail=f"reload failed: {exc}") from exc + logger.info("GraphRAG reload succeeded: %s", status.get("version")) + return status + + +@app.get("/admin/graphrag/status") +async def admin_graphrag_status( + x_admin_token: str | None = Header(default=None, alias="X-Admin-Token"), +): + """Return the currently-loaded GraphRAG build metadata.""" + _verify_admin_token(x_admin_token) + from utils.knowledge_graph import get_knowledge_graph_service + + return get_knowledge_graph_service().get_status() + + if __name__ == "__main__": uvicorn.run(app, host="0.0.0.0", port=8089) diff --git a/tools/sdk-ai-bots/azure-sdk-qa-bot-agent/tests/admin_graphrag_endpoints_test.py b/tools/sdk-ai-bots/azure-sdk-qa-bot-agent/tests/admin_graphrag_endpoints_test.py new file mode 100644 index 00000000000..95c9ab21e26 --- /dev/null +++ b/tools/sdk-ai-bots/azure-sdk-qa-bot-agent/tests/admin_graphrag_endpoints_test.py @@ -0,0 +1,132 @@ +"""Unit tests for the GraphRAG admin endpoints (reload / status). + +These tests avoid touching real blob storage or running DRIFT by +monkeypatching ``KnowledgeGraphService`` on the singleton instance with +``unittest.mock``. The goal is to lock down the HTTP contract: +- 503 when ``GRAPHRAG_ADMIN_TOKEN`` is not configured +- 401 when the supplied token is missing or wrong +- 409 when the underlying service is not enabled +- 200 + status dict on success +- 500 when ``reload`` raises +""" + +from __future__ import annotations + +import sys +from pathlib import Path +from unittest.mock import AsyncMock, patch + +import pytest + +_PROJECT_ROOT = str(Path(__file__).resolve().parent.parent) +if _PROJECT_ROOT not in sys.path: + sys.path.insert(0, _PROJECT_ROOT) + +from fastapi.testclient import TestClient # noqa: E402 + +import config.app_config as app_config # noqa: E402 +import server # noqa: E402 +from utils import knowledge_graph as kg # noqa: E402 + + +_TOKEN = "test-admin-token-12345" +_STATUS_PAYLOAD = { + "enabled": True, + "loaded": True, + "source": {"source": "blob", "container": "graphrag-output", "prefix": "snap-1"}, + "community_level": 2, + "version": {"manifest": {"build_id": "snap-1"}}, + "row_counts": { + "entities": 10, + "communities": 3, + "community_reports": 3, + "text_units": 25, + "relationships": 15, + "documents": 4, + }, +} + + +@pytest.fixture +def client(monkeypatch): + """Yield a FastAPI test client with the admin token loaded into App Config.""" + # ``config.app_config.get`` reads from the module-level ``_settings`` + # dict that's populated by ``init()`` against Azure App Configuration. + # We bypass init() here by injecting the dict directly so the admin + # token check finds it via ``app_config.get('GRAPHRAG_ADMIN_TOKEN', '')``. + monkeypatch.setattr(app_config, "_settings", {"GRAPHRAG_ADMIN_TOKEN": _TOKEN}) + with TestClient(server.app) as c: + yield c + + +@pytest.fixture +def fake_service(): + """Patch the module-level ``get_knowledge_graph_service`` to return a fake.""" + service = AsyncMock() + service.enabled = True + service.get_status = lambda: _STATUS_PAYLOAD + service.reload = AsyncMock(return_value=_STATUS_PAYLOAD) + # The endpoint imports the helper lazily; patch the source module so + # both server.py's import and any other consumer see the same fake. + with patch.object(kg, "get_knowledge_graph_service", return_value=service): + yield service + + +def test_reload_requires_configured_token(monkeypatch): + """Without GRAPHRAG_ADMIN_TOKEN the endpoint is hard-disabled (503).""" + monkeypatch.setattr(app_config, "_settings", {}) + with TestClient(server.app) as client: + resp = client.post("/admin/graphrag/reload", headers={"X-Admin-Token": "x"}) + assert resp.status_code == 503 + + +def test_reload_rejects_wrong_token(client): + resp = client.post("/admin/graphrag/reload", headers={"X-Admin-Token": "wrong"}) + assert resp.status_code == 401 + + +def test_reload_rejects_missing_header(client): + resp = client.post("/admin/graphrag/reload") + assert resp.status_code == 401 + + +def test_reload_returns_409_when_disabled(client): + service = AsyncMock() + service.enabled = False + with patch.object(kg, "get_knowledge_graph_service", return_value=service): + resp = client.post( + "/admin/graphrag/reload", headers={"X-Admin-Token": _TOKEN} + ) + assert resp.status_code == 409 + + +def test_reload_success(client, fake_service): + resp = client.post( + "/admin/graphrag/reload", headers={"X-Admin-Token": _TOKEN} + ) + assert resp.status_code == 200 + assert resp.json() == _STATUS_PAYLOAD + fake_service.reload.assert_awaited_once() + + +def test_reload_propagates_failure_as_500(client): + service = AsyncMock() + service.enabled = True + service.reload = AsyncMock(side_effect=RuntimeError("blob unavailable")) + with patch.object(kg, "get_knowledge_graph_service", return_value=service): + resp = client.post( + "/admin/graphrag/reload", headers={"X-Admin-Token": _TOKEN} + ) + assert resp.status_code == 500 + assert "blob unavailable" in resp.json()["detail"] + + +def test_status_requires_token(client): + resp = client.get("/admin/graphrag/status", headers={"X-Admin-Token": "wrong"}) + assert resp.status_code == 401 + + +def test_status_returns_payload(client, fake_service): + resp = client.get("/admin/graphrag/status", headers={"X-Admin-Token": _TOKEN}) + assert resp.status_code == 200 + assert resp.json() == _STATUS_PAYLOAD diff --git a/tools/sdk-ai-bots/azure-sdk-qa-bot-agent/tests/graph_knowledge_tools_test.py b/tools/sdk-ai-bots/azure-sdk-qa-bot-agent/tests/graph_knowledge_tools_test.py new file mode 100644 index 00000000000..2dfd6400ca0 --- /dev/null +++ b/tools/sdk-ai-bots/azure-sdk-qa-bot-agent/tests/graph_knowledge_tools_test.py @@ -0,0 +1,25 @@ +from __future__ import annotations + +import sys +from pathlib import Path + +import pytest + +# Ensure the project root is on sys.path so ``config``, ``tools``, etc. resolve. +_PROJECT_ROOT = str(Path(__file__).resolve().parent.parent) +if _PROJECT_ROOT not in sys.path: + sys.path.insert(0, _PROJECT_ROOT) + +from config.tenant_config import TenantID +from tools.graph_knowledge_tools import GraphKnowledgeTools + + +@pytest.mark.asyncio +async def test_search_graph_knowledge_tool() -> None: + query = "What does the TypeSpec JSON Schema emitter do?" + + result = await GraphKnowledgeTools().search_knowledge_graph( + queries=[query], tenant_id=TenantID.TYPESPEC_CHANNEL_QA_BOT + ) + + assert len(result.results) > 0 diff --git a/tools/sdk-ai-bots/azure-sdk-qa-bot-agent/tools/graph_knowledge_tools.py b/tools/sdk-ai-bots/azure-sdk-qa-bot-agent/tools/graph_knowledge_tools.py new file mode 100644 index 00000000000..79384edf9b5 --- /dev/null +++ b/tools/sdk-ai-bots/azure-sdk-qa-bot-agent/tools/graph_knowledge_tools.py @@ -0,0 +1,156 @@ +"""Graph-based knowledge retrieval tool (Microsoft GraphRAG / DRIFT search). + +Exposes a single tool, :meth:`GraphKnowledgeTools.search_knowledge_graph`, +that runs GraphRAG's DRIFT search against the knowledge graph built by the +``azure-sdk-qa-bot-knowledge-graph-sync`` project. The vector similarity +component of the search hits the configured Azure AI Search indexes; the +graph structure (entities, relationships, community hierarchy, source text) +is loaded from the parquet artefacts produced by the sync pipeline. + +Returned references point to the **original source documents** cited by +the DRIFT search (resolved via ``documents.parquet`` and ``text_units``), +so the LLM can surface real document titles / paths in its answer rather +than a synthetic "graph insight" stub. +""" + +from __future__ import annotations + +import asyncio +import logging +from typing import Annotated + +from models.knowledge import Reference, SearchKnowledgeBaseResult +from tools import tool +from utils.knowledge_graph import ( + GraphSourceRef, + get_knowledge_graph_service, +) + +logger = logging.getLogger(__name__) + +# Truncate each chunk excerpt to keep prompt context bounded. +_MAX_CONTENT_CHARS_PER_RESULT = 3000 + +# DRIFT search is expensive — cap the number of parallel queries per call. +_MAX_QUERIES = 2 + + +class GraphKnowledgeTools: + """Knowledge graph retrieval tools backed by GraphRAG DRIFT search.""" + + @tool + async def search_knowledge_graph( + self, + *, + queries: Annotated[ + list[str], + "One or two natural-language questions to ask the knowledge graph. " + "GraphRAG DRIFT search reasons over communities of related entities " + "and traverses their relationships, so phrase the query as a " + "QUESTION or a topic — not a keyword list. " + "Use this tool when the user's question requires connecting " + "concepts across multiple documents or summarising a topic area " + "(e.g., 'How does the TypeSpec ARM template relate to operationId " + "naming?' or 'Explain the relationship between LRO, polling, and " + "x-ms-long-running-operation'). " + "Each query is expensive (multiple LLM calls); prefer one focused " + "question, two only if they cover genuinely different facets.", + ], + tenant_id: Annotated[ + str, + "The active tenant ID for the current conversation. Currently " + "informational only — the underlying knowledge graph is global, " + "but the field is kept for parity with search_knowledge_base and " + "future per-tenant graphs.", + ], + ) -> SearchKnowledgeBaseResult: + """Search the GraphRAG knowledge graph and return the source + documents it cited. + + For each query, runs Microsoft GraphRAG's DRIFT (Dynamic Reasoning + and Inference with Flexible Traversal) search, then extracts the + text-unit citations from the resulting context payload and resolves + them back to their original source-document files via + ``documents.parquet``. + + Each resulting :class:`Reference` represents one cited source + document — title and link reflect the original file path, and + ``content`` is a representative chunk excerpt from that document. + """ + service = get_knowledge_graph_service() + + capped = [q for q in queries[:_MAX_QUERIES] if q and q.strip()] + if not capped: + return SearchKnowledgeBaseResult(results=[]) + + logger.info( + "Running GraphRAG DRIFT search for tenant=%s, queries=%s", + tenant_id, + capped, + ) + + tasks = [service.drift_search(q) for q in capped] + outcomes = await asyncio.gather(*tasks, return_exceptions=True) + + # Dedup across queries by document title so the LLM doesn't see the + # same document twice if both queries cited it. + merged: dict[str, Reference] = {} + for query, outcome in zip(capped, outcomes): + if isinstance(outcome, BaseException): + logger.warning( + "GraphRAG drift_search failed for query=%r: %s", + query, + outcome, + ) + continue + if outcome is None: + continue + answer, sources = outcome + if answer: + logger.info( + "GraphRAG synthesised answer (query=%r) length=%d", + query, + len(answer), + ) + for src in sources: + ref = _graph_source_to_reference(src) + # Keep the first occurrence (highest-ranked per query order). + merged.setdefault(ref.title or ref.link, ref) + + refs = list(merged.values()) + + logger.info( + "=========Final Graph Search Result========= total=%d", len(refs) + ) + for i, ref in enumerate(refs): + logger.info( + "Graph Result [%d] source=%s, title=%s, link=%s, content_len=%d", + i + 1, + ref.source, + ref.title, + ref.link, + len(ref.content or ""), + ) + logger.info("===================================== total=%d results", len(refs)) + + return SearchKnowledgeBaseResult(results=refs) + + +def _graph_source_to_reference(src: GraphSourceRef) -> Reference: + """Convert a :class:`GraphSourceRef` to the bot's :class:`Reference`.""" + return Reference( + title=src.title, + source=src.source, + link=src.link, + content=_truncate_content(src.content), + score=0.0, + ) + + +def _truncate_content(content: str | None) -> str: + """Truncate content to bound the prompt context size.""" + if not content: + return "" + if len(content) <= _MAX_CONTENT_CHARS_PER_RESULT: + return content + return content[:_MAX_CONTENT_CHARS_PER_RESULT] + "\n... [truncated]" diff --git a/tools/sdk-ai-bots/azure-sdk-qa-bot-agent/utils/knowledge_graph.py b/tools/sdk-ai-bots/azure-sdk-qa-bot-agent/utils/knowledge_graph.py new file mode 100644 index 00000000000..6912a9b8a27 --- /dev/null +++ b/tools/sdk-ai-bots/azure-sdk-qa-bot-agent/utils/knowledge_graph.py @@ -0,0 +1,610 @@ +"""Knowledge graph query service using Microsoft GraphRAG. + +Wraps :func:`graphrag.api.drift_search` against the indexes and graph +artefacts produced by the ``azure-sdk-qa-bot-knowledge-graph-sync`` project. + +Data sources at query time +-------------------------- +GraphRAG splits its query-time inputs into two stores; this service reads +from both because each contains a different *kind* of data: + +* **Azure AI Search** — *all vector data.* + GraphRAG's ``drift_search`` calls ``get_embedding_store(config.vector_store, + ...)`` internally, so every entity/community embedding similarity lookup + hits the AI Search indexes configured in + ``config/graphrag/settings.yaml`` (``entities``, ``communities``, + ``text_units``). No embeddings live in this process. +* **Parquet artefacts** — *graph structure only.* + ``entities`` / ``communities`` / ``community_reports`` / ``text_units`` / + ``relationships`` parquets contain IDs, names, descriptions, edges, + community hierarchy, and source text — but **no vector columns**. + GraphRAG requires these DataFrames so it can resolve embedding hits back + to entities, traverse relationships, and pull the actual report / chunk + text for the LLM prompt. + +The parquet artefacts are loaded once (lazily, on first query) from +the Azure Blob container named by ``STORAGE_GRAPHRAG_OUTPUT_CONTAINER``; +the bot then atomically swaps in a new build whenever the sync +project calls ``POST /admin/graphrag/reload``. + +DRIFT (Dynamic Reasoning and Inference with Flexible Traversal) is +GraphRAG's hybrid search mode: a community-level "primer" generates seed +answers, then per-seed local searches expand context via graph traversal, +and a reduce step combines them into the final response. It is heavier than +local/global search but produces more comprehensive graph-aware answers — +well suited for the bot agent's complex SDK / TypeSpec questions. +""" + +from __future__ import annotations + +import asyncio +import logging +import tempfile +from dataclasses import dataclass +from pathlib import Path +from typing import TYPE_CHECKING, Any +from urllib.parse import urlparse + +from config.app_config import get as cfg +from utils.azure_storage import download_blob + +if TYPE_CHECKING: + import pandas as pd + +logger = logging.getLogger(__name__) + +_DEFAULT_COMMUNITY_LEVEL = 2 +_DEFAULT_RESPONSE_TYPE = "multiple paragraphs" + +# Parquet artefacts produced by the GraphRAG indexing pipeline that we need +# to drive query operations. +# - ``documents`` is required so we can map text_units back to their original +# source-document filename / path when building Reference entries. +_REQUIRED_PARQUETS: tuple[str, ...] = ( + "entities", + "communities", + "community_reports", + "text_units", + "relationships", + "documents", +) + + +@dataclass(frozen=True) +class GraphSourceRef: + """A source-document reference cited by a GraphRAG DRIFT search. + + Attributes: + title: Human-readable document title (path with ``/`` separators). + link: Best-effort link to the original document. Empty string + when no URL can be derived without tenant context. + content: A representative excerpt of the source text used to + ground the LLM answer (typically one text_unit chunk). + source: Short identifier for the originating data set; always + ``"graphrag"`` for now since the graph does not preserve + the per-document KnowledgeSource that produced it. + """ + + title: str + link: str + content: str + source: str = "graphrag" + +# Repo-root path to the bot agent's GraphRAG query config (settings.yaml). +_GRAPHRAG_CONFIG_ROOT = ( + Path(__file__).resolve().parent.parent / "config" / "graphrag" +) + + +class KnowledgeGraphService: + """Query service that delegates to GraphRAG's ``drift_search`` APIs. + + The service is a process-wide singleton (see ``get_knowledge_graph_service``). + DataFrame loading is lazy on first use and cached for the lifetime of the + process; restart the bot to pick up newly-published parquet outputs. + """ + + def __init__(self) -> None: + self._community_level = int( + cfg("GRAPH_COMMUNITY_LEVEL", str(_DEFAULT_COMMUNITY_LEVEL)) + ) + self._response_type = cfg("GRAPH_RESPONSE_TYPE", _DEFAULT_RESPONSE_TYPE) + # Blob container holding the parquet outputs. The sync project + # writes both the versioned snapshots + # (``/.parquet``) and the ``latest.json`` + # manifest pointer at the container root — no sub-prefix. + self._blob_container = cfg("STORAGE_GRAPHRAG_OUTPUT_CONTAINER", "") + + self._config = None # graphrag GraphRagConfig + self._dfs: "dict[str, pd.DataFrame] | None" = None + # Metadata for the currently-loaded build (manifest contents + + # row counts). Populated on every successful load / reload so + # ``GET /admin/graphrag/status`` can report what's serving traffic. + self._loaded_version: dict[str, Any] | None = None + self._load_lock = asyncio.Lock() + + # Service is considered "available" when a blob container is + # configured. Tool registration is governed separately by + # KNOWLEDGE_TOOL_MODE in agents/chat_agent. + self._enabled = bool(self._blob_container) + if not self._enabled: + logger.info( + "KnowledgeGraphService inactive: " + "STORAGE_GRAPHRAG_OUTPUT_CONTAINER is not set." + ) + else: + logger.info( + "KnowledgeGraphService ready: community_level=%d, blob_container=%s", + self._community_level, + self._blob_container, + ) + + @property + def enabled(self) -> bool: + return self._enabled + + def get_status(self) -> dict[str, Any]: + """Return a snapshot of the currently-loaded graph build. + + Used by ``GET /admin/graphrag/status`` and surfaced in + ``reload()``'s response so callers can confirm which build is + serving traffic. + """ + loaded = self._dfs is not None and self._config is not None + status: dict[str, Any] = { + "enabled": self._enabled, + "loaded": loaded, + "source": self._describe_source(), + "community_level": self._community_level, + } + if self._loaded_version is not None: + status["version"] = dict(self._loaded_version) + if loaded and self._dfs is not None: + status["row_counts"] = { + name: int(len(df)) for name, df in self._dfs.items() + } + return status + + def _describe_source(self) -> dict[str, str]: + if self._blob_container: + return { + "type": "blob", + "container": self._blob_container, + } + return {"type": "none"} + + async def reload(self) -> dict[str, Any]: + """Atomically reload the graph from the configured source. + + Builds new ``config`` + ``DataFrames`` into locals, then swaps + the service's references under ``_load_lock``. In-flight queries + keep their captured snapshot of ``self._dfs`` and complete + against the old data; subsequent queries see the new data. + Failures preserve the prior state (no partial swap). + + Raises: + RuntimeError: when the service is disabled (no source + configured) — the caller should surface this as 409. + Exception: any failure during config load / parquet read. + """ + if not self._enabled: + raise RuntimeError( + "KnowledgeGraphService is disabled: no parquet source configured." + ) + + async with self._load_lock: + new_config = await asyncio.to_thread(self._load_config) + new_dfs, new_version = await self._load_parquets_with_manifest() + # Atomic pointer swap (Python ref assignment under GIL). + self._config = new_config + self._dfs = new_dfs + self._loaded_version = new_version + + logger.info( + "KnowledgeGraphService reloaded: version=%s row_counts=%s", + new_version, + {name: len(df) for name, df in new_dfs.items()}, + ) + return self.get_status() + + + # ------------------------------------------------------------------ # + # Public search API + # ------------------------------------------------------------------ # + + async def drift_search( + self, + query: str, + *, + context_ids: list[str] | None = None, # kept for caller compat + ) -> tuple[str, list[GraphSourceRef]] | None: + """Run a GraphRAG DRIFT search and return the synthesised answer + together with the source documents it cited. + + DRIFT search combines a community-level primer (global-style) with + per-seed local searches that traverse the graph for follow-up + context, then reduces them into a single comprehensive answer. + + Args: + query: The user's question. + context_ids: Ignored — kept for backwards compatibility with the + previous chunk-id-based API. GraphRAG does not + currently support per-source filtering at query time. + + Returns: + ``(answer, sources)`` on success, where: + - ``answer`` is the synthesised graph-aware response string, + - ``sources`` is a list of :class:`GraphSourceRef` objects, + one per unique original-document file that contributed to + the answer. + Returns ``None`` when the service is disabled or the underlying + search call fails. + """ + if not self._enabled: + return None + if not await self._ensure_loaded(): + return None + + # ``_ensure_loaded`` guarantees both are populated when it returns + # True, but pyright can't see that across the async boundary. + config = self._config + dfs = self._dfs + assert config is not None and dfs is not None + + # Imported lazily so the bot can boot when graphrag is unavailable. + from graphrag.api import drift_search as graphrag_drift_search + + try: + response, context = await graphrag_drift_search( + config=config, + entities=dfs["entities"], + communities=dfs["communities"], + community_reports=dfs["community_reports"], + text_units=dfs["text_units"], + relationships=dfs["relationships"], + community_level=self._community_level, + response_type=self._response_type, + query=query, + ) + except Exception: + logger.warning("GraphRAG drift_search failed", exc_info=True) + return None + + answer = _coerce_response(response) + if answer is None: + answer = "" + + sources = self._extract_sources_from_context(context) + return answer, sources + + # ------------------------------------------------------------------ # + # Source-document extraction + # ------------------------------------------------------------------ # + + def _extract_sources_from_context( + self, context_data: Any + ) -> list[GraphSourceRef]: + """Walk a DRIFT ``context_data`` payload and resolve cited + text-unit IDs back to their original source documents. + + The DRIFT context is a nested structure (``{sub_query: {table_name: + DataFrame}}``). We do not depend on its exact shape — we recursively + collect any DataFrame that looks like a "sources" table (has both + ``id`` and ``text`` columns) and treat the ``id`` column values as + text-unit ``human_readable_id``s (matches GraphRAG's + ``TextUnit.short_id``). + """ + if self._dfs is None: + return [] + + text_units_df = self._dfs.get("text_units") + documents_df = self._dfs.get("documents") + if text_units_df is None or documents_df is None: + return [] + + short_ids = _collect_text_unit_short_ids(context_data) + if not short_ids: + return [] + + # human_readable_id is stored as int; cast collected values defensively. + normalised_ids: set[int] = set() + for sid in short_ids: + try: + normalised_ids.add(int(sid)) + except (TypeError, ValueError): + continue + if not normalised_ids: + return [] + + matched_units = text_units_df[ + text_units_df["human_readable_id"].isin(list(normalised_ids)) + ] + if matched_units.empty: + return [] + + # documents.id ↔ text_units.document_id + doc_index = documents_df.set_index("id")[["title"]] + + # Group text units by document so each Reference carries a single + # representative chunk excerpt from that document (largest chunk). + sources: list[GraphSourceRef] = [] + seen_docs: set[str] = set() + # Sort by chunk size descending so the most informative chunk wins + # when multiple chunks from the same doc are cited. + # Note: a list comprehension keeps pyright happy where + # ``matched_units["text"].str.len()`` confuses the pandas type stubs + # into thinking the column is an ndarray. + sorted_units = matched_units.assign( + _len=[len(str(v)) for v in matched_units["text"]] + ).sort_values("_len", ascending=False) + + for _, row in sorted_units.iterrows(): + doc_id = row.get("document_id") + if not doc_id or doc_id in seen_docs: + continue + seen_docs.add(doc_id) + + if doc_id in doc_index.index: + raw_title = str(doc_index.loc[doc_id, "title"] or "") + else: + raw_title = "" + display_title, link = _doc_title_to_display(raw_title) + + sources.append( + GraphSourceRef( + title=display_title or f"Document {doc_id[:12]}", + link=link, + content=str(row.get("text") or ""), + source="graphrag", + ) + ) + + return sources + + # ------------------------------------------------------------------ # + # Lazy loading + # ------------------------------------------------------------------ # + + async def _ensure_loaded(self) -> bool: + """Load GraphRagConfig + parquet DataFrames on first use.""" + if self._dfs is not None and self._config is not None: + return True + + async with self._load_lock: + if self._dfs is not None and self._config is not None: + return True + + try: + self._config = await asyncio.to_thread(self._load_config) + self._dfs, self._loaded_version = ( + await self._load_parquets_with_manifest() + ) + except Exception: + logger.exception("Failed to initialise KnowledgeGraphService") + # Force re-attempt on the next call + self._config = None + self._dfs = None + self._loaded_version = None + return False + + logger.info( + "KnowledgeGraphService loaded: %s", + {name: len(df) for name, df in self._dfs.items()}, + ) + return True + + def _load_config(self): + """Load the bot's GraphRAG settings.yaml. + + The DRIFT-search latency tuning lives in ``settings.yaml`` under + the ``drift_search:`` block — see that file for what's being + overridden vs upstream defaults. + """ + from graphrag.config.load_config import load_config + + return load_config(_GRAPHRAG_CONFIG_ROOT) + + async def _load_parquets_with_manifest( + self, + ) -> "tuple[dict[str, pd.DataFrame], dict[str, Any]]": + """Load parquets and return them alongside version metadata. + + For blob sources, fetches ``latest.json`` first to find the + active versioned snapshot prefix and uses it to download the + parquets. The manifest is opaque to the bot — whatever the sync + project wrote is echoed back via ``get_status()``. + + Falls back to ``/*.parquet`` (unversioned root + layout) when no manifest exists. + """ + if not self._blob_container: + raise RuntimeError( + "STORAGE_GRAPHRAG_OUTPUT_CONTAINER is not configured; " + "cannot load GraphRAG parquet artefacts." + ) + + manifest = await self._load_manifest() + snapshot_prefix = self._snapshot_prefix(manifest) + dfs = await self._load_parquets_from_blob(snapshot_prefix) + version: dict[str, Any] = { + "source": "blob", + "container": self._blob_container, + "snapshot": snapshot_prefix, + } + if manifest: + version["manifest"] = manifest + return dfs, version + + async def _load_manifest(self) -> dict[str, Any] | None: + """Read ``latest.json`` from the blob container root, if present. + + Returns ``None`` when the manifest is missing or unparseable — + callers then fall back to assuming an unversioned root layout. + """ + import json + + data = await download_blob(self._blob_container, "latest.json") + if data is None: + logger.info( + "GraphRAG manifest not found at %s/latest.json — using unversioned layout", + self._blob_container, + ) + return None + try: + manifest = json.loads(data.decode("utf-8")) + except (ValueError, UnicodeDecodeError) as exc: + logger.warning( + "GraphRAG manifest unparseable (%s); falling back to unversioned layout", + exc, + ) + return None + if not isinstance(manifest, dict): + logger.warning( + "GraphRAG manifest is not a JSON object; falling back to unversioned layout" + ) + return None + return manifest + + def _snapshot_prefix(self, manifest: dict[str, Any] | None) -> str: + """Resolve the parquet prefix to read from for this load. + + When a manifest is present, its ``prefix`` field names the + versioned snapshot directory (e.g. ``2026-06-02T...``); without + a manifest the parquets are assumed to live at the container + root (returns ``""``). + """ + if manifest: + sub = str(manifest.get("prefix", "")).strip("/") + if sub: + return sub + return "" + + async def _load_parquets_from_blob( + self, snapshot_prefix: str + ) -> "dict[str, pd.DataFrame]": + """Download the parquet files from blob storage into a temp dir.""" + temp_dir = Path(tempfile.mkdtemp(prefix="graphrag-output-")) + logger.info( + "Downloading GraphRAG parquets from blob container '%s' (prefix='%s') " + "to %s", + self._blob_container, + snapshot_prefix, + temp_dir, + ) + + download_tasks = [ + self._download_one_parquet(name, snapshot_prefix, temp_dir) + for name in _REQUIRED_PARQUETS + ] + await asyncio.gather(*download_tasks) + + return await asyncio.to_thread(self._load_parquets_from_path, temp_dir) + + async def _download_one_parquet( + self, name: str, snapshot_prefix: str, dest_dir: Path + ) -> None: + blob_name = ( + f"{snapshot_prefix}/{name}.parquet" if snapshot_prefix else f"{name}.parquet" + ) + data = await download_blob(self._blob_container, blob_name) + if data is None: + raise FileNotFoundError( + f"GraphRAG parquet not found: {self._blob_container}/{blob_name}" + ) + (dest_dir / f"{name}.parquet").write_bytes(data) + + @staticmethod + def _load_parquets_from_path(path: Path) -> dict[str, "pd.DataFrame"]: + import pandas as pd + + dfs: dict[str, pd.DataFrame] = {} + for name in _REQUIRED_PARQUETS: + file_path = path / f"{name}.parquet" + if not file_path.is_file(): + raise FileNotFoundError( + f"GraphRAG parquet not found: {file_path}" + ) + dfs[name] = pd.read_parquet(file_path) + return dfs + + +def _coerce_response(response: object) -> str | None: + """Convert a GraphRAG response value to a string (or None when empty).""" + if response is None: + return None + if isinstance(response, str): + return response.strip() or None + return str(response).strip() or None + + +def _collect_text_unit_short_ids(context_data: Any) -> set[str]: + """Recursively walk a DRIFT ``context_data`` payload and collect every + text-unit short id we can find. + + DRIFT returns a nested dict (``{sub_query: {table: DataFrame}}``) but + the exact shape can vary across graphrag versions. We treat any value + that is a pandas DataFrame *and* has both ``id`` and ``text`` columns as + a candidate "sources" table — the convention used by + ``build_text_unit_context``. + """ + import pandas as pd # local import keeps top-level import lazy + + found: set[str] = set() + + def visit(node: Any) -> None: + if node is None: + return + if isinstance(node, pd.DataFrame): + cols = set(node.columns) + if {"id", "text"}.issubset(cols): + for value in node["id"].astype(str).tolist(): + if value: + found.add(value) + return + if isinstance(node, dict): + for v in node.values(): + visit(v) + return + if isinstance(node, (list, tuple, set)): + for v in node: + visit(v) + return + + visit(context_data) + return found + + +def _doc_title_to_display(raw_title: str) -> tuple[str, str]: + """Convert a stored ``documents.title`` into ``(display_title, link)``. + + The sync project encodes original file paths by replacing ``/`` and + ``os.sep`` with ``#`` (see ``daily_sync.py`` / ``typespec_processor.py`` + in ``azure-sdk-qa-bot-knowledge-graph-sync``). We reverse that here so + titles look like ordinary paths in the agent's reference list. + + ``link`` is best-effort: we return the path-style title so downstream + rendering can prefix a base URL if appropriate, or leave it as a path + when no URL prefix is known. We do not have per-document + ``KnowledgeSource`` context at query time. + """ + title = (raw_title or "").strip() + if not title: + return "", "" + pretty = title.replace("#", "/") + return pretty, pretty + + + +# --------------------------------------------------------------------------- # +# Singleton +# --------------------------------------------------------------------------- # + +_service: KnowledgeGraphService | None = None + + +def get_knowledge_graph_service() -> KnowledgeGraphService: + """Return the shared KnowledgeGraphService instance.""" + global _service + if _service is None: + _service = KnowledgeGraphService() + return _service diff --git a/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/README.md b/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/README.md new file mode 100644 index 00000000000..d0acc7c77d8 --- /dev/null +++ b/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/README.md @@ -0,0 +1,178 @@ +# Azure SDK QA Bot — Knowledge Graph Sync + +Python-based knowledge graph sync pipeline for the Azure SDK QA Bot. This project: + +1. **Syncs documentation** from multiple repositories into Azure Blob Storage +2. **Builds a knowledge graph** using Microsoft's [GraphRAG](https://github.com/microsoft/graphrag) library with Azure AI Search as the native vector store +3. **Incremental indexing** — uses GraphRAG's built-in `update` command to only re-process documents that changed + +## Architecture + +``` +┌─────────────────┐ ┌──────────────┐ +│ Git Repos │────▶│ Blob Store │ +│ (TypeSpec, │ │ (markdown) │ +│ Guidelines) │ └──────┬───────┘ +└─────────────────┘ │ + ▼ + ┌──────────────────────┐ + │ GraphRAG Pipeline │ + │ • Entity extraction │ + │ • Community detect │ + │ • Embedding gen │ + └──────────┬───────────┘ + │ + ┌──────────▼───────────┐ + │ Azure AI Search │ + │ (vector store) │ + │ • text_unit_text │ + │ • entity_description│ + │ • community_content │ + └──────────────────────┘ +``` + +**Key insight**: GraphRAG natively supports Azure AI Search as a vector store. Instead of maintaining a separate search indexing pipeline, GraphRAG handles all vector embedding, chunking, and index writes automatically during its `index`/`update` commands. + +## Incremental Indexing + +Uses GraphRAG's native `update` command for incremental processing: + +1. **Doc sync** identifies which blob paths changed and which were deleted +2. **Deleted files** are removed from `graphrag_config/input/` +3. **Changed files** are downloaded (additive) to `graphrag_config/input/` +4. **`graphrag update`** processes only new/modified documents and merges results into existing graph +5. **Update output** is merged back into the main output directory for subsequent runs + +If no prior index exists, the system automatically falls back to a full `graphrag index`. + +Use `--full-graphrag` to force a complete rebuild when needed (e.g., after changing extraction prompts). + +## Query Modes + +GraphRAG provides four built-in query modes (used by the bot agent at runtime): + +| Mode | Best For | +|------|----------| +| **Local Search** | Specific entity questions (fans out to neighbors + text chunks) | +| **Global Search** | Holistic questions requiring cross-document reasoning | +| **DRIFT Search** | Multi-hop reasoning with community context | +| **Basic Search** | Standard vector RAG (top-k text unit similarity) | + +## Prerequisites + +- Python 3.11+ +- Azure credentials (DefaultAzureCredential / Managed Identity) +- Access to Azure Blob Storage, Azure AI Search, and Azure OpenAI + +## Setup + +```bash +# Install with dev dependencies +pip install -e ".[dev]" + +# Or production only +pip install -e . +``` + +## Usage + +```bash +# Normal daily run: sync docs + incremental graph update +sync-knowledge-graph + +# Sync docs only (skip graph indexing) +sync-knowledge-graph --skip-graphrag + +# GraphRAG only (skip doc sync, use existing blobs) +sync-knowledge-graph --graphrag-only + +# Force full graph rebuild (re-indexes all sources) +sync-knowledge-graph --full-graphrag + +# Specific sources for full re-index +sync-knowledge-graph --full-graphrag --sources typespec_docs,azure_api_guidelines +``` + +## Environment Variables + +The pipeline reads its bootstrap endpoints from environment variables; +everything else is pulled from Azure App Configuration and Azure Key Vault +at startup (see `src/services/app_config.py` and `src/services/app_secret.py`). + +| Variable | Source | Description | +|----------|--------|-------------| +| `AZURE_APPCONFIG_ENDPOINT` | env | Azure App Configuration endpoint. All other config keys are loaded from here. | +| `KEYVAULT_ENDPOINT` | App Config | Azure Key Vault endpoint. Loaded from App Config, then secrets are exported to env. | +| `STORAGE_ACCOUNT_NAME` | App Config | Azure Storage account name. | +| `STORAGE_KNOWLEDGE_CONTAINER` | App Config | Blob container for processed docs. | +| `STORAGE_GRAPHRAG_OUTPUT_CONTAINER` | App Config | Destination container for parquet snapshots (e.g. `graphrag-output`). When unset, the post-indexing publish step degrades to a logged no-op. | +| `AI_SEARCH_BASE_URL` | App Config | Azure AI Search endpoint URL — referenced as `${AI_SEARCH_BASE_URL}` by `graphrag_config/settings.yaml`. | +| `AI_SEARCH_INDEX_TEXT_UNITS` | App Config | AI Search index for text unit embeddings. | +| `AI_SEARCH_INDEX_ENTITIES` | App Config | AI Search index for entity embeddings. | +| `AI_SEARCH_INDEX_COMMUNITIES` | App Config | AI Search index for community embeddings. | +| `AI_SEARCH_API_KEY` | Key Vault (`AI-SEARCH-APIKEY`) | AI Search admin key. | +| `AOAI_CHAT_COMPLETIONS_ENDPOINT` | App Config | Azure OpenAI endpoint (used by GraphRAG and `spector_processor`). | +| `AOAI_CHAT_COMPLETIONS_API_KEY` | Key Vault (`AOAI-CHAT-COMPLETIONS-API-KEY`) | Azure OpenAI key for non-MI callers (`spector_processor`). | +| `AOAI_CHAT_REASONING_MODEL` | App Config | Azure OpenAI deployment name used by `spector_processor`. | +| `SSH_PRIVATE_KEY` | Key Vault (`SSH-PRIVATE-KEY`) | SSH private key (for private repos cloned over SSH). | +| `AZURE_SDK_GITHUB_PAT` | env (CI) | GitHub App token for private repo access. | +| `AZURE_SDK_DOCS_PATH` | env (CI) | Local path to the `azure-sdk-docs-eng.ms` clone (used when `authType: local`). | +| `AZURE_SDK_WIKI_PATH` | env (CI) | Local path to the `internal.wiki` clone (used when `authType: local`). | +| `BOT_AGENT_RELOAD_URL` | env (CI) | Bot agent reload endpoint (e.g. `https:///admin/graphrag/reload`). When unset, the publish step skips notification with a warning. | +| `BOT_AGENT_ADMIN_TOKEN` | env (CI) | Shared secret sent as `X-Admin-Token` to the bot reload endpoint. | + +## Testing + +```bash +python -m pytest tests/ -v +``` + +## Project Structure + +``` +src/ +├── main.py # CLI entry point +├── daily_sync.py # Main sync orchestrator (returns SyncResult) +├── services/ +│ ├── app_config.py # Azure App Configuration +│ ├── app_secret.py # Key Vault secrets +│ ├── configuration_loader.py # Config parser +│ ├── metadata_resolver.py # Glob-based metadata +│ ├── storage_service.py # Blob Storage CRUD + download helpers +│ ├── spector_processor.py # TypeSpec scenarios (OpenAI) +│ └── typespec_processor.py # TypeSpec AST → markdown +└── graphrag/ + └── run_indexing.py # GraphRAG pipeline orchestration + +graphrag_config/ +├── settings.yaml # GraphRAG config (AI Search vector store) +└── prompts/ # Custom extraction prompts + +config/ +├── knowledge-config.json # Repository and documentation sources +└── knowledge-config.schema.json + +tests/ +├── test_daily_sync.py # Core function tests +└── test_configuration_loader.py # Config loader tests +``` + +## Key Design Decisions + +- **GraphRAG as single indexing engine**: No custom search indexing or Cosmos upload code. GraphRAG handles entity extraction, embedding generation, and vector store writes natively via its `azure_ai_search` vector store backend. +- **Native incremental update**: Uses `graphrag update` instead of custom change-tracking logic for the graph. The doc sync still detects file-level changes to minimize unnecessary downloads. +- **Blob Storage as source of truth**: Raw processed markdown is stored in blobs. GraphRAG reads from a local `input/` directory populated from these blobs. +- **Managed Identity auth**: Uses Azure Managed Identity for both Azure OpenAI and AI Search (no API keys in config). +- **12 entity types**: Decorator, Pattern, Tool, Service, API, ErrorCode, Guideline, Library, Operation, Model, Configuration, Protocol + +## Pipelines + +| File | Purpose | +|------|---------| +| `ci.yml` | Build + tests on every PR and on `main` (path-scoped to this project). | +| `sync_knowledge_graph.yml` | Daily scheduled run (03:00 UTC) on an internal 1ES agent — checks out the internal docs/wiki repos, installs the project, runs `sync-knowledge-graph`, publishes the new parquet snapshot to blob storage, and POSTs the bot agent's `/admin/graphrag/reload` endpoint. Mirrors `azure-sdk-qa-bot-knowledge-sync/sync_knowledge.yml`. | + +## Relationship to Other Projects + +- **[azure-sdk-qa-bot-agent](../azure-sdk-qa-bot-agent/)** — The QA bot that queries the knowledge base at runtime using GraphRAG's query API (local/global/drift/basic search). +- **[azure-sdk-qa-bot-knowledge-sync](../azure-sdk-qa-bot-knowledge-sync/)** — The original TypeScript implementation (doc sync only, no graph). This Python project is a full port + GraphRAG replacement. diff --git a/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/ci.yml b/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/ci.yml new file mode 100644 index 00000000000..57e691a3032 --- /dev/null +++ b/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/ci.yml @@ -0,0 +1,62 @@ +trigger: + branches: + include: + - main + paths: + include: + - tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/ + +pr: + branches: + include: + - main + paths: + include: + - tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/ + +extends: + template: /eng/pipelines/templates/stages/1es-redirect.yml + parameters: + Use1ESOfficial: false + stages: + - stage: "AzureSDKQABot_KnowledgeGraphSync_CI" + jobs: + - job: "AzureSDKQABot_KnowledgeGraphSync_CI" + displayName: "Run Knowledge Graph Sync CI" + + variables: + - template: /eng/pipelines/templates/variables/globals.yml + - template: /eng/pipelines/templates/variables/image.yml + + pool: + name: "$(LINUXPOOL)" + image: "$(LINUXVMIMAGE)" + os: linux + + steps: + - task: UsePythonVersion@0 + displayName: "Use Python 3.12" + inputs: + versionSpec: '3.12' + + - script: | + python --version + pip --version + displayName: "Show Python version" + + - script: | + pip install -e ".[dev]" + displayName: "Install dependencies" + workingDirectory: $(Build.SourcesDirectory)/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync + + - script: | + python -m pytest tests/ --junitxml=pytest-report.xml -v + displayName: "Run Tests" + workingDirectory: $(Build.SourcesDirectory)/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync + + - task: PublishTestResults@2 + condition: succeededOrFailed() + inputs: + testResultsFiles: 'tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/pytest-report.xml' + testRunTitle: 'Knowledge Graph Sync tests' + displayName: 'Publish test results' diff --git a/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/config/knowledge-config.json b/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/config/knowledge-config.json new file mode 100644 index 00000000000..2bfd0c6cdd3 --- /dev/null +++ b/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/config/knowledge-config.json @@ -0,0 +1,651 @@ +{ + "$schema": "./knowledge-config.schema.json", + "description": "Unified configuration for knowledge sources including repositories and documentation paths", + "version": "1.0.0", + "sources": [ + { + "repository": { + "url": "https://github.com/microsoft/typespec.git", + "branch": "main", + "authType": "public" + }, + "paths": [ + { + "path": "/website/src/content/docs/docs", + "description": "TypeSpec documentation for API design", + "folder": "typespec_docs", + "fileNameLowerCase": true, + "metadata": { + "scope": "unbranded" + } + }, + { + "description": "TypeSpec HTTP specification test cases", + "path": "/packages/http-specs/specs", + "folder": "typespec_http_specs", + "fileNameLowerCase": true, + "isGenerated": true, + "metadata": { + "scope": "unbranded" + } + } + ] + }, + { + "repository": { + "url": "https://github.com/Azure/typespec-azure.git", + "branch": "main", + "authType": "public" + }, + "paths": [ + { + "path": "/website/src/content/docs/docs", + "description": "TypeSpec Azure-specific documentation", + "folder": "typespec_azure_docs", + "fileNameLowerCase": true, + "metadata": { + "scope": "branded" + }, + "overrides": [ + { + "pattern": "**/azure-resource-manager/**", + "metadata": { + "service_type": "management-plane" + } + }, + { + "pattern": "**/getstarted/azure-core/**", + "metadata": { + "service_type": "data-plane" + } + }, + { + "pattern": "**/arm/**", + "metadata": { + "service_type": "management-plane" + } + }, + { + "pattern": "**/migrate-swagger/checklists/migrate-arm-tips/**", + "metadata": { + "service_type": "management-plane" + } + }, + { + "pattern": "**/migrate-swagger/checklists/migrate-dp-tips/**", + "metadata": { + "service_type": "data-plane" + } + } + ] + }, + { + "description": "TypeSpec Azure HTTP specification test cases", + "path": "/packages/azure-http-specs/specs", + "folder": "typespec_azure_http_specs", + "fileNameLowerCase": true, + "isGenerated": true, + "metadata": { + "scope": "branded" + } + }, + { + "description": "TypeSpec Azure Resource Manager library", + "path": "/packages/typespec-azure-resource-manager/lib", + "folder": "typespec-azure-resource-manager-lib", + "isGenerated": true, + "ignoredPaths": ["legacy-types"], + "metadata": { + "scope": "branded", + "service_type": "management-plane" + } + } + ] + }, + { + "repository": { + "url": "https://github.com/Azure/azure-rest-api-specs.wiki.git", + "branch": "master", + "authType": "public" + }, + "paths": [ + { + "description": "Wiki documentation for Azure REST API specifications", + "folder": "azure_rest_api_specs_wiki", + "metadata": { + "scope": "branded" + } + } + ] + }, + { + "repository": { + "url": "https://github.com/Azure/azure-sdk-for-python.git", + "branch": "main", + "authType": "public" + }, + "paths": [ + { + "description": "Documentation for Azure SDK for Python", + "path": "/doc", + "folder": "azure_sdk_for_python_docs", + "metadata": { + "scope": "branded" + } + } + ] + }, + { + "repository": { + "url": "https://github.com/microsoft/api-guidelines.git", + "path": "api-guidelines", + "branch": "vNext", + "authType": "public" + }, + "paths": [ + { + "description": "Microsoft API design guidelines for Azure", + "path": "/azure", + "folder": "azure_api_guidelines", + "relativeByRepoPath": true, + "metadata": { + "scope": "branded", + "service_type": "data-plane" + } + } + ] + }, + { + "repository": { + "url": "git@github-microsoft:cloud-and-ai-microsoft/resource-provider-contract.git", + "path": "resource-provider-contract", + "branch": "master", + "authType": "ssh", + "sshHost": "github-microsoft" + }, + "paths": [ + { + "description": "Azure Resource Manager resource provider contract documentation", + "path": "/v1.0", + "folder": "azure_resource_manager_rpc", + "relativeByRepoPath": true, + "metadata": { + "scope": "branded", + "service_type": "management-plane" + } + } + ] + }, + { + "repository": { + "url": "https://azure-sdk@dev.azure.com/azure-sdk/internal/_git/azure-sdk-docs-eng.ms", + "branch": "main", + "authType": "local", + "localPathEnv": "AZURE_SDK_DOCS_PATH" + }, + "paths": [ + { + "description": "Internal Azure SDK engineering documentation", + "path": "/docs", + "folder": "azure-sdk-docs-eng", + "relativeByRepoPath": true, + "metadata": { + "scope": "branded" + } + } + ] + }, + { + "repository": { + "url": "https://azure-sdk@dev.azure.com/azure-sdk/internal/_git/internal.wiki", + "branch": "wikiMaster", + "authType": "local", + "localPathEnv": "AZURE_SDK_WIKI_PATH" + }, + "paths": [ + { + "description": "Testing Guidelines for Azure SDK - main page", + "path": "/Engineering-System/Testing-Guidelines.md", + "folder": "azure-sdk-internal-wiki", + "relativeByRepoPath": true + }, + { + "description": "Azure DevOps Pipeline Guidance - main page", + "path": "/Engineering-System/Pipelines/Azure-DevOps-Pipeline-Guidance.md", + "folder": "azure-sdk-internal-wiki", + "relativeByRepoPath": true + } + ] + }, + { + "repository": { + "url": "https://github.com/Azure/azure-sdk.git", + "branch": "main", + "authType": "public" + }, + "paths": [ + { + "description": "General Azure SDK guidelines and best practices", + "path": "/docs", + "folder": "azure-sdk-guidelines", + "ignoredPaths": ["redirects"], + "metadata": { + "scope": "branded" + } + } + ] + }, + { + "repository": { + "url": "https://github.com/Azure/azure-sdk-for-net.git", + "branch": "main", + "authType": "public" + }, + "paths": [ + { + "description": ".NET Azure SDK documentation", + "path": "/doc", + "folder": "azure_sdk_for_net_docs", + "relativeByRepoPath": true, + "metadata": { + "scope": "branded" + } + }, + { + "description": ".NET Azure Resource Manager SDK documentation", + "path": "/sdk/resourcemanager/Azure.ResourceManager/docs", + "folder": "azure_sdk_for_net_docs", + "relativeByRepoPath": true, + "metadata": { + "scope": "branded" + } + } + ] + }, + { + "repository": { + "url": "https://github.com/Azure/azure-sdk-for-go.git", + "branch": "main", + "authType": "public" + }, + "paths": [ + { + "description": "Documentation for Azure SDK for Go", + "path": "/documentation", + "folder": "azure_sdk_for_go_docs", + "metadata": { + "scope": "branded" + } + } + ] + }, + { + "repository": { + "url": "https://github.com/Azure/azure-sdk-for-cpp.git", + "branch": "main", + "authType": "public" + }, + "paths": [ + { + "description": "General guide for Azure SDK for C++", + "path": "/README.md", + "folder": "azure_sdk_for_cpp_docs", + "relativeByRepoPath": true, + "metadata": { + "scope": "branded" + } + }, + { + "description": "Documentation for Azure SDK for C++", + "path": "/doc", + "folder": "azure_sdk_for_cpp_docs", + "relativeByRepoPath": true, + "metadata": { + "scope": "branded" + } + }, + { + "description": "Contributing guidelines for Azure SDK for C++", + "path": "/CONTRIBUTING.md", + "folder": "azure_sdk_for_cpp_docs", + "relativeByRepoPath": true, + "metadata": { + "scope": "branded" + } + }, + { + "description": "Samples guide for Azure SDK for C++", + "path": "/samples/README.md", + "folder": "azure_sdk_for_cpp_docs", + "relativeByRepoPath": true, + "metadata": { + "scope": "branded" + } + }, + { + "description": "CMake and vcpkg integration guide for Azure SDK for C++", + "path": "/samples/integration/cmake-vcpkg/README.md", + "folder": "azure_sdk_for_cpp_docs", + "relativeByRepoPath": true, + "metadata": { + "scope": "branded" + } + }, + { + "description": "CMake FetchContent integration guide for Azure SDK for C++", + "path": "/samples/integration/cmake-fetch-content/README.md", + "folder": "azure_sdk_for_cpp_docs", + "relativeByRepoPath": true, + "metadata": { + "scope": "branded" + } + }, + { + "description": "Beta packages with vcpkg guide for Azure SDK for C++", + "path": "/samples/integration/beta-packages-vcpkg/README.md", + "folder": "azure_sdk_for_cpp_docs", + "relativeByRepoPath": true, + "metadata": { + "scope": "branded" + } + }, + { + "description": "Storage FAQ and Best Practices for Azure SDK for C++", + "path": "/sdk/storage/faq.md", + "folder": "azure_sdk_for_cpp_docs", + "relativeByRepoPath": true, + "metadata": { + "scope": "branded" + } + }, + { + "description": "Storage Testing Guide for Azure SDK for C++", + "path": "/sdk/storage/TestingGuide.md", + "folder": "azure_sdk_for_cpp_docs", + "relativeByRepoPath": true, + "metadata": { + "scope": "branded" + } + } + ] + }, + { + "repository": { + "url": "https://github.com/Azure/typespec-cpp.git", + "branch": "main", + "authType": "token", + "tokenEnvVar": "AZURE_SDK_GITHUB_PAT" + }, + "paths": [ + { + "description": "Emitter README for TypeSpec C++ code generation", + "path": "/packages/typespec-cpp/README.md", + "folder": "typespec_cpp_docs", + "relativeByRepoPath": true, + "metadata": { + "scope": "branded" + } + }, + { + "description": "Development Guide for TypeSpec C++ code generation", + "path": "/docs/development.md", + "folder": "typespec_cpp_docs", + "relativeByRepoPath": true, + "metadata": { + "scope": "branded" + } + } + ] + }, + { + "repository": { + "url": "https://github.com/Azure/azure-sdk-for-rust.git", + "branch": "main", + "authType": "public" + }, + "paths": [ + { + "description": "General guide for Azure SDK for Rust", + "path": "/README.md", + "folder": "azure_sdk_for_rust_docs", + "relativeByRepoPath": true, + "metadata": { + "scope": "branded" + } + }, + { + "description": "Consumer documentation for Azure SDK for Rust", + "path": "/doc", + "folder": "azure_sdk_for_rust_docs", + "relativeByRepoPath": true, + "metadata": { + "scope": "branded" + } + }, + { + "description": "Developer guides for Azure SDK for Rust", + "path": "/doc/dev", + "folder": "azure_sdk_for_rust_docs", + "relativeByRepoPath": true, + "metadata": { + "scope": "branded" + } + }, + { + "description": "Contributing guidelines for Azure SDK for Rust", + "path": "/CONTRIBUTING.md", + "folder": "azure_sdk_for_rust_docs", + "relativeByRepoPath": true, + "metadata": { + "scope": "branded" + } + }, + { + "description": "Core README for Azure SDK for Rust", + "path": "/sdk/core/azure_core/README.md", + "folder": "azure_sdk_for_rust_docs", + "relativeByRepoPath": true, + "metadata": { + "scope": "branded" + } + }, + { + "description": "Identity README for Azure SDK for Rust", + "path": "/sdk/identity/azure_identity/README.md", + "folder": "azure_sdk_for_rust_docs", + "relativeByRepoPath": true, + "metadata": { + "scope": "branded" + } + }, + { + "description": "TypeSpec Client Core README for Azure SDK for Rust", + "path": "/sdk/core/typespec_client_core/README.md", + "folder": "azure_sdk_for_rust_docs", + "relativeByRepoPath": true, + "metadata": { + "scope": "branded" + } + }, + { + "description": "Performance Testing Framework README for Azure SDK for Rust", + "path": "/sdk/core/azure_core_test/src/perf/README.md", + "folder": "azure_sdk_for_rust_docs", + "relativeByRepoPath": true, + "metadata": { + "scope": "branded" + } + }, + { + "description": "Identity Troubleshooting Guide for Azure SDK for Rust", + "path": "/sdk/identity/azure_identity/TROUBLESHOOTING.md", + "folder": "azure_sdk_for_rust_docs", + "relativeByRepoPath": true, + "metadata": { + "scope": "branded" + } + } + ] + }, + { + "repository": { + "url": "https://github.com/Azure/azure-sdk-for-java.git", + "branch": "main", + "authType": "public" + }, + "paths": [ + { + "description": "Documentation for Azure SDK for Java", + "path": "/doc", + "folder": "azure_sdk_for_java_docs", + "relativeByRepoPath": true, + "metadata": { + "scope": "branded" + } + }, + { + "description": "Resource Manager documentation for Azure SDK for Java", + "path": "/sdk/resourcemanager/docs", + "folder": "azure_sdk_for_java_docs", + "relativeByRepoPath": true, + "metadata": { + "scope": "branded", + "service_type": "management-plane" + } + }, + { + "description": "Contributing guidelines for Azure SDK for Java", + "path": "/CONTRIBUTING.md", + "folder": "azure_sdk_for_java_docs", + "relativeByRepoPath": true, + "metadata": { + "scope": "branded" + } + } + ] + }, + { + "repository": { + "url": "https://github.com/Azure/azure-sdk-for-java.wiki.git", + "branch": "master", + "authType": "public" + }, + "paths": [ + { + "description": "Wiki documentation for Azure SDK for Java", + "folder": "azure_sdk_for_java_wiki", + "metadata": { + "scope": "branded" + } + } + ] + }, + { + "repository": { + "url": "https://github.com/Azure/autorest.java.git", + "branch": "main", + "authType": "public" + }, + "paths": [ + { + "description": "AutoRest Java customization documentation", + "path": "/customization-base/README.md", + "folder": "autorest_java_docs", + "relativeByRepoPath": true, + "metadata": { + "scope": "branded" + } + } + ] + }, + { + "repository": { + "url": "https://github.com/Azure/azure-sdk-for-js.git", + "branch": "main", + "authType": "public" + }, + "paths": [ + { + "description": "Documentation for Azure SDK for JavaScript", + "path": "/documentation", + "folder": "azure_sdk_for_js_docs", + "relativeByRepoPath": true, + "metadata": { + "scope": "branded" + } + } + ] + }, + { + "repository": { + "url": "https://github.com/Azure/azure-sdk-for-js.wiki.git", + "path": "azure-sdk-for-js.wiki", + "branch": "master", + "authType": "public" + }, + "paths": [ + { + "description": "Wiki documentation for Azure SDK for JavaScript", + "folder": "azure_sdk_for_js_wiki", + "metadata": { + "scope": "branded" + } + } + ] + }, + { + "repository": { + "url": "https://github.com/Azure/azure-rest-api-specs.git", + "branch": "main", + "authType": "public" + }, + "paths": [ + { + "description": "Azure REST API Breaking Change and Oad Rules Mapping", + "path": "/documentation/BreakingChange-Oad-Rules-Mapping.md", + "folder": "azure_rest_api_specs_docs", + "relativeByRepoPath": true, + "metadata": { + "scope": "branded" + } + } + ] + }, + { + "repository": { + "url": "https://github.com/Azure/openapi-diff.git", + "branch": "main", + "authType": "public" + }, + "paths": [ + { + "description": "OpenAPI Diff (OAD) rules documentation", + "path": "/docs", + "folder": "azure_openapi_diff_docs", + "relativeByRepoPath": true, + "metadata": { + "scope": "branded" + } + } + ] + }, + { + "repository": { + "url": "https://github.com/Azure/azure-sdk-tools.git", + "branch": "main", + "authType": "public" + }, + "paths": [ + { + "description": "Documentation for Azure SDK Tools", + "path": "/tools/js-sdk-release-tools/docs/automation-pipeline.md", + "folder": "azure_sdk_tools_docs", + "relativeByRepoPath": true, + "metadata": { + "scope": "branded" + } + } + ] + } + ] +} \ No newline at end of file diff --git a/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/config/knowledge-config.schema.json b/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/config/knowledge-config.schema.json new file mode 100644 index 00000000000..62419037de4 --- /dev/null +++ b/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/config/knowledge-config.schema.json @@ -0,0 +1,188 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "Knowledge Configuration", + "description": "Schema for unified knowledge source configuration including repositories and documentation sources", + "type": "object", + "properties": { + "$schema": { + "type": "string", + "description": "Reference to this schema file" + }, + "description": { + "type": "string", + "description": "Human-readable description of this configuration" + }, + "version": { + "type": "string", + "description": "Version of the configuration format", + "pattern": "^\\d+\\.\\d+\\.\\d+$" + }, + "sources": { + "type": "array", + "description": "Array of knowledge sources with their repositories and documentation", + "items": { + "$ref": "#/$defs/Source" + } + } + }, + "required": ["version", "sources"], + "additionalProperties": false, + "$defs": { + "Source": { + "type": "object", + "description": "A source containing repository and documentation configuration", + "properties": { + "repository": { + "$ref": "#/$defs/Repository" + }, + "paths": { + "type": "array", + "description": "Documentation paths within this repository", + "items": { + "$ref": "#/$defs/DocumentationPath" + } + } + }, + "required": ["repository", "paths"], + "additionalProperties": false + }, + "Repository": { + "type": "object", + "description": "Git repository configuration", + "properties": { + "url": { + "type": "string", + "description": "Git repository URL" + }, + "path": { + "type": "string", + "description": "Local path where repository will be cloned" + }, + "branch": { + "type": "string", + "description": "Git branch to checkout", + "default": "main" + }, + "authType": { + "type": "string", + "description": "Authentication type for repository access", + "enum": ["public", "ssh", "token", "local"] + }, + "sshHost": { + "type": "string", + "description": "SSH host configuration (required for SSH auth)" + }, + "tokenEnvVar": { + "type": "string", + "description": "Environment variable containing access token (required for token auth)" + }, + "localPathEnv": { + "type": "string", + "description": "Environment variable containing local repository path (required for local auth)" + } + }, + "required": ["url", "branch", "authType"], + "additionalProperties": false, + "allOf": [ + { + "if": { + "properties": { + "authType": { + "const": "ssh" + } + } + }, + "then": { + "required": ["sshHost"] + } + } + ] + }, + "DocumentationPath": { + "type": "object", + "description": "Documentation path configuration", + "properties": { + "description": { + "type": "string", + "description": "Human-readable description of the documentation path" + }, + "path": { + "type": "string", + "description": "Path to the documentation files within the repository" + }, + "folder": { + "type": "string", + "description": "Destination folder name for processed documentation" + }, + "fileNameLowerCase": { + "type": "boolean", + "description": "Whether to convert filenames to lowercase", + "default": false + }, + "ignoredPaths": { + "type": "array", + "description": "Paths to ignore during processing", + "items": { + "type": "string" + } + }, + "relativeByRepoPath": { + "type": "boolean", + "description": "Whether paths are relative to the repository root", + "default": false + }, + "isGenerated": { + "type": "boolean", + "description": "Whether this source is a generated markdown.", + "default": false + }, + "metadata": { + "$ref": "#/$defs/Metadata", + "description": "Default metadata for all files in this path" + }, + "overrides": { + "type": "array", + "description": "File-specific metadata overrides using glob patterns", + "items": { + "$ref": "#/$defs/Override" + } + } + }, + "required": ["description", "folder"], + "additionalProperties": false + }, + "Metadata": { + "type": "object", + "description": "Hierarchical metadata for categorizing documentation", + "properties": { + "scope": { + "type": "string", + "enum": ["branded", "unbranded"], + "description": "Target audience: branded (Azure-specific) or unbranded (general TypeSpec)" + }, + "service_type": { + "type": "string", + "enum": ["data-plane", "management-plane"], + "description": "Service plane: data-plane (DPG), management-plane (MPG/ARM)" + } + }, + "additionalProperties": false + }, + "Override": { + "type": "object", + "description": "File-specific metadata override using glob pattern matching", + "properties": { + "pattern": { + "type": "string", + "description": "Glob pattern to match files (e.g., '**/Azure Data Plane Service/**', '**/*paging*.md')" + }, + "metadata": { + "$ref": "#/$defs/Metadata", + "description": "Metadata to apply to matching files (inherits from parent, only specified fields are overridden)" + } + }, + "required": ["pattern", "metadata"], + "additionalProperties": false + } + } +} diff --git a/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/graphrag_config/settings.yaml b/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/graphrag_config/settings.yaml new file mode 100644 index 00000000000..dc00b5ba967 --- /dev/null +++ b/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/graphrag_config/settings.yaml @@ -0,0 +1,56 @@ +# GraphRAG Configuration — Azure SDK QA Bot Knowledge Graph +# Only non-default settings are specified here. +# See https://microsoft.github.io/graphrag/config/yaml/ for all options. + +completion_models: + default_completion_model: + model_provider: azure + model: gpt-5.4 + auth_method: azure_managed_identity + api_base: ${AOAI_CHAT_COMPLETIONS_ENDPOINT} + api_version: "2024-12-01-preview" + cognitive_services_endpoint: https://cognitiveservices.azure.com/.default + +embedding_models: + default_embedding_model: + model_provider: azure + model: text-embedding-3-small + auth_method: azure_managed_identity + api_base: ${AOAI_CHAT_COMPLETIONS_ENDPOINT} + api_version: "2024-12-01-preview" + cognitive_services_endpoint: https://cognitiveservices.azure.com/.default + # Throttle to stay under Sweden Central S0 embedding limits (350 RPM / 350K TPM nominal, + # but bursts trigger 429s). Keep healthy headroom. + rate_limit: + type: sliding_window + period_in_seconds: 60 + requests_per_period: 60 + tokens_per_period: 100000 + retry: + type: exponential_backoff + max_retries: 10 + base_delay: 5.0 + max_delay: 120.0 + jitter: true + +input: + file_type: text + base_dir: "input" + # Match both .md and .mdx — daily_sync._process_source_directory + # already harvests both extensions, so GraphRAG must index both too. + file_pattern: ".*\\.mdx?$$" + +vector_store: + type: azure_ai_search + url: ${AI_SEARCH_BASE_URL} + audience: "https://search.azure.com" + index_schema: + text_unit_text: + index_name: "azuresdkqabot-dev-search-index-text-units" + vector_size: 1536 + entity_description: + index_name: "azuresdkqabot-dev-search-index-entities" + vector_size: 1536 + community_full_content: + index_name: "azuresdkqabot-dev-search-index-communities" + vector_size: 1536 diff --git a/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/pyproject.toml b/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/pyproject.toml new file mode 100644 index 00000000000..feb0d04b909 --- /dev/null +++ b/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/pyproject.toml @@ -0,0 +1,37 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "azure-sdk-qa-bot-knowledge-graph-sync" +version = "1.0.0" +description = "Knowledge graph sync pipeline for Azure SDK QA Bot — processes documentation and builds knowledge graph using GraphRAG" +requires-python = ">=3.11,<3.14" +dependencies = [ + "azure-identity>=1.20.0", + "azure-storage-blob>=12.20.0", + "azure-search-documents>=11.6.0", + "azure-appconfiguration>=1.6.0", + "azure-keyvault-secrets>=4.9.0", + "openai>=1.0.0", + "graphrag>=3.1.0", + "python-dotenv>=1.0.0", + "pyyaml>=6.0", + "wcmatch>=8.5", +] + +[project.optional-dependencies] +dev = [ + "pytest>=8.0", + "pytest-asyncio>=0.23", +] + +[project.scripts] +sync-knowledge-graph = "src.main:main" + +[tool.hatch.build.targets.wheel] +packages = ["src"] + +[tool.pytest.ini_options] +testpaths = ["tests"] +asyncio_mode = "auto" diff --git a/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/src/__init__.py b/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/src/__init__.py new file mode 100644 index 00000000000..f77c8ece25f --- /dev/null +++ b/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/src/__init__.py @@ -0,0 +1 @@ +"""Azure SDK QA Bot Knowledge Sync — Python implementation.""" diff --git a/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/src/daily_sync.py b/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/src/daily_sync.py new file mode 100644 index 00000000000..cf09d79b4cd --- /dev/null +++ b/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/src/daily_sync.py @@ -0,0 +1,600 @@ +"""Daily knowledge sync orchestrator. + +Implements the main pipeline: +1. Clone/update documentation repositories +2. Preprocess spector cases and TypeSpec libraries +3. Process markdown files (frontmatter extraction, content normalization) +4. Detect changes via content hashing +5. Upload changed files to Azure Blob Storage +6. Clean up expired blobs + +Note: AI Search indexing is handled by GraphRAG (configured with +azure_ai_search vector store). This module only manages the raw +document blob storage that GraphRAG reads from. +""" + +from __future__ import annotations + +import base64 +import logging +import os +import re +import shutil +import subprocess +import tempfile +from dataclasses import dataclass, field +from pathlib import Path + +from src.services.configuration_loader import ( + ConfigurationLoader, + DocumentationSource, + Metadata, + RepositoryConfig, +) +from src.services.metadata_resolver import MetadataResolver +from src.services.spector_processor import SpectorCaseProcessor +from src.services.storage_service import BlobService +from src.services.typespec_processor import TypeSpecProcessor + +logger = logging.getLogger(__name__) + + +@dataclass +class ProcessedFile: + filename: str + content: str + blob_path: str + is_valid: bool + metadata: Metadata | None = None + + +@dataclass +class ProcessResult: + total_processed: int = 0 + changed_documents: int = 0 + unchanged_documents: int = 0 + changed_files: list[ProcessedFile] = field(default_factory=list) + metadata_changed_files: list[ProcessedFile] = field(default_factory=list) + unchanged_files: list[ProcessedFile] = field(default_factory=list) + + +@dataclass +class SyncResult: + """Result of the daily sync — used to drive incremental GraphRAG.""" + + changed_blob_paths: list[str] = field(default_factory=list) + deleted_blob_paths: list[str] = field(default_factory=list) + + +# --- Main entry point --- + + +async def process_daily_sync_knowledge() -> SyncResult: + """Run the complete daily knowledge sync pipeline. + + Returns: + SyncResult with changed and deleted blob paths for incremental GraphRAG. + """ + working_dir = os.path.join(tempfile.gettempdir(), "daily-sync-work") + docs_dir = os.path.join(working_dir, "docs") + temp_docs_dir = os.path.join(working_dir, "temp_docs") + + blob_service = BlobService() + sync_result = SyncResult() + + try: + # Clean and create work directories + if os.path.exists(working_dir): + shutil.rmtree(working_dir, ignore_errors=True) + os.makedirs(docs_dir, exist_ok=True) + + # Step 1: Clone repositories + logger.info("Setting up documentation repositories...") + _setup_repositories(docs_dir) + + # Step 2: Preprocess spector cases + logger.info("Preprocessing spector cases...") + await SpectorCaseProcessor.process_spector_cases(docs_dir) + + # Step 3: Process TypeSpec libraries + logger.info("Processing typespec-azure-resource-manager library...") + try: + TypeSpecProcessor( + docs_dir, "typespec-azure/packages/typespec-azure-resource-manager/lib" + ).process_typespec_libraries() + except Exception as e: + logger.error("Error processing typespec library: %s", e) + + # Step 4: Process documentation sources + logger.info("Loading documentation source config...") + doc_sources = ConfigurationLoader.get_documentation_sources() + + logger.info("Loading existing blob metadata for change detection...") + existing_blobs = blob_service.list_blobs() + + all_changed: list[ProcessedFile] = [] + all_metadata_changed: list[ProcessedFile] = [] + all_unchanged: list[ProcessedFile] = [] + + for source in doc_sources: + source_dir = os.path.join(working_dir, source.path) + target_dir = os.path.join(temp_docs_dir, source.folder) + + if not os.path.exists(source_dir): + logger.warning("Source directory not found: %s", source_dir) + continue + + os.makedirs(target_dir, exist_ok=True) + + # Create release notes index + try: + _create_release_notes_index(source, source_dir, target_dir) + except Exception as e: + logger.error("Error creating release notes index: %s", e) + + # Process files + result = _process_source_directory( + source_dir, source, target_dir, existing_blobs, blob_service + ) + all_changed.extend(result.changed_files) + all_metadata_changed.extend(result.metadata_changed_files) + all_unchanged.extend(result.unchanged_files) + + logger.info( + "Processing completed: %d changed, %d metadata-changed, %d unchanged", + len(all_changed), + len(all_metadata_changed), + len(all_unchanged), + ) + + # Step 5: Upload changed files to blob storage + _upload_files(blob_service, all_changed + all_metadata_changed) + + # Step 6: Clean up expired blobs + deleted_paths = _cleanup_expired_blobs( + blob_service, all_changed + all_unchanged + all_metadata_changed + ) + + # Build sync result for incremental GraphRAG + sync_result.changed_blob_paths = [f.blob_path for f in all_changed if f.is_valid] + sync_result.deleted_blob_paths = deleted_paths + + logger.info("Daily sync knowledge processing completed") + + finally: + if os.path.exists(working_dir): + shutil.rmtree(working_dir, ignore_errors=True) + + return sync_result + + +# --- Repository setup --- + + +def _setup_repositories(docs_dir: str) -> None: + """Clone/checkout all configured repositories.""" + # Configure git HTTP/1.1 + try: + subprocess.run( + ["git", "config", "--global", "http.version", "HTTP/1.1"], + capture_output=True, + ) + except Exception: + pass + + _setup_ssh_config() + repos = ConfigurationLoader.get_repository_configs() + + for repo in repos: + try: + logger.info("Setting up %s...", repo.name) + repo_path = os.path.join(docs_dir, repo.path) + clone_url = _get_authenticated_url(repo) + + if repo.auth_type == "local": + # Copy from local path + for folder in repo.sparse_checkout or []: + src = os.path.join(clone_url, folder) + dst = os.path.join(repo_path, folder) + if os.path.exists(src): + shutil.copytree(src, dst, dirs_exist_ok=True) + else: + os.makedirs(docs_dir, exist_ok=True) + if repo.sparse_checkout: + subprocess.run( + ["git", "clone", "--filter=blob:none", "--sparse", clone_url, repo.path], + cwd=docs_dir, + capture_output=True, + check=True, + env=os.environ, + ) + subprocess.run( + ["git", "config", "core.sparseCheckout", "true"], + cwd=repo_path, + capture_output=True, + check=True, + ) + sparse_file = os.path.join(repo_path, ".git/info/sparse-checkout") + Path(sparse_file).write_text("\n".join(repo.sparse_checkout)) + subprocess.run( + ["git", "checkout", repo.branch], + cwd=repo_path, + capture_output=True, + check=True, + env=os.environ, + ) + else: + subprocess.run( + ["git", "clone", clone_url, repo.path], + cwd=docs_dir, + capture_output=True, + check=True, + env=os.environ, + ) + + logger.info("%s setup completed", repo.name) + except Exception as e: + logger.error("Error setting up %s: %s", repo.name, e) + continue + + +def _get_authenticated_url(repo: RepositoryConfig) -> str: + """Get URL with embedded credentials based on auth type.""" + if repo.auth_type == "public": + return repo.url + if repo.auth_type == "token": + if not repo.token: + raise RuntimeError(f"Token missing for {repo.name}") + return repo.url.replace("https://", f"https://x-access-token:{repo.token}@") + if repo.auth_type == "ssh": + return repo.url + if repo.auth_type == "local": + return repo.local_path or repo.url + return repo.url + + +def _setup_ssh_config() -> None: + """Set up SSH keys and config for git operations.""" + ssh_key = os.environ.get("SSH_PRIVATE_KEY") + if not ssh_key: + return + + home = os.environ.get("HOME", os.path.expanduser("~")) + ssh_dir = os.path.join(home, ".ssh") + os.makedirs(ssh_dir, mode=0o700, exist_ok=True) + + key_path = os.path.join(ssh_dir, "id_ed25519") + decoded_key = base64.b64decode(ssh_key).decode("utf-8") + Path(key_path).write_text(decoded_key) + os.chmod(key_path, 0o600) + + config_path = os.path.join(ssh_dir, "config") + ssh_config = f"""Host github-microsoft + HostName github.com + User git + IdentityFile {key_path} + IdentitiesOnly yes + StrictHostKeyChecking no + UserKnownHostsFile /dev/null + +Host github.com + HostName github.com + User git + StrictHostKeyChecking no + UserKnownHostsFile /dev/null +""" + Path(config_path).write_text(ssh_config) + os.chmod(config_path, 0o644) + + os.environ["GIT_SSH_COMMAND"] = f'ssh -F "{config_path}" -o StrictHostKeyChecking=no' + logger.info("SSH configuration setup completed") + + +# --- File processing --- + + +def _process_source_directory( + source_dir: str, + source: DocumentationSource, + target_dir: str, + existing_blobs: dict, + blob_service: BlobService, +) -> ProcessResult: + """Process all markdown files in a source directory.""" + result = ProcessResult() + + def process_single(full_path: str, file_source_dir: str) -> None: + result.total_processed += 1 + processed = _process_markdown_file(full_path, source, file_source_dir) + if not processed.is_valid: + return + + # Build metadata for comparison + blob_metadata = None + if processed.metadata: + blob_metadata = {"scope": processed.metadata.scope} + if processed.metadata.service_type: + blob_metadata["service_type"] = processed.metadata.service_type + + content_changed = blob_service.has_content_changed( + processed.blob_path, processed.content, existing_blobs + ) + metadata_changed = blob_service.has_metadata_changed( + processed.blob_path, blob_metadata, existing_blobs + ) + + if content_changed: + result.changed_documents += 1 + result.changed_files.append(processed) + Path(os.path.join(target_dir, processed.filename)).write_text( + processed.content, encoding="utf-8" + ) + elif metadata_changed: + result.metadata_changed_files.append(processed) + else: + result.unchanged_documents += 1 + result.unchanged_files.append(processed) + + # Check if source is a file + if os.path.isfile(source_dir): + process_single(source_dir, os.path.dirname(source_dir)) + return result + + # Walk directory + for dirpath, _, filenames in os.walk(source_dir): + for fname in filenames: + if not (fname.endswith(".md") or fname.endswith(".mdx")): + continue + full_path = os.path.join(dirpath, fname) + rel_path = os.path.relpath(full_path, source_dir) + + # Skip ignored paths + if source.ignored_paths: + if source.is_generated: + if any(fname.startswith(p.replace("/", "#").replace("\\", "#")) for p in source.ignored_paths): + continue + else: + if any(rel_path.startswith(p) for p in source.ignored_paths): + continue + + # Skip reference files and release notes + if rel_path.startswith("reference") or fname.startswith("release-"): + continue + + process_single(full_path, source_dir) + + return result + + +def _process_markdown_file( + file_path: str, + source: DocumentationSource, + source_dir: str, +) -> ProcessedFile: + """Process a single markdown file.""" + content = Path(file_path).read_text(encoding="utf-8") + converted = convert_markdown(content) + + if not converted["filename"]: + rel_path = os.path.relpath(file_path, source_dir) + converted["filename"] = rel_path.replace(os.sep, "#").replace("/", "#") + if source.folder == "azure-sdk-guidelines": + return ProcessedFile(filename="", content="", blob_path="", is_valid=False) + if source.is_generated: + converted["filename"] = re.sub(r"^generated#", "", converted["filename"]) + + filename = converted["filename"] + if source.file_name_lower_case: + filename = filename.lower().replace(" ", "-") + + blob_path = f"{source.folder}/{filename}" + + # Resolve metadata + metadata = None + if source.metadata: + rel_path = os.path.relpath(file_path, source_dir) + metadata = MetadataResolver.resolve_metadata( + rel_path, source.metadata, source.overrides or None + ) + + return ProcessedFile( + filename=converted["filename"], + content=converted["content"], + blob_path=blob_path, + is_valid=True, + metadata=metadata, + ) + + +# --- Content processing functions --- + + +def preprocess_content(content: str) -> str: + """Fix Azure AI Search markdown parser issues. + + 1. Replace # at start of lines in code blocks with // (prevents header detection) + 2. Escape ``` code block delimiters + """ + # Fix 1: Replace # in code blocks + def fix_code_block(m: re.Match) -> str: + lang = m.group(1) + code = m.group(2) + transformed = re.sub(r"^#(\s*)", r"//\1", code, flags=re.MULTILINE) + return f"```{lang}\n{transformed}```" + + result = re.sub(r"```(\w+)\s*\n([\s\S]*?)```", fix_code_block, content) + + # Fix 2: Escape code block delimiters + result = re.sub(r"```(\w*)", r"\\`\\`\\`\1", result) + + return result + + +def convert_markdown(content: str) -> dict[str, str]: + """Convert markdown: extract frontmatter, normalize content.""" + title = "" + filename = "" + found_title = False + in_frontmatter = False + first_content_line = True + + content = preprocess_content(content) + lines = content.split("\n") + content_lines: list[str] = [] + + for line in lines: + if line.strip() == "---": + if not in_frontmatter: + in_frontmatter = True + continue + else: + in_frontmatter = False + continue + + if in_frontmatter: + if line.startswith("title:"): + title = line[6:].strip().strip("\"'") + found_title = True + if line.startswith("permalink:"): + filename = line[10:].strip().strip("\"'") + continue + + if first_content_line: + if found_title: + content_lines.append(f"# {title}") + content_lines.append("") + first_content_line = False + + content_lines.append(line) + + return {"filename": filename, "content": "\n".join(content_lines)} + + +def extract_date_from_filename(file_path: str) -> str: + """Extract date from filename in format release-YYYY-MM-DD.md.""" + m = re.search(r"release-(\d{4}-\d{2}-\d{2})", os.path.basename(file_path)) + return m.group(1) if m else "1970-01-01" + + +def extract_release_info(content: str) -> dict[str, str]: + """Extract title, releaseDate, version from frontmatter.""" + info = {"title": "", "releaseDate": "", "version": ""} + m = re.match(r"^---\s*\n([\s\S]*?)\n---\s*", content) + if not m: + return info + for line in m.group(1).split("\n"): + line = line.strip() + if line.startswith("title:"): + info["title"] = line[6:].strip().strip("\"'") + elif line.startswith("releaseDate:"): + info["releaseDate"] = line[12:].strip() + elif line.startswith("version:"): + info["version"] = line[8:].strip().strip("\"'") + return info + + +def extract_sections(content: str) -> str: + """Extract and downgrade headers for release note sections.""" + # Remove frontmatter + result = re.sub(r"^---\s*\n[\s\S]*?\n---\s*\n", "", content) + # Remove caution blocks + result = re.sub(r":::caution[\s\S]*?:::\s*", "", result) + # Downgrade headers + result = re.sub(r"^(#+)\s+(.+)$", r"#\1 \2", result, flags=re.MULTILINE) + return result.strip() + + +# --- Upload and cleanup --- + + +def _upload_files(blob_service: BlobService, files: list[ProcessedFile]) -> None: + """Upload changed/metadata-changed files to blob storage.""" + count = 0 + for f in files: + if not f.is_valid: + continue + metadata = None + if f.metadata: + metadata = {"scope": f.metadata.scope} + if f.metadata.service_type: + metadata["service_type"] = f.metadata.service_type + blob_service.put_blob(f.blob_path, f.content, metadata) + count += 1 + logger.info("Uploaded %d files to blob storage", count) + + +def _cleanup_expired_blobs(blob_service: BlobService, current_files: list[ProcessedFile]) -> list[str]: + """Remove blobs that are no longer in the current file set. + + Returns: + List of deleted blob paths. + """ + blobs = blob_service.list_blobs() + current_paths = {f.blob_path for f in current_files if f.is_valid} + deleted: list[str] = [] + + for blob_path in blobs: + if blob_path.startswith("static_"): + continue + if blob_path not in current_paths: + try: + blob_service.delete_blob(blob_path) + deleted.append(blob_path) + except Exception as e: + logger.warning("Failed to delete blob %s: %s", blob_path, e) + + logger.info("Cleaned up %d expired blobs", len(deleted)) + return deleted + + +def _create_release_notes_index( + source: DocumentationSource, source_dir: str, target_dir: str +) -> None: + """Create an index file with the 10 most recent release notes.""" + release_dir = os.path.join(source_dir, "release-notes") + if not os.path.isdir(release_dir): + return + + # Find release note files + release_files: list[str] = [] + for dirpath, _, filenames in os.walk(release_dir): + for fname in filenames: + if re.match(r"release-\d{4}-\d{2}-\d{2}\.(md|mdx)$", fname): + release_files.append(os.path.join(dirpath, fname)) + + # Sort by date descending + release_files.sort(key=lambda f: extract_date_from_filename(f), reverse=True) + recent = release_files[:10] + + # Build index content + content = f"# {source.folder} - Recent Version Release Notes\n" + content += f"This contains latest release version and changes of {source.folder}\n\n" + + for file_path in recent: + try: + file_content = Path(file_path).read_text(encoding="utf-8") + rel_path = os.path.relpath(file_path, source_dir) + + # Build release link + if source.folder == "typespec_docs": + link = f"https://typespec.io/docs/{rel_path}" + elif source.folder == "typespec_azure_docs": + link = f"https://azure.github.io/typespec-azure/docs/{rel_path}" + else: + link = "" + link = re.sub(r"\.(md|mdx)$", "", link) + + info = extract_release_info(file_content) + header = f"## [version-{info['title']}-{info['releaseDate']}" + if info["version"]: + header += f" (v{info['version']})" + header += f"]({link})\n" + + sections = extract_sections(file_content) + content += header + sections + "\n" + except Exception as e: + logger.warning("Error reading release note %s: %s", file_path, e) + + index_path = os.path.join(target_dir, "version-release-notes-index.md") + Path(index_path).write_text(content, encoding="utf-8") + logger.info("Created release notes index for %s", source.folder) diff --git a/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/src/graphrag/__init__.py b/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/src/graphrag/__init__.py new file mode 100644 index 00000000000..4dd5a7707e4 --- /dev/null +++ b/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/src/graphrag/__init__.py @@ -0,0 +1 @@ +"""GraphRAG indexing sub-package.""" diff --git a/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/src/graphrag/publish_output.py b/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/src/graphrag/publish_output.py new file mode 100644 index 00000000000..72fc622ac16 --- /dev/null +++ b/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/src/graphrag/publish_output.py @@ -0,0 +1,173 @@ +"""Publish GraphRAG parquet outputs to blob storage + notify the bot. + +After ``run_graphrag_pipeline`` completes the parquets live in +``graphrag_config/output/``. This module uploads them to a versioned +prefix in blob storage (``/.parquet`` at the +container root) and then writes ``latest.json`` last, so a +partially-published snapshot is never picked up by the bot. + +After publishing it POSTs ``BOT_AGENT_RELOAD_URL`` with a shared-secret +header (``X-Admin-Token: BOT_AGENT_ADMIN_TOKEN``) so the live bot +swaps in the new build without restarting. The notify step is +best-effort — if the bot is unreachable we log a warning and let the +nightly process exit successfully (the bot will pick up the new build +on its next cold start or on the next reload). + +Publishing always runs after a successful indexing pass; the only +required knob is the destination container. The remaining env vars are: + +* ``STORAGE_GRAPHRAG_OUTPUT_CONTAINER`` — destination container +* ``BOT_AGENT_RELOAD_URL`` — POST endpoint on the bot +* ``BOT_AGENT_ADMIN_TOKEN`` — shared secret +""" + +from __future__ import annotations + +import asyncio +import datetime as _dt +import json +import logging +import os +import uuid +from pathlib import Path +from typing import Any + +from src.services.storage_service import BlobService + +logger = logging.getLogger(__name__) + +_PARQUET_FILES = ( + "entities.parquet", + "communities.parquet", + "community_reports.parquet", + "text_units.parquet", + "relationships.parquet", + "documents.parquet", +) + + +def _build_snapshot_id() -> str: + """Return a sortable, filesystem-safe timestamp for the snapshot prefix.""" + ts = _dt.datetime.now(_dt.timezone.utc).strftime("%Y-%m-%dT%H-%M-%SZ") + short = uuid.uuid4().hex[:6] + return f"{ts}-{short}" + + +async def publish_and_notify(output_dir: Path) -> dict[str, Any] | None: + """Publish parquets from ``output_dir`` and notify the bot agent. + + Returns the manifest dict on success, or ``None`` when no destination + container is configured (so the step degrades to a logged no-op + rather than failing the sync). + + The function offloads sync SDK calls to ``asyncio.to_thread`` so it + doesn't block the event loop, but the underlying ``BlobService`` + (and azure-storage-blob) remains synchronous, consistent with the + rest of the sync project. + """ + container = os.environ.get("STORAGE_GRAPHRAG_OUTPUT_CONTAINER") + if not container: + logger.warning( + "STORAGE_GRAPHRAG_OUTPUT_CONTAINER not set — skipping parquet publish" + ) + return None + + snapshot_id = _build_snapshot_id() + manifest = { + "prefix": snapshot_id, + "built_at": _dt.datetime.now(_dt.timezone.utc).isoformat(), + "build_id": snapshot_id, + "files": list(_PARQUET_FILES), + } + + await asyncio.to_thread(_upload_snapshot, container, snapshot_id, output_dir, manifest) + + reload_url = os.environ.get("BOT_AGENT_RELOAD_URL", "").strip() + if reload_url: + await _notify_bot(reload_url, os.environ.get("BOT_AGENT_ADMIN_TOKEN", "")) + else: + logger.info( + "BOT_AGENT_RELOAD_URL not set — bot will pick up the new snapshot " + "on its next reload / restart" + ) + + return manifest + + +def _upload_snapshot( + container: str, + snapshot_id: str, + output_dir: Path, + manifest: dict[str, Any], +) -> None: + """Upload all parquets + manifest (manifest LAST for atomicity).""" + storage = BlobService(container_name=container) + + missing: list[str] = [] + for name in _PARQUET_FILES: + src = output_dir / name + if not src.is_file(): + missing.append(name) + continue + blob_name = f"{snapshot_id}/{name}" + logger.info("Uploading %s -> %s/%s", src, container, blob_name) + storage.put_blob(blob_name, src.read_bytes()) + + if missing: + raise FileNotFoundError( + f"GraphRAG output is incomplete; missing parquets: {missing}" + ) + + # Manifest written LAST — readers polling latest.json never see a + # half-uploaded snapshot. + logger.info("Publishing manifest %s/latest.json", container) + storage.put_blob("latest.json", json.dumps(manifest, indent=2)) + + +async def _notify_bot(reload_url: str, admin_token: str) -> None: + """POST the reload endpoint; never raise — log on failure.""" + if not admin_token: + logger.warning( + "BOT_AGENT_ADMIN_TOKEN not set — skipping reload POST to %s", + reload_url, + ) + return + + try: + import httpx # type: ignore[import-not-found] + except ImportError: + # Fallback to stdlib urllib so we don't add a hard runtime dep + await asyncio.to_thread(_notify_bot_urllib, reload_url, admin_token) + return + + try: + async with httpx.AsyncClient(timeout=30.0) as client: + resp = await client.post( + reload_url, headers={"X-Admin-Token": admin_token} + ) + if resp.status_code >= 400: + logger.warning( + "Bot reload returned HTTP %s: %s", + resp.status_code, + resp.text[:500], + ) + else: + logger.info("Bot reload accepted: HTTP %s", resp.status_code) + except Exception as exc: + logger.warning("Bot reload POST failed: %s", exc) + + +def _notify_bot_urllib(reload_url: str, admin_token: str) -> None: + import urllib.error + import urllib.request + + req = urllib.request.Request( + reload_url, method="POST", headers={"X-Admin-Token": admin_token} + ) + try: + with urllib.request.urlopen(req, timeout=30) as resp: + logger.info("Bot reload accepted: HTTP %s", resp.status) + except urllib.error.HTTPError as exc: + logger.warning("Bot reload returned HTTP %s: %s", exc.code, exc.reason) + except Exception as exc: + logger.warning("Bot reload POST failed: %s", exc) diff --git a/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/src/graphrag/run_indexing.py b/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/src/graphrag/run_indexing.py new file mode 100644 index 00000000000..41c6d85aaf1 --- /dev/null +++ b/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/src/graphrag/run_indexing.py @@ -0,0 +1,264 @@ +"""GraphRAG indexing pipeline — using GraphRAG Python API directly. + +GraphRAG natively supports: +- Azure AI Search as vector store (configured in settings.yaml) +- Incremental indexing via `build_index(is_update_run=True)` +- Full indexing via `build_index(is_update_run=False)` + +This module orchestrates: +1. Downloading source documents to the GraphRAG input directory +2. Calling GraphRAG's Python API for indexing (no subprocess/CLI) + +GraphRAG handles all vector indexing (Azure AI Search) and graph +extraction internally — no custom Cosmos upload or embedding needed. +""" + +from __future__ import annotations + +import logging +import os +import shutil +from pathlib import Path + +from src.services.configuration_loader import ConfigurationLoader +from src.services.storage_service import BlobService + +from graphrag.api.index import build_index +from graphrag.config.enums import IndexingMethod +from graphrag.config.load_config import load_config + +logger = logging.getLogger(__name__) + +# The graphrag config lives in the graphrag_config/ directory at project root +GRAPHRAG_ROOT = Path(__file__).resolve().parent.parent.parent / "graphrag_config" +INPUT_DIR = GRAPHRAG_ROOT / "input" +OUTPUT_DIR = GRAPHRAG_ROOT / "output" +UPDATE_OUTPUT_DIR = GRAPHRAG_ROOT / "update_output" + + +def _resolve_source_prefixes() -> list[str]: + """Return the source-folder prefixes to index from knowledge-config.json. + + Mirrors how `daily_sync` derives blob paths: each `DocumentationSource.folder` + is the prefix used when uploading blobs (`source.folder/...`), so the full + rebuild downloads exactly the same set the doc sync produced. + + Returns folders in config order, de-duplicated. + """ + seen: set[str] = set() + folders: list[str] = [] + for source in ConfigurationLoader.get_documentation_sources(): + if source.folder and source.folder not in seen: + seen.add(source.folder) + folders.append(source.folder) + return folders + + +async def run_graphrag_pipeline( + sources: list[str] | None = None, + full: bool = False, + changed_blob_paths: list[str] | None = None, + deleted_blob_paths: list[str] | None = None, +) -> None: + """Run the GraphRAG indexing pipeline. + + Uses GraphRAG's Python API for indexing: + - `build_index(is_update_run=False)` for full re-indexing + - `build_index(is_update_run=True)` for incremental updates + + GraphRAG writes vectors directly to Azure AI Search (configured in + settings.yaml) and stores graph structure in parquet output files. + + Args: + sources: Source prefixes to index (None = derive from knowledge-config.json). Used in full mode. + full: If True, re-index everything from scratch. + changed_blob_paths: Blob paths that changed in the current sync. + deleted_blob_paths: Blob paths that were deleted in the current sync. + """ + changed = changed_blob_paths or [] + deleted = deleted_blob_paths or [] + + if full: + logger.info("Full GraphRAG indexing mode") + await _run_full_indexing(sources) + elif changed or deleted: + logger.info( + "Incremental GraphRAG update: %d changed, %d deleted", + len(changed), + len(deleted), + ) + await _run_incremental_indexing(changed, deleted, sources) + else: + logger.info("No changes detected — skipping GraphRAG indexing") + + +async def _run_incremental_indexing( + changed_blob_paths: list[str], + deleted_blob_paths: list[str], + sources: list[str] | None = None, +) -> None: + """Incremental indexing using GraphRAG's native update API. + + GraphRAG's update mode: + - Detects new/modified files in the input directory + - Only processes changed documents + - Merges new entities into the existing graph + - Updates vector store indexes in Azure AI Search + """ + # Ensure we have an existing index output (required for update) + if not OUTPUT_DIR.exists() or not any(OUTPUT_DIR.rglob("*.parquet")): + logger.info("No existing index found — falling back to full indexing") + await _run_full_indexing(sources) + return + + # Remove deleted files from input directory + if deleted_blob_paths: + _remove_deleted_from_input(deleted_blob_paths) + + # Download changed blobs into the input directory (additive, not destructive) + if changed_blob_paths: + blob_service = BlobService() + count = _download_blobs_additive(blob_service, changed_blob_paths) + if count == 0: + logger.warning("Could not download any changed blobs") + return + + # Run GraphRAG update (native incremental via Python API) + await _run_graphrag_build_index(is_update_run=True) + + # Merge update_output back into output for next run + _merge_update_output() + + logger.info("Incremental GraphRAG update done") + + +async def _run_full_indexing(sources: list[str] | None = None) -> None: + """Full re-indexing: download all source blobs and rebuild the graph.""" + src_list = sources or _resolve_source_prefixes() + if not src_list: + logger.warning( + "No source folders resolved from knowledge-config.json — skipping indexing" + ) + return + + blob_service = BlobService() + count = blob_service.download_all_blobs_to_dir(INPUT_DIR, source_prefixes=src_list) + if count == 0: + logger.warning("No documents downloaded — skipping indexing") + return + + # Run full GraphRAG index via Python API + await _run_graphrag_build_index(is_update_run=False) + + logger.info("Full GraphRAG indexing done") + + +# ============================================================================= +# GraphRAG Python API execution +# ============================================================================= + + +async def _run_graphrag_build_index(is_update_run: bool = False) -> None: + """Run GraphRAG indexing using the Python API directly. + + Args: + is_update_run: If True, runs incremental update. If False, full index. + """ + + mode = "update" if is_update_run else "full index" + logger.info("Starting GraphRAG %s via Python API...", mode) + + # Load config from settings.yaml + config = load_config(GRAPHRAG_ROOT) + + # Run the indexing pipeline + results = await build_index( + config=config, + method=IndexingMethod.Standard, + is_update_run=is_update_run, + verbose=True, + ) + + # Check for errors + errors = [r for r in results if r.error is not None] + if errors: + error_msgs = [f"{r.workflow}: {r.error}" for r in errors] + logger.error("GraphRAG %s had errors:\n%s", mode, "\n".join(error_msgs)) + raise RuntimeError( + f"GraphRAG {mode} failed with {len(errors)} workflow error(s): " + + "; ".join(error_msgs[:3]) + ) + + logger.info( + "GraphRAG %s completed: %d workflows succeeded", + mode, + len(results), + ) + + +# ============================================================================= +# File management helpers +# ============================================================================= + + +def _download_blobs_additive(blob_service: BlobService, blob_paths: list[str]) -> int: + """Download blobs into the input directory without clearing existing files. + + Unlike download_blobs_to_dir (which clears the target), this preserves + existing files so GraphRAG update can compare against prior state. + """ + INPUT_DIR.mkdir(parents=True, exist_ok=True) + count = 0 + for blob_path in blob_paths: + try: + data = blob_service.download_blob(blob_path) + local_path = INPUT_DIR / blob_path + local_path.parent.mkdir(parents=True, exist_ok=True) + local_path.write_bytes(data) + count += 1 + except Exception as e: + logger.warning("Failed to download blob %s: %s", blob_path, e) + + logger.info("Downloaded %d/%d changed blobs to input/", count, len(blob_paths)) + return count + + +def _remove_deleted_from_input(deleted_blob_paths: list[str]) -> None: + """Remove deleted documents from the input directory.""" + removed = 0 + for blob_path in deleted_blob_paths: + local_path = INPUT_DIR / blob_path + if local_path.exists(): + local_path.unlink() + removed += 1 + # Clean up empty parent directories + parent = local_path.parent + if parent != INPUT_DIR and not any(parent.iterdir()): + parent.rmdir() + + logger.info( + "Removed %d/%d deleted files from input/", removed, len(deleted_blob_paths) + ) + + +def _merge_update_output() -> None: + """Merge update_output into output directory for subsequent updates. + + GraphRAG's update mode writes to update_output/ by default. + We merge these results back into output/ so the next run can use them. + """ + if not UPDATE_OUTPUT_DIR.exists(): + return + + OUTPUT_DIR.mkdir(parents=True, exist_ok=True) + + for src_file in UPDATE_OUTPUT_DIR.rglob("*"): + if src_file.is_file(): + rel = src_file.relative_to(UPDATE_OUTPUT_DIR) + dst = OUTPUT_DIR / rel + dst.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(src_file, dst) + + # Clean up update_output for next run + shutil.rmtree(UPDATE_OUTPUT_DIR) + logger.info("Merged update_output into output") diff --git a/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/src/main.py b/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/src/main.py new file mode 100644 index 00000000000..1ece36cea7e --- /dev/null +++ b/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/src/main.py @@ -0,0 +1,119 @@ +#!/usr/bin/env python3 +"""Main entry point for Azure SDK Knowledge Sync. + +Usage: + python -m src.main [--skip-graphrag] [--graphrag-only] [--sources SRC1,SRC2] +""" + +from __future__ import annotations + +import argparse +import asyncio +import logging +import sys + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s [%(name)s] %(message)s", +) +logger = logging.getLogger(__name__) + + +async def run(args: argparse.Namespace) -> None: + """Run the knowledge sync pipeline.""" + from src.services.app_config import init_configuration + from src.services.app_secret import init_secrets + + # Step 1: Initialize configuration and secrets + logger.info("Initializing app configuration...") + await init_configuration() + logger.info("Initializing app secrets...") + await init_secrets() + + # Step 2: Run main knowledge sync (unless --graphrag-only) + changed_blob_paths: list[str] = [] + deleted_blob_paths: list[str] = [] + + if not args.graphrag_only: + from src.daily_sync import process_daily_sync_knowledge + + logger.info("Starting Azure SDK Knowledge Sync...") + sync_result = await process_daily_sync_knowledge() + changed_blob_paths = sync_result.changed_blob_paths + deleted_blob_paths = sync_result.deleted_blob_paths + logger.info( + "Knowledge sync completed: %d changed, %d deleted", + len(changed_blob_paths), + len(deleted_blob_paths), + ) + + # Step 3: Run GraphRAG indexing (unless --skip-graphrag) + if not args.skip_graphrag: + from src.graphrag.run_indexing import OUTPUT_DIR, run_graphrag_pipeline + + sources = [s.strip() for s in args.sources.split(",")] if args.sources else None + logger.info("Starting GraphRAG indexing...") + await run_graphrag_pipeline( + sources=sources, + full=args.full_graphrag, + changed_blob_paths=changed_blob_paths, + deleted_blob_paths=deleted_blob_paths, + ) + logger.info("GraphRAG indexing completed successfully") + + # Step 4: Publish parquets to blob + notify bot agent. Runs after + # every successful indexing pass; degrades to a logged no-op + # when STORAGE_GRAPHRAG_OUTPUT_CONTAINER is unset. Failures here + # don't fail the sync — the bot will pick up the new build on + # next cold start. + from src.graphrag.publish_output import publish_and_notify + + try: + manifest = await publish_and_notify(OUTPUT_DIR) + if manifest: + logger.info( + "Published GraphRAG snapshot %s", manifest.get("build_id") + ) + except Exception: + logger.warning( + "Failed to publish GraphRAG output to blob", exc_info=True + ) + + +def main() -> None: + """CLI entry point.""" + parser = argparse.ArgumentParser( + description="Azure SDK QA Bot Knowledge Sync Pipeline" + ) + parser.add_argument( + "--skip-graphrag", + action="store_true", + help="Skip GraphRAG indexing step", + ) + parser.add_argument( + "--graphrag-only", + action="store_true", + help="Run only GraphRAG indexing (skip document sync)", + ) + parser.add_argument( + "--sources", + type=str, + default=None, + help="Comma-separated list of sources to process", + ) + parser.add_argument( + "--full-graphrag", + action="store_true", + help="Run GraphRAG on all sources (expensive)", + ) + args = parser.parse_args() + + try: + asyncio.run(run(args)) + except Exception as e: + logger.error("Knowledge sync failed: %s", e, exc_info=True) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/src/services/__init__.py b/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/src/services/__init__.py new file mode 100644 index 00000000000..aef2a3830d2 --- /dev/null +++ b/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/src/services/__init__.py @@ -0,0 +1,5 @@ +"""Service modules for knowledge sync pipeline.""" + +from src.services.storage_service import BlobService + +__all__ = ["BlobService"] diff --git a/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/src/services/app_config.py b/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/src/services/app_config.py new file mode 100644 index 00000000000..eaf9e622cfd --- /dev/null +++ b/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/src/services/app_config.py @@ -0,0 +1,72 @@ +"""Azure App Configuration loader. + +Loads settings from Azure App Configuration and sets them as environment +variables (only if not already set, giving priority to .env/local values). +""" + +from __future__ import annotations + +import logging +import os +from pathlib import Path + +from dotenv import load_dotenv + +logger = logging.getLogger(__name__) + + +def _load_env_file() -> None: + """Load .env file from project root.""" + project_root = Path(__file__).resolve().parent.parent.parent + env_path = project_root / ".env" + if env_path.exists(): + load_dotenv(env_path) + logger.info("Loaded environment variables from %s", env_path) + else: + logger.debug("No .env file at %s", env_path) + + +async def init_configuration() -> None: + """Initialize configuration from .env then Azure App Configuration. + + Settings from App Configuration only override env vars that are not + already set (local .env takes priority). + """ + _load_env_file() + + endpoint = os.environ.get("AZURE_APPCONFIG_ENDPOINT") + if not endpoint: + logger.warning("AZURE_APPCONFIG_ENDPOINT not set; skipping App Configuration") + return + + from azure.appconfiguration.aio import AzureAppConfigurationClient + from azure.identity.aio import ( + AzureCliCredential, + ChainedTokenCredential, + ManagedIdentityCredential, + WorkloadIdentityCredential, + ) + + logger.info("Loading configuration from Azure App Configuration...") + + credential = ChainedTokenCredential( + WorkloadIdentityCredential(), + AzureCliCredential(), + ManagedIdentityCredential(), + ) + + client = AzureAppConfigurationClient(endpoint, credential=credential) + + try: + async for setting in client.list_configuration_settings(): + if setting.key and setting.value is not None: + if not os.environ.get(setting.key): + os.environ[setting.key] = setting.value + logger.info("Set %s from App Configuration", setting.key) + else: + logger.debug("Skipping %s — already set", setting.key) + finally: + await client.close() + await credential.close() + + logger.info("Successfully loaded configuration from Azure App Configuration") diff --git a/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/src/services/app_secret.py b/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/src/services/app_secret.py new file mode 100644 index 00000000000..a12efbdc519 --- /dev/null +++ b/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/src/services/app_secret.py @@ -0,0 +1,62 @@ +"""Azure Key Vault secret loader. + +Fetches required secrets from Key Vault and sets them as environment variables. +""" + +from __future__ import annotations + +import logging +import os + +logger = logging.getLogger(__name__) + + +async def init_secrets() -> None: + """Load secrets from Azure Key Vault into environment variables.""" + endpoint = os.environ.get("KEYVAULT_ENDPOINT") + if not endpoint: + logger.warning("KEYVAULT_ENDPOINT not set; skipping Key Vault secrets") + return + + from azure.identity.aio import ( + AzureCliCredential, + ChainedTokenCredential, + ManagedIdentityCredential, + WorkloadIdentityCredential, + ) + from azure.keyvault.secrets.aio import SecretClient + + logger.info("Loading secrets from Azure Key Vault...") + + credential = ChainedTokenCredential( + WorkloadIdentityCredential(), + AzureCliCredential(), + ManagedIdentityCredential(), + ) + + client = SecretClient(vault_url=endpoint, credential=credential) + + try: + # AI Search API Key + secret = await client.get_secret("AI-SEARCH-APIKEY") + if secret.value: + os.environ["AI_SEARCH_API_KEY"] = secret.value + logger.info("Set AI_SEARCH_API_KEY from Key Vault") + + # Azure OpenAI API Key + secret = await client.get_secret("AOAI-CHAT-COMPLETIONS-API-KEY") + if secret.value: + os.environ["AOAI_CHAT_COMPLETIONS_API_KEY"] = secret.value + logger.info("Set AOAI_CHAT_COMPLETIONS_API_KEY from Key Vault") + + # SSH private key + secret = await client.get_secret("SSH-PRIVATE-KEY") + if secret.value: + os.environ["SSH_PRIVATE_KEY"] = secret.value + logger.info("Set SSH_PRIVATE_KEY from Key Vault") + + finally: + await client.close() + await credential.close() + + logger.info("Successfully loaded secrets from Azure Key Vault") diff --git a/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/src/services/configuration_loader.py b/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/src/services/configuration_loader.py new file mode 100644 index 00000000000..5c4a1ebff64 --- /dev/null +++ b/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/src/services/configuration_loader.py @@ -0,0 +1,286 @@ +"""Knowledge configuration loader. + +Parses knowledge-config.json and provides typed access to repository +configurations and documentation source definitions. +""" + +from __future__ import annotations + +import json +import logging +import os +import re +from dataclasses import dataclass, field +from pathlib import Path +from typing import Literal + +logger = logging.getLogger(__name__) + +_CONFIG_PATH = Path(__file__).resolve().parent.parent.parent / "config" / "knowledge-config.json" + + +# --- Data Models --- + + +@dataclass +class Metadata: + scope: Literal["branded", "unbranded"] | None = None + service_type: Literal["data-plane", "management-plane"] | None = None + + +@dataclass +class Override: + pattern: str + metadata: Metadata + + +@dataclass +class DocumentationPath: + name: str + description: str + path: str | None = None + folder: str | None = None + file_name_lower_case: bool = False + ignored_paths: list[str] = field(default_factory=list) + relative_by_repo_path: bool = False + is_generated: bool = False + metadata: Metadata | None = None + overrides: list[Override] = field(default_factory=list) + + +@dataclass +class Repository: + url: str + branch: str + auth_type: Literal["public", "ssh", "token", "local"] + path: str | None = None + ssh_host: str | None = None + token_env_var: str | None = None + local_path_env: str | None = None + + +@dataclass +class Source: + repository: Repository + paths: list[DocumentationPath] + + +@dataclass +class KnowledgeConfig: + version: str + sources: list[Source] + description: str | None = None + + +# --- Legacy flattened formats used by the orchestrator --- + + +@dataclass +class DocumentationSource: + """Flattened representation of a documentation path with resolved filesystem path.""" + path: str + folder: str + file_name_lower_case: bool = False + ignored_paths: list[str] = field(default_factory=list) + is_generated: bool = False + metadata: Metadata | None = None + overrides: list[Override] = field(default_factory=list) + + +@dataclass +class RepositoryConfig: + """Flattened repository config with resolved auth details.""" + name: str + url: str + path: str + branch: str + sparse_checkout: list[str] | None = None + auth_type: str = "public" + ssh_host: str | None = None + token: str | None = None + local_path: str | None = None + + +# --- Loader --- + + +class ConfigurationLoader: + """Loads and transforms knowledge-config.json.""" + + _config: KnowledgeConfig | None = None + _config_path: Path = _CONFIG_PATH + + @classmethod + def load_config(cls) -> KnowledgeConfig: + if cls._config is not None: + return cls._config + + config_content = cls._config_path.read_text(encoding="utf-8") + raw = json.loads(config_content) + cls._config = cls._parse_config(raw) + logger.info( + "Loaded config version %s with %d sources", + cls._config.version, + len(cls._config.sources), + ) + return cls._config + + @classmethod + def get_documentation_sources(cls) -> list[DocumentationSource]: + """Transform config into legacy DocumentationSource list.""" + config = cls.load_config() + sources: list[DocumentationSource] = [] + + for source in config.sources: + repo_path = source.repository.path or cls._get_repo_path_from_url( + source.repository.url + ) + + for doc_path in source.paths: + if doc_path.relative_by_repo_path or doc_path.path is None: + fs_path = f"docs/{repo_path}" + else: + fs_path = f"docs/{repo_path}/{doc_path.path}" + + sources.append( + DocumentationSource( + path=fs_path, + folder=doc_path.folder or doc_path.name, + file_name_lower_case=doc_path.file_name_lower_case, + ignored_paths=doc_path.ignored_paths, + is_generated=doc_path.is_generated, + metadata=doc_path.metadata, + overrides=doc_path.overrides, + ) + ) + + return sources + + @classmethod + def get_repository_configs(cls) -> list[RepositoryConfig]: + """Transform config into legacy RepositoryConfig list.""" + config = cls.load_config() + repositories: list[RepositoryConfig] = [] + + for source in config.sources: + repo = source.repository + repo_path = repo.path or cls._get_repo_path_from_url(repo.url) + sparse_checkout = cls._calculate_sparse_checkout(source.paths) + + repositories.append( + RepositoryConfig( + name=cls._get_repo_name_from_url(repo.url), + url=repo.url, + path=repo_path, + branch=repo.branch, + sparse_checkout=sparse_checkout or None, + auth_type=repo.auth_type, + ssh_host=repo.ssh_host, + token=( + os.environ.get(repo.token_env_var, "") + if repo.token_env_var + else None + ), + local_path=( + os.environ.get(repo.local_path_env, "") + if repo.local_path_env + else None + ), + ) + ) + + return repositories + + @classmethod + def reload_config(cls) -> KnowledgeConfig: + cls._config = None + return cls.load_config() + + @classmethod + def set_config_path(cls, path: Path) -> None: + cls._config_path = path + cls._config = None + + # --- Private helpers --- + + @classmethod + def _parse_config(cls, raw: dict) -> KnowledgeConfig: + sources = [] + for src in raw.get("sources", []): + repo_raw = src["repository"] + repository = Repository( + url=repo_raw["url"], + branch=repo_raw["branch"], + auth_type=repo_raw.get("authType", "public"), + path=repo_raw.get("path"), + ssh_host=repo_raw.get("sshHost"), + token_env_var=repo_raw.get("tokenEnvVar"), + local_path_env=repo_raw.get("localPathEnv"), + ) + + paths = [] + for p in src.get("paths", []): + metadata = None + if "metadata" in p: + metadata = Metadata( + scope=p["metadata"].get("scope"), + service_type=p["metadata"].get("service_type"), + ) + + overrides = [] + for o in p.get("overrides", []): + overrides.append( + Override( + pattern=o["pattern"], + metadata=Metadata( + scope=o["metadata"].get("scope"), + service_type=o["metadata"].get("service_type"), + ), + ) + ) + + paths.append( + DocumentationPath( + name=p.get("name", p.get("folder", "")), + description=p.get("description", ""), + path=p.get("path"), + folder=p.get("folder"), + file_name_lower_case=p.get("fileNameLowerCase", False), + ignored_paths=p.get("ignoredPaths", []), + relative_by_repo_path=p.get("relativeByRepoPath", False), + is_generated=p.get("isGenerated", False), + metadata=metadata, + overrides=overrides, + ) + ) + + sources.append(Source(repository=repository, paths=paths)) + + return KnowledgeConfig( + version=raw.get("version", "1.0"), + sources=sources, + description=raw.get("description"), + ) + + @staticmethod + def _get_repo_name_from_url(url: str) -> str: + patterns = [ + r"/([^/]+)\.git$", + r"/([^/]+)\.wiki\.git$", + r"/([^/]+)$", + r"_git/([^/]+)$", + ] + for pattern in patterns: + m = re.search(pattern, url) + if m: + return m.group(1) + segments = url.rstrip("/").split("/") + return segments[-1] if segments else "unknown-repo" + + @staticmethod + def _get_repo_path_from_url(url: str) -> str: + return ConfigurationLoader._get_repo_name_from_url(url) + + @staticmethod + def _calculate_sparse_checkout(paths: list[DocumentationPath]) -> list[str]: + return [p.path for p in paths if p.path] diff --git a/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/src/services/metadata_resolver.py b/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/src/services/metadata_resolver.py new file mode 100644 index 00000000000..9f904c5498d --- /dev/null +++ b/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/src/services/metadata_resolver.py @@ -0,0 +1,82 @@ +"""Metadata resolver — resolves scope and service_type for documents. + +Uses glob pattern matching to apply file-level overrides on top of +path-level default metadata. +""" + +from __future__ import annotations + +import logging + +from wcmatch import glob + +from src.services.configuration_loader import Metadata, Override + +logger = logging.getLogger(__name__) + + +class MetadataResolver: + """Resolves hierarchical metadata with file overrides.""" + + @staticmethod + def resolve_metadata( + relative_path: str, + path_metadata: Metadata | None, + overrides: list[Override] | None = None, + ) -> Metadata | None: + """Resolve metadata for a file using path defaults and overrides. + + Args: + relative_path: Relative path from source root + path_metadata: Default metadata from DocumentationPath + overrides: Per-file override rules + + Returns: + Resolved Metadata, or None if no metadata applies + """ + if path_metadata is None: + return None + + # Start with path-level defaults + resolved = Metadata( + scope=path_metadata.scope, + service_type=path_metadata.service_type, + ) + + # Apply overrides (last match wins for each field) + if overrides: + for override in overrides: + if MetadataResolver._match_pattern(relative_path, override.pattern): + if override.metadata.scope: + resolved.scope = override.metadata.scope + if override.metadata.service_type: + resolved.service_type = override.metadata.service_type + + return resolved + + @staticmethod + def validate_metadata(metadata: Metadata) -> bool: + """Validate metadata structure.""" + if metadata.scope not in ("branded", "unbranded"): + return False + if metadata.service_type and metadata.service_type not in ( + "data-plane", + "management-plane", + ): + return False + if metadata.scope == "unbranded" and metadata.service_type: + logger.warning("service_type is set for unbranded content and will be ignored") + return True + + @staticmethod + def _match_pattern(file_path: str, pattern: str) -> bool: + """Match a file path against a glob pattern.""" + # Normalize path separators + normalized_path = file_path.replace("\\", "/") + normalized_pattern = pattern.replace("\\", "/") + + return glob.globmatch( + normalized_path, + normalized_pattern, + flags=glob.GLOBSTAR | glob.DOTGLOB | glob.IGNORECASE, + ) diff --git a/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/src/services/spector_processor.py b/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/src/services/spector_processor.py new file mode 100644 index 00000000000..71972534691 --- /dev/null +++ b/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/src/services/spector_processor.py @@ -0,0 +1,326 @@ +"""Spector case processor — converts TypeSpec scenario files to markdown. + +Processes .tsp files with @scenario annotations from the typespec and +typespec-azure http-specs directories, using Azure OpenAI to generate +meaningful titles and descriptions for each scenario. +""" + +from __future__ import annotations + +import asyncio +import logging +import os +import re +import time +from dataclasses import dataclass +from pathlib import Path + +from openai import AsyncAzureOpenAI + +logger = logging.getLogger(__name__) + +MAX_RETRIES = 5 +BASE_DELAY = 2.0 # seconds +MAX_DELAY = 60.0 +MAX_CONCURRENT = 5 +IGNORED_SPECS = ["special-words"] + + +@dataclass +class AnalysisResult: + title: str + scenarios: list[dict[str, str]] # [{heading, description}] + + +class SpectorCaseProcessor: + """Processes TypeSpec @scenario files into markdown documentation.""" + + _client: AsyncAzureOpenAI | None = None + + @classmethod + async def _init_client(cls) -> None: + deployment = os.environ.get("AOAI_CHAT_REASONING_MODEL") + api_key = os.environ.get("AOAI_CHAT_COMPLETIONS_API_KEY") + endpoint = os.environ.get("AOAI_CHAT_COMPLETIONS_ENDPOINT") + cls._client = AsyncAzureOpenAI( + azure_endpoint=endpoint, + api_key=api_key, + api_version="2024-12-01-preview", + azure_deployment=deployment, + ) + + @classmethod + async def process_spector_cases(cls, docs_dir: str) -> None: + """Process spector cases in both typespec and typespec-azure directories.""" + await cls._init_client() + + dirs = [ + ( + os.path.join(docs_dir, "typespec/packages/http-specs/specs"), + os.path.join(docs_dir, "typespec/packages/http-specs/specs/generated"), + ), + ( + os.path.join(docs_dir, "typespec-azure/packages/azure-http-specs/specs"), + os.path.join(docs_dir, "typespec-azure/packages/azure-http-specs/specs/generated"), + ), + ] + + for root, target_root in dirs: + try: + await cls._convert_cases_to_markdown(root, target_root) + except Exception as e: + logger.error("Error processing specs in %s: %s", root, e) + + logger.info("Spector case processing completed") + + @classmethod + async def _convert_cases_to_markdown(cls, root: str, target_root: str) -> None: + """Convert all specs in a directory to markdown.""" + if not os.path.isdir(root): + logger.error("Spector specs directory not found: %s", root) + return + + specs, paths = cls._get_specs(root) + semaphore = asyncio.Semaphore(MAX_CONCURRENT) + + async def process(spec: str, spec_path: str) -> None: + async with semaphore: + await cls._process_spec_file(spec, spec_path, root, target_root) + + tasks = [] + for spec, spec_path in zip(specs, paths): + if os.path.basename(spec_path).lower() != "main.tsp": + continue + tasks.append(process(spec, spec_path)) + + await asyncio.gather(*tasks, return_exceptions=True) + + @classmethod + async def _process_spec_file( + cls, main_spec: str, spec_path: str, root: str, target_root: str + ) -> None: + """Process a single spec file.""" + try: + dir_path = os.path.dirname(spec_path) + relative_dir = os.path.relpath(dir_path, root) + logger.info("Processing spec: %s", relative_dir) + + # Check for client.tsp + client_tsp_path = os.path.join(dir_path, "client.tsp") + client_tsp = None + if os.path.isfile(client_tsp_path): + with open(client_tsp_path, encoding="utf-8") as f: + client_tsp = f.read() + + scenarios = cls._get_scenarios("@scenario\n", main_spec) + if not scenarios: + logger.info("No scenarios in %s, skipping", relative_dir) + return + + doc = await cls._create_markdown_doc(scenarios, main_spec, client_tsp) + + target_dir = os.path.join(target_root, relative_dir) + os.makedirs(target_dir, exist_ok=True) + target_path = os.path.join( + target_dir, + os.path.basename(spec_path).replace(".tsp", ".md"), + ) + Path(target_path).write_text(doc, encoding="utf-8") + logger.info("Saved markdown: %s", target_path) + except Exception as e: + logger.error("Error processing %s: %s", spec_path, e) + + @classmethod + async def _create_markdown_doc( + cls, + scenarios: list[str], + main_spec: str, + client_tsp: str | None = None, + ) -> str: + """Create markdown from scenarios using LLM analysis.""" + combined = main_spec + if client_tsp: + combined = ( + f"// === MAIN SPEC (main.tsp) ===\n{main_spec}\n\n" + f"// === CLIENT CUSTOMIZATION (client.tsp) ===\n{client_tsp}" + ) + + analysis = await cls._analyze_scenarios(scenarios, combined) + + doc = f"# Usages for {analysis.title}\n\n" + for i, scenario in enumerate(scenarios): + data = analysis.scenarios[i] if i < len(analysis.scenarios) else {"heading": f"Scenario {i+1}", "description": ""} + cleaned = cls._remove_spector_content(scenario) + desc = "" if data["description"] == data["heading"] else data["description"] + doc += ( + f"## Scenario: {data['heading']}\n" + f"{desc}\n" + f"``` typespec\n{cleaned}\n```\n\n" + ) + + doc += "## Full Sample: \n// main.tsp\n``` typespec\n" + doc += cls._remove_spector_content(main_spec) + "\n```\n" + + if client_tsp: + doc += "// client.tsp\n``` typespec\n" + doc += cls._remove_spector_content(client_tsp) + "\n```\n" + + return doc + + @classmethod + async def _analyze_scenarios( + cls, scenarios: list[str], spec: str + ) -> AnalysisResult: + """Analyze scenarios with a single LLM call.""" + scenarios_text = "\n".join( + f"=== SCENARIO {i+1} ===\n{s}" for i, s in enumerate(scenarios) + ) + + prompt = f"""Analyze the following TypeSpec content and scenarios to extract structured information. + +FULL SPEC CONTENT: +{spec} + +SCENARIOS: +{scenarios_text} + +Please provide a JSON response with the following structure: +{{ + "title": "A concise title from @scenarioService or @doc (one line only)", + "scenarios": [ + {{ + "heading": "Title for scenario from @scenarioDoc or @doc (one line)", + "description": "Description from @scenarioDoc or @doc (exclude 'expected' test results)" + }} + ] +}} + +Requirements: +- Extract title from @scenarioService or @doc closest to @scenarioService only +- Headings should be one line suitable for markdown headers +- Descriptions should exclude 'expected' test results +- If description is same as heading, make description empty string +- Provide exactly {len(scenarios)} scenario objects +- Return only valid JSON""" + + response = await cls._get_chat_completion(prompt) + # Parse JSON response + clean = response.strip() + if clean.startswith("```json"): + clean = clean.removeprefix("```json").removesuffix("```").strip() + elif clean.startswith("```"): + clean = clean.removeprefix("```").removesuffix("```").strip() + + import json + + data = json.loads(clean) + return AnalysisResult( + title=data.get("title", "Unknown"), + scenarios=data.get("scenarios", []), + ) + + @classmethod + async def _get_chat_completion(cls, question: str) -> str: + """Get chat completion with retry logic.""" + if not cls._client: + raise RuntimeError("OpenAI client not initialized") + + deployment = os.environ.get("AOAI_CHAT_REASONING_MODEL", "") + + for attempt in range(MAX_RETRIES + 1): + try: + response = await cls._client.chat.completions.create( + messages=[ + { + "role": "system", + "content": "You are a TypeSpec expert. Extract structured information and return only valid JSON.", + }, + {"role": "user", "content": question}, + ], + model=deployment, + ) + if response.choices and response.choices[0].message.content: + return response.choices[0].message.content + break + except Exception as e: + err_str = str(e) + if ("429" in err_str or "Too Many Requests" in err_str) and attempt < MAX_RETRIES: + delay = min(BASE_DELAY * (2**attempt), MAX_DELAY) + logger.warning("Rate limit (attempt %d), retrying in %.1fs...", attempt + 1, delay) + await asyncio.sleep(delay) + continue + raise + + raise RuntimeError(f"Failed to get response after {MAX_RETRIES + 1} attempts") + + # --- Text processing utilities --- + + @classmethod + def _get_specs(cls, root: str) -> tuple[list[str], list[str]]: + """Walk directory tree and collect .tsp file contents and paths.""" + specs: list[str] = [] + paths: list[str] = [] + + for dirpath, _, filenames in os.walk(root): + for fname in filenames: + if not fname.endswith(".tsp"): + continue + full_path = os.path.join(dirpath, fname) + if any(ignored in full_path for ignored in IGNORED_SPECS): + continue + try: + with open(full_path, encoding="utf-8") as f: + specs.append(f.read()) + paths.append(full_path) + except OSError as e: + logger.warning("Failed to read %s: %s", full_path, e) + + return specs, paths + + @classmethod + def _get_scenarios(cls, search_str: str, spec: str) -> list[str]: + """Extract scenario blocks from spec content.""" + indexes = cls._find_indexes(search_str, spec) + scenarios = [] + for i, start in enumerate(indexes): + end = indexes[i + 1] if i + 1 < len(indexes) else len(spec) + scenarios.append(spec[start:end]) + return scenarios + + @classmethod + def _find_indexes(cls, search_str: str, spec: str) -> list[int]: + """Find starting indexes of each scenario block.""" + indexes: list[int] = [] + start = 0 + while True: + pos = spec.find(search_str, start) + if pos == -1: + break + # Walk back to find preceding blank line + block_start = pos + for ind in range(pos - 1, 0, -1): + if spec[ind] == "\n" and ind > 0 and spec[ind - 1] == "\n": + block_start = ind + 1 + break + indexes.append(block_start) + start = pos + len(search_str) + return indexes + + @classmethod + def _remove_spector_content(cls, content: str) -> str: + """Remove spector annotations from TypeSpec content.""" + result = content + # @scenarioDoc patterns + result = re.sub(r'@scenarioDoc\("[\s\S]*?"\)\n', "", result) + result = re.sub(r'@scenarioDoc\("""[\s\S]*?"""\)\n', "", result) + # @scenarioService patterns + result = re.sub(r'@scenarioService\("[\s\S]*?"\)\n', "", result) + result = re.sub(r"@scenarioService\(\n[\s\S]*?\n\)\n", "", result) + # Other + result = result.replace("@scenario", "") + result = re.sub(r'import "@typespec/spector";\n', "", result) + result = re.sub(r"using Spector;\n", "", result) + # Remove #suppress and missing-scenario lines + lines = result.split("\n") + lines = [l for l in lines if "#suppress " not in l and "missing-scenario" not in l] + return "\n".join(lines) diff --git a/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/src/services/storage_service.py b/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/src/services/storage_service.py new file mode 100644 index 00000000000..02b26e26843 --- /dev/null +++ b/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/src/services/storage_service.py @@ -0,0 +1,248 @@ +"""Azure Blob Storage service for knowledge documents. + +Handles blob upload, listing, deletion (soft-delete via metadata), +content change detection via MD5 hashing, and metadata comparison. +""" + +from __future__ import annotations + +import hashlib +import logging +import os +from base64 import b64encode +from pathlib import Path +from typing import Any + +from azure.identity import DefaultAzureCredential +from azure.storage.blob import BlobServiceClient, ContainerClient, ContentSettings + +logger = logging.getLogger(__name__) + + +class BlobService: + """Azure Blob Storage operations against a single container. + + By default the container name comes from + ``STORAGE_KNOWLEDGE_CONTAINER`` so existing callers keep working + unchanged. Pass ``container_name`` explicitly when you need to talk + to a different container (e.g. the GraphRAG output container). + """ + + def __init__(self, container_name: str | None = None) -> None: + account_name = os.environ.get("STORAGE_ACCOUNT_NAME") + resolved_container = container_name or os.environ.get("STORAGE_KNOWLEDGE_CONTAINER") + if not account_name: + raise RuntimeError("STORAGE_ACCOUNT_NAME environment variable is required") + if not resolved_container: + raise RuntimeError( + "Container name not provided and STORAGE_KNOWLEDGE_CONTAINER is not set" + ) + + credential = DefaultAzureCredential() + account_url = f"https://{account_name}.blob.core.windows.net" + self._service_client = BlobServiceClient(account_url, credential=credential) + self._container_client: ContainerClient = self._service_client.get_container_client( + resolved_container + ) + self._container_name = resolved_container + + @property + def container_name(self) -> str: + return self._container_name + + def put_blob( + self, + blob_path: str, + content: str | bytes, + metadata: dict[str, str] | None = None, + ) -> None: + """Upload content to blob storage. + + The blob's Content-MD5 header is set explicitly so that + :meth:`has_content_changed` can later compare hashes via the + ``content_settings.content_md5`` field returned by + ``list_blobs``. Without this, the server stores no MD5 and every + unchanged document gets re-uploaded on the next sync. + """ + blob_client = self._container_client.get_blob_client(blob_path) + data = content.encode("utf-8") if isinstance(content, str) else content + # Azure expects content_md5 as a bytearray of the raw digest, not + # base64-encoded — the service returns it on list_blobs in the + # same raw form. + content_md5 = bytearray(hashlib.md5(data).digest()) + blob_client.upload_blob( + data, + overwrite=True, + content_settings=ContentSettings( + content_type=self._get_content_type(blob_path), + content_md5=content_md5, + ), + metadata=metadata, + ) + logger.info("Uploaded %s to blob storage", blob_path) + + def list_blobs(self, prefix: str | None = None) -> dict[str, Any]: + """List all blobs in the container, returning {name: blob_properties}.""" + blobs: dict[str, Any] = {} + kwargs: dict[str, Any] = {"include": ["metadata"]} + if prefix: + kwargs["name_starts_with"] = prefix + for blob in self._container_client.list_blobs(**kwargs): + blobs[blob.name] = blob + logger.info("Listed %d blobs", len(blobs)) + return blobs + + def delete_blob(self, blob_path: str) -> None: + """Soft-delete a blob by setting IsDeleted metadata.""" + blob_client = self._container_client.get_blob_client(blob_path) + blob_client.set_blob_metadata({"IsDeleted": "true"}) + logger.info("Soft-deleted blob %s", blob_path) + + def download_blob(self, blob_name: str) -> bytes: + """Download blob content as bytes.""" + blob_client = self._container_client.get_blob_client(blob_name) + return blob_client.download_blob().readall() + + def download_blobs_to_dir(self, blob_paths: list[str], target_dir: Path) -> int: + """Download specific blobs to a local directory. + + Args: + blob_paths: List of blob names to download. + target_dir: Local directory to write files into (cleaned first). + + Returns: + Number of blobs successfully downloaded. + """ + import shutil + + if target_dir.exists(): + shutil.rmtree(target_dir) + target_dir.mkdir(parents=True) + + count = 0 + for blob_path in blob_paths: + try: + data = self.download_blob(blob_path) + local_path = target_dir / blob_path + local_path.parent.mkdir(parents=True, exist_ok=True) + local_path.write_bytes(data) + count += 1 + except Exception as e: + logger.warning("Failed to download blob %s: %s", blob_path, e) + + logger.info("Downloaded %d/%d blobs to %s", count, len(blob_paths), target_dir) + return count + + def download_all_blobs_to_dir( + self, target_dir: Path, source_prefixes: list[str] | None = None + ) -> int: + """Download all blobs (optionally filtered by prefix) to a local directory. + + Args: + target_dir: Local directory to write files into (cleaned first). + source_prefixes: Only download blobs starting with these prefixes. + + Returns: + Number of blobs downloaded. + """ + import shutil + + if target_dir.exists(): + shutil.rmtree(target_dir) + target_dir.mkdir(parents=True) + + count = 0 + for blob in self._container_client.list_blobs(): + name: str = blob.name + if source_prefixes: + if not any(name.startswith(f"{p}/") for p in source_prefixes): + continue + try: + data = self._container_client.get_blob_client(name).download_blob().readall() + local_path = target_dir / name + local_path.parent.mkdir(parents=True, exist_ok=True) + local_path.write_bytes(data) + count += 1 + except Exception as e: + logger.warning("Failed to download blob %s: %s", name, e) + + logger.info("Downloaded %d blobs to %s", count, target_dir) + return count + + # --- Change detection --- + + def has_content_changed( + self, + blob_path: str, + content: str | bytes, + existing_blobs: dict[str, Any], + ) -> bool: + """Check if content has changed by comparing MD5 hashes.""" + current_md5 = self._calculate_md5(content) + existing = existing_blobs.get(blob_path) + + if existing is None: + return True + + # Check soft-delete flag + if existing.metadata and existing.metadata.get("IsDeleted") == "true": + return True + + # The azure-storage-blob SDK returns BlobProperties objects from + # list_blobs() with the MD5 at .content_settings.content_md5 as a + # raw bytearray — NOT under a .properties dict like the JS SDK. + content_settings = getattr(existing, "content_settings", None) + existing_md5 = getattr(content_settings, "content_md5", None) if content_settings else None + if not existing_md5: + return True + + # Azure returns content_md5 as bytearray; convert to base64 for comparison + if isinstance(existing_md5, (bytes, bytearray)): + existing_md5_b64 = b64encode(existing_md5).decode() + else: + existing_md5_b64 = str(existing_md5) + + return existing_md5_b64 != current_md5 + + def has_metadata_changed( + self, + blob_path: str, + current_metadata: dict[str, str] | None, + existing_blobs: dict[str, Any], + ) -> bool: + """Check if blob metadata has changed.""" + existing = existing_blobs.get(blob_path) + + if existing is None: + return current_metadata is not None + + existing_meta = existing.metadata or {} + + if not current_metadata: + return bool(existing_meta.get("scope") or existing_meta.get("service_type")) + + if current_metadata.get("scope") != existing_meta.get("scope"): + return True + if current_metadata.get("service_type") != existing_meta.get("service_type"): + return True + + return False + + # --- Private helpers --- + + @staticmethod + def _calculate_md5(content: str | bytes) -> str: + """Calculate MD5 hash as base64 (matching Azure's contentMD5 format).""" + data = content.encode("utf-8") if isinstance(content, str) else content + return b64encode(hashlib.md5(data).digest()).decode() + + @staticmethod + def _get_content_type(filename: str) -> str: + ext = filename.rsplit(".", 1)[-1].lower() if "." in filename else "" + return { + "md": "text/markdown", + "mdx": "text/markdown", + "txt": "text/plain", + "json": "application/json", + "html": "text/html", + }.get(ext, "application/octet-stream") diff --git a/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/src/services/typespec_processor.py b/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/src/services/typespec_processor.py new file mode 100644 index 00000000000..a40ef27be1e --- /dev/null +++ b/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/src/services/typespec_processor.py @@ -0,0 +1,486 @@ +"""TypeSpec library processor — converts .tsp files to structured markdown. + +Parses TypeSpec definitions (models, operations, interfaces, enums, etc.) +and generates markdown documentation with code blocks for each definition. +""" + +from __future__ import annotations + +import logging +import os +import re +from dataclasses import dataclass, field +from pathlib import Path +from typing import Literal + +logger = logging.getLogger(__name__) + +DefinitionType = Literal[ + "model", "operation", "interface", "enum", "union", + "alias", "namespace", "scalar", "decorator", +] + + +@dataclass +class TypeSpecDefinition: + type: DefinitionType + name: str + full_name: str + code: str + decorators: list[str] = field(default_factory=list) + description: str = "" + comments: list[str] = field(default_factory=list) + level: int = 1 + children: list[TypeSpecDefinition] | None = None + + +class TypeSpecProcessor: + """Converts TypeSpec library .tsp files into markdown documentation.""" + + def __init__(self, work_dir: str, relative_lib_dir: str) -> None: + self._work_dir = work_dir + self._relative_lib_dir = relative_lib_dir + self._src_dir = os.path.join(work_dir, relative_lib_dir) + self._dest_dir = os.path.join(work_dir, relative_lib_dir, "generated") + + def process_typespec_libraries(self) -> None: + """Find all .tsp files and convert them to markdown.""" + if not os.path.isdir(self._src_dir): + raise RuntimeError(f"TypeSpec library directory not found: {self._src_dir}") + + os.makedirs(self._dest_dir, exist_ok=True) + + # Collect all .tsp files + tsp_files: list[str] = [] + for dirpath, _, filenames in os.walk(self._src_dir): + for fname in filenames: + if fname.endswith(".tsp"): + tsp_files.append(os.path.join(dirpath, fname)) + + logger.info("Found %d .tsp files in %s", len(tsp_files), self._src_dir) + + for tsp_file in tsp_files: + relative = os.path.relpath(tsp_file, self._src_dir) + safe_name = relative.replace(os.sep, "#").replace("/", "#") + md_file = os.path.join(self._dest_dir, safe_name.replace(".tsp", ".md")) + self._convert_to_markdown(tsp_file, md_file) + + def _convert_to_markdown(self, tsp_file: str, md_file: str) -> None: + """Convert a single TypeSpec file to markdown.""" + content = Path(tsp_file).read_text(encoding="utf-8") + definitions = self._parse_definitions(content) + relative = os.path.relpath(tsp_file, self._work_dir) + markdown = self._generate_markdown(definitions) + Path(md_file).write_text(markdown, encoding="utf-8") + + # --- Parser --- + + def _parse_definitions(self, content: str) -> list[TypeSpecDefinition]: + lines = content.split("\n") + return self._parse_definitions_from_lines(lines) + + def _parse_definitions_from_lines( + self, lines: list[str], level: int = 1 + ) -> list[TypeSpecDefinition]: + definitions: list[TypeSpecDefinition] = [] + current_def_start = -1 + current_body_start = -1 + current_type: DefinitionType | None = None + current_name = "" + current_level = level + brace_count = 0 + has_global_namespace = False + + for i, line in enumerate(lines): + trimmed = line.strip() + match = self._match_definition_start(trimmed) + + if match and brace_count == 0: + brace_count = trimmed.count("{") - trimmed.count("}") + + if match["type"] == "namespace" and trimmed.endswith(";"): + has_global_namespace = True + current_level += 1 + + if current_def_start == -1: + # First definition — find its preamble start + current_def_start = self._find_preamble_start(lines, i) + current_body_start = i + current_type = match["type"] + current_name = match["name"] + else: + # Emit the previous definition + prev_end = self._find_preamble_start_between(lines, current_def_start, i) + definition = self._build_definition( + current_type, current_name, lines, + current_def_start, current_body_start, i, current_level, + ) + definitions.append(definition) + + current_def_start = prev_end + current_body_start = i + current_type = match["type"] + current_name = match["name"] + else: + brace_count += trimmed.count("{") - trimmed.count("}") + + # Handle last definition + if current_def_start != -1 and current_type and current_name: + definition = self._build_definition( + current_type, current_name, lines, + current_def_start, current_body_start, len(lines), current_level, + ) + definitions.append(definition) + + if has_global_namespace and definitions: + definitions[0].level -= 1 + + return definitions + + def _build_definition( + self, + def_type: DefinitionType, + def_name: str, + lines: list[str], + def_start: int, + body_start: int, + next_body_start: int, + level: int, + ) -> TypeSpecDefinition: + """Build a TypeSpecDefinition from line ranges.""" + # Find the actual end of this definition (before next definition's comments) + def_end = self._find_definition_end(lines, def_start, next_body_start) + + # Extract comments and decorators from preamble + comments: list[str] = [] + decorators: list[str] = [] + in_block_comment = False + in_decorator = False + paren_count = 0 + + for n in range(def_start, body_start): + trimmed = lines[n].strip() + if trimmed.startswith("/**") and not in_block_comment: + comments.append(trimmed) + if not trimmed.endswith("*/"): + in_block_comment = True + continue + if in_block_comment: + comments.append(trimmed) + if trimmed.endswith("*/"): + in_block_comment = False + continue + if trimmed.startswith("//"): + comments.append(trimmed) + continue + if trimmed.startswith("@") and not in_decorator: + paren_count = trimmed.count("(") - trimmed.count(")") + decorators.append(trimmed) + in_decorator = True + if trimmed.count("(") == 0 or (trimmed.endswith(")") and paren_count == 0): + in_decorator = False + continue + if in_decorator: + decorators[-1] += "\n" + trimmed + paren_count += trimmed.count("(") - trimmed.count(")") + if trimmed.endswith(")") and paren_count == 0: + in_decorator = False + + # Parse children for namespaces and interfaces + children: list[TypeSpecDefinition] = [] + if def_type == "namespace": + children = self._parse_definitions_from_lines( + lines[body_start + 1: def_end], level + 1 + ) + elif def_type == "interface": + children = self._parse_interface_operations( + lines, body_start, def_end, level + 1 + ) + + description = self._extract_description(decorators, comments) + + return TypeSpecDefinition( + type=def_type, + name=def_name, + full_name=def_name, + code="\n".join(lines[def_start: def_end + 1]), + decorators=decorators, + description=description, + comments=comments, + level=level, + children=children or None, + ) + + def _parse_interface_operations( + self, lines: list[str], def_start: int, def_end: int, level: int + ) -> list[TypeSpecDefinition]: + """Parse operations from an interface body.""" + operations: list[TypeSpecDefinition] = [] + current_comments: list[str] = [] + current_decorators: list[str] = [] + op_lines: list[str] = [] + in_block_comment = False + in_decorator = False + in_operation = False + brace_count = 0 + angle_count = 0 + paren_count = 0 + + for i in range(def_start, def_end + 1): + line = lines[i] + trimmed = line.strip() + + if not trimmed and not in_operation and not in_block_comment and not in_decorator: + continue + + # Block comments + if trimmed.startswith("/**") and not in_block_comment: + in_block_comment = True + current_comments.append(trimmed) + if trimmed.endswith("*/"): + in_block_comment = False + continue + if in_block_comment: + current_comments.append(trimmed) + if trimmed.endswith("*/"): + in_block_comment = False + continue + if trimmed.startswith("//"): + current_comments.append(trimmed) + continue + + # Decorators + if trimmed.startswith("@") and not in_operation: + in_decorator = True + current_decorators.append(trimmed) + paren_count = trimmed.count("(") - trimmed.count(")") + if paren_count <= 0: + in_decorator = False + continue + if in_decorator: + current_decorators[-1] += "\n" + trimmed + paren_count += trimmed.count("(") - trimmed.count(")") + if paren_count <= 0: + in_decorator = False + continue + + # Operation detection + op_match = ( + re.match(r"^op\s+(\w+)", trimmed) + or re.match(r"^(\w+)\s+is\s+", trimmed) + or re.match(r"^(\w+)\s*<", trimmed) + or re.match(r"^(\w+)\s*\(", trimmed) + ) + + if op_match and not in_operation: + in_operation = True + op_lines = [line] + brace_count = trimmed.count("{") - trimmed.count("}") + angle_count = trimmed.count("<") - trimmed.count(">") + paren_count = trimmed.count("(") - trimmed.count(")") + + if trimmed.endswith(";") and brace_count <= 0 and angle_count <= 0 and paren_count <= 0: + op = self._build_operation( + op_match.group(1), op_lines, current_decorators, current_comments, level + ) + operations.append(op) + current_comments, current_decorators, op_lines = [], [], [] + in_operation = False + brace_count = angle_count = paren_count = 0 + continue + + if in_operation: + op_lines.append(line) + brace_count += trimmed.count("{") - trimmed.count("}") + angle_count += trimmed.count("<") - trimmed.count(">") + paren_count += trimmed.count("(") - trimmed.count(")") + + if trimmed.endswith(";") and brace_count <= 0 and angle_count <= 0 and paren_count <= 0: + name = self._extract_op_name(op_lines[0]) + op = self._build_operation( + name, op_lines, current_decorators, current_comments, level + ) + operations.append(op) + current_comments, current_decorators, op_lines = [], [], [] + in_operation = False + brace_count = angle_count = paren_count = 0 + + return operations + + def _build_operation( + self, + name: str, + op_lines: list[str], + decorators: list[str], + comments: list[str], + level: int, + ) -> TypeSpecDefinition: + code = "\n".join([*comments, *decorators, *op_lines]) + description = self._extract_description(decorators, comments) + return TypeSpecDefinition( + type="operation", + name=name, + full_name=name, + code=code, + decorators=decorators, + description=description, + comments=comments, + level=level, + ) + + @staticmethod + def _extract_op_name(line: str) -> str: + trimmed = line.strip() + for pattern in [r"^op\s+(\w+)", r"^(\w+)\s+is\s+", r"^(\w+)\s*<", r"^(\w+)\s*\("]: + m = re.match(pattern, trimmed) + if m: + return m.group(1) + return "unknown" + + # --- Markdown generation --- + + def _generate_markdown(self, definitions: list[TypeSpecDefinition]) -> str: + lines: list[str] = [] + for defn in definitions: + self._emit_definition(defn, lines) + if defn.children: + for child in defn.children: + self._emit_definition(child, lines) + return "\n".join(lines) + + @staticmethod + def _emit_definition(defn: TypeSpecDefinition, output: list[str]) -> None: + header = "#" * defn.level + output.append(f"{header} {defn.full_name}") + output.append("") + output.append(f"**Type:** {defn.type.capitalize()}") + output.append("") + if defn.description: + output.append(defn.description) + output.append("") + output.append("```typespec") + output.append(defn.code) + output.append("```") + output.append("") + output.append("") + + # --- Helpers --- + + @staticmethod + def _match_definition_start(line: str) -> dict[str, str] | None: + patterns: list[tuple[str, DefinitionType]] = [ + (r"^model\s+(\w+)", "model"), + (r"^op\s+(\w+)", "operation"), + (r"^interface\s+(\w+)", "interface"), + (r"^enum\s+(\w+)", "enum"), + (r"^union\s+(\w+)", "union"), + (r"^alias\s+(\w+)", "alias"), + (r"^namespace\s+([\w.]+)", "namespace"), + (r"^scalar\s+(\w+)", "scalar"), + (r"^(?:extern\s+)?dec\s+(\w+)", "decorator"), + ] + for pattern, def_type in patterns: + m = re.match(pattern, line) + if m: + return {"type": def_type, "name": m.group(1)} + return None + + @staticmethod + def _find_preamble_start(lines: list[str], current: int) -> int: + """Walk backward from current to find where comments/decorators start.""" + in_comment = False + for i in range(current - 1, -1, -1): + trimmed = lines[i].strip() + if trimmed.endswith("*/"): + in_comment = True + if trimmed.startswith("/*"): + in_comment = False + is_comment = in_comment or trimmed.startswith("//") + if not is_comment and (trimmed.endswith(";") or trimmed.endswith("}")): + return i + 1 + return 0 + + @staticmethod + def _find_preamble_start_between(lines: list[str], start: int, current: int) -> int: + """Find start of next definition's preamble looking backward from current.""" + in_comment = False + for i in range(current - 1, start, -1): + trimmed = lines[i].strip() + if trimmed.endswith("*/"): + in_comment = True + if trimmed.startswith("/*"): + in_comment = False + is_comment = in_comment or trimmed.startswith("//") + if not is_comment and (trimmed.endswith(";") or trimmed.endswith("}")): + return i + 1 + return start + + @staticmethod + def _find_definition_end(lines: list[str], start: int, next_start: int) -> int: + """Find end of current definition (before next definition's comments).""" + in_comment = False + for i in range(next_start - 1, start, -1): + trimmed = lines[i].strip() + if trimmed.endswith("*/"): + in_comment = True + if trimmed.startswith("/*"): + in_comment = False + is_comment = in_comment or trimmed.startswith("//") + if not is_comment and (trimmed.endswith(";") or trimmed.endswith("}")): + return i + return min(next_start - 1, len(lines) - 1) + + @staticmethod + def _extract_description(decorators: list[str], comments: list[str]) -> str: + """Extract description from @doc decorator or JSDoc comments.""" + # Try @doc decorator + for dec in decorators: + m = re.match(r'^@doc\s*\(\s*"([^"]*)"\s*\)', dec) + if m: + return m.group(1) + m = re.match(r'^@doc\s*\(\s*"""([\s\S]*?)"""\s*\)', dec) + if m: + return m.group(1).strip() + m = re.match(r'^@summary\s*\(\s*"([^"]*)"\s*\)', dec) + if m: + return m.group(1) + + # Try JSDoc comments + if comments: + return TypeSpecProcessor._parse_jsdoc(comments) + return "" + + @staticmethod + def _parse_jsdoc(comments: list[str]) -> str: + """Parse JSDoc comments, excluding @param and other tags.""" + desc_lines: list[str] = [] + in_tag = False + tag_prefixes = ( + "@param", "@template", "@returns", "@return", "@example", + "@see", "@deprecated", "@throws", "@type", "@typedef", + "@callback", "@property", "@prop", "@arg", "@argument", + ) + + for line in comments: + clean = line + clean = re.sub(r"^/\*\*?\s*", "", clean) + clean = re.sub(r"\s*\*/$", "", clean) + clean = re.sub(r"^\*\s?", "", clean) + clean = re.sub(r"^//\s?", "", clean) + trimmed = clean.strip() + + if any(trimmed.startswith(t) for t in tag_prefixes): + in_tag = True + continue + if trimmed.startswith("@"): + in_tag = True + continue + if in_tag: + if not trimmed or re.match(r"^[A-Z]", trimmed): + in_tag = False + else: + continue + if trimmed: + desc_lines.append(trimmed) + + return " ".join(desc_lines).strip() diff --git a/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/sync_knowledge_graph.yml b/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/sync_knowledge_graph.yml new file mode 100644 index 00000000000..7fc843b62e9 --- /dev/null +++ b/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/sync_knowledge_graph.yml @@ -0,0 +1,175 @@ +name: Sync Knowledge Graph + +trigger: none +pr: none + +parameters: + # Manual full rebuild — leave default for the scheduled daily run. + # Toggle to `true` in the "Run pipeline" dialog to wipe and re-index + # everything from scratch (expensive — 1–3 hours, large embedding cost). + - name: fullGraphrag + displayName: 'Full GraphRAG rebuild (skip incremental)' + type: boolean + default: false + +schedules: +- cron: "0 3 * * *" # Daily at 3 AM UTC (1h after the doc sync at 02:00) + displayName: Daily knowledge graph sync + branches: + include: + - main + +resources: + repositories: + - repository: 1ESPipelineTemplates + type: git + name: 1ESPipelineTemplates/1ESPipelineTemplates + ref: refs/tags/release + - repository: azure-sdk-docs-eng.ms + type: git + name: internal/azure-sdk-docs-eng.ms + ref: main + - repository: internal-wiki + type: git + name: internal/internal.wiki + ref: wikiMaster + +extends: + template: v1/1ES.Official.PipelineTemplate.yml@1ESPipelineTemplates + parameters: + settings: + skipBuildTagsForGitHubPullRequests: true + networkIsolationPolicy: Permissive + sdl: + # Turn off the build warnings caused by disabling some sdl checks + createAdoIssuesForJustificationsForDisablement: false + sourceAnalysisPool: + name: azsdk-pool + image: windows-2022 + os: windows + eslint: + enabled: false + justificationForDisabling: 'ESLint injected task has failures because it uses an old version of mkdirp. We should not fail for tools not controlled by the repo. See: https://dev.azure.com/azure-sdk/internal/_build/results?buildId=3499746' + codeql: + compiled: + enabled: false + justificationForDisabling: CodeQL times our pipelines out by running for 2+ hours before being force canceled. + componentgovernance: + enabled: false + justificationForDisabling: Manually enabling only on the main build job instead of running it on every job. + psscriptanalyzer: + compiled: true + break: true + policy: M365 + sourceRepositoriesToScan: + exclude: + - repository: azure-sdk-docs-eng.ms + - repository: internal-wiki + stages: + - stage: SyncKnowledgeGraph + displayName: 'Sync Knowledge Graph' + jobs: + - job: BuildAndSync + displayName: 'Build and Execute Knowledge Graph Sync' + # GraphRAG indexing can take significantly longer than a plain + # doc sync — bump the per-job timeout well above the default 60m. + timeoutInMinutes: 240 + + variables: + - group: Release Secrets for GitHub + - template: /eng/pipelines/templates/variables/image.yml@self + - template: /eng/pipelines/templates/variables/globals.yml@self + - name: workingDirectory + value: '$(Agent.BuildDirectory)/s/azure-sdk-tools/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync' + + pool: + name: $(LINUXPOOL) + image: $(LINUXVMIMAGE) + os: linux + + steps: + - checkout: self + displayName: 'Checkout current repository' + + - checkout: azure-sdk-docs-eng.ms + displayName: 'Checkout azure-sdk-docs-eng.ms repository' + + - checkout: internal-wiki + displayName: 'Checkout internal wiki repository' + + - template: /eng/common/pipelines/templates/steps/login-to-github.yml@self + parameters: + ScriptDirectory: $(Agent.BuildDirectory)/s/azure-sdk-tools/eng/common/scripts + + - task: AzureCLI@2 + displayName: 'Resolve federated credentials' + inputs: + azureSubscription: $(SERVICE_CONNECTION) + addSpnToEnvironment: true + scriptType: bash + scriptLocation: inlineScript + inlineScript: | + echo "##vso[task.setvariable variable=AZURE_CLIENT_ID]$servicePrincipalId" + echo "##vso[task.setvariable variable=AZURE_TENANT_ID]$tenantId" + echo "##vso[task.setvariable variable=AZURE_SUBSCRIPTION_ID]$(az account show --query id -o tsv)" + echo "$idToken" > "$(Agent.BuildDirectory)/id_token" + echo "##vso[task.setvariable variable=AZURE_FEDERATED_TOKEN_FILE]$(Agent.BuildDirectory)/id_token" + + - task: UsePythonVersion@0 + displayName: 'Use Python 3.12' + inputs: + versionSpec: '3.12' + + - script: | + python --version + pip --version + displayName: 'Show Python version' + + # 1ES agents don't have outbound access to public PyPI; route + # pip through the internal mirror feed. PipAuthenticate sets + # PIP_INDEX_URL for the rest of the job. + - task: PipAuthenticate@1 + displayName: 'Pip Authenticate to internal feed' + inputs: + artifactFeeds: 'public/azure-sdk-for-python' + onlyAddExtraIndex: false + + - script: | + pip install -e . + displayName: 'Install Python dependencies' + workingDirectory: $(workingDirectory) + + - script: | + ARGS="" + if [ "$FULL_GRAPHRAG" = "True" ] || [ "$FULL_GRAPHRAG" = "true" ]; then + ARGS="--full-graphrag" + fi + echo "Running: sync-knowledge-graph $ARGS" + sync-knowledge-graph $ARGS + displayName: 'Execute knowledge graph sync' + workingDirectory: $(workingDirectory) + env: + # Pipeline parameter surfaced as env var for the bash wrapper. + FULL_GRAPHRAG: ${{ parameters.fullGraphrag }} + + # Azure App Configuration — everything else is sourced from here. + AZURE_APPCONFIG_ENDPOINT: $(AZURE_APPCONFIG_ENDPOINT) + + # Local clones of the internal repos (checked out above). + # The sync project's knowledge-config.json references these + # via `localPathEnv`. + AZURE_SDK_DOCS_PATH: "$(Agent.BuildDirectory)/s/azure-sdk-docs-eng.ms" + AZURE_SDK_WIKI_PATH: "$(Agent.BuildDirectory)/s/internal.wiki" + + # GitHub App token for private repo access. + AZURE_SDK_GITHUB_PAT: $(GH_TOKEN) + + # Bot reload notification (best-effort; if either is empty + # the post-publish notify step is skipped with a warning + # and the bot picks up the new snapshot on its next reload). + BOT_AGENT_RELOAD_URL: $(BOT_AGENT_RELOAD_URL) + BOT_AGENT_ADMIN_TOKEN: $(BOT_AGENT_ADMIN_TOKEN) + + # Set working directory for temporary files. + TMPDIR: $(Agent.TempDirectory) + HOME: $(Agent.HomeDirectory) diff --git a/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/tests/__init__.py b/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/tests/__init__.py new file mode 100644 index 00000000000..258ee1ff580 --- /dev/null +++ b/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/tests/__init__.py @@ -0,0 +1 @@ +"""Tests for the knowledge sync pipeline — Python port of vitest suite.""" diff --git a/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/tests/conftest.py b/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/tests/conftest.py new file mode 100644 index 00000000000..ae16defb3f2 --- /dev/null +++ b/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/tests/conftest.py @@ -0,0 +1,7 @@ +"""Shared test fixtures and configuration.""" + +import sys +from pathlib import Path + +# Ensure src is importable +sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) diff --git a/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/tests/test_configuration_loader.py b/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/tests/test_configuration_loader.py new file mode 100644 index 00000000000..da4d8e0cc0d --- /dev/null +++ b/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/tests/test_configuration_loader.py @@ -0,0 +1,84 @@ +"""Tests for ConfigurationLoader.""" + +from __future__ import annotations + +import json +from pathlib import Path + +import pytest + +from src.services.configuration_loader import ConfigurationLoader + + +@pytest.fixture +def temp_config(tmp_path): + """Helper to write a temp config and return its path.""" + + def _write(data: dict) -> str: + cfg_file = tmp_path / "knowledge-config.json" + cfg_file.write_text(json.dumps(data, indent=2)) + return str(cfg_file) + + return _write + + +class TestConfigurationLoader: + def setup_method(self): + """Reset cached config between tests.""" + ConfigurationLoader._config = None + + def test_loads_config_and_exposes_sources(self, temp_config): + config = { + "version": "1.0.0", + "sources": [ + { + "repository": { + "url": "https://github.com/org/repo.git", + "branch": "main", + "authType": "public", + }, + "paths": [ + {"name": "docs_a", "description": "Docs A", "path": "docs", "folder": "folder_a"}, + {"name": "docs_b", "description": "Docs B", "folder": "folder_b", "relativeByRepoPath": True}, + ], + } + ], + } + path = temp_config(config) + ConfigurationLoader._config_path = Path(path) + docs = ConfigurationLoader.get_documentation_sources() + + assert len(docs) == 2 + assert docs[0].folder == "folder_a" + assert "docs" in docs[0].path + assert docs[1].folder == "folder_b" + + def test_repository_configs(self, temp_config): + config = { + "version": "1.0.0", + "sources": [ + { + "repository": { + "url": "https://github.com/org/another.git", + "branch": "dev", + "authType": "public", + }, + "paths": [ + {"name": "part1", "description": "Part1", "path": "docs/part1", "folder": "f1"}, + {"name": "part2", "description": "Part2", "path": "docs/part2", "folder": "f2"}, + ], + } + ], + } + path = temp_config(config) + ConfigurationLoader._config_path = Path(path) + repos = ConfigurationLoader.get_repository_configs() + + assert len(repos) == 1 + repo = repos[0] + assert repo.url == "https://github.com/org/another.git" + assert repo.branch == "dev" + # Sparse checkout paths should include both paths + assert len(repo.sparse_checkout) == 2 + assert "docs/part1" in repo.sparse_checkout + assert "docs/part2" in repo.sparse_checkout diff --git a/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/tests/test_daily_sync.py b/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/tests/test_daily_sync.py new file mode 100644 index 00000000000..01d76d751dc --- /dev/null +++ b/tools/sdk-ai-bots/azure-sdk-qa-bot-knowledge-graph-sync/tests/test_daily_sync.py @@ -0,0 +1,131 @@ +"""Tests for daily_sync content processing functions. + +Ported from the vitest suite (convertMarkdown, preprocessContent, +extractDateFromFilename, extractReleaseInfo, extractSections). +""" + +from __future__ import annotations + +import pytest + +from src.daily_sync import ( + convert_markdown, + extract_date_from_filename, + extract_release_info, + extract_sections, + preprocess_content, +) + + +# --- convertMarkdown tests --- + + +class TestConvertMarkdown: + def test_frontmatter_title_and_permalink(self): + md = "---\ntitle: Sample Title\npermalink: custom-name\n---\nHello world." + result = convert_markdown(md) + assert result["filename"] == "custom-name" + assert result["content"].startswith("# Sample Title") + assert "Hello world." in result["content"] + + def test_no_permalink_empty_filename(self): + md = "---\ntitle: Sample Title\n---\nHello again." + result = convert_markdown(md) + assert result["filename"] == "" + assert result["content"].startswith("# Sample Title") + + +# --- extractDateFromFilename tests --- + + +class TestExtractDateFromFilename: + def test_valid_release_date(self): + d = extract_date_from_filename("release-2024-12-25.md") + assert d == "2024-12-25" + + def test_invalid_filename_returns_epoch(self): + d = extract_date_from_filename("not-a-release-file.md") + assert d == "1970-01-01" + + +# --- preprocessContent tests --- + + +class TestPreprocessContent: + def test_escape_backticks(self): + inp = "Some text\n```\ncode line 1\n```\nMore text" + result = preprocess_content(inp) + assert "\\`\\`\\`" in result + assert "code line 1" in result + + def test_backticks_with_language(self): + inp = '```python\ndef hello():\n print("world")\n```' + result = preprocess_content(inp) + assert "\\`\\`\\`python" in result + + def test_hash_comments_converted_in_code_blocks(self): + inp = "```python\n# This is a comment\ndef hello():\n pass\n```" + result = preprocess_content(inp) + assert "// This is a comment" in result + assert "def hello():" in result + + def test_hash_preserved_outside_code_blocks(self): + inp = "# Header 1\n## Header 2" + result = preprocess_content(inp) + assert "# Header 1" in result + assert "## Header 2" in result + + def test_inline_hash_preserved(self): + inp = '```python\n# Start comment\ntext = "value" # inline\n```' + result = preprocess_content(inp) + assert "// Start comment" in result + # Inline # should remain + assert '# inline' in result + + def test_empty_string(self): + assert preprocess_content("") == "" + + def test_no_transformations_needed(self): + inp = "Regular markdown content\nWith **bold** text" + assert preprocess_content(inp) == inp + + def test_inline_backticks_unchanged(self): + inp = "Use `inline code` like this" + assert preprocess_content(inp) == inp + + +# --- extractReleaseInfo tests --- + + +class TestExtractReleaseInfo: + def test_extracts_all_fields(self): + content = '---\ntitle: "TypeSpec"\nreleaseDate: 2024-03-15\nversion: "0.52.0"\n---\nContent here.' + info = extract_release_info(content) + assert info["title"] == "TypeSpec" + assert info["releaseDate"] == "2024-03-15" + assert info["version"] == "0.52.0" + + def test_no_frontmatter(self): + info = extract_release_info("No frontmatter here") + assert info["title"] == "" + assert info["releaseDate"] == "" + assert info["version"] == "" + + +# --- extractSections tests --- + + +class TestExtractSections: + def test_removes_frontmatter_and_downgrades_headers(self): + content = "---\ntitle: Test\n---\n# Section One\nContent\n## Sub Section" + result = extract_sections(content) + assert "---" not in result + # Headers should be downgraded (one more #) + assert "## Section One" in result + assert "### Sub Section" in result + + def test_removes_caution_blocks(self): + content = "---\ntitle: t\n---\n# H1\n:::caution\nWarning text\n:::\nNormal text" + result = extract_sections(content) + assert "caution" not in result + assert "Normal text" in result diff --git a/tools/sdk-ai-bots/offline-evaluation.yml b/tools/sdk-ai-bots/offline-evaluation.yml index f3b46164e5e..c30e72f49c8 100644 --- a/tools/sdk-ai-bots/offline-evaluation.yml +++ b/tools/sdk-ai-bots/offline-evaluation.yml @@ -1,8 +1,3 @@ -parameters: -- name: PythonVersion - type: string - default: '3.10' - trigger: branches: include: @@ -46,9 +41,10 @@ extends: apispec: testPrefix: apispec steps: - - template: /eng/pipelines/templates/steps/use-python-version.yml - parameters: - versionSpec: '${{ parameters.PythonVersion }}' + - task: UsePythonVersion@0 + displayName: "Use Python 3.12" + inputs: + versionSpec: '3.12' - script: | python --version displayName: 'check python' diff --git a/tools/sdk-ai-bots/online-evaluation.yml b/tools/sdk-ai-bots/online-evaluation.yml index 083f0944e14..d8e12e4e8e2 100644 --- a/tools/sdk-ai-bots/online-evaluation.yml +++ b/tools/sdk-ai-bots/online-evaluation.yml @@ -1,8 +1,3 @@ -parameters: -- name: PythonVersion - type: string - default: '3.10' - schedules: - cron: "0 1 * * 1" # Runs every Monday at 01:00 UTC displayName: "Weekly Monday Morning Trigger" @@ -30,9 +25,10 @@ extends: - job: 'evaluation' displayName: 'online evaluation' steps: - - template: /eng/pipelines/templates/steps/use-python-version.yml - parameters: - versionSpec: '${{ parameters.PythonVersion }}' + - task: UsePythonVersion@0 + displayName: "Use Python 3.12" + inputs: + versionSpec: '3.12' - script: | python --version displayName: 'check python'