diff --git a/docs/_ext/json_output/README.md b/docs/_ext/json_output/README.md new file mode 100644 index 00000000..a0d966f4 --- /dev/null +++ b/docs/_ext/json_output/README.md @@ -0,0 +1,295 @@ +# JSON Output Extension + +Sphinx extension to generate JSON output for every page alongside HTML output. + +Similar to Hugo's output formats, this creates parallel JSON files for each document +containing metadata, content, and other structured data that can be consumed by +search engines, APIs, or other applications. + +The main use case is generating comprehensive search indexes for tools like Solr, +Lunr.js, or custom search implementations. + +## Search Index Integration + +The main index.json file contains all documents with full content, perfect for: + +- **Lunr.js**: Load index.json and build search index from documents +- **Solr**: POST the JSON data to Solr's update endpoint +- **Elasticsearch**: Bulk index the documents array +- **Custom search**: Parse JSON and implement your own search logic + +## Enhanced JSON Structure + +The JSON structure includes search-optimized fields and global metadata from `conf.py`: + +```json +{ + "id": "getting-started/installation-guide", + "title": "Installation Guide", + "url": "/getting-started/installation-guide.html", + "last_modified": "2026-01-15T10:30:00Z", + + "book": { + "title": "NVIDIA NeMo Guardrails Library Developer Guide", + "version": "0.11.0" + }, + "product": { + "name": "NeMo Guardrails", + "family": ["NeMo"], + "version": "0.11.0" + }, + "site": { + "name": "NVIDIA Technical Documentation" + }, + + "content": "Full markdown content here...", + "content_length": 5420, + "word_count": 850, + "format": "text", + "summary": "Quick summary for previews...", + "doc_type": "tutorial", + "section_path": ["Getting Started", "Installation Guide"], + "headings": [ + {"text": "Prerequisites", "level": 2, "id": "prerequisites"} + ], + "headings_text": "Prerequisites Installation Steps Troubleshooting", + "keywords": ["install", "setup", "prerequisites", "pip", "python", "guardrails"], + "code_blocks": [ + {"content": "pip install nemoguardrails", "language": "bash"} + ], + "links": [ + { + "text": "Configuration Guide", + "url": "/configure-rails/index.html", + "type": "cross_reference", + "ref_type": "doc", + "target_doc": "configure-rails/index" + }, + { + "text": "GitHub Repository", + "url": "https://github.com/NVIDIA/NeMo-Guardrails", + "type": "external" + } + ], + "tags": ["setup", "guide"], + "categories": ["tutorials"] +} +``` + +## Configuration Examples + +### Minimal Configuration (Recommended) + +Uses optimized defaults for best performance: + +```python +# conf.py +json_output_settings = { + 'enabled': True, # All other settings use performance-optimized defaults +} +``` + +### Comprehensive Search Index (Default Behavior) + +```python +json_output_settings = { + 'enabled': True, + 'verbose': True, # Default: detailed logging + 'parallel': True, # Default: parallel processing + 'main_index_mode': 'full', # Default: full content + 'max_main_index_docs': 0, # Default: no limit + 'minify_json': True, # Default: smaller files + 'filter_search_clutter': True, # Default: clean content +} +``` + +### Large Sites Configuration + +```python +json_output_settings = { + 'enabled': True, + 'max_main_index_docs': 500, # Limit to 500 documents + 'content_max_length': 20000, # Limit content length + 'skip_large_files': 50000, # Skip files over 50KB +} +``` + +### Fastest Builds (Minimal Features) + +```python +json_output_settings = { + 'enabled': True, + 'main_index_mode': 'metadata_only', # Only titles, descriptions, tags + 'lazy_extraction': True, # Skip keywords, links, code_blocks, images + 'skip_complex_parsing': True, # Skip complex parsing features +} +``` + +## Available Settings + +### Core Settings + +- **enabled** (bool): Enable/disable JSON output generation. Default: `True` +- **verbose** (bool): Enable verbose logging. Default: `True` +- **parallel** (bool): Enable parallel processing. Default: `True` +- **exclude_patterns** (list): Patterns to exclude from JSON generation. Default: `['_build', '_templates', '_static']` +- **include_children** (bool): Include child documents in directory indexes. Default: `True` +- **include_child_content** (bool): Include full content in child documents. Default: `True` +- **main_index_mode** (str): How to handle main index page. Options: `'disabled'`, `'metadata_only'`, `'full'`. Default: `'full'` +- **max_main_index_docs** (int): Maximum documents to include in main index (0 = no limit). Default: `0` + +### Search Optimization Features + +- **extract_code_blocks** (bool): Include code blocks in search data. Default: `True` +- **extract_links** (bool): Include internal/external links. Default: `True` +- **extract_images** (bool): Include image references. Default: `True` +- **extract_keywords** (bool): Auto-extract technical keywords (frontmatter `keywords` field takes priority). Default: `True` +- **include_doc_type** (bool): Auto-detect document types (tutorial, guide, reference, etc.). Default: `True` +- **include_section_path** (bool): Include hierarchical section paths. Default: `True` + +### Link Extraction Options + +- **link_normalization** (bool): Normalize internal URLs to absolute paths with `.html` extension. Default: `True` +- **link_include_ref_type** (bool): Include `ref_type` metadata (ref, doc, any, etc.) for cross-references. Default: `True` +- **link_include_target_doc** (bool): Include `target_doc` for cross-references (enables document relationship mapping). Default: `True` +- **link_resolve_titles** (bool): Resolve filename-like link text (e.g., "index") to document titles (e.g., "Getting Started"). Default: `True` + +### Performance Controls + +- **content_max_length** (int): Max content length per document (0 = no limit). Default: `50000` +- **summary_max_length** (int): Max summary length. Default: `500` +- **keywords_max_count** (int): Max keywords per document. Default: `50` + +### Output Format Options + +- **minify_json** (bool): Minify JSON output (removes indentation for smaller files). Default: `True` +- **separate_content** (bool): Store content in separate .content.json files for better performance. Default: `False` + +### Speed Optimizations + +- **parallel_workers** (str): Number of parallel workers. Default: `'auto'` +- **batch_size** (int): Process documents in batches. Default: `50` +- **cache_aggressive** (bool): Enable aggressive caching. Default: `True` +- **lazy_extraction** (bool): Skip feature extraction (keywords, links, code_blocks, images) for faster builds. Default: `False` +- **skip_large_files** (int): Skip files larger than N bytes. Default: `100000` +- **incremental_build** (bool): Only process changed files. Default: `True` +- **memory_limit_mb** (int): Memory limit per worker. Default: `512` +- **fast_text_extraction** (bool): Use faster text extraction. Default: `True` +- **skip_complex_parsing** (bool): Skip complex parsing features. Default: `False` + +### Content Filtering + +- **filter_search_clutter** (bool): Remove SVG, toctree, and other non-searchable content. Default: `True` + +### Global Metadata + +- **global_metadata** (dict): User-defined global fields injected into all JSON files. Default: `{}` +- **infer_global_metadata** (bool): Auto-infer book/product/site from Sphinx config. Default: `True` + +## Global Metadata from conf.py + +The extension can inject site-wide metadata from `conf.py` into every JSON file, providing consistent book/product/site context without requiring frontmatter on each page. + +### Auto-Inference (Default) + +By default, the extension auto-infers global metadata from standard Sphinx configuration: + +| JSON Field | Source | Example | +|------------|--------|---------| +| `book.title` | `project` | "NVIDIA NeMo Guardrails Library Developer Guide" | +| `book.version` | `release` | "0.11.0" | +| `product.name` | Extracted from `project` (strips "NVIDIA" prefix and doc suffixes) | "NeMo Guardrails" | +| `product.version` | `release` | "0.11.0" | +| `product.family` | `html_context["product_family"]` (if set) | ["NeMo"] | +| `site.name` | `html_context["site_name"]` (if set) | "NVIDIA Technical Documentation" | + +### Explicit Configuration + +For full control, provide explicit `global_metadata`: + +```python +# conf.py +project = "NVIDIA NeMo Guardrails Library Developer Guide" +release = "0.11.0" + +json_output_settings = { + "enabled": True, + "global_metadata": { + "book": { + "title": project, + "version": release, + }, + "product": { + "name": "NeMo Guardrails", + "family": ["NeMo"], + "version": release, + }, + "site": { + "name": "NVIDIA Technical Documentation", + }, + }, +} +``` + +### Using html_context for Inference + +You can also set values via `html_context` for auto-inference: + +```python +# conf.py +project = "NVIDIA NeMo Guardrails Library Developer Guide" +release = "0.11.0" + +html_context = { + "product_name": "NeMo Guardrails", + "product_family": ["NeMo"], + "site_name": "NVIDIA Technical Documentation", +} + +json_output_settings = { + "enabled": True, + "infer_global_metadata": True, # Default +} +``` + +### Disabling Global Metadata + +To disable global metadata entirely: + +```python +json_output_settings = { + "enabled": True, + "infer_global_metadata": False, + "global_metadata": {}, +} +``` + +## Content Gating Integration + +This extension automatically respects content gating rules set by the content_gating extension at multiple levels: + +### Document-Level Gating + +Documents with 'only' conditions in frontmatter that fail evaluation (e.g., 'only: not ga' when building with -t ga) will be excluded from JSON generation entirely, ensuring sensitive content doesn't leak into search indexes. + +### Content-Level Gating + +Content sections wrapped in `{conditional}` directives are also properly filtered. When conditions don't match, the content is excluded from the document tree and won't appear in the generated JSON. + +### Integration Details + +- **Automatic Detection**: Detects if content_gating extension is loaded +- **Exclude Pattern Sync**: Respects documents added to exclude_patterns by content gating +- **Build Tag Awareness**: Logs current build tags for debugging +- **Debug Logging**: Provides detailed logs when content gating rules are applied + +The integration works seamlessly - just enable both extensions and your JSON output will automatically respect all content gating rules without additional configuration. + +## Performance Tips + +1. **Enable parallel processing** for faster builds on multi-core systems +2. **Use incremental builds** to only process changed files +3. **Set content length limits** for large documentation sites +4. **Enable content filtering** to reduce JSON file sizes +5. **Use batch processing** to control memory usage +6. **Skip large files** to avoid processing massive documents diff --git a/docs/_ext/json_output/__init__.py b/docs/_ext/json_output/__init__.py new file mode 100644 index 00000000..447af75b --- /dev/null +++ b/docs/_ext/json_output/__init__.py @@ -0,0 +1,48 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Sphinx extension to generate JSON output for every page alongside HTML output. + +This extension creates parallel JSON files for each document containing metadata, +content, and other structured data that can be consumed by search engines, APIs, +or other applications. + +See README.md for detailed configuration options and usage examples. +""" + +from typing import Any + +from sphinx.application import Sphinx + +from .config import get_default_settings, validate_config +from .processing import on_build_finished + + +def setup(app: Sphinx) -> dict[str, Any]: + """Setup function for Sphinx extension.""" + # Add configuration with default settings + default_settings = get_default_settings() + app.add_config_value("json_output_settings", default_settings, "html") + + # Connect to build events + app.connect("config-inited", validate_config) + app.connect("build-finished", on_build_finished) + + return { + "version": "1.0.0", + "parallel_read_safe": True, + "parallel_write_safe": True, + } diff --git a/docs/_ext/json_output/config.py b/docs/_ext/json_output/config.py new file mode 100644 index 00000000..de9e3315 --- /dev/null +++ b/docs/_ext/json_output/config.py @@ -0,0 +1,226 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Configuration management for JSON output extension.""" + +from typing import Any + +from sphinx.application import Sphinx +from sphinx.config import Config +from sphinx.util import logging + +logger = logging.getLogger(__name__) + +# Constants +MAX_PARALLEL_WORKERS = 32 + + +def get_default_settings() -> dict[str, Any]: + """Get default configuration settings for json_output extension.""" + return { + "enabled": True, + "exclude_patterns": ["_build", "_templates", "_static"], + "verbose": True, # Enable by default for better user feedback + "parallel": True, # Enable parallel processing by default for speed + "include_children": True, + "include_child_content": True, + "main_index_mode": "full", # 'disabled', 'metadata_only', 'full' + "max_main_index_docs": 0, # No limit by default for comprehensive search + # Search optimization features + "extract_code_blocks": True, # Include code blocks in search data + "extract_links": True, # Include internal/external links + "extract_images": True, # Include image references + "extract_keywords": True, # Auto-extract technical keywords + "include_doc_type": True, # Auto-detect document types + "include_section_path": True, # Include hierarchical section paths + # Link extraction options + "link_normalization": True, # Normalize internal URLs to absolute paths + "link_include_ref_type": True, # Include ref_type metadata (ref, doc, etc.) + "link_include_target_doc": True, # Include target_doc for cross-references + "link_resolve_titles": True, # Resolve filename-like link text to document titles + # Performance controls + "content_max_length": 50000, # Max content length per document (0 = no limit) + "summary_max_length": 500, # Max summary length + "keywords_max_count": 50, # Max keywords per document + # Output format options + "minify_json": True, # Minify JSON by default for better performance + "separate_content": False, # Store content in separate .content.json files + # Speed optimizations + "parallel_workers": "auto", # Number of parallel workers + "batch_size": 50, # Process documents in batches + "cache_aggressive": True, # Enable aggressive caching + "lazy_extraction": False, # Skip feature extraction (keywords, links, etc.) for faster builds + "skip_large_files": 100000, # Skip files larger than N bytes + "incremental_build": True, # Only process changed files + "memory_limit_mb": 512, # Memory limit per worker + "fast_text_extraction": True, # Use faster text extraction + "skip_complex_parsing": False, # Skip complex parsing features + # Content filtering + "filter_search_clutter": True, # Remove SVG, toctree, and other non-searchable content + # Global metadata from conf.py + "global_metadata": {}, # User-defined global fields (book, product, site) + "infer_global_metadata": True, # Auto-infer from Sphinx config (project, release) + } + + +def apply_config_defaults(settings: dict[str, Any]) -> dict[str, Any]: + """Apply default values to settings dictionary.""" + defaults = get_default_settings() + + for key, default_value in defaults.items(): + if key not in settings: + settings[key] = default_value + + return settings + + +def validate_config(_app: Sphinx, config: Config) -> None: + """Validate configuration values.""" + settings = _ensure_settings_dict(config) + settings = apply_config_defaults(settings) + config.json_output_settings = settings + + _validate_core_settings(settings) + _validate_content_limits(settings) + _validate_boolean_settings(settings) + _validate_integer_settings(settings) + _validate_parallel_workers(settings) + _validate_global_metadata(settings) + + +def _ensure_settings_dict(config: Config) -> dict[str, Any]: + """Ensure settings is a valid dictionary.""" + settings = getattr(config, "json_output_settings", {}) + if not isinstance(settings, dict): + logger.warning("json_output_settings must be a dictionary. Using defaults.") + settings = {} + config.json_output_settings = settings + return settings + + +def _validate_core_settings(settings: dict[str, Any]) -> None: + """Validate core configuration settings.""" + # Validate main index mode + valid_modes = ["disabled", "metadata_only", "full"] + mode = settings.get("main_index_mode", "full") + if mode not in valid_modes: + logger.warning(f"Invalid main_index_mode '{mode}'. Using 'full'. Valid options: {valid_modes}") + settings["main_index_mode"] = "full" + + # Validate exclude patterns + patterns = settings.get("exclude_patterns", []) + if not isinstance(patterns, list): + logger.warning("exclude_patterns must be a list. Using default.") + settings["exclude_patterns"] = ["_build", "_templates", "_static"] + + +def _validate_content_limits(settings: dict[str, Any]) -> None: + """Validate content-related limit settings.""" + limit_settings = { + "max_main_index_docs": (0, "0 (no limit)"), + "content_max_length": (50000, "50000 (0 = no limit)"), + "summary_max_length": (500, "500"), + "keywords_max_count": (50, "50"), + } + + for setting, (default_val, description) in limit_settings.items(): + value = settings.get(setting, default_val) + if not isinstance(value, int) or value < 0: + logger.warning(f"Invalid {setting} '{value}'. Using {description}.") + settings[setting] = default_val + + +def _validate_boolean_settings(settings: dict[str, Any]) -> None: + """Validate boolean configuration settings.""" + bool_settings = [ + "enabled", + "verbose", + "parallel", + "include_children", + "include_child_content", + "extract_code_blocks", + "extract_links", + "extract_images", + "extract_keywords", + "include_doc_type", + "include_section_path", + "link_normalization", + "link_include_ref_type", + "link_include_target_doc", + "link_resolve_titles", + "minify_json", + "separate_content", + "cache_aggressive", + "lazy_extraction", + "incremental_build", + "fast_text_extraction", + "skip_complex_parsing", + "filter_search_clutter", + "infer_global_metadata", + ] + + defaults = get_default_settings() + for setting in bool_settings: + if setting in settings and not isinstance(settings.get(setting), bool): + logger.warning(f"Setting '{setting}' must be boolean. Using default.") + settings[setting] = defaults[setting] + + +def _validate_integer_settings(settings: dict[str, Any]) -> None: + """Validate integer configuration settings with ranges.""" + int_settings = { + "batch_size": (1, 1000), # min, max + "skip_large_files": (0, None), # 0 = disabled + "memory_limit_mb": (64, 8192), # reasonable memory limits + } + + defaults = get_default_settings() + for setting, (min_val, max_val) in int_settings.items(): + if setting in settings: + value = settings[setting] + if not isinstance(value, int) or value < min_val or (max_val and value > max_val): + logger.warning( + f"Setting '{setting}' must be integer between {min_val} and {max_val or 'unlimited'}. Using default." + ) + settings[setting] = defaults[setting] + + +def _validate_parallel_workers(settings: dict[str, Any]) -> None: + """Validate parallel_workers setting (can be 'auto' or integer).""" + if "parallel_workers" in settings: + value = settings["parallel_workers"] + if value != "auto" and (not isinstance(value, int) or value < 1 or value > MAX_PARALLEL_WORKERS): + logger.warning( + f"Setting 'parallel_workers' must be 'auto' or integer between 1 and {MAX_PARALLEL_WORKERS}. Using default." + ) + defaults = get_default_settings() + settings["parallel_workers"] = defaults["parallel_workers"] + + +def _validate_global_metadata(settings: dict[str, Any]) -> None: + """Validate global_metadata setting structure.""" + global_metadata = settings.get("global_metadata", {}) + + if not isinstance(global_metadata, dict): + logger.warning("global_metadata must be a dictionary. Using empty default.") + settings["global_metadata"] = {} + return + + # Validate known top-level keys have dict values + valid_sections = ["book", "product", "site"] + for section in valid_sections: + if section in global_metadata and not isinstance(global_metadata[section], dict): + logger.warning(f"global_metadata.{section} must be a dictionary. Removing invalid value.") + del global_metadata[section] diff --git a/docs/_ext/json_output/content/__init__.py b/docs/_ext/json_output/content/__init__.py new file mode 100644 index 00000000..abc7b45e --- /dev/null +++ b/docs/_ext/json_output/content/__init__.py @@ -0,0 +1,24 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Content extraction functions for JSON output.""" + +from .extractor import extract_document_content +from .metadata import extract_document_metadata + +__all__ = [ + "extract_document_content", + "extract_document_metadata", +] diff --git a/docs/_ext/json_output/content/extractor.py b/docs/_ext/json_output/content/extractor.py new file mode 100644 index 00000000..9cf97565 --- /dev/null +++ b/docs/_ext/json_output/content/extractor.py @@ -0,0 +1,245 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Main content extraction orchestration.""" + +from typing import Any + +from docutils import nodes +from sphinx.environment import BuildEnvironment +from sphinx.util import logging + +from .structured import extract_code_blocks, extract_headings, extract_images, extract_links +from .text import ( + clean_text_for_llm, + extract_clean_text_content, + extract_keywords, + extract_raw_markdown, + extract_summary, + extract_text_content, +) + +logger = logging.getLogger(__name__) + + +def extract_document_content(env: BuildEnvironment, docname: str, content_cache: dict) -> dict[str, Any]: + """Extract content from document optimized for LLM/search use cases.""" + if docname in content_cache: + return content_cache[docname] + + try: + logger.debug(f"Starting content extraction for {docname}") + doctree = env.get_doctree(docname) + + # Get extraction settings + extraction_settings = _get_extraction_settings(env) + + # Extract main content + content = _extract_main_content(doctree, env, docname, extraction_settings) + + # Extract additional features based on settings (pass env for link resolution) + _extract_additional_features(content, doctree, docname, extraction_settings, env) + + # Cache and return result + content_cache[docname] = content + logger.debug(f"Successfully extracted content for {docname}") + + except Exception: + logger.exception(f"Critical error extracting content from {docname}") + content = _get_empty_content_dict() + content_cache[docname] = content + + return content_cache[docname] + + +def _get_extraction_settings(env: BuildEnvironment) -> dict[str, bool]: + """Extract all extraction-related settings from environment config.""" + config = getattr(env.app, "config", None) + json_settings = getattr(config, "json_output_settings", {}) if config else {} + + return { + "fast_extraction": json_settings.get("fast_text_extraction", False), + "lazy_extraction": json_settings.get("lazy_extraction", False), + "skip_complex": json_settings.get("skip_complex_parsing", False), + "filter_clutter": json_settings.get("filter_search_clutter", True), + } + + +def _extract_main_content( + doctree: nodes.document, env: BuildEnvironment, docname: str, settings: dict[str, bool] +) -> dict[str, Any]: + """Extract main text content with appropriate strategy.""" + content = {} + + try: + if settings["fast_extraction"]: + content["content"] = extract_text_content(doctree) + content["format"] = "text" + logger.debug(f"Fast text extraction for {docname}: {len(content['content'])} chars") + else: + content = _extract_with_fallbacks(doctree, env, docname) + + # Apply content filtering if enabled + if settings["filter_clutter"] and content.get("content"): + _apply_content_filtering(content, docname) + + except Exception as e: # noqa: BLE001 + logger.warning(f"Error extracting main content from {docname}: {e}") + content = {"content": "", "format": "text"} + + return content + + +def _extract_with_fallbacks(doctree: nodes.document, env: BuildEnvironment, docname: str) -> dict[str, Any]: + """Extract content with multiple fallback strategies.""" + # Try clean text first (pass env for link title resolution) + clean_text = extract_clean_text_content(doctree, env) + if clean_text: + logger.debug(f"Extracted clean text content for {docname}: {len(clean_text)} chars") + return {"content": clean_text, "format": "text"} + + # Fallback to raw markdown + raw_markdown = extract_raw_markdown(env, docname) + if raw_markdown: + logger.debug(f"Fallback to raw markdown for {docname}: {len(raw_markdown)} chars") + return {"content": raw_markdown, "format": "markdown"} + + # Final fallback to basic text + logger.debug(f"Fallback to basic text extraction for {docname}") + return {"content": extract_text_content(doctree), "format": "text"} + + +def _apply_content_filtering(content: dict[str, Any], docname: str) -> None: + """Apply content filtering to remove clutter.""" + original_length = len(content["content"]) + content["content"] = clean_text_for_llm(content["content"]) + filtered_length = len(content["content"]) + + if original_length != filtered_length: + logger.debug(f"Content filtering for {docname}: {original_length} -> {filtered_length} chars") + + +def _extract_additional_features( + content: dict[str, Any], + doctree: nodes.document, + docname: str, + settings: dict[str, bool], + env: BuildEnvironment | None = None, +) -> None: + """Extract additional features based on extraction settings.""" + if settings["lazy_extraction"]: + _set_empty_additional_features(content) + return + + # Extract basic features + _extract_basic_features(content, doctree, docname) + + # Extract complex features if not skipped + if not settings["skip_complex"]: + _extract_complex_features(content, doctree, docname, env) + else: + _set_empty_complex_features(content) + + # Extract keywords if not lazy + if not settings["lazy_extraction"]: + _extract_keywords_feature(content, docname) + else: + content["keywords"] = [] + + +def _extract_basic_features(content: dict[str, Any], doctree: nodes.document, docname: str) -> None: + """Extract basic features: headings and summary.""" + features = [ + ("headings", extract_headings, []), + ("summary", extract_summary, ""), + ] + + for feature_name, extract_func, default_value in features: + try: + result = extract_func(doctree) + content[feature_name] = result + if feature_name == "headings": + logger.debug(f"Extracted {len(result)} headings from {docname}") + except Exception as e: # noqa: BLE001, PERF203 + logger.warning(f"Error extracting {feature_name} from {docname}: {e}") + content[feature_name] = default_value + + +def _extract_complex_features( + content: dict[str, Any], + doctree: nodes.document, + docname: str, + env: BuildEnvironment | None = None, +) -> None: + """Extract complex features: code blocks, links, and images.""" + # Code blocks and images don't need env + simple_features = [ + ("code_blocks", extract_code_blocks), + ("images", extract_images), + ] + + for feature_name, extract_func in simple_features: + try: + result = extract_func(doctree) + content[feature_name] = result + logger.debug(f"Extracted {len(result)} {feature_name} from {docname}") + except Exception as e: # noqa: BLE001, PERF203 + logger.warning(f"Error extracting {feature_name} from {docname}: {e}") + content[feature_name] = [] + + # Links need env for title resolution + try: + content["links"] = extract_links(doctree, env, docname) + logger.debug(f"Extracted {len(content['links'])} links from {docname}") + except Exception as e: # noqa: BLE001 + logger.warning(f"Error extracting links from {docname}: {e}") + content["links"] = [] + + +def _extract_keywords_feature(content: dict[str, Any], docname: str) -> None: + """Extract keywords from content and headings.""" + try: + content["keywords"] = extract_keywords(content.get("content", ""), content.get("headings", [])) + logger.debug(f"Extracted {len(content['keywords'])} keywords from {docname}") + except Exception as e: # noqa: BLE001 + logger.warning(f"Error extracting keywords from {docname}: {e}") + content["keywords"] = [] + + +def _set_empty_additional_features(content: dict[str, Any]) -> None: + """Set empty values for all additional features (lazy extraction).""" + features = ["headings", "summary", "code_blocks", "links", "images", "keywords"] + for feature in features: + content[feature] = [] if feature != "summary" else "" + + +def _set_empty_complex_features(content: dict[str, Any]) -> None: + """Set empty values for complex features only.""" + for feature in ["code_blocks", "links", "images"]: + content[feature] = [] + + +def _get_empty_content_dict() -> dict[str, Any]: + """Get empty content dictionary for error cases.""" + return { + "content": "", + "format": "text", + "headings": [], + "summary": "", + "code_blocks": [], + "links": [], + "images": [], + "keywords": [], + } diff --git a/docs/_ext/json_output/content/metadata.py b/docs/_ext/json_output/content/metadata.py new file mode 100644 index 00000000..03c543d9 --- /dev/null +++ b/docs/_ext/json_output/content/metadata.py @@ -0,0 +1,94 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Metadata and frontmatter extraction functions.""" + +from typing import Any + +from sphinx.environment import BuildEnvironment +from sphinx.util import logging + +# Import YAML at module level with error handling +try: + import yaml + + YAML_AVAILABLE = True +except ImportError: + YAML_AVAILABLE = False + yaml = None + +logger = logging.getLogger(__name__) + + +def extract_document_metadata( + env: BuildEnvironment, docname: str, metadata_cache: dict, frontmatter_cache: dict +) -> dict[str, Any]: + """Extract metadata from document with caching.""" + if docname in metadata_cache: + return metadata_cache[docname] + + metadata = {} + + try: + if hasattr(env, "metadata") and docname in env.metadata: + metadata.update(env.metadata[docname]) + + source_path = env.doc2path(docname) + if source_path and str(source_path).endswith(".md"): + frontmatter = extract_frontmatter(str(source_path), frontmatter_cache) + if frontmatter: + metadata.update(frontmatter) + + metadata_cache[docname] = metadata + logger.debug(f"Successfully extracted metadata for {docname}: {len(metadata)} items") + + except Exception as e: # noqa: BLE001 + logger.warning(f"Error extracting metadata from {docname}: {e}") + metadata_cache[docname] = {} + + return metadata_cache[docname] + + +def extract_frontmatter(file_path: str, frontmatter_cache: dict) -> dict[str, Any] | None: + """Extract YAML frontmatter from markdown files.""" + if file_path in frontmatter_cache: + return frontmatter_cache[file_path] + + result = None + + # Check prerequisites + if not YAML_AVAILABLE: + logger.debug("PyYAML not available, skipping frontmatter extraction") + else: + try: + with open(file_path, encoding="utf-8") as f: + content = f.read() + + # Check for valid frontmatter format + if content.startswith("---"): + end_marker = content.find("\n---\n", 3) + if end_marker != -1: + frontmatter_text = content[3:end_marker] + result = yaml.safe_load(frontmatter_text) + + except yaml.YAMLError as e: + logger.warning(f"YAML parsing error in frontmatter for {file_path}: {e}") + result = None + except Exception as e: # noqa: BLE001 + logger.debug(f"Could not extract frontmatter from {file_path}: {e}") + result = None + + frontmatter_cache[file_path] = result + return result diff --git a/docs/_ext/json_output/content/structured.py b/docs/_ext/json_output/content/structured.py new file mode 100644 index 00000000..413810fc --- /dev/null +++ b/docs/_ext/json_output/content/structured.py @@ -0,0 +1,399 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Structured content extraction functions for headings, code blocks, links, and images.""" + +import re +from typing import TYPE_CHECKING, Any + +from docutils import nodes +from sphinx import addnodes +from sphinx.util import logging + +if TYPE_CHECKING: + from sphinx.environment import BuildEnvironment + +logger = logging.getLogger(__name__) + + +def extract_headings(doctree: nodes.document) -> list[dict[str, Any]]: + """Extract headings from document tree.""" + headings = [] + + # Extract headings from section nodes + for node in doctree.traverse(nodes.section): + # Get the title node + title_node = node.next_node(nodes.title) + if title_node: + title_text = title_node.astext().strip() + if title_text: + # Determine heading level based on nesting + level = 1 + parent = node.parent + while parent and isinstance(parent, nodes.section): + level += 1 + parent = parent.parent + + # Generate ID (similar to how Sphinx does it) + heading_id = re.sub(r"[^\w\-_]", "", title_text.lower().replace(" ", "-")) + + headings.append({"text": title_text, "level": level, "id": heading_id}) + + # Also check for standalone title nodes (like document title) + for node in doctree.traverse(nodes.title): + if node.parent and not isinstance(node.parent, nodes.section): + title_text = node.astext().strip() + if title_text: + heading_id = re.sub(r"[^\w\-_]", "", title_text.lower().replace(" ", "-")) + headings.append({"text": title_text, "level": 1, "id": heading_id}) + + # Remove duplicates while preserving order + seen = set() + unique_headings = [] + for heading in headings: + heading_key = (heading["text"], heading["level"]) + if heading_key not in seen: + seen.add(heading_key) + unique_headings.append(heading) + + return unique_headings + + +def extract_code_blocks(doctree: nodes.document) -> list[dict[str, Any]]: + """Extract code blocks from document tree.""" + code_blocks = [] + + for node in doctree.traverse(nodes.literal_block): + code_content = node.astext().strip() + if code_content: + # Try to determine language from classes or attributes + language = "text" # default + + if hasattr(node, "attributes") and "classes" in node.attributes: + classes = node.attributes["classes"] + for cls in classes: + if cls.startswith("language-"): + language = cls[9:] # Remove 'language-' prefix + break + elif cls in [ + "python", + "bash", + "javascript", + "json", + "yaml", + "sql", + "html", + "css", + "cpp", + "c", + "java", + "rust", + "go", + ]: + language = cls + break + + # Also check for highlight language + if hasattr(node, "attributes") and "highlight_args" in node.attributes: + highlight_args = node.attributes["highlight_args"] + if "language" in highlight_args: + language = highlight_args["language"] + + code_blocks.append({"content": code_content, "language": language}) + + return code_blocks + + +def extract_links( + doctree: nodes.document, + env: "BuildEnvironment | None" = None, + docname: str = "", +) -> list[dict[str, Any]]: + """Extract links from document tree with enhanced metadata. + + Args: + doctree: The document tree to extract links from + env: Optional Sphinx build environment for title resolution + docname: Current document name for relative URL resolution + + Returns: + List of link dictionaries with text, url, type, and optional metadata + """ + links = [] + + # Extract standard reference nodes + for node in doctree.traverse(nodes.reference): + link = _extract_reference_node(node, env, docname) + if link: + links.append(link) + + # Extract download reference nodes + for node in doctree.traverse(addnodes.download_reference): + link = _extract_download_reference(node) + if link: + links.append(link) + + return links + + +def _extract_reference_node( + node: nodes.reference, + env: "BuildEnvironment | None", + current_docname: str, +) -> dict[str, Any] | None: + """Extract metadata from a reference node.""" + link_text = node.astext().strip() + if not link_text: + return None + + attrs = getattr(node, "attributes", {}) + link: dict[str, Any] = {"text": link_text, "type": "internal"} + + # Extract URL from various attributes + if "refuri" in attrs: + link["url"] = attrs["refuri"] + # Classify link type + if attrs["refuri"].startswith(("http://", "https://", "ftp://", "mailto:")): + link["type"] = "external" + elif attrs["refuri"].startswith("#"): + link["type"] = "anchor" + else: + link["type"] = "internal" + # Normalize internal URLs + link["url"] = _normalize_internal_url(attrs["refuri"], current_docname) + elif "refid" in attrs: + link["url"] = f"#{attrs['refid']}" + link["type"] = "anchor" + elif "reftarget" in attrs: + link["url"] = attrs["reftarget"] + link["type"] = "internal" + + # Extract cross-reference metadata (from :ref:, :doc:, {ref}, {doc}, etc.) + if "refdoc" in attrs: + link["target_doc"] = attrs["refdoc"] + if link["type"] == "internal": + link["type"] = "cross_reference" + + if "reftype" in attrs: + link["ref_type"] = attrs["reftype"] + + # Try to improve link text if it looks like a filename + if env and _looks_like_filename(link_text): + better_text = _resolve_link_text(link_text, attrs, env) + if better_text and better_text != link_text: + link["text"] = better_text + link["original_text"] = link_text # Keep original for debugging + + # Only return if we have a URL or target_doc + if link.get("url") or link.get("target_doc"): + return link + return None + + +def _extract_download_reference(node: addnodes.download_reference) -> dict[str, Any] | None: + """Extract metadata from a download reference node.""" + link_text = node.astext().strip() + attrs = getattr(node, "attributes", {}) + + if not link_text: + return None + + link: dict[str, Any] = { + "text": link_text, + "type": "download", + } + + if "reftarget" in attrs: + link["url"] = attrs["reftarget"] + if "filename" in attrs: + link["filename"] = attrs["filename"] + + return link if link.get("url") else None + + +def _normalize_internal_url(url: str, current_docname: str) -> str: + """Normalize internal URLs to consistent format. + + Converts .md/.rst extensions to .html and resolves relative paths. + """ + if not url: + return url + + # Already absolute or external + if url.startswith(("/", "http://", "https://", "#")): + # Just normalize extension for absolute internal paths + if url.startswith("/"): + return _normalize_extension(url) + return url + + # Relative URL - resolve against current document + if current_docname: + # Get directory of current document + if "/" in current_docname: + base_dir = current_docname.rsplit("/", 1)[0] + url = f"{base_dir}/{url}" + + return _normalize_extension(url) + + +def _normalize_extension(url: str) -> str: + """Normalize file extensions to .html.""" + # Split off anchor if present + anchor = "" + if "#" in url: + url, anchor = url.rsplit("#", 1) + anchor = f"#{anchor}" + + # Replace source extensions with .html + for ext in (".md", ".rst", ".txt"): + if url.endswith(ext): + url = url[: -len(ext)] + ".html" + break + + # Add .html if no extension + if url and not url.endswith(".html") and "." not in url.rsplit("/", 1)[-1]: + url = url + ".html" + + return url + anchor + + +def _looks_like_filename(text: str) -> bool: + """Check if text looks like a filename/docname rather than readable text.""" + if not text: + return False + + # Single word with no spaces, possibly with path separators + if " " not in text and ("/" in text or text == text.lower()): + # But not if it's a reasonable title-like word + if len(text) > 2 and text[0].isupper() and text[1:].islower(): + return False + return True + + # Contains path separators + if "/" in text or "\\" in text: + return True + + # Ends with file extension + if re.search(r"\.(md|rst|html|txt)$", text, re.IGNORECASE): + return True + + return False + + +def _resolve_link_text( + text: str, + attrs: dict[str, Any], + env: "BuildEnvironment", +) -> str: + """Try to resolve a filename-like link text to a proper title.""" + # Try to get the target document name + target_doc = attrs.get("refdoc") or attrs.get("reftarget", "") + + # Clean up the target + target_doc = target_doc.replace(".html", "").replace(".md", "").replace(".rst", "") + + if target_doc and hasattr(env, "titles") and target_doc in env.titles: + title_node = env.titles[target_doc] + if title_node: + return title_node.astext().strip() + + # Fallback: humanize the filename + return _humanize_filename(text) + + +def _humanize_filename(filename: str) -> str: + """Convert a filename to human-readable text.""" + # Get just the filename part + if "/" in filename: + filename = filename.rsplit("/", 1)[-1] + + # Remove extension + for ext in (".md", ".rst", ".html", ".txt"): + if filename.endswith(ext): + filename = filename[: -len(ext)] + break + + # Replace separators with spaces + filename = filename.replace("-", " ").replace("_", " ") + + # Title case + return filename.title() + + +def extract_images(doctree: nodes.document) -> list[dict[str, Any]]: + """Extract images from document tree.""" + images = [] + + # Extract standalone images + images.extend(_extract_standalone_images(doctree)) + + # Extract images within figures + images.extend(_extract_figure_images(doctree)) + + return images + + +def _extract_standalone_images(doctree: nodes.document) -> list[dict[str, Any]]: + """Extract standalone image nodes.""" + images = [] + + for node in doctree.traverse(nodes.image): + if hasattr(node, "attributes"): + image_info = _build_image_info(node.attributes) + if image_info: + images.append(image_info) + + return images + + +def _extract_figure_images(doctree: nodes.document) -> list[dict[str, Any]]: + """Extract images from figure nodes.""" + images = [] + + for node in doctree.traverse(nodes.figure): + for img_node in node.traverse(nodes.image): + if hasattr(img_node, "attributes"): + image_info = _build_image_info(img_node.attributes) + if image_info: + # Add caption from figure + caption = _extract_figure_caption(node) + if caption: + image_info["caption"] = caption + images.append(image_info) + + return images + + +def _build_image_info(attrs: dict[str, Any]) -> dict[str, Any] | None: + """Build image info dictionary from attributes.""" + image_src = attrs.get("uri", "") + if not image_src: + return None + + image_info = {"src": image_src, "alt": attrs.get("alt", "")} + + # Add optional attributes + for attr_name in ["title", "width", "height"]: + if attr_name in attrs: + image_info[attr_name] = attrs[attr_name] + + return image_info + + +def _extract_figure_caption(figure_node: nodes.figure) -> str: + """Extract caption text from figure node.""" + for caption_node in figure_node.traverse(nodes.caption): + return caption_node.astext().strip() + return "" diff --git a/docs/_ext/json_output/content/text.py b/docs/_ext/json_output/content/text.py new file mode 100644 index 00000000..6e91afe7 --- /dev/null +++ b/docs/_ext/json_output/content/text.py @@ -0,0 +1,372 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Text content extraction functions.""" + +import re +from typing import Any + +from docutils import nodes +from sphinx.environment import BuildEnvironment +from sphinx.util import logging + +logger = logging.getLogger(__name__) + +# Constants +MIN_SUBSTANTIAL_CONTENT_LENGTH = 50 +MAX_SUMMARY_LENGTH = 300 +MIN_KEYWORD_LENGTH = 3 +MAX_KEYWORDS_RETURNED = 50 + + +def extract_raw_markdown(env: BuildEnvironment, docname: str) -> str | None: + """Extract raw markdown from source file.""" + try: + source_path = env.doc2path(docname) + if not source_path or not source_path.exists(): + return None + + with open(source_path, encoding="utf-8") as f: + content = f.read() + + # Remove frontmatter if present + if content.startswith("---"): + end_marker = content.find("\n---\n", 3) + if end_marker != -1: + content = content[end_marker + 5 :] # Skip the second ---\n + + return content.strip() + + except Exception as e: # noqa: BLE001 + logger.debug(f"Could not extract raw markdown from {docname}: {e}") + return None + + +def extract_text_content(doctree: nodes.document) -> str: + """Extract plain text content from document tree.""" + text_parts = [] + + for node in doctree.traverse(nodes.Text): + text_parts.append(node.astext()) + + return " ".join(text_parts).strip() + + +def extract_clean_text_content(doctree: nodes.document, env: BuildEnvironment | None = None) -> str: + """Extract clean text content, filtering out navigation elements. + + Args: + doctree: The document tree to extract text from + env: Optional Sphinx environment for resolving link titles + + Returns: + Cleaned text content suitable for search/LLM consumption + """ + text_parts = [] + # Track nodes we've already processed (to avoid duplicate text from references) + processed_refs = set() + + for node in doctree.traverse(): + # Skip certain node types that aren't content + if isinstance(node, (nodes.target, nodes.substitution_definition)): + continue + + # Skip toctree and other directive content + if hasattr(node, "tagname") and node.tagname in ["toctree", "index", "meta"]: + continue + + # Handle reference nodes specially - extract and potentially improve link text + if isinstance(node, nodes.reference): + ref_id = id(node) + if ref_id not in processed_refs: + processed_refs.add(ref_id) + link_text = _get_improved_link_text(node, env) + if link_text: + text_parts.append(link_text) + continue + + # Extract text from text nodes (but skip if inside a reference we already processed) + if isinstance(node, nodes.Text): + # Check if this text node is inside a reference + parent = node.parent + if isinstance(parent, nodes.reference) and id(parent) in processed_refs: + continue # Already handled by reference processing + + text = node.astext().strip() + if text and not text.startswith("ΒΆ"): # Skip permalink symbols + text_parts.append(text) + + # Join and clean up the text + full_text = " ".join(text_parts) + + # Clean up whitespace + full_text = re.sub(r"\s+", " ", full_text) + + return full_text.strip() + + +def _get_improved_link_text(node: nodes.reference, env: BuildEnvironment | None) -> str: + """Get improved link text, resolving filenames to titles where possible.""" + text = node.astext().strip() + if not text: + return "" + + # If text doesn't look like a filename, use it as-is + if not _text_looks_like_filename(text): + return text + + # Try to resolve to a better title + attrs = getattr(node, "attributes", {}) + + # Try refdoc first (target document for cross-references) + target_doc = attrs.get("refdoc", "") + + # Try reftarget as fallback + if not target_doc: + target_doc = attrs.get("reftarget", "") + # Clean up the target + target_doc = target_doc.replace(".html", "").replace(".md", "").replace(".rst", "") + + # Look up title in env.titles + if target_doc and env and hasattr(env, "titles") and target_doc in env.titles: + title_node = env.titles[target_doc] + if title_node: + resolved_title = title_node.astext().strip() + if resolved_title: + return resolved_title + + # Fallback: humanize the filename + return _humanize_link_text(text) + + +def _text_looks_like_filename(text: str) -> bool: + """Check if text looks like a filename rather than readable text.""" + if not text: + return False + + # Contains path separators + if "/" in text or "\\" in text: + return True + + # Ends with file extension + if re.search(r"\.(md|rst|html|txt)$", text, re.IGNORECASE): + return True + + # Single lowercase word (like "index", "readme", "configuration") + if " " not in text and text == text.lower() and len(text) > 2: + # But allow proper nouns that happen to be lowercase in context + return True + + return False + + +def _humanize_link_text(text: str) -> str: + """Convert filename-like text to human-readable form.""" + # Get just the filename part + if "/" in text: + text = text.rsplit("/", 1)[-1] + + # Remove extension + for ext in (".md", ".rst", ".html", ".txt"): + if text.endswith(ext): + text = text[: -len(ext)] + break + + # Replace separators with spaces + text = text.replace("-", " ").replace("_", " ") + + # Title case + return text.title() + + +def clean_text_for_llm(text: str) -> str: + """Clean text content to make it more suitable for LLM processing and search indexing.""" + if not text: + return "" + + # Remove SVG content (common in documentation) + text = re.sub(r"", "", text, flags=re.DOTALL | re.IGNORECASE) + + # Remove HTML comments + text = re.sub(r"", "", text, flags=re.DOTALL) + + # Remove empty directive blocks (common MyST artifacts) + text = re.sub(r"^\s*```\{[^}]+\}\s*```\s*$", "", text, flags=re.MULTILINE) + + # Remove toctree artifacts + text = re.sub(r"^\s*:caption:.*$", "", text, flags=re.MULTILINE) + text = re.sub(r"^\s*:hidden:\s*$", "", text, flags=re.MULTILINE) + text = re.sub(r"^\s*:glob:\s*$", "", text, flags=re.MULTILINE) + text = re.sub(r"^\s*:maxdepth:\s*\d+\s*$", "", text, flags=re.MULTILINE) + + # Remove common MyST directive markers that aren't useful for search + text = re.sub(r"^\s*:::\{[^}]+\}\s*$", "", text, flags=re.MULTILINE) + text = re.sub(r"^\s*:::\s*$", "", text, flags=re.MULTILINE) + + # Clean up code block language indicators + text = re.sub(r"```(\w+)\s*\n", "```\n", text) + + # Remove excessive whitespace but preserve paragraph breaks + text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text) # Multiple line breaks -> double + text = re.sub(r"[ \t]+", " ", text) # Multiple spaces/tabs -> single space + + # Remove lines that are just punctuation or symbols + lines = text.split("\n") + cleaned_lines = [] + for line in lines: + stripped = line.strip() + # Keep line if it has actual words (not just punctuation/symbols) + if stripped and re.search(r"[a-zA-Z0-9]", stripped): + # Remove standalone punctuation at start/end + stripped = re.sub(r"^[^\w\s]+\s*", "", stripped) + stripped = re.sub(r"\s*[^\w\s]+$", "", stripped) + if stripped: + cleaned_lines.append(stripped) + + text = "\n".join(cleaned_lines) + + # Final cleanup + return text.strip() + + +def extract_directive_content(directive_block: str) -> str: + """Extract meaningful content from MyST directive blocks.""" + if not directive_block: + return "" + + # Remove the directive syntax but keep the content + lines = directive_block.split("\n") + content_lines = [] + in_content = False + + for line in lines: + # Skip directive header lines + if line.strip().startswith(":::") or line.strip().startswith("```{"): + in_content = True + continue + elif line.strip() == ":::" or line.strip() == "```": + continue + elif line.strip().startswith(":") and not in_content: + # Skip directive options + continue + + # Include content lines + if in_content or not line.strip().startswith(":"): + content_lines.append(line) + + return "\n".join(content_lines).strip() + + +def extract_summary(doctree: nodes.document) -> str: + """Extract a summary from the document (first paragraph or section).""" + # Try to find the first substantial paragraph + for node in doctree.traverse(nodes.paragraph): + text = node.astext().strip() + if text and len(text) > MIN_SUBSTANTIAL_CONTENT_LENGTH: # Substantial content + # Clean and truncate + text = re.sub(r"\s+", " ", text) + if len(text) > MAX_SUMMARY_LENGTH: + text = text[:297] + "..." + return text + + # Fallback: use first MAX_SUMMARY_LENGTH characters of any text + text = extract_text_content(doctree) + if text: + text = re.sub(r"\s+", " ", text) + if len(text) > MAX_SUMMARY_LENGTH: + text = text[:297] + "..." + return text + + return "" + + +def extract_keywords(content: str, headings: list[dict[str, Any]]) -> list[str]: + """Extract relevant keywords from content for search optimization.""" + if not content: + return [] + + keywords = set() + + # Add heading text as keywords + for heading in headings: + if "text" in heading: + # Split heading into words and add significant ones + words = re.findall(r"\b[a-zA-Z]{3,}\b", heading["text"].lower()) + keywords.update(words) + + # Extract technical terms (often capitalized or have specific patterns) + # API names, class names, function names, etc. + tech_terms = re.findall(r"\b[A-Z][a-zA-Z0-9_]*[a-z][a-zA-Z0-9_]*\b", content) + keywords.update(term.lower() for term in tech_terms) + + # Extract quoted terms (often important concepts) + quoted_terms = re.findall(r'["`]([^"`]{3,20})["`]', content) + for term in quoted_terms: + if re.match(r"^[a-zA-Z][a-zA-Z0-9_\-\s]*$", term): + keywords.add(term.lower().strip()) + + # Extract common patterns for documentation keywords + # Configuration keys, file extensions, command names + config_keys = re.findall(r"\b[a-z_]+[a-z0-9_]*\s*[:=]", content) + keywords.update(key.rstrip(":=").strip() for key in config_keys) + + # File extensions + extensions = re.findall(r"\.[a-z]{2,4}\b", content.lower()) + keywords.update(ext.lstrip(".") for ext in extensions) + + # Remove common stop words and very short terms + stop_words = { + "the", + "and", + "for", + "are", + "but", + "not", + "you", + "all", + "can", + "had", + "her", + "was", + "one", + "our", + "out", + "day", + "get", + "has", + "him", + "his", + "how", + "its", + "may", + "new", + "now", + "old", + "see", + "two", + "who", + "boy", + "did", + "she", + "use", + "way", + "what", + "when", + "will", + } + keywords = {kw for kw in keywords if len(kw) >= MIN_KEYWORD_LENGTH and kw not in stop_words} + + # Return sorted list, limited to reasonable number + return sorted(keywords)[:MAX_KEYWORDS_RETURNED] diff --git a/docs/_ext/json_output/core/__init__.py b/docs/_ext/json_output/core/__init__.py new file mode 100644 index 00000000..b1512c11 --- /dev/null +++ b/docs/_ext/json_output/core/__init__.py @@ -0,0 +1,32 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Core JSON output generation components.""" + +from .builder import JSONOutputBuilder +from .document_discovery import DocumentDiscovery +from .global_metadata import get_global_metadata +from .hierarchy_builder import HierarchyBuilder +from .json_formatter import JSONFormatter +from .json_writer import JSONWriter + +__all__ = [ + "DocumentDiscovery", + "HierarchyBuilder", + "JSONFormatter", + "JSONOutputBuilder", + "JSONWriter", + "get_global_metadata", +] diff --git a/docs/_ext/json_output/core/builder.py b/docs/_ext/json_output/core/builder.py new file mode 100644 index 00000000..2652b949 --- /dev/null +++ b/docs/_ext/json_output/core/builder.py @@ -0,0 +1,110 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""JSONOutputBuilder class for handling JSON output generation.""" + +from typing import Any + +from sphinx.application import Sphinx +from sphinx.util import logging + +from ..content import extract_document_content as _extract_document_content +from ..content import extract_document_metadata as _extract_document_metadata +from ..processing.cache import JSONOutputCache +from ..utils import get_setting, should_generate_json +from .document_discovery import DocumentDiscovery +from .hierarchy_builder import HierarchyBuilder +from .json_formatter import JSONFormatter +from .json_writer import JSONWriter + +logger = logging.getLogger(__name__) + + +class JSONOutputBuilder: + """Handles JSON output generation for documents.""" + + def __init__(self, app: Sphinx): + self.app = app + self.env = app.env + self.config = app.config + + # Initialize cache manager + self.cache = JSONOutputCache() + + # Initialize modular components + self.document_discovery = DocumentDiscovery(app, self) + self.json_formatter = JSONFormatter(app, self) + self.json_writer = JSONWriter(app) + self.hierarchy_builder = HierarchyBuilder(app, self, self.document_discovery, self.json_formatter) + + def should_generate_json(self, docname: str) -> bool: + """Check if JSON should be generated for this document.""" + return should_generate_json(self.config, docname) + + def needs_update(self, docname: str) -> bool: + """Check if document needs to be updated based on modification time.""" + incremental_enabled = get_setting(self.config, "incremental_build", False) + source_path = self.env.doc2path(docname) + return self.cache.needs_update(docname, source_path, incremental_enabled) + + def mark_updated(self, docname: str) -> None: + """Mark document as processed with current timestamp.""" + source_path = self.env.doc2path(docname) + self.cache.mark_updated(docname, source_path) + + def extract_document_metadata(self, docname: str) -> dict[str, Any]: + """Extract metadata from document with caching.""" + return self.cache.with_cache_lock( + _extract_document_metadata, + self.env, + docname, + self.cache.get_metadata_cache(), + self.cache.get_frontmatter_cache(), + ) + + def extract_document_content(self, docname: str) -> dict[str, Any]: + """Extract content from document optimized for LLM/search use cases.""" + return self.cache.with_cache_lock(_extract_document_content, self.env, docname, self.cache.get_content_cache()) + + def build_json_data(self, docname: str) -> dict[str, Any]: + """Build optimized JSON data structure for LLM/search use cases.""" + # Use the JSON formatter for base data + data = self.json_formatter.build_json_data(docname) + + # Add children for directory indexes using hierarchy builder + self.hierarchy_builder.add_children_to_data(data, docname) + + return data + + def write_json_file(self, docname: str, data: dict[str, Any]) -> None: + """Write JSON data to file.""" + self.json_writer.write_json_file(docname, data) + + # Delegate methods to maintain API compatibility + def get_child_documents(self, parent_docname: str) -> list[str]: + """Get all child documents for a parent directory.""" + return self.document_discovery.get_child_documents(parent_docname) + + def is_hidden_document(self, docname: str) -> bool: + """Check if a document should be considered hidden.""" + return self.document_discovery.is_hidden_document(docname) + + def get_all_documents_recursive(self) -> list[str]: + """Get all non-hidden documents recursively.""" + return self.document_discovery.get_all_documents_recursive() + + def build_child_json_data(self, docname: str, include_content: bool | None = None) -> dict[str, Any]: + """Build optimized JSON data for child documents (LLM/search focused).""" + return self.json_formatter.build_child_json_data(docname, include_content) diff --git a/docs/_ext/json_output/core/document_discovery.py b/docs/_ext/json_output/core/document_discovery.py new file mode 100644 index 00000000..3dc255ba --- /dev/null +++ b/docs/_ext/json_output/core/document_discovery.py @@ -0,0 +1,130 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Document discovery and filtering functionality.""" + +from typing import TYPE_CHECKING + +from sphinx.application import Sphinx + +from ..utils import get_setting + +if TYPE_CHECKING: + from .builder import JSONOutputBuilder + + +class DocumentDiscovery: + """Handles document discovery, filtering, and hierarchical relationships.""" + + def __init__(self, app: Sphinx, json_builder: "JSONOutputBuilder"): + self.app = app + self.env = app.env + self.config = app.config + self.json_builder = json_builder # Reference to main builder for metadata access + + def get_child_documents(self, parent_docname: str) -> list[str]: + """Get all child documents for a parent directory.""" + if parent_docname == "index": + parent_path = "" + elif parent_docname.endswith("/index"): + parent_path = parent_docname[:-6] # Remove '/index' + else: + # Not a directory index, no children + return [] + + children = [] + for docname in self.env.all_docs: + if self.is_hidden_document(docname): + continue + + # Skip the parent itself + if docname == parent_docname: + continue + + # Check if this document is a child of the parent + if parent_path == "": + # Root level - include all docs + children.append(docname) + elif docname.startswith(parent_path + "/"): + children.append(docname) + + return sorted(children) + + def is_hidden_document(self, docname: str) -> bool: + """Check if a document should be considered hidden.""" + # Skip documents that match exclude patterns + for pattern in get_setting(self.config, "exclude_patterns", []): + if docname.startswith(pattern): + return True + + # Skip documents with 'hidden' or 'draft' in metadata + metadata = self.json_builder.extract_document_metadata(docname) + if metadata.get("hidden") or metadata.get("draft"): + return True + + # Skip documents that wouldn't generate JSON + return not self.json_builder.should_generate_json(docname) + + def get_all_documents_recursive(self) -> list[str]: + """Get all non-hidden documents recursively.""" + all_docs = [] + for docname in self.env.all_docs: + if not self.is_hidden_document(docname): + all_docs.append(docname) + return sorted(all_docs) + + def get_section_path(self, docname: str) -> list[str]: + """Get hierarchical section path for navigation.""" + parts = docname.split("/") + + # Filter out common file names to get clean section path + filtered_parts = [] + for part in parts: + if part not in ["index", "README"]: + filtered_parts.append(part.replace("-", " ").replace("_", " ").title()) + + return filtered_parts + + def detect_document_type(self, docname: str, title: str, content: str) -> str: + """Detect document type for better search categorization.""" + docname_lower = docname.lower() + title_lower = title.lower() + content_lower = content.lower()[:1000] # First 1000 chars + + # Define document type checks in priority order + type_checks = [ + ("tutorial", lambda: "tutorial" in docname_lower or "tutorial" in title_lower), + ("guide", lambda: "guide" in docname_lower or "guide" in title_lower), + ("reference", lambda: "reference" in docname_lower or "api" in docname_lower), + ("example", lambda: "example" in docname_lower or "examples" in docname_lower), + ("troubleshooting", lambda: "troubleshoot" in docname_lower or "faq" in docname_lower), + ("installation", lambda: "install" in docname_lower or "setup" in docname_lower), + ("overview", lambda: docname.endswith("/index")), + ( + "tutorial", + lambda: any(word in content_lower for word in ["$ ", "pip install", "docker run", "git clone"]), + ), + ( + "reference", + lambda: any(word in content_lower for word in ["class ", "def ", "function", "method", "parameter"]), + ), + ] + + # Check each type in order and return the first match + for doc_type, check_func in type_checks: + if check_func(): + return doc_type + + return "documentation" diff --git a/docs/_ext/json_output/core/global_metadata.py b/docs/_ext/json_output/core/global_metadata.py new file mode 100644 index 00000000..83430afd --- /dev/null +++ b/docs/_ext/json_output/core/global_metadata.py @@ -0,0 +1,155 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Global metadata extraction from Sphinx configuration. + +This module provides functions to extract and build global metadata +from conf.py settings for inclusion in JSON output files. +""" + +import re +from typing import Any + +from sphinx.config import Config +from sphinx.util import logging + +logger = logging.getLogger(__name__) + + +def get_global_metadata(config: Config) -> dict[str, Any]: + """Build global metadata from Sphinx config settings. + + Combines explicit global_metadata settings with auto-inferred values + from standard Sphinx configuration (project, release, etc.). + + Args: + config: Sphinx configuration object + + Returns: + Dictionary with global metadata (book, product, site sections) + """ + settings = getattr(config, "json_output_settings", {}) + + # Start with explicit global_metadata if provided + global_meta = _deep_copy_dict(settings.get("global_metadata", {})) + + # Auto-infer if enabled + if settings.get("infer_global_metadata", True): + _infer_book_metadata(global_meta, config) + _infer_product_metadata(global_meta, config) + _infer_site_metadata(global_meta, config) + + # Remove empty sections + return {k: v for k, v in global_meta.items() if v} + + +def _deep_copy_dict(d: dict) -> dict: + """Create a deep copy of a nested dictionary.""" + result = {} + for k, v in d.items(): + if isinstance(v, dict): + result[k] = _deep_copy_dict(v) + elif isinstance(v, list): + result[k] = list(v) + else: + result[k] = v + return result + + +def _infer_book_metadata(global_meta: dict, config: Config) -> None: + """Infer book metadata from Sphinx config.""" + global_meta.setdefault("book", {}) + book = global_meta["book"] + + # book.title from project + if "title" not in book and hasattr(config, "project"): + book["title"] = config.project + + # book.version from release + if "version" not in book and hasattr(config, "release"): + book["version"] = config.release + + +def _infer_product_metadata(global_meta: dict, config: Config) -> None: + """Infer product metadata from Sphinx config.""" + global_meta.setdefault("product", {}) + product = global_meta["product"] + + # Try to get from html_context first (explicit config) + html_context = getattr(config, "html_context", {}) + + # product.name + if "name" not in product: + if html_context.get("product_name"): + product["name"] = html_context["product_name"] + elif hasattr(config, "project"): + product["name"] = _extract_product_name(config.project) + + # product.family + if "family" not in product and html_context.get("product_family"): + family = html_context["product_family"] + product["family"] = family if isinstance(family, list) else [family] + + # product.version (can differ from book.version) + if "version" not in product and hasattr(config, "release"): + product["version"] = config.release + + +def _infer_site_metadata(global_meta: dict, config: Config) -> None: + """Infer site metadata from Sphinx config.""" + html_context = getattr(config, "html_context", {}) + + # Only add site section if we have data + site_name = html_context.get("site_name") + if site_name: + global_meta.setdefault("site", {}) + if "name" not in global_meta["site"]: + global_meta["site"]["name"] = site_name + + +def _extract_product_name(project: str) -> str: + """Extract product name from project string. + + Examples: + 'NVIDIA DORI' -> 'DORI' + 'NVIDIA NeMo Curator User Guide' -> 'NeMo Curator' + 'NeMo Framework Documentation' -> 'NeMo Framework' + + Args: + project: The Sphinx project name + + Returns: + Extracted product name + """ + name = project + + # Remove NVIDIA prefix + name = re.sub(r"^NVIDIA\s+", "", name, flags=re.IGNORECASE) + + # Remove common documentation suffixes + suffixes = [ + r"\s+User Guide$", + r"\s+User Manual$", + r"\s+Developer Guide$", + r"\s+Documentation$", + r"\s+Reference$", + r"\s+Reference Guide$", + r"\s+API Reference$", + r"\s+Docs$", + ] + for suffix in suffixes: + name = re.sub(suffix, "", name, flags=re.IGNORECASE) + + return name.strip() diff --git a/docs/_ext/json_output/core/hierarchy_builder.py b/docs/_ext/json_output/core/hierarchy_builder.py new file mode 100644 index 00000000..8bd53c13 --- /dev/null +++ b/docs/_ext/json_output/core/hierarchy_builder.py @@ -0,0 +1,140 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Hierarchy building for complex document structures like main index.""" + +from typing import TYPE_CHECKING, Any + +from sphinx.application import Sphinx +from sphinx.util import logging + +from ..utils import get_setting + +if TYPE_CHECKING: + from .builder import JSONOutputBuilder + from .document_discovery import DocumentDiscovery + from .json_formatter import JSONFormatter + +logger = logging.getLogger(__name__) + + +class HierarchyBuilder: + """Handles complex hierarchy building for indexes.""" + + def __init__( + self, + app: Sphinx, + json_builder: "JSONOutputBuilder", + document_discovery: "DocumentDiscovery", + json_formatter: "JSONFormatter", + ): + self.app = app + self.config = app.config + self.json_builder = json_builder + self.document_discovery = document_discovery + self.json_formatter = json_formatter + + def add_children_to_data(self, data: dict[str, Any], docname: str) -> None: + """Add children documents to data structure for directory indexes.""" + include_children = get_setting(self.config, "include_children", True) + if not include_children or not (docname == "index" or docname.endswith("/index")): + return + + if docname == "index": + self._handle_main_index(data, docname) + else: + self._handle_directory_index(data, docname) + + def _handle_main_index(self, data: dict[str, Any], docname: str) -> None: + """Handle main index behavior: optimized for search index generation.""" + main_index_mode = get_setting(self.config, "main_index_mode", "full") + max_main_index_docs = get_setting(self.config, "max_main_index_docs", 1000) + + if main_index_mode == "disabled": + logger.info("Main index children disabled by configuration") + data["children"] = [] + data["total_documents"] = 0 + elif main_index_mode == "metadata_only": + self._build_metadata_only_index(data, docname, max_main_index_docs) + else: # 'full' mode - comprehensive search index + self._build_full_search_index(data, docname, max_main_index_docs) + + def _build_metadata_only_index(self, data: dict[str, Any], docname: str, max_docs: int) -> None: + """Build metadata-only search index for main index page.""" + logger.info("Building metadata-only search index for main index page...") + all_docs = self.document_discovery.get_all_documents_recursive() + + # Apply document limit if set (0 = no limit) + if max_docs > 0: + all_docs = all_docs[:max_docs] + if len(self.document_discovery.get_all_documents_recursive()) > max_docs: + logger.info(f"Limited to {max_docs} documents (set max_main_index_docs to 0 for no limit)") + + # Build flat array of documents for search index + documents = [] + for child_docname in all_docs: + if child_docname != docname: # Don't include self + try: + child_data = self.json_formatter.build_child_json_data(child_docname, include_content=False) + documents.append(child_data) + except Exception as e: # noqa: BLE001 + logger.warning(f"Failed to build child metadata for {child_docname}: {e}") + + # Store as flat array - will be output as array at root level + data["_documents_array"] = documents + data["total_documents"] = len(self.document_discovery.get_all_documents_recursive()) + + logger.info(f"Generated metadata-only search index with {len(documents)} documents") + + def _build_full_search_index(self, data: dict[str, Any], docname: str, max_docs: int) -> None: + """Build comprehensive search index for main index page.""" + logger.info("Building comprehensive search index for main index page...") + all_docs = self.document_discovery.get_all_documents_recursive() + + # Apply document limit if set (0 = no limit) + if max_docs > 0: + all_docs = all_docs[:max_docs] + if len(self.document_discovery.get_all_documents_recursive()) > max_docs: + logger.info(f"Limited to {max_docs} documents (set max_main_index_docs to 0 for no limit)") + + # Build flat array of documents for search index + documents = [] + for child_docname in all_docs: + if child_docname != docname: # Don't include self + try: + child_data = self.json_formatter.build_child_json_data(child_docname) + documents.append(child_data) + except Exception as e: # noqa: BLE001 + logger.warning(f"Failed to build child data for {child_docname}: {e}") + + # Store as flat array - will be output as array at root level + data["_documents_array"] = documents + data["total_documents"] = len(self.document_discovery.get_all_documents_recursive()) + + logger.info(f"Generated comprehensive search index with {len(documents)} documents") + + def _handle_directory_index(self, data: dict[str, Any], docname: str) -> None: + """Handle directory index: gets direct children.""" + children = self.document_discovery.get_child_documents(docname) + data["children"] = [] + + for child_docname in children: + try: + child_data = self.json_formatter.build_child_json_data(child_docname) + data["children"].append(child_data) + except Exception as e: # noqa: BLE001, PERF203 + logger.warning(f"Failed to build child data for {child_docname}: {e}") + + logger.debug(f"Included {len(data['children'])} child documents for {docname}") diff --git a/docs/_ext/json_output/core/json_formatter.py b/docs/_ext/json_output/core/json_formatter.py new file mode 100644 index 00000000..250451a3 --- /dev/null +++ b/docs/_ext/json_output/core/json_formatter.py @@ -0,0 +1,278 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""JSON data formatting and structure building.""" + +from datetime import datetime, timezone +from typing import TYPE_CHECKING, Any + +from docutils import nodes +from sphinx.application import Sphinx +from sphinx.util import logging + +from ..utils import get_document_url, get_setting +from .document_discovery import DocumentDiscovery +from .global_metadata import get_global_metadata + +if TYPE_CHECKING: + from .builder import JSONOutputBuilder + +logger = logging.getLogger(__name__) + + +class JSONFormatter: + """Handles JSON data structure building and formatting.""" + + def __init__(self, app: Sphinx, json_builder: "JSONOutputBuilder"): + self.app = app + self.env = app.env + self.config = app.config + self.json_builder = json_builder + self._global_metadata: dict[str, Any] | None = None + + @property + def global_metadata(self) -> dict[str, Any]: + """Get cached global metadata from conf.py.""" + if self._global_metadata is None: + self._global_metadata = get_global_metadata(self.config) + return self._global_metadata + + def add_metadata_fields(self, data: dict[str, Any], metadata: dict[str, Any]) -> None: + """Add all metadata fields to JSON data structure. + + Supports both new nested schema and legacy flat fields for backwards compatibility. + New schema: topics, tags, industry, content.type, content.learning_level, content.audience, facets.modality + Legacy schema: categories, personas, difficulty, content_type, modality + """ + # Basic metadata fields + if metadata.get("description"): + data["description"] = metadata["description"] + + # Tags (same in both schemas) + if metadata.get("tags"): + data["tags"] = metadata["tags"] if isinstance(metadata["tags"], list) else [metadata["tags"]] + + # Topics (new schema) or categories (legacy) + topics = metadata.get("topics") or metadata.get("categories") + if topics: + data["topics"] = topics if isinstance(topics, list) else [topics] + + # Industry verticals + if metadata.get("industry"): + industry = metadata["industry"] + data["industry"] = industry if isinstance(industry, list) else [industry] + + if metadata.get("author"): + data["author"] = metadata["author"] + + # Content classification - support nested and flat structures + content = metadata.get("content", {}) + + # Content type: content.type (new) or content_type (legacy) + content_type = content.get("type") if isinstance(content, dict) else None + content_type = content_type or metadata.get("content_type") + if content_type: + data["content_type"] = content_type + + # Learning level: content.learning_level (new) or content.difficulty/difficulty (legacy) + learning_level = content.get("learning_level") if isinstance(content, dict) else None + learning_level = learning_level or content.get("difficulty") if isinstance(content, dict) else None + learning_level = learning_level or metadata.get("learning_level") or metadata.get("difficulty") + if learning_level: + data["learning_level"] = learning_level + + # Audience: content.audience (new) or personas (legacy) + audience = content.get("audience") if isinstance(content, dict) else None + audience = audience or metadata.get("personas") + if audience: + data["audience"] = audience if isinstance(audience, list) else [audience] + + # Keywords from frontmatter (takes priority over auto-extraction) + if metadata.get("keywords"): + keywords = metadata["keywords"] + data["keywords"] = keywords if isinstance(keywords, list) else [keywords] + + # Product-specific facets - dynamically extract all facet keys + facets = metadata.get("facets", {}) + if isinstance(facets, dict) and facets: + # Include all facets as a nested object + data["facets"] = facets + # Also flatten facets to top level for backwards compatibility and easier filtering + for facet_key, facet_value in facets.items(): + data[facet_key] = facet_value + + # Legacy flat modality support (if not already set via facets) + if "modality" not in data and metadata.get("modality"): + data["modality"] = metadata["modality"] + + # Content gating + if metadata.get("only"): + data["only"] = metadata["only"] + + def build_child_json_data(self, docname: str, include_content: bool | None = None) -> dict[str, Any]: + """Build optimized JSON data for child documents (LLM/search focused).""" + if include_content is None: + include_content = get_setting(self.config, "include_child_content", True) + + # Get document title + title = self.env.titles.get(docname, nodes.title()).astext() if docname in self.env.titles else "" + + # Extract metadata for tags/categories + metadata = self.json_builder.extract_document_metadata(docname) + content_data = self.json_builder.extract_document_content(docname) if include_content else {} + + # Build optimized data structure for search engines + data = { + "id": docname, # Use 'id' for search engines + "title": title, + "url": get_document_url(self.app, docname), + } + + # Add global metadata from conf.py (book, product, site) + self._add_global_metadata(data) + + # Add metadata fields from frontmatter + self.add_metadata_fields(data, metadata) + + # Add search-specific fields + if include_content: + self._add_content_fields(data, content_data, docname, title) + + return data + + def build_json_data(self, docname: str) -> dict[str, Any]: + """Build optimized JSON data structure for LLM/search use cases.""" + # Get document title + title = self.env.titles.get(docname, nodes.title()).astext() if docname in self.env.titles else "" + + # Extract metadata and content + metadata = self.json_builder.extract_document_metadata(docname) + content_data = self.json_builder.extract_document_content(docname) + + # Build data structure + data = { + "id": docname, + "title": title, + "url": get_document_url(self.app, docname), + "last_modified": datetime.now(timezone.utc).isoformat(), + } + + # Add global metadata from conf.py (book, product, site) + self._add_global_metadata(data) + + # Add metadata fields from frontmatter + self.add_metadata_fields(data, metadata) + + # Add content + if content_data.get("content"): + data["content"] = content_data["content"] + data["format"] = content_data.get("format", "text") + + if content_data.get("summary"): + data["summary"] = content_data["summary"] + + if content_data.get("headings"): + data["headings"] = [{"text": h["text"], "level": h["level"]} for h in content_data["headings"]] + + return data + + def _add_global_metadata(self, data: dict[str, Any]) -> None: + """Inject global site/book/product metadata from conf.py.""" + for key, value in self.global_metadata.items(): + if value: # Only add non-empty values + data[key] = value + + def _add_content_fields(self, data: dict[str, Any], content_data: dict[str, Any], docname: str, title: str) -> None: + """Add content-related fields to JSON data.""" + self._add_primary_content(data, content_data) + self._add_summary_content(data, content_data) + self._add_headings_content(data, content_data) + self._add_optional_features(data, content_data) + self._add_document_metadata(data, content_data, docname, title) + + def _add_primary_content(self, data: dict[str, Any], content_data: dict[str, Any]) -> None: + """Add primary content with length limits.""" + if not content_data.get("content"): + return + + content_max_length = get_setting(self.config, "content_max_length", 50000) + content = content_data["content"] + + if content_max_length > 0 and len(content) > content_max_length: + content = content[:content_max_length] + "..." + + data["content"] = content + data["format"] = content_data.get("format", "text") + data["content_length"] = len(content_data["content"]) # Original length + data["word_count"] = len(content_data["content"].split()) if content_data["content"] else 0 + + def _add_summary_content(self, data: dict[str, Any], content_data: dict[str, Any]) -> None: + """Add summary with length limits.""" + if not content_data.get("summary"): + return + + summary_max_length = get_setting(self.config, "summary_max_length", 500) + summary = content_data["summary"] + + if summary_max_length > 0 and len(summary) > summary_max_length: + summary = summary[:summary_max_length] + "..." + + data["summary"] = summary + + def _add_headings_content(self, data: dict[str, Any], content_data: dict[str, Any]) -> None: + """Add headings for structure/navigation.""" + if not content_data.get("headings"): + return + + # Simplify headings for LLM use + data["headings"] = [ + {"text": h["text"], "level": h["level"], "id": h.get("id", "")} for h in content_data["headings"] + ] + # Add searchable heading text + data["headings_text"] = " ".join([h["text"] for h in content_data["headings"]]) + + def _add_optional_features(self, data: dict[str, Any], content_data: dict[str, Any]) -> None: + """Add optional search enhancement features.""" + # Keywords: frontmatter takes priority, then auto-extraction + if "keywords" not in data: # Not already set from frontmatter + if get_setting(self.config, "extract_keywords", True) and "keywords" in content_data: + keywords_max_count = get_setting(self.config, "keywords_max_count", 50) + keywords = ( + content_data["keywords"][:keywords_max_count] + if keywords_max_count > 0 + else content_data["keywords"] + ) + data["keywords"] = keywords + + if get_setting(self.config, "extract_code_blocks", True) and "code_blocks" in content_data: + data["code_blocks"] = content_data["code_blocks"] + + if get_setting(self.config, "extract_links", True) and "links" in content_data: + data["links"] = content_data["links"] + + if get_setting(self.config, "extract_images", True) and "images" in content_data: + data["images"] = content_data["images"] + + def _add_document_metadata( + self, data: dict[str, Any], content_data: dict[str, Any], docname: str, title: str + ) -> None: + """Add document type and section metadata.""" + if get_setting(self.config, "include_doc_type", True): + discovery = DocumentDiscovery(self.app, self.json_builder) + data["doc_type"] = discovery.detect_document_type(docname, title, content_data.get("content", "")) + + if get_setting(self.config, "include_section_path", True): + discovery = DocumentDiscovery(self.app, self.json_builder) + data["section_path"] = discovery.get_section_path(docname) diff --git a/docs/_ext/json_output/core/json_writer.py b/docs/_ext/json_output/core/json_writer.py new file mode 100644 index 00000000..14eea68d --- /dev/null +++ b/docs/_ext/json_output/core/json_writer.py @@ -0,0 +1,103 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""JSON file writing and output operations.""" + +import json +from pathlib import Path +from typing import Any + +from sphinx.application import Sphinx +from sphinx.util import logging + +from ..utils import get_setting + +logger = logging.getLogger(__name__) + + +class JSONWriter: + """Handles JSON file writing operations.""" + + def __init__(self, app: Sphinx): + self.app = app + self.config = app.config + + def write_json_file(self, docname: str, data: dict[str, Any]) -> None: + """Write JSON data to file.""" + try: + outdir = Path(self.app.outdir) + + if docname == "index": + json_path = outdir / "index.json" + elif docname.endswith("/index"): + json_path = outdir / docname[:-6] / "index.json" + else: + json_path = outdir / f"{docname}.json" + + json_path.parent.mkdir(parents=True, exist_ok=True) + + # For main index.json, output as array of page objects + if docname == "index" and "_documents_array" in data: + self._write_array_index(json_path, data) + # Handle separate content files option + elif get_setting(self.config, "separate_content", False) and "content" in data: + self._write_separate_content(json_path, data) + else: + self._write_single_file(json_path, data) + + logger.debug(f"Generated JSON: {json_path}") + + except Exception: + logger.exception(f"Failed to write JSON for {docname}") + + def _write_array_index(self, json_path: Path, data: dict[str, Any]) -> None: + """Write main index.json as an array of page objects for search engines.""" + # Extract the documents array and write as root-level array + documents = data.get("_documents_array", []) + self._write_json_data(json_path, documents) + logger.info(f"Generated search index array with {len(documents)} documents") + + def _write_separate_content(self, json_path: Path, data: dict[str, Any]) -> None: + """Write content to separate file when separate_content is enabled.""" + # Write content to separate file + content_path = json_path.with_suffix(".content.json") + content_data = { + "id": data["id"], + "content": data["content"], + "format": data.get("format", "text"), + "content_length": data.get("content_length", 0), + "word_count": data.get("word_count", 0), + } + + self._write_json_data(content_path, content_data) + + # Remove content from main data and add reference + main_data = data.copy() + del main_data["content"] + main_data["content_file"] = str(content_path.name) + + self._write_json_data(json_path, main_data) + + def _write_single_file(self, json_path: Path, data: dict[str, Any]) -> None: + """Write all data to a single JSON file.""" + self._write_json_data(json_path, data) + + def _write_json_data(self, file_path: Path, data: dict[str, Any]) -> None: + """Write JSON data to file with appropriate formatting.""" + with open(file_path, "w", encoding="utf-8") as f: + if get_setting(self.config, "minify_json", False): + json.dump(data, f, ensure_ascii=False, separators=(",", ":")) + else: + json.dump(data, f, ensure_ascii=False, indent=2) diff --git a/docs/_ext/json_output/processing/__init__.py b/docs/_ext/json_output/processing/__init__.py new file mode 100644 index 00000000..f0030405 --- /dev/null +++ b/docs/_ext/json_output/processing/__init__.py @@ -0,0 +1,27 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Processing pipeline and orchestration components.""" + +from .cache import JSONOutputCache +from .processor import on_build_finished, process_document, process_documents_parallel, process_documents_sequential + +__all__ = [ + "JSONOutputCache", + "on_build_finished", + "process_document", + "process_documents_parallel", + "process_documents_sequential", +] diff --git a/docs/_ext/json_output/processing/cache.py b/docs/_ext/json_output/processing/cache.py new file mode 100644 index 00000000..bc397dcf --- /dev/null +++ b/docs/_ext/json_output/processing/cache.py @@ -0,0 +1,109 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Caching and incremental build support for JSON output extension.""" + +from collections.abc import Callable +from pathlib import Path +from threading import Lock +from typing import Any, ClassVar + +from sphinx.util import logging + +logger = logging.getLogger(__name__) + + +class JSONOutputCache: + """Manages caching and incremental builds for JSON output.""" + + # Class-level shared caches with thread safety + _shared_cache_lock = Lock() + _shared_metadata_cache: ClassVar[dict[str, Any]] = {} + _shared_frontmatter_cache: ClassVar[dict[str, Any]] = {} + _shared_content_cache: ClassVar[dict[str, Any]] = {} + _file_timestamps: ClassVar[dict[str, float]] = {} # Track file modification times + + def __init__(self): + """Initialize cache instance with shared caches.""" + with self._shared_cache_lock: + self._metadata_cache = self._shared_metadata_cache + self._frontmatter_cache = self._shared_frontmatter_cache + self._content_cache = self._shared_content_cache + self._timestamps = self._file_timestamps + + def get_metadata_cache(self) -> dict[str, Any]: + """Get the metadata cache.""" + return self._metadata_cache + + def get_frontmatter_cache(self) -> dict[str, Any]: + """Get the frontmatter cache.""" + return self._frontmatter_cache + + def get_content_cache(self) -> dict[str, Any]: + """Get the content cache.""" + return self._content_cache + + def needs_update(self, docname: str, source_path: Path, incremental_enabled: bool = False) -> bool: + """Check if document needs to be updated based on modification time.""" + if not incremental_enabled: + return True # Process all files if incremental build is disabled + + try: + if not source_path or not source_path.exists(): + return True + + current_mtime = source_path.stat().st_mtime + + # Check if we have a recorded timestamp + if docname in self._timestamps: + return current_mtime > self._timestamps[docname] + else: + # First time processing this file + self._timestamps[docname] = current_mtime + return True + + except Exception as e: # noqa: BLE001 + logger.debug(f"Error checking modification time for {docname}: {e}") + return True # Process if we can't determine modification time + + def mark_updated(self, docname: str, source_path: Path) -> None: + """Mark document as processed with current timestamp.""" + try: + if source_path and source_path.exists(): + self._timestamps[docname] = source_path.stat().st_mtime + except Exception: # noqa: BLE001 + logger.debug(f"Could not update timestamp for {docname}") + + def clear_caches(self) -> None: + """Clear all caches (useful for testing or memory cleanup).""" + with self._shared_cache_lock: + self._metadata_cache.clear() + self._frontmatter_cache.clear() + self._content_cache.clear() + self._timestamps.clear() + + def get_cache_stats(self) -> dict[str, int]: + """Get cache statistics for debugging.""" + return { + "metadata_cache_size": len(self._metadata_cache), + "frontmatter_cache_size": len(self._frontmatter_cache), + "content_cache_size": len(self._content_cache), + "timestamps_size": len(self._timestamps), + } + + def with_cache_lock(self, func: Callable[..., Any], *args: Any, **kwargs: Any) -> Any: # noqa: ANN401 + """Execute function with cache lock held.""" + with self._shared_cache_lock: + return func(*args, **kwargs) diff --git a/docs/_ext/json_output/processing/processor.py b/docs/_ext/json_output/processing/processor.py new file mode 100644 index 00000000..357fe83f --- /dev/null +++ b/docs/_ext/json_output/processing/processor.py @@ -0,0 +1,214 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Document processing and build orchestration for JSON output extension.""" + +import multiprocessing +from collections.abc import Callable +from concurrent.futures import ThreadPoolExecutor + +from sphinx.application import Sphinx +from sphinx.config import Config +from sphinx.util import logging + +from ..core.builder import JSONOutputBuilder +from ..utils import get_setting, validate_content_gating_integration + +logger = logging.getLogger(__name__) + + +def on_build_finished(app: Sphinx, exception: Exception) -> None: + """Generate JSON files after HTML build is complete.""" + if exception is not None: + return + + verbose = get_setting(app.config, "verbose", False) + log_func = logger.info if verbose else logger.debug + log_func("Generating JSON output files...") + + # Setup and validation + json_builder = _setup_json_builder(app) + if not json_builder: + return + + # Get and filter documents + all_docs = _filter_documents(app, json_builder, log_func) + + # Process documents + generated_count, failed_count = _process_documents(app, json_builder, all_docs, log_func) + + # Final logging + _log_results(log_func, generated_count, failed_count) + + +def _setup_json_builder(app: Sphinx) -> JSONOutputBuilder | None: + """Setup and validate JSON builder.""" + validate_content_gating_integration(app) + + try: + return JSONOutputBuilder(app) + except Exception: + logger.exception("Failed to initialize JSONOutputBuilder") + return None + + +def _filter_documents(app: Sphinx, json_builder: JSONOutputBuilder, log_func: Callable[[str], None]) -> list[str]: + """Filter documents based on gating, incremental build, and size limits.""" + all_docs, gated_docs = _get_initial_documents(app, json_builder) + + if gated_docs: + log_func(f"Content gating: excluding {len(gated_docs)} documents from JSON generation") + verbose = get_setting(app.config, "verbose", False) + if verbose and gated_docs: + logger.debug(f"Gated documents: {', '.join(sorted(gated_docs))}") + + all_docs = _apply_incremental_filtering(app, json_builder, all_docs, log_func) + return _apply_size_filtering(app, all_docs, log_func) + + +def _get_initial_documents(app: Sphinx, json_builder: JSONOutputBuilder) -> tuple[list[str], list[str]]: + """Get initial document lists, separating processable from gated documents.""" + all_docs = [] + gated_docs = [] + + for docname in app.env.all_docs: + if json_builder.should_generate_json(docname): + all_docs.append(docname) + else: + gated_docs.append(docname) + + return all_docs, gated_docs + + +def _apply_incremental_filtering( + app: Sphinx, json_builder: JSONOutputBuilder, all_docs: list[str], log_func: Callable[[str], None] +) -> list[str]: + """Apply incremental build filtering if enabled.""" + if not get_setting(app.config, "incremental_build", False): + return all_docs + + incremental_docs = [docname for docname in all_docs if json_builder.needs_update(docname)] + skipped_count = len(all_docs) - len(incremental_docs) + if skipped_count > 0: + log_func(f"Incremental build: skipping {skipped_count} unchanged files") + return incremental_docs + + +def _apply_size_filtering(app: Sphinx, all_docs: list[str], log_func: Callable[[str], None]) -> list[str]: + """Apply file size filtering if enabled.""" + skip_large_files = get_setting(app.config, "skip_large_files", 0) + if skip_large_files <= 0: + return all_docs + + filtered_docs = [] + for docname in all_docs: + try: + source_path = app.env.doc2path(docname) + if source_path and source_path.stat().st_size <= skip_large_files: + filtered_docs.append(docname) + else: + log_func(f"Skipping large file: {docname} ({source_path.stat().st_size} bytes)") + except Exception: # noqa: BLE001, PERF203 + filtered_docs.append(docname) # Include if we can't check size + return filtered_docs + + +def _process_documents( + app: Sphinx, json_builder: JSONOutputBuilder, all_docs: list[str], log_func: Callable[[str], None] +) -> tuple[int, int]: + """Process documents either in parallel or sequentially.""" + if get_setting(app.config, "parallel", False): + return process_documents_parallel(json_builder, all_docs, app.config, log_func) + else: + return process_documents_sequential(json_builder, all_docs) + + +def _log_results(log_func: Callable[[str], None], generated_count: int, failed_count: int) -> None: + """Log final processing results.""" + log_func(f"Generated {generated_count} JSON files") + if failed_count > 0: + logger.warning(f"Failed to generate {failed_count} JSON files") + + +def process_documents_parallel( + json_builder: JSONOutputBuilder, all_docs: list[str], config: Config, log_func: Callable[[str], None] +) -> tuple[int, int]: + """Process documents in parallel batches.""" + parallel_workers = get_setting(config, "parallel_workers", "auto") + if parallel_workers == "auto": + cpu_count = multiprocessing.cpu_count() or 1 + max_workers = min(cpu_count, 8) # Limit to 8 threads max + else: + max_workers = min(int(parallel_workers), 16) # Cap at 16 for safety + + batch_size = get_setting(config, "batch_size", 50) + + generated_count = 0 + failed_count = 0 + + # Process in batches to control memory usage + for i in range(0, len(all_docs), batch_size): + batch_docs = all_docs[i : i + batch_size] + log_func( + f"Processing batch {i // batch_size + 1}/{(len(all_docs) - 1) // batch_size + 1} ({len(batch_docs)} docs)" + ) + + with ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = {} + for docname in batch_docs: + future = executor.submit(process_document, json_builder, docname) + futures[future] = docname + + for future, docname in futures.items(): + try: + if future.result(): + generated_count += 1 + else: + failed_count += 1 + except Exception: # noqa: PERF203 + logger.exception(f"Error generating JSON for {docname}") + failed_count += 1 + + return generated_count, failed_count + + +def process_documents_sequential(json_builder: JSONOutputBuilder, all_docs: list[str]) -> tuple[int, int]: + """Process documents sequentially.""" + generated_count = 0 + failed_count = 0 + + for docname in all_docs: + try: + json_data = json_builder.build_json_data(docname) + json_builder.write_json_file(docname, json_data) + generated_count += 1 + except Exception: # noqa: PERF203 + logger.exception(f"Error generating JSON for {docname}") + failed_count += 1 + + return generated_count, failed_count + + +def process_document(json_builder: JSONOutputBuilder, docname: str) -> bool: + """Process a single document for parallel execution.""" + try: + json_data = json_builder.build_json_data(docname) + json_builder.write_json_file(docname, json_data) + json_builder.mark_updated(docname) # Mark as processed for incremental builds + except Exception: + logger.exception(f"Error generating JSON for {docname}") + return False + else: + return True diff --git a/docs/_ext/json_output/utils.py b/docs/_ext/json_output/utils.py new file mode 100644 index 00000000..43fbc044 --- /dev/null +++ b/docs/_ext/json_output/utils.py @@ -0,0 +1,137 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Utility functions for JSON output.""" + +import fnmatch +from typing import Any + +from sphinx.application import Sphinx +from sphinx.config import Config +from sphinx.util import logging + +logger = logging.getLogger(__name__) + + +def validate_content_gating_integration(app: Sphinx) -> None: + """Validate that content gating integration is working properly.""" + # Check if content_gating extension is loaded + if "content_gating" in app.extensions: + logger.info("Content gating extension detected - JSON output will respect content gating rules") + else: + logger.debug("Content gating extension not detected - JSON output will process all documents") + + # Log current exclude patterns for debugging + exclude_patterns = getattr(app.config, "exclude_patterns", []) + if exclude_patterns: + logger.debug(f"Current exclude patterns: {exclude_patterns}") + + # Check current build tags for debugging + if hasattr(app, "tags"): + try: + current_tags = set(app.tags) + if current_tags: + logger.info(f"Active build tags: {current_tags}") + else: + logger.info("No build tags active") + except (TypeError, AttributeError): + logger.debug("Could not determine active build tags") + + +def get_setting(config: Config, key: str, default: Any = None) -> Any: # noqa: ANN401 + """Get a setting from json_output_settings with fallback to old config names.""" + settings = getattr(config, "json_output_settings", {}) + + # Try new settings format first + if key in settings: + return settings[key] + + # Fallback to old config names for backward compatibility + old_config_map = { + "enabled": "json_output_enabled", + "exclude_patterns": "json_output_exclude_patterns", + "verbose": "json_output_verbose", + "parallel": "json_output_parallel", + "include_children": "json_output_include_children", + "include_child_content": "json_output_include_child_content", + "main_index_mode": "json_output_main_index_mode", + "max_main_index_docs": "json_output_max_main_index_docs", + } + + old_key = old_config_map.get(key) + if old_key and hasattr(config, old_key): + return getattr(config, old_key) + + return default + + +def is_content_gated(config: Config, docname: str) -> bool: + """ + Check if a document is content gated by checking Sphinx's exclude_patterns. + This works with the content_gating extension that adds restricted documents + to exclude_patterns during config-inited event. + """ + sphinx_exclude_patterns = getattr(config, "exclude_patterns", []) + if not sphinx_exclude_patterns: + return False + + # Convert docname to potential file paths that might be in exclude_patterns + possible_paths = [docname + ".md", docname + ".rst", docname] + + for possible_path in possible_paths: + # Check if this path matches any exclude pattern using fnmatch (supports glob patterns) + for pattern in sphinx_exclude_patterns: + if isinstance(pattern, str) and fnmatch.fnmatch(possible_path, pattern): + logger.debug(f"Document {docname} is content gated (matches pattern: {pattern})") + return True + + return False + + +def should_generate_json(config: Config, docname: str) -> bool: + """Check if JSON should be generated for this document.""" + if not get_setting(config, "enabled", True): + return False + + if not docname or not isinstance(docname, str): + logger.warning(f"Invalid docname for JSON generation: {docname}") + return False + + # CRITICAL: Check content gating first - if document is content gated, don't generate JSON + if is_content_gated(config, docname): + logger.info(f"Excluding {docname} from JSON generation due to content gating") + return False + + # Check JSON output extension's own exclude patterns + for pattern in get_setting(config, "exclude_patterns", []): + if isinstance(pattern, str) and docname.startswith(pattern): + return False + + return True + + +def get_document_url(app: Sphinx, docname: str) -> str: + """Get the URL for a document.""" + if not docname or not isinstance(docname, str): + logger.warning(f"Invalid docname for URL generation: {docname}") + return "invalid.html" + + try: + if hasattr(app.builder, "get_target_uri"): + return app.builder.get_target_uri(docname) + except Exception as e: # noqa: BLE001 + logger.warning(f"Failed to get target URI for {docname}: {e}") + + return docname + ".html" diff --git a/docs/_ext/search_assets/__init__.py b/docs/_ext/search_assets/__init__.py new file mode 100644 index 00000000..032e149a --- /dev/null +++ b/docs/_ext/search_assets/__init__.py @@ -0,0 +1,202 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Enhanced Search Extension for Sphinx +Provides enhanced search page functionality without interfering with default search +""" + +import os +import re +import shutil +from typing import Any + +from sphinx.application import Sphinx +from sphinx.config import Config +from sphinx.util import logging + +logger = logging.getLogger(__name__) + + +def bundle_javascript_modules(extension_dir: str, output_path: str, minify: bool = False) -> None: + """Bundle all JavaScript modules into a single file.""" + + # Define the module loading order (dependencies first) + module_files = [ + ("modules", "Utils.js"), + ("modules", "DocumentLoader.js"), + ("modules", "SearchEngine.js"), + ("modules", "SearchInterface.js"), + ("modules", "ResultRenderer.js"), + ("modules", "EventHandler.js"), + ("modules", "SearchPageManager.js"), + ("", "main.js"), # Main file in root + ] + + bundled_content = [] + bundled_content.append("// Enhanced Search Bundle - Generated automatically") + bundled_content.append( + "// Contains: Utils, DocumentLoader, SearchEngine, SearchInterface, ResultRenderer, EventHandler, SearchPageManager, main" + ) + bundled_content.append("") + + for subdir, filename in module_files: + if subdir: + module_path = os.path.join(extension_dir, subdir, filename) + else: + module_path = os.path.join(extension_dir, filename) + + if os.path.exists(module_path): + with open(module_path, encoding="utf-8") as f: + content = f.read() + + # Remove module loading code since everything is bundled + content = content.replace("await this.loadModules();", "// Modules bundled - no loading needed") + content = content.replace( + "await this.loadModuleWithFallback(name)", "// Modules bundled - no loading needed" + ) + + # Simple minification if requested + if minify: + # Remove extra whitespace and comments (basic minification) + # Remove single-line comments but preserve URLs + content = re.sub(r"^\s*//.*$", "", content, flags=re.MULTILINE) + # Remove multi-line comments + content = re.sub(r"/\*.*?\*/", "", content, flags=re.DOTALL) + # Remove extra whitespace + content = re.sub(r"\n\s*\n", "\n", content) + content = re.sub(r"^\s+", "", content, flags=re.MULTILINE) + + bundled_content.append(f"// === {filename} ===") + bundled_content.append(content) + bundled_content.append("") + + logger.info(f"Bundled: {filename}") + else: + logger.warning(f"Module not found for bundling: {module_path}") + + # Write the bundled file + with open(output_path, "w", encoding="utf-8") as f: + f.write("\n".join(bundled_content)) + + file_size = os.path.getsize(output_path) + size_kb = file_size / 1024 + logger.info(f"Enhanced Search JavaScript bundle created: {output_path} ({size_kb:.1f}KB)") + + +def add_template_path(_app: Sphinx, config: Config) -> None: + """Add template path during config initialization.""" + extension_dir = os.path.dirname(os.path.abspath(__file__)) + templates_path = os.path.join(extension_dir, "templates") + + if os.path.exists(templates_path): + # Ensure templates_path is a list + if not isinstance(config.templates_path, list): + config.templates_path = list(config.templates_path) if config.templates_path else [] + + # Add our template path if not already present + if templates_path not in config.templates_path: + config.templates_path.append(templates_path) + logger.info(f"Enhanced search templates added: {templates_path}") + + +def copy_assets(app: Sphinx, exc: Exception | None) -> None: + """Copy assets to _static after build.""" + if exc is not None: # Only run if build succeeded + return + + extension_dir = os.path.dirname(os.path.abspath(__file__)) + static_path = os.path.join(app.outdir, "_static") + os.makedirs(static_path, exist_ok=True) + + # Copy CSS file + css_file = os.path.join(extension_dir, "enhanced-search.css") + if os.path.exists(css_file): + shutil.copy2(css_file, os.path.join(static_path, "enhanced-search.css")) + logger.info("Enhanced search CSS copied") + + # Copy main JavaScript file + main_js = os.path.join(extension_dir, "main.js") + if os.path.exists(main_js): + shutil.copy2(main_js, os.path.join(static_path, "main.js")) + logger.info("Enhanced search main.js copied") + + # Copy module files + modules_dir = os.path.join(extension_dir, "modules") + if os.path.exists(modules_dir): + modules_static_dir = os.path.join(static_path, "modules") + os.makedirs(modules_static_dir, exist_ok=True) + for module_file in os.listdir(modules_dir): + if module_file.endswith(".js"): + shutil.copy2(os.path.join(modules_dir, module_file), os.path.join(modules_static_dir, module_file)) + logger.info("Enhanced search modules copied") + + +def copy_assets_early(app: Sphinx, _docname: str, _source: list[str]) -> None: + """Copy bundled assets to _static early in the build process.""" + # Only copy once - use a flag to prevent multiple copies + if hasattr(app, "_search_assets_copied"): + return + + extension_dir = os.path.dirname(os.path.abspath(__file__)) + static_path = os.path.join(app.outdir, "_static") + os.makedirs(static_path, exist_ok=True) + + # Copy CSS file + css_file = os.path.join(extension_dir, "enhanced-search.css") + if os.path.exists(css_file): + shutil.copy2(css_file, os.path.join(static_path, "enhanced-search.css")) + logger.info("Enhanced search CSS copied") + + # Create bundled JavaScript file instead of copying individual modules + bundle_path = os.path.join(static_path, "search-assets.bundle.js") + bundle_javascript_modules(extension_dir, bundle_path) + + # Mark as copied + app._search_assets_copied = True + + +def setup(app: Sphinx) -> dict[str, Any]: + """Setup the enhanced search extension.""" + + # Get the directory where this extension is located + extension_dir = os.path.dirname(os.path.abspath(__file__)) + + # Connect to config-inited event to add template path + app.connect("config-inited", add_template_path) + + # Copy assets early in the build process so JS modules are available + app.connect("source-read", copy_assets_early) + + # Add CSS file + css_file = os.path.join(extension_dir, "enhanced-search.css") + if os.path.exists(css_file): + app.add_css_file("enhanced-search.css") + logger.info("Enhanced search CSS loaded") + else: + logger.warning(f"Enhanced search CSS not found at {css_file}") + + # Add the bundled JavaScript file (contains all modules) + app.add_js_file("search-assets.bundle.js") + logger.info("Enhanced search bundled JS will be loaded") + + # Connect to build events (backup) + app.connect("build-finished", copy_assets) + + return { + "version": "2.0.0", + "parallel_read_safe": True, + "parallel_write_safe": True, + } diff --git a/docs/_ext/search_assets/enhanced-search.css b/docs/_ext/search_assets/enhanced-search.css new file mode 100644 index 00000000..6ae98c88 --- /dev/null +++ b/docs/_ext/search_assets/enhanced-search.css @@ -0,0 +1,1370 @@ +/** + * Enhanced Search Styles + * Aligned with NVIDIA Sphinx theme - full light/dark mode support + * Uses theme variables exclusively - no hardcoded colors + */ + +/* CSS Variables for theming */ +:root { + --search-primary-color: var(--nv-color-green, #76b900); + --search-background: var(--pst-color-background, #ffffff); + --search-surface: var(--pst-color-surface, #f8f9fa); + --search-text-primary: var(--pst-color-text-base, #333333); + --search-text-secondary: var(--pst-color-text-muted, #6c757d); + --search-border: var(--pst-color-border, #e1e4e8); + --search-shadow: 0 2px 8px rgba(0, 0, 0, 0.1); + --search-font-family: var(--pst-font-family-base, -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif); +} + +/* ===== SEARCH PAGE STYLES ===== */ + +/* Unified Search Controls Container */ +.search-controls-container { + background: linear-gradient(to bottom, var(--pst-color-background), var(--pst-color-surface)); + border: 1px solid var(--pst-color-on-surface); + border-radius: 1rem; + padding: 1.5rem; + box-shadow: + 0 4px 6px -1px rgba(0, 0, 0, 0.05), + 0 2px 4px -1px rgba(0, 0, 0, 0.03), + inset 0 1px 0 rgba(255, 255, 255, 0.1); +} + +/* Search Filters */ +.search-filters { + margin-bottom: 1.25rem; +} + +/* Filter Header */ +.filter-header { + display: flex; + align-items: center; + justify-content: space-between; + margin-bottom: 1rem; + padding-bottom: 0.75rem; + border-bottom: 1px solid var(--pst-color-on-surface); +} + +.filter-header-left { + display: flex; + align-items: center; + gap: 0.5rem; +} + +.filter-header-icon { + color: var(--nv-color-green); + font-size: 0.875rem; +} + +.filter-header-title { + font-size: 0.8125rem; + font-weight: 600; + color: var(--pst-color-text-base); + text-transform: uppercase; + letter-spacing: 0.05em; +} + +.active-filter-count { + display: inline-flex; + align-items: center; + justify-content: center; + min-width: 1.25rem; + height: 1.25rem; + padding: 0 0.375rem; + font-size: 0.6875rem; + font-weight: 700; + color: white; + background: var(--nv-color-green); + border-radius: 1rem; +} + +.filter-clear-btn { + display: inline-flex; + align-items: center; + gap: 0.375rem; + padding: 0.375rem 0.75rem; + font-size: 0.75rem; + font-weight: 500; + color: var(--pst-color-text-muted); + background: transparent; + border: 1px solid var(--pst-color-on-surface); + border-radius: 0.375rem; + cursor: pointer; + transition: all 0.2s ease; +} + +.filter-clear-btn:hover { + color: var(--pst-color-text-base); + background: var(--pst-color-surface); + border-color: var(--pst-color-text-muted); +} + +.filter-clear-btn.hidden { + opacity: 0; + pointer-events: none; +} + +/* Filter Grid */ +.filter-grid { + display: grid; + grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)); + gap: 1rem; +} + +.filter-group { + display: flex; + flex-direction: column; + gap: 0.375rem; +} + +.filter-label { + display: flex; + align-items: center; + gap: 0.375rem; + font-size: 0.6875rem; + font-weight: 600; + color: var(--pst-color-text-muted); + text-transform: uppercase; + letter-spacing: 0.05em; +} + +.filter-label i { + font-size: 0.625rem; + color: var(--pst-color-text-muted); + opacity: 0.7; +} + +/* Filter Select Wrapper */ +.filter-select-wrapper { + position: relative; + display: flex; + align-items: center; +} + +.filter-select-wrapper.has-value { + --select-border-color: var(--nv-color-green); + --select-bg-color: rgba(118, 185, 0, 0.05); +} + +.filter-select { + width: 100%; + padding: 0.5rem 2rem 0.5rem 0.75rem; + font-size: 0.8125rem; + font-family: var(--pst-font-family-base); + color: var(--pst-color-text-base); + background-color: var(--select-bg-color, var(--pst-color-background)); + border: 1px solid var(--select-border-color, var(--pst-color-on-surface)); + border-radius: 0.5rem; + outline: none; + appearance: none; + cursor: pointer; + transition: all 0.2s ease; + text-overflow: ellipsis; +} + +.filter-select:focus { + border-color: var(--nv-color-green); + box-shadow: 0 0 0 3px rgba(118, 185, 0, 0.15); +} + +.filter-select:hover:not(:focus) { + border-color: var(--pst-color-text-muted); + background-color: var(--pst-color-surface); +} + +.filter-select-arrow { + position: absolute; + right: 0.625rem; + font-size: 0.625rem; + color: var(--pst-color-text-muted); + pointer-events: none; + transition: transform 0.2s ease; +} + +.filter-select:focus+.filter-select-arrow { + color: var(--nv-color-green); +} + +.filter-select option { + background-color: var(--pst-color-background); + color: var(--pst-color-text-base); + padding: 0.5rem; +} + +/* Search Input Wrapper */ +.search-input-wrapper { + position: relative; + display: flex; + align-items: center; +} + +.search-input-icon { + position: absolute; + left: 1rem; + font-size: 1rem; + color: var(--pst-color-text-muted); + pointer-events: none; + transition: color 0.2s ease; + z-index: 1; +} + +.search-input-field { + width: 100%; + padding: 0.875rem 1rem 0.875rem 2.75rem; + font-size: 1rem; + font-family: var(--pst-font-family-base); + font-weight: 400; + line-height: 1.5; + color: var(--pst-color-text-base); + background-color: var(--pst-color-background); + border: 2px solid var(--pst-color-on-surface); + border-radius: 0.75rem; + outline: none; + transition: all 0.2s ease; +} + +.search-input-field:focus { + border-color: var(--nv-color-green); + box-shadow: 0 0 0 4px rgba(118, 185, 0, 0.12); +} + +.search-input-field:focus+.search-input-icon, +.search-input-wrapper:focus-within .search-input-icon { + color: var(--nv-color-green); +} + +.search-input-field::placeholder { + color: var(--pst-color-text-muted); + opacity: 0.8; +} + +/* Legacy filter-row support */ +.filter-row { + display: grid; + grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)); + gap: 1rem; + align-items: end; +} + +.filter-actions { + display: flex; + align-items: center; + gap: 0.5rem; + justify-self: end; +} + +.btn { + display: inline-flex; + align-items: center; + gap: 0.5rem; + padding: 0.5rem 1rem; + font-size: 0.875rem; + font-weight: 500; + font-family: var(--pst-font-family-base); + text-decoration: none; + border-radius: 0.25rem; + border: 1px solid transparent; + cursor: pointer; + transition: all 0.15s ease-in-out; +} + +.btn-sm { + padding: 0.375rem 0.75rem; + font-size: 0.8125rem; +} + +.btn-secondary { + color: var(--pst-color-text-base); + background-color: transparent; + border-color: var(--pst-color-on-surface); +} + +.btn-secondary:hover { + color: var(--pst-color-background); + background-color: var(--pst-color-text-base); + border-color: var(--pst-color-text-base); +} + +.btn-secondary:focus { + color: var(--pst-color-text-base); + background-color: transparent; + border-color: var(--nv-color-green); + box-shadow: 0 0 0 0.2rem rgba(118, 185, 0, 0.25); +} + +.btn-outline-secondary { + color: var(--pst-color-text-base); + background-color: transparent; + border-color: var(--pst-color-on-surface); +} + +.btn-outline-secondary:hover { + color: var(--pst-color-background); + background-color: var(--pst-color-text-base); + border-color: var(--pst-color-text-base); +} + +/* Responsive filters */ +@media (max-width: 900px) { + .filter-grid { + grid-template-columns: repeat(3, 1fr); + } +} + +@media (max-width: 768px) { + .search-controls-container { + padding: 1rem; + border-radius: 0.75rem; + } + + .search-filters { + margin-bottom: 1rem; + } + + .filter-header { + flex-wrap: wrap; + gap: 0.75rem; + } + + .filter-grid { + grid-template-columns: repeat(2, 1fr); + gap: 0.75rem; + } + + .filter-group { + min-width: auto; + } + + .filter-actions { + grid-column: 1; + justify-self: center; + margin-top: 0.75rem; + } + + .search-input-field { + padding: 0.75rem 1rem 0.75rem 2.5rem; + font-size: 1rem; + } +} + +@media (max-width: 480px) { + .filter-grid { + grid-template-columns: 1fr; + } + + .filter-header-left { + flex: 1; + } +} + +/* Legacy input ID selector - now handled by .search-input-field */ +#enhanced-search-page-input { + width: 100%; + padding: 0.875rem 1rem 0.875rem 2.75rem; + font-size: 1rem; + font-family: var(--pst-font-family-base); + font-weight: 400; + line-height: 1.5; + color: var(--pst-color-text-base); + background-color: var(--pst-color-background); + border: 2px solid var(--pst-color-on-surface); + border-radius: 0.75rem; + outline: none; + transition: all 0.2s ease; +} + +.search-input-unified { + margin-top: 0 !important; +} + +#enhanced-search-page-input:focus { + border-color: var(--nv-color-green); + box-shadow: 0 0 0 4px rgba(118, 185, 0, 0.12); +} + +#enhanced-search-page-input::placeholder { + color: var(--pst-color-text-muted); + opacity: 0.8; +} + +.loading { + display: inline-block; + margin-left: 0.5rem; + color: var(--pst-color-text-muted); +} + +.spinner { + display: inline-block; + width: 1rem; + height: 1rem; + border: 0.125rem solid var(--pst-color-text-muted); + border-radius: 50%; + border-top-color: var(--nv-color-green); + animation: spin 1s ease-in-out infinite; +} + +@keyframes spin { + to { + transform: rotate(360deg); + } +} + +#search-results { + margin-top: 1.5rem; +} + +/* ===== SEARCH RESULTS STYLES ===== */ + +.search-results-header { + margin-bottom: 1.5rem; + padding-bottom: 1rem; + border-bottom: 1px solid var(--pst-color-on-surface); +} + +.search-results-header h3 { + color: var(--pst-color-heading); + font-family: var(--pst-font-family-heading); + font-weight: var(--pst-font-weight-heading); + font-size: var(--pst-font-size-h3); + margin: 0 0 0.5rem 0; +} + +.search-results-header p { + color: var(--pst-color-text-muted); + font-size: 0.875rem; + margin: 0; +} + +/* Search Result Cards */ +.search-result { + background-color: var(--pst-color-background); + border: 1px solid var(--pst-color-on-surface); + border-radius: 0.5rem; + padding: 1.5rem; + margin-bottom: 1.5rem; + transition: all 0.2s ease-in-out; + position: relative; + overflow: hidden; +} + +.search-result::before { + content: ''; + position: absolute; + top: 0; + left: 0; + right: 0; + height: 4px; + background: linear-gradient(90deg, var(--nv-color-green), var(--nv-color-green-2)); + transform: scaleX(0); + transform-origin: left; + transition: transform 0.2s ease-in-out; +} + +.search-result:hover { + border-color: var(--nv-color-green); + box-shadow: 0 0.5rem 1rem rgba(0, 0, 0, 0.1); + transform: translateY(-0.125rem); +} + +.search-result:hover::before { + transform: scaleX(1); +} + +/* Result Header */ +.result-header { + display: flex; + align-items: flex-start; + gap: 1rem; + margin-bottom: 1rem; +} + +.section-icon { + flex-shrink: 0; + width: 3rem; + height: 3rem; + border-radius: 0.5rem; + display: flex; + align-items: center; + justify-content: center; + font-size: 1.25rem; + font-weight: 700; + color: var(--pst-color-background); + background: var(--nv-color-green); + border: 1px solid var(--pst-color-on-surface); +} + +.result-info { + flex-grow: 1; + min-width: 0; +} + +.result-title { + margin: 0 0 0.5rem 0; + font-family: var(--pst-font-family-heading); + font-weight: var(--pst-font-weight-heading); + font-size: var(--pst-font-size-h4); + line-height: 1.25; +} + +.result-title a { + color: var(--pst-color-heading); + text-decoration: none; + transition: color 0.15s ease-in-out; +} + +.result-title a:hover { + color: var(--nv-color-green); + text-decoration: underline; + text-decoration-color: var(--nv-color-green); + text-decoration-thickness: max(3px, 0.1875rem, 0.12em); +} + +/* Breadcrumb */ +.result-breadcrumb { + display: flex; + align-items: center; + gap: 0.5rem; + font-size: 0.875rem; + color: var(--pst-color-text-muted); + margin-bottom: 0.5rem; + font-family: var(--pst-font-family-base); +} + +.result-breadcrumb .breadcrumb-separator { + color: var(--pst-color-text-muted); + font-weight: 400; +} + +/* Meta Information */ +.result-meta { + display: flex; + align-items: center; + gap: 1rem; + flex-wrap: wrap; +} + +.section-badge { + display: inline-flex; + align-items: center; + gap: 0.25rem; + padding: 0.25rem 0.5rem; + background-color: var(--pst-color-background); + border: 1px solid var(--pst-color-on-surface); + border-radius: 1rem; + font-size: 0.75rem; + font-weight: 500; + color: var(--pst-color-text-base); + text-transform: uppercase; + letter-spacing: 0.05em; +} + +.relevance-score { + font-size: 0.75rem; + color: var(--pst-color-text-muted); + font-weight: 500; + font-family: var(--pst-font-family-monospace); +} + +/* Result Content */ +.result-content { + color: var(--pst-color-text-base); + font-family: var(--pst-font-family-base); + line-height: 1.6; + margin-bottom: 1rem; +} + +.result-content p { + margin: 0 0 0.75rem 0; +} + +.result-content p:last-child { + margin-bottom: 0; +} + +.result-summary { + color: var(--pst-color-text-base); + font-size: 0.9rem; + line-height: 1.5; + margin-bottom: 1rem; +} + +/* Matching Sections */ +.matching-sections { + margin-top: 1rem; + padding-top: 1rem; + border-top: 1px solid var(--pst-color-on-surface); +} + +.matching-sections h4, +.matching-sections h5 { + color: var(--pst-color-heading); + font-family: var(--pst-font-family-heading); + font-weight: 500; + font-size: 0.875rem; + text-transform: uppercase; + letter-spacing: 0.05em; + margin: 0 0 0.75rem 0; + display: flex; + align-items: center; + gap: 0.5rem; +} + +.section-links { + background-color: var(--pst-color-background); + border: 1px solid var(--pst-color-on-surface); + border-radius: 0.5rem; + padding: 0.75rem; +} + +.section-link { + display: flex; + align-items: center; + gap: 0.75rem; + padding: 0.5rem 0.75rem; + border-radius: 0.25rem; + font-size: 0.875rem; + color: var(--pst-color-text-base); + text-decoration: none; + transition: all 0.15s ease-in-out; + font-family: var(--pst-font-family-base); + margin-bottom: 0.25rem; +} + +.section-link:last-child { + margin-bottom: 0; +} + +.section-link:hover { + background-color: var(--nv-color-green); + color: var(--pst-color-background); + text-decoration: none; + transform: translateY(-0.0625rem); + box-shadow: 0 0.25rem 0.5rem rgba(118, 185, 0, 0.25); +} + +.section-link .section-icon { + width: 1.5rem; + height: 1.5rem; + font-size: 0.875rem; + background: var(--pst-color-surface); + color: var(--pst-color-primary); +} + +.section-link:hover .section-icon { + background: var(--pst-color-background); + color: var(--nv-color-green); +} + +/* Enhanced Result Features */ +.result-tag, +.result-category { + display: inline-flex; + align-items: center; + padding: 0.25rem 0.5rem; + font-size: 0.75rem; + font-weight: 500; + border-radius: 0.25rem; + text-decoration: none; + margin-right: 0.25rem; + margin-bottom: 0.25rem; +} + +.result-tag { + background-color: var(--pst-color-surface); + color: var(--pst-color-text-base); + border: 1px solid var(--pst-color-on-surface); + font-size: 0.75rem; + padding: 0.25rem 0.5rem; + border-radius: 0.25rem; + display: inline-block; + margin-right: 0.5rem; + margin-bottom: 0.25rem; +} + +.result-category { + background-color: rgba(118, 185, 0, 0.1); + color: var(--nv-color-green); + border: 1px solid rgba(118, 185, 0, 0.2); +} + +.multiple-matches-indicator { + display: inline-flex; + align-items: center; + padding: 0.25rem 0.5rem; + font-size: 0.75rem; + font-weight: 500; + color: var(--nv-color-green); + background-color: rgba(118, 185, 0, 0.1); + border-radius: 0.25rem; + border: 1px solid rgba(118, 185, 0, 0.2); + margin-left: 0.5rem; +} + +.more-tags, +.more-categories { + font-size: 0.75rem; + color: var(--pst-color-text-muted); + font-style: italic; + margin-left: 0.25rem; +} + +.result-tags, +.result-categories { + display: flex; + flex-wrap: wrap; + gap: 0.25rem; + align-items: center; +} + +/* Badge styles */ +.badge { + display: inline-flex; + align-items: center; + padding: 0.375rem 0.5rem; + font-size: 0.75rem; + font-weight: 500; + border-radius: 0.25rem; + text-decoration: none; +} + +.bg-secondary { + background-color: var(--pst-color-text-muted) !important; + color: var(--pst-color-background) !important; +} + +.bg-info { + background-color: rgba(118, 185, 0, 0.9) !important; + color: var(--pst-color-background) !important; +} + +.bg-light { + background-color: transparent !important; + color: var(--pst-color-text-muted) !important; + border: 1px solid var(--pst-color-on-surface) !important; +} + +/* Metadata badges */ +.metadata-badge { + display: inline-flex; + align-items: center; + padding: 0.2rem 0.5rem; + margin-right: 0.5rem; + margin-bottom: 0.25rem; + font-size: 0.75rem; + font-weight: 500; + border-radius: 0.375rem; + border: 1px solid; + cursor: help; + transition: all 0.2s ease; +} + +.persona-badge { + background-color: #e8f5e8; + color: #2d5a2d; + border-color: #c3e6c3; +} + +.difficulty-badge { + background-color: #fff3cd; + color: #856404; + border-color: #ffeaa7; +} + +.modality-badge { + background-color: #e2f3ff; + color: #0c5460; + border-color: #b8daff; +} + +.metadata-badge:hover { + transform: translateY(-1px); + box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); +} + +/* Clickable badge styles */ +.clickable-badge { + cursor: pointer; + transition: all 0.2s ease; + user-select: none; +} + +.clickable-badge:hover { + transform: translateY(-1px); + box-shadow: 0 2px 4px rgba(0, 0, 0, 0.15); + filter: brightness(1.1); +} + +.clickable-badge:active { + transform: translateY(0); +} + +.result-tag.clickable-badge:hover { + background-color: var(--nv-color-green); + color: var(--pst-color-background); + border-color: var(--nv-color-green); +} + +/* Active filter display */ +.active-filters-display { + background-color: var(--pst-color-surface); + border: 1px solid var(--pst-color-on-surface); + border-radius: 0.375rem; + padding: 0.75rem; +} + +.active-filter-badge { + display: inline-flex; + align-items: center; + padding: 0.2rem 0.5rem; + margin-right: 0.5rem; + font-size: 0.75rem; + font-weight: 500; + border-radius: 0.25rem; + background-color: var(--nv-color-green); + color: var(--pst-color-background); + border: 1px solid var(--nv-color-green); +} + +/* Utility classes for layout */ +.mb-1 { + margin-bottom: 0.25rem !important; +} + +.mb-2 { + margin-bottom: 0.5rem !important; +} + +.mb-3 { + margin-bottom: 1rem !important; +} + +.mb-4 { + margin-bottom: 1.5rem !important; +} + +.mt-1 { + margin-top: 0.25rem !important; +} + +.mt-3 { + margin-top: 1rem !important; +} + +.me-1 { + margin-right: 0.25rem !important; +} + +.me-2 { + margin-right: 0.5rem !important; +} + +.me-3 { + margin-right: 1rem !important; +} + +.ms-2 { + margin-left: 0.5rem !important; +} + +.ms-4 { + margin-left: 1.5rem !important; +} + +.d-flex { + display: flex !important; +} + +.align-items-center { + align-items: center !important; +} + +.align-items-start { + align-items: flex-start !important; +} + +.flex-grow-1 { + flex-grow: 1 !important; +} + +.flex-wrap { + flex-wrap: wrap !important; +} + +.gap-2 { + gap: 0.5rem !important; +} + +.text-decoration-none { + text-decoration: none !important; +} + +.text-center { + text-align: center !important; +} + +.text-muted { + color: var(--pst-color-text-muted) !important; +} + +.py-4 { + padding-top: 1.5rem !important; + padding-bottom: 1.5rem !important; +} + +.p-2 { + padding: 0.5rem !important; +} + +.border { + border: 1px solid var(--pst-color-on-surface) !important; +} + +.rounded { + border-radius: 0.25rem !important; +} + +.small { + font-size: 0.875rem !important; +} + +/* Empty and Error States */ +.no-results { + text-align: center; + padding: 3rem 1rem; + color: var(--pst-color-text-muted); + font-family: var(--pst-font-family-base); +} + +.no-results h3 { + color: var(--pst-color-heading); + font-family: var(--pst-font-family-heading); + font-weight: var(--pst-font-weight-heading); + font-size: var(--pst-font-size-h3); + margin: 0 0 1rem 0; +} + +.no-results p { + font-size: 1.125rem; + line-height: 1.6; + margin: 0; +} + +.error-message { + background-color: var(--pst-color-surface); + border: 1px solid var(--pst-color-on-surface); + border-left: 4px solid var(--nv-color-green); + border-radius: 0.5rem; + padding: 1rem; + margin: 1rem 0; + color: var(--pst-color-text-base); + font-family: var(--pst-font-family-base); +} + +/* Search Highlighting */ +.search-highlight, +mark { + background-color: rgba(118, 185, 0, 0.2); + color: var(--pst-color-text-base); + padding: 0.0625rem 0.125rem; + border-radius: 0.125rem; + font-weight: 400; + border: 1px solid rgba(118, 185, 0, 0.3); +} + +/* Section-specific icon colors and styles */ +.section-badge.getting-started { + background: linear-gradient(135deg, var(--nv-color-green), var(--nv-color-green-2)); + color: var(--pst-color-background); + border-color: var(--nv-color-green); +} + +.section-badge.admin { + background-color: var(--pst-color-surface); + color: var(--pst-color-text-base); +} + +.section-badge.reference { + background-color: var(--pst-color-surface); + color: var(--pst-color-text-base); +} + +.section-badge.tutorial { + background-color: var(--pst-color-surface); + color: var(--pst-color-text-base); +} + +/* Empty state icons and messaging */ +.search-empty-state, +.search-no-results { + text-align: center; + padding: 2rem; + color: var(--pst-color-text-muted); + font-family: var(--pst-font-family-base); +} + +.search-empty-state i, +.search-no-results i { + font-size: 3rem; + color: var(--pst-color-text-muted); + margin-bottom: 1rem; + display: block; +} + +.search-empty-state h4, +.search-no-results h4 { + color: var(--pst-color-heading); + font-family: var(--pst-font-family-heading); + font-size: var(--pst-font-size-h4); + margin-bottom: 0.5rem; +} + +.search-empty-state p, +.search-no-results p { + color: var(--pst-color-text-muted); + font-size: 1rem; + line-height: 1.5; + margin-bottom: 1rem; +} + +/* Responsive Design */ +@media (max-width: 768px) { + .search-result { + padding: 1rem; + margin-bottom: 1rem; + } + + .result-header { + flex-direction: column; + gap: 0.75rem; + } + + .section-icon { + width: 2.5rem; + height: 2.5rem; + font-size: 1rem; + } + + .result-title { + font-size: var(--pst-font-size-h5); + } + + .result-meta { + flex-direction: column; + align-items: flex-start; + gap: 0.5rem; + } + + .section-links { + padding: 0.5rem; + } + + .section-link { + padding: 0.375rem 0.5rem; + font-size: 0.8125rem; + } + + #enhanced-search-page-input { + font-size: 1rem; + padding: 0.875rem 1rem; + } +} + +/* High contrast mode support */ +@media (prefers-contrast: high) { + .search-result { + border-width: 2px; + } + + .search-result:hover { + border-width: 3px; + } + + .search-highlight, + mark { + outline: 1px solid var(--pst-color-text-base); + } +} + +/* Reduced motion support */ +@media (prefers-reduced-motion: reduce) { + + .search-result, + .section-link, + #enhanced-search-page-input, + .search-result::before { + transition: none; + } + + .spinner { + animation: none; + } +} + +/* Print styles */ +@media print { + .search-result { + break-inside: avoid; + box-shadow: none; + border: 1px solid; + margin-bottom: 1rem; + background: transparent !important; + } + + .section-icon { + background: transparent !important; + border: 1px solid; + } + + .section-link { + text-decoration: underline !important; + } + + .search-highlight, + mark { + background: transparent !important; + text-decoration: underline; + font-weight: bold; + } +} + +/* Focus states for accessibility */ +#enhanced-search-page-input:focus-visible { + outline: 2px solid var(--nv-color-green); + outline-offset: 2px; +} + +.section-link:focus-visible { + outline: 2px solid var(--nv-color-green); + outline-offset: 2px; +} + +.result-title a:focus-visible { + outline: 2px solid var(--nv-color-green); + outline-offset: 2px; + border-radius: 0.125rem; +} + +/* Dark theme support */ +html[data-theme="dark"] .search-result { + background: var(--pst-color-surface-200, #1f2937); +} + +html[data-theme="dark"] .search-result:hover { + background: var(--pst-color-surface-300, #111827); +} + +html[data-theme="dark"] .search-results-header h3 { + color: var(--pst-color-text-base, #f9fafb); +} + +/* Accessibility enhancements */ +@media (prefers-reduced-motion: reduce) { + + .search-result, + .section-link, + #enhanced-search-page-input { + transition: none; + } +} + +@media (prefers-contrast: high) { + .search-result { + border-color: var(--pst-color-text-base); + } + + .search-highlight, + mark { + background: var(--nv-color-green); + color: var(--pst-color-background); + } +} + +/* AI Assistant container styling */ +.ai-assistant-container { + border: 1px solid var(--pst-color-border); + border-radius: var(--pst-border-radius); + background: var(--pst-color-surface); + padding: 1rem; + margin-top: 1.5rem; +} + +.ai-assistant-container .ai-loading { + text-align: center; + padding: 2rem; + color: var(--pst-color-text-muted); +} + +.ai-assistant-container .ai-response { + line-height: 1.6; +} + +.ai-assistant-container .ai-error { + color: var(--pst-color-danger); + background: var(--pst-color-danger-bg); + padding: 1rem; + border-radius: var(--pst-border-radius); + border-left: 4px solid var(--pst-color-danger); +} + +/* AI Assistant dark theme support */ +html[data-theme="dark"] .ai-assistant-container { + background: var(--pst-color-surface-200, #1f2937); + border-color: var(--pst-color-border-dark, #374151); +} + +/* ===== TOPIC BADGES ===== */ +.result-topics { + display: flex; + flex-wrap: wrap; + gap: 0.25rem; + align-items: center; +} + +.topic-badge { + display: inline-flex; + align-items: center; + padding: 0.25rem 0.5rem; + margin-right: 0.25rem; + font-size: 0.75rem; + background: var(--topic-bg, #e8f5e9); + color: var(--topic-text, #2e7d32); + border-radius: 4px; + cursor: pointer; + transition: all 0.15s ease-in-out; + border: 1px solid rgba(46, 125, 50, 0.2); +} + +.topic-badge:hover { + background: var(--topic-bg-hover, #c8e6c9); + transform: translateY(-1px); + box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); +} + +.topic-badge:active { + transform: translateY(0); +} + +/* Dark theme topic badges */ +html[data-theme="dark"] .topic-badge { + background: rgba(118, 185, 0, 0.15); + color: var(--nv-color-green); + border-color: rgba(118, 185, 0, 0.3); +} + +html[data-theme="dark"] .topic-badge:hover { + background: rgba(118, 185, 0, 0.25); +} + +/* More Topics Indicator */ +.more-topics { + font-size: 0.75rem; + color: var(--pst-color-text-muted); + padding: 0.25rem; + font-style: italic; +} + +/* ===== RESULT BREAKDOWN ===== */ +.result-breakdown { + margin-left: 0.5rem; + font-size: 0.875rem; + color: var(--pst-color-text-muted); +} + +.result-breakdown::before { + content: 'β '; +} + +/* ===== KEYBOARD NAVIGATION FOCUS STATES ===== */ +.search-result.focused { + outline: 2px solid var(--nv-color-green, #76b900); + outline-offset: 2px; + border-radius: 8px; + background-color: rgba(118, 185, 0, 0.05); +} + +.search-result:focus-visible { + outline: 2px solid var(--nv-color-green, #76b900); + outline-offset: 2px; +} + +/* Animation for focus transition */ +.search-result { + transition: outline 0.15s ease-in-out, background-color 0.15s ease-in-out, transform 0.2s ease-in-out, border-color 0.2s ease-in-out, box-shadow 0.2s ease-in-out; +} + +/* Dark theme focus states */ +html[data-theme="dark"] .search-result.focused { + background-color: rgba(118, 185, 0, 0.1); + outline-color: var(--nv-color-green); +} + +/* ===== EXTENDED FILTER GRID RESPONSIVE ===== */ +@media (max-width: 1200px) { + .filter-grid { + grid-template-columns: repeat(3, 1fr); + } +} + +/* Dark theme filter enhancements */ +html[data-theme="dark"] .search-controls-container { + background: linear-gradient(to bottom, var(--pst-color-surface-200, #1f2937), var(--pst-color-surface-300, #111827)); + border-color: var(--pst-color-border-dark, #374151); +} + +html[data-theme="dark"] .filter-header { + border-bottom-color: var(--pst-color-border-dark, #374151); +} + +html[data-theme="dark"] .filter-select { + background-color: var(--pst-color-surface-200, #1f2937); + border-color: var(--pst-color-border-dark, #374151); +} + +html[data-theme="dark"] .filter-select:hover:not(:focus) { + background-color: var(--pst-color-surface-300, #111827); +} + +html[data-theme="dark"] .filter-clear-btn { + border-color: var(--pst-color-border-dark, #374151); +} + +html[data-theme="dark"] .filter-clear-btn:hover { + background-color: var(--pst-color-surface-300, #111827); +} + +html[data-theme="dark"] .search-input-field, +html[data-theme="dark"] #enhanced-search-page-input { + background-color: var(--pst-color-surface-200, #1f2937); + border-color: var(--pst-color-border-dark, #374151); +} + +html[data-theme="dark"] .filter-select-wrapper.has-value { + --select-bg-color: rgba(118, 185, 0, 0.1); +} + +/* ===== ACCESSIBILITY SKIP LINK ===== */ +.sr-only { + position: absolute; + width: 1px; + height: 1px; + padding: 0; + margin: -1px; + overflow: hidden; + clip: rect(0, 0, 0, 0); + white-space: nowrap; + border: 0; +} + +/* Visual indicator for keyboard users */ +.search-results-list:focus-within { + outline: 1px dashed var(--pst-color-text-muted); + outline-offset: 4px; + border-radius: 8px; +} + +/* Reduced motion support for keyboard navigation */ +@media (prefers-reduced-motion: reduce) { + .search-result.focused { + transition: none; + } +} diff --git a/docs/_ext/search_assets/main.js b/docs/_ext/search_assets/main.js new file mode 100644 index 00000000..31140ef1 --- /dev/null +++ b/docs/_ext/search_assets/main.js @@ -0,0 +1,197 @@ +/** + * Enhanced Search Main Entry Point + * Loads search engine and page manager for enhanced search page + * Does NOT interfere with default search behavior + */ + +// Prevent multiple initializations +if (typeof window.EnhancedSearch !== 'undefined') { +} else { + +// Import modules (will be loaded dynamically) +class EnhancedSearch { + constructor(options = {}) { + this.options = { + placeholder: options.placeholder || 'Search documentation...', + maxResults: options.maxResults || 20, + minQueryLength: 2, + highlightClass: 'search-highlight', + ...options + }; + + this.isLoaded = false; + + // Module instances + this.documentLoader = null; + this.searchEngine = null; + this.searchPageManager = null; + this.utils = null; + + this.init(); + } + + async init() { + try { + // Load required modules + await this.loadModules(); + + // Initialize core modules + this.utils = new Utils(); + this.documentLoader = new DocumentLoader(); + this.searchEngine = new SearchEngine(this.utils); + + // Load documents and initialize search engine (always needed) + await this.documentLoader.loadDocuments(); + await this.searchEngine.initialize(this.documentLoader.getDocuments()); + + // Check if we're on the search page + const isSearchPage = this.isSearchPage(); + + if (isSearchPage) { + this.searchPageManager = new SearchPageManager(); + } + + this.isLoaded = true; + } catch (error) { + this.fallbackToDefaultSearch(); + } + } + + isSearchPage() { + return window.location.pathname.includes('/search') || + window.location.pathname.includes('/search.html') || + window.location.pathname.endsWith('search/') || + document.querySelector('#enhanced-search-page-input') !== null || + document.querySelector('#enhanced-search-page-results') !== null; + } + + async loadModules() { + const moduleNames = [ + 'Utils', + 'DocumentLoader', + 'SearchEngine', + 'SearchPageManager' + ]; + + // Load modules with smart path resolution + const modulePromises = moduleNames.map(name => + this.loadModuleWithFallback(name) + ); + + await Promise.all(modulePromises); + } + + async loadModuleWithFallback(moduleName) { + const possiblePaths = this.getModulePaths(moduleName); + + for (const path of possiblePaths) { + try { + await this.loadModule(path); + return; + } catch (error) { + // Continue to next path + } + } + + throw new Error(`Failed to load module ${moduleName} from any path`); + } + + getModulePaths(moduleName) { + const fileName = `${moduleName}.js`; + + // Calculate nesting level to determine correct _static path + const pathParts = window.location.pathname.split('/').filter(part => part.length > 0); + const htmlFile = pathParts[pathParts.length - 1]; + + // Remove the HTML file from the count if it exists + let nestingLevel = pathParts.length; + if (htmlFile && htmlFile.endsWith('.html')) { + nestingLevel--; + } + + // Build the correct _static path based on nesting level + const staticPrefix = nestingLevel > 0 ? '../'.repeat(nestingLevel) : './'; + const staticPath = `${staticPrefix}_static`; + + // Search assets only has modules directory + const moduleDir = 'modules'; + + // Generate paths in order of likelihood + const paths = []; + + // 1. Most likely path based on calculated nesting + paths.push(`${staticPath}/${moduleDir}/${fileName}`); + + // 2. Fallback static paths (for different nesting scenarios) + paths.push(`_static/${moduleDir}/${fileName}`); + paths.push(`./_static/${moduleDir}/${fileName}`); + if (nestingLevel > 1) { + paths.push(`../_static/${moduleDir}/${fileName}`); + } + + // 3. Legacy fallback paths + paths.push(`./modules/${fileName}`); + paths.push(`../modules/${fileName}`); + paths.push(`modules/${fileName}`); + + return paths; + } + + async loadModule(src) { + return new Promise((resolve, reject) => { + const script = document.createElement('script'); + script.src = src; + script.onload = resolve; + script.onerror = () => reject(new Error(`Failed to load module: ${src}`)); + document.head.appendChild(script); + }); + } + + // Public API methods + search(query) { + if (!this.searchEngine) { + return []; + } + + return this.searchEngine.search(query); + } + + renderResults(results, query) { + // Use SearchPageManager for search page rendering + return ''; + } + + fallbackToDefaultSearch() { + // Don't interfere with default search - just fallback + } + + getDocuments() { + return this.documentLoader ? this.documentLoader.getDocuments() : []; + } + + get documents() { + return this.getDocuments(); + } + + getSearchEngine() { + return this.searchEngine; + } + + getOptions() { + return this.options; + } +} + +// Initialize the enhanced search system +window.EnhancedSearch = EnhancedSearch; + +// Auto-initialize +document.addEventListener('DOMContentLoaded', function() { + // Create the global instance + window.enhancedSearchInstance = new EnhancedSearch({ + placeholder: 'Search NVIDIA documentation...', + maxResults: 50 + }); +}); + +} // End of duplicate prevention check diff --git a/docs/_ext/search_assets/modules/DocumentLoader.js b/docs/_ext/search_assets/modules/DocumentLoader.js new file mode 100644 index 00000000..a15e55c1 --- /dev/null +++ b/docs/_ext/search_assets/modules/DocumentLoader.js @@ -0,0 +1,239 @@ +/** + * DocumentLoader Module + * Handles loading and managing search documents from JSON index + */ + +class DocumentLoader { + constructor() { + this.documents = {}; + this.isLoaded = false; + } + + /** + * Load documents from JSON index files + */ + async loadDocuments() { + try { + const data = await this.fetchDocumentData(); + this.processDocuments(data); + this.isLoaded = true; + console.log(`β Document loader initialized with ${Object.keys(this.documents).length} documents`); + } catch (error) { + console.error('Failed to load search documents:', error); + throw error; + } + } + + /** + * Fetch document data from various possible paths + */ + async fetchDocumentData() { + // Try different paths to account for different page depths + const possiblePaths = [ + './index.json', + '../index.json', + '../../index.json', + '../../../index.json' + ]; + + for (const path of possiblePaths) { + try { + const response = await fetch(path); + if (response.ok) { + const data = await response.json(); + console.log(`β Loaded search index from: ${path}`); + return data; + } + } catch (error) { + console.log(`β Failed to load from ${path}: ${error.message}`); + } + } + + throw new Error('Failed to load search data from any path'); + } + + /** + * Process and filter documents from raw data + * Supports three formats: + * 1. Array of documents (new format): [{ id, title, ... }, ...] + * 2. Object with children (legacy): { children: [...] } + * 3. Single document (fallback): { id, title, ... } + */ + processDocuments(data) { + let allDocs; + if (Array.isArray(data)) { + // New format: root is an array of documents + allDocs = data; + } else if (data.children) { + // Legacy format: object with children array + allDocs = data.children; + } else { + // Fallback: single document + allDocs = [data]; + } + + // Filter out problematic documents + const filteredDocs = allDocs.filter(doc => this.isValidDocument(doc)); + + // Store documents by ID + filteredDocs.forEach(doc => { + this.documents[doc.id] = this.sanitizeDocument(doc); + }); + + console.log(`Processed ${filteredDocs.length} documents (filtered from ${allDocs.length} total)`); + } + + /** + * Check if a document is valid for indexing + */ + isValidDocument(doc) { + const docId = doc.id || ''; + return !docId.toLowerCase().includes('readme') && + !docId.startsWith('_') && + doc.title && + doc.content; + } + + /** + * Sanitize document content for safe indexing + * Supports both new schema fields and legacy fields + * Preserves dynamic facets as-is + */ + sanitizeDocument(doc) { + const sanitized = { + ...doc, + title: this.sanitizeText(doc.title, 200), + // Add description as separate indexed field (for improved search relevance) + description: this.sanitizeText(doc.description, 300), + content: this.sanitizeText(doc.content, 5000), + summary: this.sanitizeText(doc.summary, 500), + headings: this.sanitizeHeadings(doc.headings), + headings_text: this.sanitizeText(doc.headings_text, 1000), + keywords: this.sanitizeArray(doc.keywords, 300), + tags: this.sanitizeArray(doc.tags, 200), + // Support both topics (new) and categories (legacy) + topics: this.sanitizeArray(doc.topics || doc.categories, 200), + // Support both audience (new) and personas (legacy) + audience: this.sanitizeArray(doc.audience || doc.personas, 200), + // Content type and difficulty + content_type: this.sanitizeText(doc.content_type, 50), + difficulty: this.sanitizeText(doc.difficulty, 50), + doc_type: this.sanitizeText(doc.doc_type, 50), + section_path: this.sanitizeArray(doc.section_path, 200), + author: this.sanitizeText(doc.author, 100) + }; + + // Preserve facets object (dynamic, user-defined keys) + if (doc.facets && typeof doc.facets === 'object') { + sanitized.facets = this.sanitizeFacets(doc.facets); + } + + // Preserve legacy flat modality if present and no facets.modality + if (doc.modality && (!doc.facets || !doc.facets.modality)) { + sanitized.modality = this.sanitizeText(doc.modality, 50); + } + + return sanitized; + } + + /** + * Sanitize facets object (dynamic keys with string or array values) + */ + sanitizeFacets(facets) { + const sanitized = {}; + Object.entries(facets).forEach(([key, value]) => { + if (Array.isArray(value)) { + sanitized[key] = value.map(v => String(v).substring(0, 100)); + } else if (value) { + sanitized[key] = String(value).substring(0, 100); + } + }); + return sanitized; + } + + /** + * Sanitize text content with length limits + */ + sanitizeText(text, maxLength) { + if (!text || typeof text !== 'string') return ''; + return text.substring(0, maxLength); + } + + /** + * Sanitize array content + */ + sanitizeArray(arr, maxLength) { + if (!Array.isArray(arr)) return []; + return arr.map(item => String(item)).join(' ').substring(0, maxLength); + } + + /** + * Sanitize headings array + */ + sanitizeHeadings(headings) { + if (!Array.isArray(headings)) return []; + return headings.map(heading => ({ + text: this.sanitizeText(heading.text, 200), + level: Number(heading.level) || 1 + })); + } + + /** + * Get all loaded documents + */ + getDocuments() { + return this.documents; + } + + /** + * Get a specific document by ID + */ + getDocument(id) { + return this.documents[id]; + } + + /** + * Get document count + */ + getDocumentCount() { + return Object.keys(this.documents).length; + } + + /** + * Check if documents are loaded + */ + isReady() { + return this.isLoaded && Object.keys(this.documents).length > 0; + } + + /** + * Get documents as array for indexing + */ + getDocumentsArray() { + return Object.values(this.documents); + } + + /** + * Filter documents by criteria + */ + filterDocuments(filterFn) { + return this.getDocumentsArray().filter(filterFn); + } + + /** + * Get document statistics + */ + getStatistics() { + const docs = this.getDocumentsArray(); + return { + totalDocuments: docs.length, + documentsWithSummary: docs.filter(d => d.summary).length, + documentsWithHeadings: docs.filter(d => d.headings && d.headings.length > 0).length, + documentsWithTags: docs.filter(d => d.tags && d.tags.length > 0).length, + averageContentLength: docs.reduce((sum, d) => sum + (d.content?.length || 0), 0) / docs.length + }; + } +} + +// Make DocumentLoader available globally +window.DocumentLoader = DocumentLoader; diff --git a/docs/_ext/search_assets/modules/EventHandler.js b/docs/_ext/search_assets/modules/EventHandler.js new file mode 100644 index 00000000..31cba430 --- /dev/null +++ b/docs/_ext/search_assets/modules/EventHandler.js @@ -0,0 +1,298 @@ +/** + * EventHandler Module + * Handles keyboard shortcuts and event management for the search interface + */ + +class EventHandler { + constructor(enhancedSearch) { + this.enhancedSearch = enhancedSearch; + this.searchInterface = enhancedSearch.searchInterface; + this.resultRenderer = enhancedSearch.resultRenderer; + this.searchEngine = enhancedSearch.searchEngine; + this.utils = enhancedSearch.utils; + + // Track bound event listeners for cleanup + this.boundListeners = new Map(); + + // Debounced search function + this.debouncedSearch = this.utils.debounce(this.handleSearch.bind(this), 200); + } + + /** + * Bind all event listeners + */ + bindEvents() { + this.bindInputEvents(); + this.bindModalEvents(); + this.bindGlobalEvents(); + console.log('β Event handlers bound'); + } + + /** + * Bind input-related events + */ + bindInputEvents() { + const input = this.searchInterface.getInput(); + if (!input) return; + + // Search input + const inputHandler = (e) => this.debouncedSearch(e); + input.addEventListener('input', inputHandler); + this.boundListeners.set('input', inputHandler); + + // Keyboard navigation + const keydownHandler = (e) => this.handleKeyDown(e); + input.addEventListener('keydown', keydownHandler); + this.boundListeners.set('keydown', keydownHandler); + } + + /** + * Bind page-specific events (replaces modal events) + */ + bindModalEvents() { + // Check if we're on the search page + if (!this.searchInterface.isSearchPage()) { + return; + } + + // Get query parameter if we're on search page + const urlParams = new URLSearchParams(window.location.search); + const query = urlParams.get('q'); + + if (query) { + // Perform search immediately with the query from URL + setTimeout(() => { + const input = this.searchInterface.getInput(); + if (input) { + input.value = query; + this.handleSearch({ target: input }); + } + }, 100); + } + } + + /** + * Bind global keyboard shortcuts + */ + bindGlobalEvents() { + const globalKeyHandler = (e) => { + // Ctrl+K or Cmd+K to focus search input + if ((e.ctrlKey || e.metaKey) && e.key === 'k') { + e.preventDefault(); + // Focus the search input if we're on the search page + const searchInput = this.searchInterface.getInput(); + if (searchInput) { + searchInput.focus(); + } else { + // If not on search page, redirect to search page + window.location.href = 'search.html'; + } + return; + } + }; + + document.addEventListener('keydown', globalKeyHandler); + this.boundListeners.set('global', globalKeyHandler); + } + + /** + * Handle search input + */ + async handleSearch(event) { + const query = event.target.value.trim(); + const resultsContainer = this.searchInterface.getResultsContainer(); + + if (query.length < this.enhancedSearch.options.minQueryLength) { + this.searchInterface.showEmptyState(); + this.searchInterface.clearStats(); + return; + } + + try { + // Show loading state + this.resultRenderer.renderLoading(resultsContainer); + + // Perform search + const results = this.searchEngine.search(query, this.enhancedSearch.options.maxResults); + const count = results.length; + + // Render results + this.resultRenderer.render(results, query, resultsContainer); + + // Update stats + this.searchInterface.updateStats(query, count); + + // Emit search event for AI Assistant extension if available + this.emitSearchEvent(query, results, count); + + } catch (error) { + console.error('Search error:', error); + this.resultRenderer.renderError(resultsContainer, 'Search temporarily unavailable'); + this.searchInterface.clearStats(); + } + } + + /** + * Handle keyboard navigation + */ + handleKeyDown(event) { + const resultsContainer = this.searchInterface.getResultsContainer(); + + switch (event.key) { + case 'ArrowDown': + event.preventDefault(); + this.resultRenderer.selectNext(resultsContainer); + break; + + case 'ArrowUp': + event.preventDefault(); + this.resultRenderer.selectPrevious(resultsContainer); + break; + + case 'Enter': + event.preventDefault(); + this.resultRenderer.activateSelected(resultsContainer); + break; + + case 'Escape': + event.preventDefault(); + this.enhancedSearch.hide(); + break; + } + } + + /** + * Emit search event for other extensions + */ + emitSearchEvent(query, results, count) { + if (window.AIAssistant && window.aiAssistantInstance) { + const searchEvent = new CustomEvent('enhanced-search-results', { + detail: { query, results, count } + }); + document.dispatchEvent(searchEvent); + } + } + + /** + * Handle window resize + */ + handleResize() { + // Adjust modal positioning if needed + const modal = this.searchInterface.getModal(); + if (modal && this.searchInterface.isModalVisible()) { + // Could add responsive adjustments here + } + } + + /** + * Handle focus management + */ + handleFocus(event) { + // Trap focus within modal when visible + if (this.searchInterface.isModalVisible()) { + const modal = this.searchInterface.getModal(); + const focusableElements = modal.querySelectorAll( + 'button, input, select, textarea, [tabindex]:not([tabindex="-1"])' + ); + + const firstFocusable = focusableElements[0]; + const lastFocusable = focusableElements[focusableElements.length - 1]; + + if (event.key === 'Tab') { + if (event.shiftKey) { + // Shift + Tab + if (document.activeElement === firstFocusable) { + event.preventDefault(); + lastFocusable.focus(); + } + } else { + // Tab + if (document.activeElement === lastFocusable) { + event.preventDefault(); + firstFocusable.focus(); + } + } + } + } + } + + /** + * Bind additional event listeners + */ + bindAdditionalEvents() { + // Window resize + const resizeHandler = this.utils.debounce(() => this.handleResize(), 100); + window.addEventListener('resize', resizeHandler); + this.boundListeners.set('resize', resizeHandler); + + // Focus trap + const focusHandler = (e) => this.handleFocus(e); + document.addEventListener('keydown', focusHandler); + this.boundListeners.set('focus', focusHandler); + } + + /** + * Unbind all event listeners + */ + unbindEvents() { + // Remove input events + const input = this.searchInterface.getInput(); + if (input && this.boundListeners.has('input')) { + input.removeEventListener('input', this.boundListeners.get('input')); + input.removeEventListener('keydown', this.boundListeners.get('keydown')); + } + + // Remove modal events + const closeBtn = this.searchInterface.getCloseButton(); + if (closeBtn && this.boundListeners.has('close')) { + closeBtn.removeEventListener('click', this.boundListeners.get('close')); + } + + const backdrop = this.searchInterface.getBackdrop(); + if (backdrop && this.boundListeners.has('backdrop')) { + backdrop.removeEventListener('click', this.boundListeners.get('backdrop')); + } + + // Remove global events + if (this.boundListeners.has('global')) { + document.removeEventListener('keydown', this.boundListeners.get('global')); + } + + if (this.boundListeners.has('resize')) { + window.removeEventListener('resize', this.boundListeners.get('resize')); + } + + if (this.boundListeners.has('focus')) { + document.removeEventListener('keydown', this.boundListeners.get('focus')); + } + + // Clear listeners map + this.boundListeners.clear(); + + console.log('β Event handlers unbound'); + } + + /** + * Get event handler statistics + */ + getStatistics() { + return { + boundListeners: this.boundListeners.size, + modalVisible: this.searchInterface.isModalVisible(), + hasInput: !!this.searchInterface.getInput(), + hasModal: !!this.searchInterface.getModal() + }; + } + + /** + * Check if events are properly bound + */ + isReady() { + return this.boundListeners.size > 0 && + this.searchInterface.getInput() !== null && + this.searchInterface.getModal() !== null; + } +} + +// Make EventHandler available globally +window.EventHandler = EventHandler; diff --git a/docs/_ext/search_assets/modules/ResultRenderer.js b/docs/_ext/search_assets/modules/ResultRenderer.js new file mode 100644 index 00000000..5a173a24 --- /dev/null +++ b/docs/_ext/search_assets/modules/ResultRenderer.js @@ -0,0 +1,263 @@ +/** + * ResultRenderer Module + * Handles rendering of search results in the interface + */ + +class ResultRenderer { + constructor(options, utils) { + this.options = options; + this.utils = utils; + } + + /** + * Render search results + */ + render(results, query, container) { + if (!container) { + console.warn('No container provided for rendering results'); + return; + } + + if (results.length === 0) { + container.innerHTML = this.renderNoResults(query); + return; + } + + const html = results.map((result, index) => { + const isSelected = index === 0; + return this.renderResultItem(result, query, isSelected); + }).join(''); + + container.innerHTML = `
No results found for "${this.utils.escapeHtml(query)}"
+Searching...
+${this.utils.escapeHtml(message)}
+Start typing to search documentation...
+No results found for "${this.escapeHtml(query)}"
+${this.escapeHtml(message)}
++ Found ${results.length} result${results.length !== 1 ? 's' : ''} for "${this.escapeHtml(this.currentQuery)}" + ${this.getActiveFiltersText()} + ${resultBreakdown ? `${resultBreakdown}` : ''} +
+${summary}
+ ${matchingSections} +Start typing to search across all documentation pages...
+Enter at least 2 characters to search
+No results found for "${this.escapeHtml(this.currentQuery)}"${this.getActiveFiltersText()}
+Start typing to search across all documentation pages...
+