cld2labs · geethac2l · Mar 3, 2026 · Mar 3, 2026
diff --git a/backend/.env.example b/backend/.env.example
@@ -1,32 +1,94 @@
 
+# ==========================================================
+# Unified OpenAI-Compatible Provider Config (Primary)
+# Use these 4 keys for OpenAI / Xeon / Ollama-compatible endpoints.
+# API_ENDPOINT:
+# - OpenAI cloud: leave empty
+# - Xeon/Gateway: https://api.example.com (without /v1; auto-appended)
+# - Ollama: http://localhost:11434 (without /v1; auto-appended)
+# PROVIDER_NAME is optional and only used for logs/UI labeling.
+# If omitted, provider name is inferred from API_ENDPOINT.
+# ==========================================================
+API_ENDPOINT=
+API_TOKEN=your-api-token-here
+MODEL_NAME=gpt-4o-mini
+PROVIDER_NAME=
 
-# OpenAI Configuration
-OPENAI_API_KEY=your_openai_api_key_here
-# Legacy
-OPENAI_MODEL=gpt-4o-mini
 
+
+# Simple embedding config (OpenAI-compatible)
+# If embeddings use the same endpoint/token as chat, leave EMBEDDING_ENDPOINT
+# and EMBEDDING_API empty.
+# Example:
+# EMBEDDING_ENDPOINT=http://localhost:11434
+# EMBEDDING_API=ollama
+# EMBEDDING_MODEL=nomic-embed-text
+EMBEDDING_ENDPOINT=
+EMBEDDING_API=
 EMBEDDING_MODEL=text-embedding-3-small
+EMBEDDING_PROVIDER_NAME=
+# Optional: fail fast for slow local embedding endpoints
+EMBEDDING_TIMEOUT=20
+EMBEDDING_MAX_RETRIES=0
+
 
-RAG_CHUNK_SIZE=800
-RAG_CHUNK_OVERLAP=150
-RAG_TOP_K=5
 
-RAG_MAX_DOCS=25
-RAG_TTL_SECONDS=3600
+# Optional per-model context window overrides
+# MODEL_CONTEXT_TOKENS_MAP format:
+# {"gpt-4o-mini":128000,"llama3.2:3b":32768}
+MODEL_CONTEXT_TOKENS=
+MODEL_CONTEXT_TOKENS_MAP=
+MIN_MODEL_CONTEXT_TOKENS=4096
+CONTEXT_RETRY_SHRINK_RATIO=0.75
+CONTEXT_RETRY_MARGIN_TOKENS=1200
+# Speed knobs (recommended for local models)
+# false = heuristic section chips only (faster, fewer LLM calls)
+DYNAMIC_SECTIONS_USE_LLM=false
+# false = skip second "retry summary" LLM call
+SUMMARY_RETRY_ENABLE=false
 
 
+# ==========================================================
+# LLM Tuning
+# ==========================================================
 LLM_TEMPERATURE=0.2
 LLM_MAX_TOKENS=900
 
-CACHE_MAX_DOCS=25
-CACHE_TTL_SECONDS=3600
-
+# ==========================================================
+# RAG / Chunking
+# ==========================================================
+RAG_CHUNK_CHARS=1400
+RAG_CHUNK_OVERLAP_CHARS=220
+# For local embedding models, start with 1-4.
+RAG_EMBED_BATCH_SIZE=64
+RAG_SUMMARY_TOP_K=8
+RAG_SECTION_TOP_K=10
+RAG_MIN_SCORE=0.15
+RAG_CONTEXT_MAX_CHARS=18000
+RETRIEVAL_FIRST_ENABLE=true
+# If true, request path can synchronously build index when missing (may be slow).
+RETRIEVAL_FORCE_INDEX_ON_DEMAND=false
 
+# ==========================================================
+# Map-Reduce Summarization for Large Docs
+# ==========================================================
+MAP_REDUCE_ENABLE=true
+MAP_REDUCE_MIN_CHARS=18000
+MAP_REDUCE_MIN_CHUNKS=8
+MAP_REDUCE_MAX_CHUNKS=24
+MAP_REDUCE_CHUNK_CHARS=2800
+MAP_REDUCE_CHUNK_OVERLAP=250
+MAP_REDUCE_BATCH_SIZE=6
 
+# ==========================================================
+# Cache
+# ==========================================================
+CACHE_MAX_DOCS=25
+CACHE_TTL_SECONDS=3600
 
-# Service Configuration
+# ==========================================================
+# Service
+# ==========================================================
 SERVICE_PORT=8000
 LOG_LEVEL=INFO
-
-# CORS Settings
-CORS_ORIGINS=*
+CORS_ORIGINS=*
diff --git a/backend/api/routes.py b/backend/api/routes.py
@@ -4,6 +4,7 @@
 """
 
 from fastapi import APIRouter, Form, File, UploadFile, HTTPException, BackgroundTasks
+from fastapi.responses import PlainTextResponse
 from fastapi.responses import StreamingResponse
 from typing import Optional
 import os
@@ -14,7 +15,8 @@
 from models import HealthResponse
 
 from services import pdf_service, llm_service
-from services.rag_index_service import rag_index_service  # <-- ADDED
+from services.rag.rag_index_service import rag_index_service
+from services.observability_service import observability_service
 
 logger = logging.getLogger(__name__)
 
@@ -23,18 +25,26 @@
 
 @router.get("/health", response_model=HealthResponse)
 async def health_check():
-    """Health check endpoint - OpenAI-only"""
+    """Health check endpoint"""
     llm_health = llm_service.health_check()
 
     response = {
         "status": "healthy" if llm_health.get("status") == "healthy" else "unhealthy",
         "service": config.APP_TITLE,
         "version": config.APP_VERSION,
-        "llm_provider": "OpenAI",
+        "llm_provider": llm_health.get("provider", llm_service.get_provider_name()),
     }
 
     return response
 
+
+@router.get("/v1/observability", response_class=PlainTextResponse)
+async def observability(limit: int = 100):
+    """
+    Plain text table with LLM token observability only.
+    """
+    return observability_service.render_table(limit=limit, llm_only=True)
+
 @router.get("/v1/rag/status")
 async def rag_status(doc_id: str):
     """
@@ -46,11 +56,13 @@ async def rag_status(doc_id: str):
         return {"doc_id": doc_id.strip(), **status}
     except Exception as e:
         raise HTTPException(status_code=400, detail=str(e))
+
+
 @router.post("/v1/rag/chat")
 async def rag_chat(
     doc_id: str = Form(""),
     message: str = Form(""),
-    max_tokens: int = Form(500),
+    max_tokens: int = Form(220),
     temperature: float = Form(0.2),
 ):
     """
@@ -119,7 +131,7 @@ async def delete_vectors(doc_id: str):
         if not doc_id_clean:
             raise HTTPException(status_code=400, detail="doc_id is required")
 
-        from services.vector_store import vector_store
+        from services.rag.vector_store import vector_store
         vector_store.clear_doc(doc_id_clean)
 
         return {"doc_id": doc_id_clean, "status": "deleted", "message": "Vector data cleared"}

diff --git a/backend/config.py b/backend/config.py
@@ -1,5 +1,5 @@
 """
-Configuration settings for Doc-Sum Application
+Configuration settings for FinSights Application
 """
 
 import os
@@ -10,8 +10,39 @@
 
 # OpenAI Configuration (optional)
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+# Optional fallback model when MODEL_NAME is not set
 OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-4o-mini")
 
+# Unified Provider Configuration (primary)
+API_ENDPOINT = os.getenv("API_ENDPOINT", "")
+API_TOKEN = os.getenv("API_TOKEN", "")
+MODEL_NAME = os.getenv("MODEL_NAME", "")
+EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "")
+PROVIDER_NAME = os.getenv("PROVIDER_NAME", "")
+VERIFY_SSL = os.getenv("VERIFY_SSL", "true")
+LOCAL_URL_ENDPOINT = os.getenv("LOCAL_URL_ENDPOINT", "not-needed")
+EMBEDDING_PROVIDER = os.getenv("EMBEDDING_PROVIDER", "same")
+EMBEDDING_ENDPOINT = os.getenv("EMBEDDING_ENDPOINT", "")
+EMBEDDING_API = os.getenv("EMBEDDING_API", "")
+EMBEDDING_API_ENDPOINT = os.getenv("EMBEDDING_API_ENDPOINT", "")
+EMBEDDING_API_TOKEN = os.getenv("EMBEDDING_API_TOKEN", "")
+EMBEDDING_PROVIDER_NAME = os.getenv("EMBEDDING_PROVIDER_NAME", "")
+
+# Optional embedding overrides (advanced)
+OPENAI_EMBEDDING_MODEL = os.getenv("OPENAI_EMBEDDING_MODEL", "text-embedding-3-small")
+INFERENCE_EMBEDDING_MODEL_NAME = os.getenv("INFERENCE_EMBEDDING_MODEL_NAME", "")
+OLLAMA_EMBEDDING_MODEL = os.getenv("OLLAMA_EMBEDDING_MODEL", "")
+
+# Legacy compatibility (optional)
+LLM_PROVIDER = os.getenv("LLM_PROVIDER", "")
+INFERENCE_API_ENDPOINT = os.getenv("INFERENCE_API_ENDPOINT", "")
+INFERENCE_API_TOKEN = os.getenv("INFERENCE_API_TOKEN", "")
+INFERENCE_MODEL_NAME = os.getenv("INFERENCE_MODEL_NAME", "")
+OLLAMA_ENDPOINT = os.getenv("OLLAMA_ENDPOINT", "")
+OLLAMA_TOKEN = os.getenv("OLLAMA_TOKEN", "")
+OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "")
+EMBEDDING_MODEL_NAME = os.getenv("EMBEDDING_MODEL_NAME", "")
+
 
 
 # LLM Configuration (tuned for section summaries)

diff --git a/backend/server.py b/backend/server.py
@@ -1,18 +1,21 @@
 """
-FastAPI server for Doc-Sum Application (OpenAI-only)
+FastAPI server for Doc-Sum Application
 """
 
 import logging
+import time
 from fastapi import FastAPI
+from fastapi import Request
 from fastapi.middleware.cors import CORSMiddleware
 import uvicorn
 
 import config
 from models import HealthResponse
 from api.routes import router
+from services.observability_service import observability_service
 
-# IMPORTANT: llm_service is inside the "service" package
-from services.llm_service import llm_service
+# IMPORTANT: import the llm_service singleton object directly
+from services.llm.llm_service import llm_service
 
 # Configure logging
 logging.basicConfig(
@@ -40,6 +43,22 @@
 # Include API routes
 app.include_router(router)
 
+@app.middleware("http")
+async def request_observability_middleware(request: Request, call_next):
+    started = time.perf_counter()
+    ctx_tokens = observability_service.set_request_context(request.url.path, request.method)
+    status_code = 500
+    try:
+        response = await call_next(request)
+        status_code = response.status_code
+        return response
+    finally:
+        observability_service.record_request(
+            status_code=status_code,
+            duration_ms=(time.perf_counter() - started) * 1000.0,
+        )
+        observability_service.reset_request_context(ctx_tokens)
+
 
 @app.get("/")
 def root():
@@ -51,17 +70,17 @@ def root():
         "docs": "/docs",
         "health": "/health",
         "config": {
-            "llm_provider": "OpenAI",
-            "llm_model": config.OPENAI_MODEL,
-            "openai_configured": bool(config.OPENAI_API_KEY),
+            "llm_provider": llm_service.get_provider_name(),
+            "llm_model": llm_service.model,
+            "api_token_configured": bool(config.API_TOKEN or config.OPENAI_API_KEY),
         },
     }
     return response
 
 
 @app.get("/health", response_model=HealthResponse)
 def health_check():
-    """Detailed health check - OpenAI only"""
+    """Detailed health check"""
     response_data = {
         "status": "healthy",
         "service": config.APP_TITLE,
@@ -70,9 +89,7 @@ def health_check():
 
     llm_health = llm_service.health_check()
 
-    # Only set fields that likely exist in your HealthResponse model
-    # Keep the original fields + add llm_provider (as your old code did)
-    response_data["llm_provider"] = "OpenAI"
+    response_data["llm_provider"] = llm_health.get("provider", llm_service.get_provider_name())
 
     # If OpenAI isn't configured or health check fails, mark unhealthy
     if llm_health.get("status") in ("not_configured", "unhealthy"):
@@ -87,9 +104,9 @@ async def startup_event():
     logger.info("=" * 60)
     logger.info(f"Starting {config.APP_TITLE} v{config.APP_VERSION}")
     logger.info("=" * 60)
-    logger.info("LLM Provider: OpenAI")
-    logger.info(f"OpenAI Configured: {bool(config.OPENAI_API_KEY)}")
-    logger.info(f"Model: {config.OPENAI_MODEL}")
+    logger.info(f"LLM Provider: {llm_service.get_provider_name()}")
+    logger.info(f"API Token Configured: {bool(config.API_TOKEN or config.OPENAI_API_KEY)}")
+    logger.info(f"Model: {llm_service.model}")
     logger.info(f"Port: {config.SERVICE_PORT}")
     logger.info("=" * 60)
 

diff --git a/backend/services/__init__.py b/backend/services/__init__.py
@@ -1,6 +1,6 @@
 """Services module - Business logic layer"""
 
-from .pdf_service import pdf_service
-from .llm_service import llm_service
+from .pdf import pdf_service
+from .llm.llm_service import llm_service
 
 __all__ = ["pdf_service", "llm_service"]
diff --git a/backend/services/llm/__init__.py b/backend/services/llm/__init__.py
@@ -0,0 +1,3 @@
+from .llm_service import llm_service
+
+__all__ = ["llm_service"]
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from .llm_service import llm_service

		__all__ = ["llm_service"]