From 8af7b4337858e113533bd0da5f2d664990f30781 Mon Sep 17 00:00:00 2001
From: arpannookala-12 <ganesh.arpan.nookala@cloud2labs.com>
Date: Tue, 24 Mar 2026 19:39:43 -0500
Subject: [PATCH 1/3] Add inference benchmarks and model capabilities to README

Add benchmark results comparing vLLM local, Intel OPEA Enterprise
Inference, and OpenAI cloud tiers across the full FinSights document
analysis pipeline. Include model capability tables for Llama 3.2 3B
Instruct, BAAI/bge-base-en-v1.5, text-embedding-3-small, and
GPT-4o-mini with a side-by-side comparison summary.
---
 README.md | 120 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 120 insertions(+)

diff --git a/README.md b/README.md
index 197e780..97f31a1 100644
--- a/README.md
+++ b/README.md
@@ -18,6 +18,13 @@ AI-powered financial document analysis with intelligent section-based summarizat
 - [Project Structure](#project-structure)
 - [Usage Guide](#usage-guide)
 - [Environment Variables](#environment-variables)
+- [Inference Benchmarks](#inference-benchmarks)
+- [Model Capabilities](#model-capabilities)
+  - [Meta Llama 3.2 3B Instruct](#meta-llama-32-3b-instruct)
+  - [BAAI/bge-base-en-v1.5](#baaibge-base-en-v15)
+  - [OpenAI text-embedding-3-small](#openai-text-embedding-3-small)
+  - [GPT-4o-mini](#gpt-4o-mini)
+  - [Comparison Summary](#comparison-summary)
 - [Technology Stack](#technology-stack)
 - [Troubleshooting](#troubleshooting)
 - [License](#license)
@@ -361,6 +368,119 @@ This blueprint uses a **document-cached RAG approach without static chunking**.
 
 
 
+---
+
+## Inference Benchmarks
+
+The table below compares inference performance across different providers, deployment modes, and hardware profiles using a standardized FinSights document analysis workload (averaged over 3 runs of the full pipeline: initial summary, overall summary, section summary, RAG indexing, and RAG chat).
+
+
+| Provider       | LLM Model                      | Embedding Model              | Deployment           | Avg Input Tokens/Gen | Avg Output Tokens/Gen | Avg Total Tokens/Gen | P50 Latency (ms) | P95 Latency (ms) | Throughput (req/s) | Hardware                               |
+| -------------- | ------------------------------ | ---------------------------- | -------------------- | -------------------- | --------------------- | -------------------- | ---------------- | ---------------- | ------------------ | -------------------------------------- |
+| vLLM           | `Llama-3.2-3B-Instruct`       | `BAAI/bge-base-en-v1.5`     | Local                | 441                  | 127                   | 568                  | 15,283           | 59,437           | 0.050              | Apple Silicon (Metal) (MacBook Pro M4) |
+| [Intel OPEA EI](https://github.com/opea-project/Enterprise-Inference)  | `Llama-3.2-3B-Instruct`       | `BAAI/bge-base-en-v1.5`     | Enterprise (On-Prem) | 444                  | 122                   | 566                  | 4,393            | 23,270           | 0.133              | CPU-only (Xeon)                        |
+| OpenAI (Cloud) | `gpt-4o-mini`                  | `text-embedding-3-small`     | API (Cloud)          | 411                  | 133                   | 544                  | 2,772            | 11,906           | 0.221              | N/A                                    |
+
+
+> **Notes:**
+>
+> - All benchmarks use the same FinSights document analysis pipeline. Token counts may vary slightly per run due to non-deterministic model output.
+> - vLLM on Apple Silicon uses Metal (MPS) GPU acceleration for the LLM and CPU-based vLLM for the BERT embedding model (`BAAI/bge-base-en-v1.5`).
+> - [Intel OPEA Enterprise Inference](https://github.com/opea-project/Enterprise-Inference) runs on Intel Xeon CPUs without GPU acceleration.
+> - Each benchmark run exercises 5 generations: initial summary, overall summary, section summary, RAG indexing (embeddings), and RAG chat.
+> - Langfuse tracing is used for full observability of each benchmark run.
+
+---
+
+## Model Capabilities
+
+### Meta Llama 3.2 3B Instruct
+
+A 3-billion-parameter open-weight model from Meta's Llama family, optimized for instruction-following and on-device deployment.
+
+
+| Attribute                   | Details                                                                                      |
+| --------------------------- | -------------------------------------------------------------------------------------------- |
+| **Parameters**              | 3.21B                                                                                        |
+| **Architecture**            | Transformer with Grouped Query Attention (GQA) — 28 layers, 24 Q-heads / 8 KV-heads          |
+| **Context Window**          | 128,000 tokens                                                                               |
+| **Instruction Tuning**      | RLHF + supervised fine-tuning on instruction data                                            |
+| **Multilingual**            | English, German, French, Italian, Portuguese, Hindi, Spanish, Thai                           |
+| **Quantization Formats**    | GGUF, AWQ, GPTQ, MLX (4-bit)                                                                |
+| **Inference Runtimes**      | vLLM, Ollama, llama.cpp, LMStudio, SGLang, TGI                                               |
+| **License**                 | Llama 3.2 Community License (permissive, with acceptable use policy)                         |
+| **Deployment**              | Local, on-prem, air-gapped, cloud — full data sovereignty                                    |
+
+
+### BAAI/bge-base-en-v1.5
+
+A 110M-parameter BERT-based embedding model from BAAI, widely used for retrieval and RAG pipelines.
+
+
+| Attribute                   | Details                                                    |
+| --------------------------- | ---------------------------------------------------------- |
+| **Parameters**              | 109M                                                       |
+| **Architecture**            | BERT base (12 layers, 768 hidden dim)                      |
+| **Embedding Dimensions**    | 768                                                        |
+| **Max Sequence Length**      | 512 tokens                                                 |
+| **MTEB Retrieval Score**    | 53.25 (competitive with models 3x its size)                |
+| **Inference Runtimes**      | sentence-transformers, vLLM (CPU), ONNX, TGI               |
+| **License**                 | MIT                                                        |
+| **Deployment**              | Local, on-prem, air-gapped — lightweight enough for CPU    |
+
+
+### OpenAI text-embedding-3-small
+
+OpenAI's compact embedding model, used for RAG indexing and retrieval when running with the OpenAI provider.
+
+
+| Attribute                   | Details                                                    |
+| --------------------------- | ---------------------------------------------------------- |
+| **Parameters**              | Not publicly disclosed                                     |
+| **Embedding Dimensions**    | 1,536 (default) or 512 (with `dimensions` parameter)      |
+| **Max Sequence Length**      | 8,191 tokens                                               |
+| **MTEB Retrieval Score**    | 44.0                                                       |
+| **Pricing**                 | $0.02 / 1M tokens                                          |
+| **License**                 | Proprietary (OpenAI Terms of Use)                          |
+| **Deployment**              | Cloud-only — OpenAI API or Azure OpenAI Service            |
+
+
+### GPT-4o-mini
+
+OpenAI's cost-efficient multimodal model, accessible exclusively via cloud API.
+
+
+| Attribute                   | Details                                                                           |
+| --------------------------- | --------------------------------------------------------------------------------- |
+| **Parameters**              | Not publicly disclosed                                                            |
+| **Architecture**            | Multimodal Transformer (text + image input, text output)                          |
+| **Context Window**          | 128,000 tokens input / 16,384 tokens max output                                   |
+| **Tool / Function Calling** | Supported; parallel function calling                                              |
+| **Structured Output**       | JSON mode and strict JSON schema adherence supported                              |
+| **Multilingual**            | Broad multilingual support                                                        |
+| **Pricing**                 | $0.15 / 1M input tokens, $0.60 / 1M output tokens (Batch API: 50% discount)       |
+| **Fine-Tuning**             | Supervised fine-tuning via OpenAI API                                             |
+| **License**                 | Proprietary (OpenAI Terms of Use)                                                 |
+| **Deployment**              | Cloud-only — OpenAI API or Azure OpenAI Service. No self-hosted or on-prem option |
+
+
+### Comparison Summary
+
+
+| Capability                      | Llama 3.2 3B Instruct            | GPT-4o-mini                       |
+| ------------------------------- | -------------------------------- | --------------------------------- |
+| Financial document analysis     | Yes                              | Yes                               |
+| RAG-based document chat         | Yes                              | Yes                               |
+| On-prem / air-gapped deployment | Yes                              | No                                |
+| Data sovereignty                | Full (weights run locally)       | No (data sent to cloud API)       |
+| Open weights                    | Yes (Llama Community License)    | No (proprietary)                  |
+| Custom fine-tuning              | Full fine-tuning + LoRA adapters | Supervised fine-tuning (API only) |
+| Multimodal (image input)        | No                               | Yes                               |
+| Native context window           | 128K                             | 128K                              |
+
+
+> Both models support financial document analysis and RAG-based chat. However, only Llama 3.2 offers open weights, data sovereignty, and local deployment flexibility — making it suitable for air-gapped, regulated, or cost-sensitive environments. GPT-4o-mini offers lower latency and higher throughput via OpenAI's cloud infrastructure, with added multimodal capabilities.
+
 ---
 
 ## Technology Stack

From baffc9188d9512fea1c8585a8e4542aa0b5109b9 Mon Sep 17 00:00:00 2001
From: arpannookala-12 <ganesh.arpan.nookala@cloud2labs.com>
Date: Tue, 24 Mar 2026 19:47:10 -0500
Subject: [PATCH 2/3] Update trivy-action from 0.24.0 to 0.35.0

Fix CI failure caused by unresolvable trivy-action version 0.24.0.
---
 .github/workflows/code-scans.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/code-scans.yaml b/.github/workflows/code-scans.yaml
index 18e5cd9..5139404 100644
--- a/.github/workflows/code-scans.yaml
+++ b/.github/workflows/code-scans.yaml
@@ -37,7 +37,7 @@ jobs:
         run: mkdir -p trivy-reports
         
       - name: Run Trivy FS Scan
-        uses: aquasecurity/trivy-action@0.24.0
+        uses: aquasecurity/trivy-action@0.35.0
         with:
           scan-type: 'fs'
           scan-ref: '.'

From ce2d1eb373017f0a2a49bcaa06535243b5f25364 Mon Sep 17 00:00:00 2001
From: arpannookala-12 <ganesh.arpan.nookala@cloud2labs.com>
Date: Tue, 24 Mar 2026 19:57:50 -0500
Subject: [PATCH 3/3] Bump jspdf from 4.2.0 to 4.2.1

Fix CVE-2026-31938 (CRITICAL, XSS) and CVE-2026-31898 (HIGH,
arbitrary code execution) flagged by Trivy scan.
---
 frontend/package-lock.json | 19 ++++++++++++++-----
 frontend/package.json      |  4 ++--
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/frontend/package-lock.json b/frontend/package-lock.json
index 5fe35d0..9b30ee7 100644
--- a/frontend/package-lock.json
+++ b/frontend/package-lock.json
@@ -8,7 +8,7 @@
       "name": "finsights-react-ui",
       "version": "1.0.0",
       "dependencies": {
-        "jspdf": "^4.2.0",
+        "jspdf": "^4.2.1",
         "lucide-react": "^0.294.0",
         "react": "^18.2.0",
         "react-dom": "^18.2.0",
@@ -63,6 +63,7 @@
       "resolved": "https://registry.npmjs.org/@babel/core/-/core-7.28.5.tgz",
       "integrity": "sha512-e7jT4DxYvIDLk1ZHmU/m/mB19rex9sv0c2ftBtjSBv+kVM/902eh0fINUzD7UwLLNR+jU585GxUJ8/EBfAM5fw==",
       "dev": true,
+      "peer": true,
       "dependencies": {
         "@babel/code-frame": "^7.27.1",
         "@babel/generator": "^7.28.5",
@@ -1326,6 +1327,7 @@
           "url": "https://github.com/sponsors/ai"
         }
       ],
+      "peer": true,
       "dependencies": {
         "baseline-browser-mapping": "^2.9.0",
         "caniuse-lite": "^1.0.30001759",
@@ -1474,7 +1476,8 @@
     "node_modules/csstype": {
       "version": "3.2.3",
       "resolved": "https://registry.npmjs.org/csstype/-/csstype-3.2.3.tgz",
-      "integrity": "sha512-z1HGKcYy2xA8AGQfwrn0PAy+PB7X/GSj3UVJW9qKyn43xWa+gl5nXmU4qqLMRzWVLFC8KusUX8T/0kCiOYpAIQ=="
+      "integrity": "sha512-z1HGKcYy2xA8AGQfwrn0PAy+PB7X/GSj3UVJW9qKyn43xWa+gl5nXmU4qqLMRzWVLFC8KusUX8T/0kCiOYpAIQ==",
+      "peer": true
     },
     "node_modules/debug": {
       "version": "4.4.3",
@@ -1789,6 +1792,7 @@
       "resolved": "https://registry.npmjs.org/jiti/-/jiti-1.21.7.tgz",
       "integrity": "sha512-/imKNG4EbWNrVjoNC/1H5/9GFy+tqjGBHCaSsN+P2RnPqjsLmv6UD3Ej+Kj8nBWaRAwyk7kK5ZUc+OEatnTR3A==",
       "dev": true,
+      "peer": true,
       "bin": {
         "jiti": "bin/jiti.js"
       }
@@ -1823,9 +1827,9 @@
       }
     },
     "node_modules/jspdf": {
-      "version": "4.2.0",
-      "resolved": "https://registry.npmjs.org/jspdf/-/jspdf-4.2.0.tgz",
-      "integrity": "sha512-hR/hnRevAXXlrjeqU5oahOE+Ln9ORJUB5brLHHqH67A+RBQZuFr5GkbI9XQI8OUFSEezKegsi45QRpc4bGj75Q==",
+      "version": "4.2.1",
+      "resolved": "https://registry.npmjs.org/jspdf/-/jspdf-4.2.1.tgz",
+      "integrity": "sha512-YyAXyvnmjTbR4bHQRLzex3CuINCDlQnBqoSYyjJwTP2x9jDLuKDzy7aKUl0hgx3uhcl7xzg32agn5vlie6HIlQ==",
       "license": "MIT",
       "dependencies": {
         "@babel/runtime": "^7.28.6",
@@ -2047,6 +2051,7 @@
           "url": "https://github.com/sponsors/ai"
         }
       ],
+      "peer": true,
       "dependencies": {
         "nanoid": "^3.3.11",
         "picocolors": "^1.1.1",
@@ -2217,6 +2222,7 @@
       "version": "18.3.1",
       "resolved": "https://registry.npmjs.org/react/-/react-18.3.1.tgz",
       "integrity": "sha512-wS+hAgJShR0KhEvPJArfuPVN1+Hz1t0Y6n5jLrGQbkb4urgPE/0Rve+1kMB1v/oWgHgm4WIcV+i7F2pTVj+2iQ==",
+      "peer": true,
       "dependencies": {
         "loose-envify": "^1.1.0"
       },
@@ -2228,6 +2234,7 @@
       "version": "18.3.1",
       "resolved": "https://registry.npmjs.org/react-dom/-/react-dom-18.3.1.tgz",
       "integrity": "sha512-5m4nQKp+rZRb09LNH59GM4BxTh9251/ylbKIbpe7TpGxfJ+9kv6BLkLBXIjjspbgbnIBNqlI23tRnTWT0snUIw==",
+      "peer": true,
       "dependencies": {
         "loose-envify": "^1.1.0",
         "scheduler": "^0.23.2"
@@ -2610,6 +2617,7 @@
       "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-4.0.3.tgz",
       "integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==",
       "dev": true,
+      "peer": true,
       "engines": {
         "node": ">=12"
       },
@@ -2685,6 +2693,7 @@
       "resolved": "https://registry.npmjs.org/vite/-/vite-5.4.21.tgz",
       "integrity": "sha512-o5a9xKjbtuhY6Bi5S3+HvbRERmouabWbyUcpXXUA1u+GNUKoROi9byOJ8M0nHbHYHkYICiMlqxkg1KkYmm25Sw==",
       "dev": true,
+      "peer": true,
       "dependencies": {
         "esbuild": "^0.21.3",
         "postcss": "^8.4.43",
diff --git a/frontend/package.json b/frontend/package.json
index df7df27..07e883e 100644
--- a/frontend/package.json
+++ b/frontend/package.json
@@ -8,12 +8,12 @@
     "preview": "vite preview"
   },
   "dependencies": {
-    "jspdf": "^4.2.0",
+    "jspdf": "^4.2.1",
     "lucide-react": "^0.294.0",
     "react": "^18.2.0",
     "react-dom": "^18.2.0",
     "react-hot-toast": "^2.4.1",
-    "react-router-dom": "^6.30.3"
+    "react-router-dom": "^6.30.3"
   },
   "devDependencies": {
     "@vitejs/plugin-react": "^4.2.0",