diff --git a/.github/workflows/code-scans.yaml b/.github/workflows/code-scans.yaml index 18e5cd9..5139404 100644 --- a/.github/workflows/code-scans.yaml +++ b/.github/workflows/code-scans.yaml @@ -37,7 +37,7 @@ jobs: run: mkdir -p trivy-reports - name: Run Trivy FS Scan - uses: aquasecurity/trivy-action@0.24.0 + uses: aquasecurity/trivy-action@0.35.0 with: scan-type: 'fs' scan-ref: '.' diff --git a/README.md b/README.md index 197e780..97f31a1 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,13 @@ AI-powered financial document analysis with intelligent section-based summarizat - [Project Structure](#project-structure) - [Usage Guide](#usage-guide) - [Environment Variables](#environment-variables) +- [Inference Benchmarks](#inference-benchmarks) +- [Model Capabilities](#model-capabilities) + - [Meta Llama 3.2 3B Instruct](#meta-llama-32-3b-instruct) + - [BAAI/bge-base-en-v1.5](#baaibge-base-en-v15) + - [OpenAI text-embedding-3-small](#openai-text-embedding-3-small) + - [GPT-4o-mini](#gpt-4o-mini) + - [Comparison Summary](#comparison-summary) - [Technology Stack](#technology-stack) - [Troubleshooting](#troubleshooting) - [License](#license) @@ -361,6 +368,119 @@ This blueprint uses a **document-cached RAG approach without static chunking**. +--- + +## Inference Benchmarks + +The table below compares inference performance across different providers, deployment modes, and hardware profiles using a standardized FinSights document analysis workload (averaged over 3 runs of the full pipeline: initial summary, overall summary, section summary, RAG indexing, and RAG chat). + + +| Provider | LLM Model | Embedding Model | Deployment | Avg Input Tokens/Gen | Avg Output Tokens/Gen | Avg Total Tokens/Gen | P50 Latency (ms) | P95 Latency (ms) | Throughput (req/s) | Hardware | +| -------------- | ------------------------------ | ---------------------------- | -------------------- | -------------------- | --------------------- | -------------------- | ---------------- | ---------------- | ------------------ | -------------------------------------- | +| vLLM | `Llama-3.2-3B-Instruct` | `BAAI/bge-base-en-v1.5` | Local | 441 | 127 | 568 | 15,283 | 59,437 | 0.050 | Apple Silicon (Metal) (MacBook Pro M4) | +| [Intel OPEA EI](https://github.com/opea-project/Enterprise-Inference) | `Llama-3.2-3B-Instruct` | `BAAI/bge-base-en-v1.5` | Enterprise (On-Prem) | 444 | 122 | 566 | 4,393 | 23,270 | 0.133 | CPU-only (Xeon) | +| OpenAI (Cloud) | `gpt-4o-mini` | `text-embedding-3-small` | API (Cloud) | 411 | 133 | 544 | 2,772 | 11,906 | 0.221 | N/A | + + +> **Notes:** +> +> - All benchmarks use the same FinSights document analysis pipeline. Token counts may vary slightly per run due to non-deterministic model output. +> - vLLM on Apple Silicon uses Metal (MPS) GPU acceleration for the LLM and CPU-based vLLM for the BERT embedding model (`BAAI/bge-base-en-v1.5`). +> - [Intel OPEA Enterprise Inference](https://github.com/opea-project/Enterprise-Inference) runs on Intel Xeon CPUs without GPU acceleration. +> - Each benchmark run exercises 5 generations: initial summary, overall summary, section summary, RAG indexing (embeddings), and RAG chat. +> - Langfuse tracing is used for full observability of each benchmark run. + +--- + +## Model Capabilities + +### Meta Llama 3.2 3B Instruct + +A 3-billion-parameter open-weight model from Meta's Llama family, optimized for instruction-following and on-device deployment. + + +| Attribute | Details | +| --------------------------- | -------------------------------------------------------------------------------------------- | +| **Parameters** | 3.21B | +| **Architecture** | Transformer with Grouped Query Attention (GQA) — 28 layers, 24 Q-heads / 8 KV-heads | +| **Context Window** | 128,000 tokens | +| **Instruction Tuning** | RLHF + supervised fine-tuning on instruction data | +| **Multilingual** | English, German, French, Italian, Portuguese, Hindi, Spanish, Thai | +| **Quantization Formats** | GGUF, AWQ, GPTQ, MLX (4-bit) | +| **Inference Runtimes** | vLLM, Ollama, llama.cpp, LMStudio, SGLang, TGI | +| **License** | Llama 3.2 Community License (permissive, with acceptable use policy) | +| **Deployment** | Local, on-prem, air-gapped, cloud — full data sovereignty | + + +### BAAI/bge-base-en-v1.5 + +A 110M-parameter BERT-based embedding model from BAAI, widely used for retrieval and RAG pipelines. + + +| Attribute | Details | +| --------------------------- | ---------------------------------------------------------- | +| **Parameters** | 109M | +| **Architecture** | BERT base (12 layers, 768 hidden dim) | +| **Embedding Dimensions** | 768 | +| **Max Sequence Length** | 512 tokens | +| **MTEB Retrieval Score** | 53.25 (competitive with models 3x its size) | +| **Inference Runtimes** | sentence-transformers, vLLM (CPU), ONNX, TGI | +| **License** | MIT | +| **Deployment** | Local, on-prem, air-gapped — lightweight enough for CPU | + + +### OpenAI text-embedding-3-small + +OpenAI's compact embedding model, used for RAG indexing and retrieval when running with the OpenAI provider. + + +| Attribute | Details | +| --------------------------- | ---------------------------------------------------------- | +| **Parameters** | Not publicly disclosed | +| **Embedding Dimensions** | 1,536 (default) or 512 (with `dimensions` parameter) | +| **Max Sequence Length** | 8,191 tokens | +| **MTEB Retrieval Score** | 44.0 | +| **Pricing** | $0.02 / 1M tokens | +| **License** | Proprietary (OpenAI Terms of Use) | +| **Deployment** | Cloud-only — OpenAI API or Azure OpenAI Service | + + +### GPT-4o-mini + +OpenAI's cost-efficient multimodal model, accessible exclusively via cloud API. + + +| Attribute | Details | +| --------------------------- | --------------------------------------------------------------------------------- | +| **Parameters** | Not publicly disclosed | +| **Architecture** | Multimodal Transformer (text + image input, text output) | +| **Context Window** | 128,000 tokens input / 16,384 tokens max output | +| **Tool / Function Calling** | Supported; parallel function calling | +| **Structured Output** | JSON mode and strict JSON schema adherence supported | +| **Multilingual** | Broad multilingual support | +| **Pricing** | $0.15 / 1M input tokens, $0.60 / 1M output tokens (Batch API: 50% discount) | +| **Fine-Tuning** | Supervised fine-tuning via OpenAI API | +| **License** | Proprietary (OpenAI Terms of Use) | +| **Deployment** | Cloud-only — OpenAI API or Azure OpenAI Service. No self-hosted or on-prem option | + + +### Comparison Summary + + +| Capability | Llama 3.2 3B Instruct | GPT-4o-mini | +| ------------------------------- | -------------------------------- | --------------------------------- | +| Financial document analysis | Yes | Yes | +| RAG-based document chat | Yes | Yes | +| On-prem / air-gapped deployment | Yes | No | +| Data sovereignty | Full (weights run locally) | No (data sent to cloud API) | +| Open weights | Yes (Llama Community License) | No (proprietary) | +| Custom fine-tuning | Full fine-tuning + LoRA adapters | Supervised fine-tuning (API only) | +| Multimodal (image input) | No | Yes | +| Native context window | 128K | 128K | + + +> Both models support financial document analysis and RAG-based chat. However, only Llama 3.2 offers open weights, data sovereignty, and local deployment flexibility — making it suitable for air-gapped, regulated, or cost-sensitive environments. GPT-4o-mini offers lower latency and higher throughput via OpenAI's cloud infrastructure, with added multimodal capabilities. + --- ## Technology Stack diff --git a/frontend/package-lock.json b/frontend/package-lock.json index 5fe35d0..9b30ee7 100644 --- a/frontend/package-lock.json +++ b/frontend/package-lock.json @@ -8,7 +8,7 @@ "name": "finsights-react-ui", "version": "1.0.0", "dependencies": { - "jspdf": "^4.2.0", + "jspdf": "^4.2.1", "lucide-react": "^0.294.0", "react": "^18.2.0", "react-dom": "^18.2.0", @@ -63,6 +63,7 @@ "resolved": "https://registry.npmjs.org/@babel/core/-/core-7.28.5.tgz", "integrity": "sha512-e7jT4DxYvIDLk1ZHmU/m/mB19rex9sv0c2ftBtjSBv+kVM/902eh0fINUzD7UwLLNR+jU585GxUJ8/EBfAM5fw==", "dev": true, + "peer": true, "dependencies": { "@babel/code-frame": "^7.27.1", "@babel/generator": "^7.28.5", @@ -1326,6 +1327,7 @@ "url": "https://github.com/sponsors/ai" } ], + "peer": true, "dependencies": { "baseline-browser-mapping": "^2.9.0", "caniuse-lite": "^1.0.30001759", @@ -1474,7 +1476,8 @@ "node_modules/csstype": { "version": "3.2.3", "resolved": "https://registry.npmjs.org/csstype/-/csstype-3.2.3.tgz", - "integrity": "sha512-z1HGKcYy2xA8AGQfwrn0PAy+PB7X/GSj3UVJW9qKyn43xWa+gl5nXmU4qqLMRzWVLFC8KusUX8T/0kCiOYpAIQ==" + "integrity": "sha512-z1HGKcYy2xA8AGQfwrn0PAy+PB7X/GSj3UVJW9qKyn43xWa+gl5nXmU4qqLMRzWVLFC8KusUX8T/0kCiOYpAIQ==", + "peer": true }, "node_modules/debug": { "version": "4.4.3", @@ -1789,6 +1792,7 @@ "resolved": "https://registry.npmjs.org/jiti/-/jiti-1.21.7.tgz", "integrity": "sha512-/imKNG4EbWNrVjoNC/1H5/9GFy+tqjGBHCaSsN+P2RnPqjsLmv6UD3Ej+Kj8nBWaRAwyk7kK5ZUc+OEatnTR3A==", "dev": true, + "peer": true, "bin": { "jiti": "bin/jiti.js" } @@ -1823,9 +1827,9 @@ } }, "node_modules/jspdf": { - "version": "4.2.0", - "resolved": "https://registry.npmjs.org/jspdf/-/jspdf-4.2.0.tgz", - "integrity": "sha512-hR/hnRevAXXlrjeqU5oahOE+Ln9ORJUB5brLHHqH67A+RBQZuFr5GkbI9XQI8OUFSEezKegsi45QRpc4bGj75Q==", + "version": "4.2.1", + "resolved": "https://registry.npmjs.org/jspdf/-/jspdf-4.2.1.tgz", + "integrity": "sha512-YyAXyvnmjTbR4bHQRLzex3CuINCDlQnBqoSYyjJwTP2x9jDLuKDzy7aKUl0hgx3uhcl7xzg32agn5vlie6HIlQ==", "license": "MIT", "dependencies": { "@babel/runtime": "^7.28.6", @@ -2047,6 +2051,7 @@ "url": "https://github.com/sponsors/ai" } ], + "peer": true, "dependencies": { "nanoid": "^3.3.11", "picocolors": "^1.1.1", @@ -2217,6 +2222,7 @@ "version": "18.3.1", "resolved": "https://registry.npmjs.org/react/-/react-18.3.1.tgz", "integrity": "sha512-wS+hAgJShR0KhEvPJArfuPVN1+Hz1t0Y6n5jLrGQbkb4urgPE/0Rve+1kMB1v/oWgHgm4WIcV+i7F2pTVj+2iQ==", + "peer": true, "dependencies": { "loose-envify": "^1.1.0" }, @@ -2228,6 +2234,7 @@ "version": "18.3.1", "resolved": "https://registry.npmjs.org/react-dom/-/react-dom-18.3.1.tgz", "integrity": "sha512-5m4nQKp+rZRb09LNH59GM4BxTh9251/ylbKIbpe7TpGxfJ+9kv6BLkLBXIjjspbgbnIBNqlI23tRnTWT0snUIw==", + "peer": true, "dependencies": { "loose-envify": "^1.1.0", "scheduler": "^0.23.2" @@ -2610,6 +2617,7 @@ "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-4.0.3.tgz", "integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==", "dev": true, + "peer": true, "engines": { "node": ">=12" }, @@ -2685,6 +2693,7 @@ "resolved": "https://registry.npmjs.org/vite/-/vite-5.4.21.tgz", "integrity": "sha512-o5a9xKjbtuhY6Bi5S3+HvbRERmouabWbyUcpXXUA1u+GNUKoROi9byOJ8M0nHbHYHkYICiMlqxkg1KkYmm25Sw==", "dev": true, + "peer": true, "dependencies": { "esbuild": "^0.21.3", "postcss": "^8.4.43", diff --git a/frontend/package.json b/frontend/package.json index df7df27..07e883e 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -8,12 +8,12 @@ "preview": "vite preview" }, "dependencies": { - "jspdf": "^4.2.0", + "jspdf": "^4.2.1", "lucide-react": "^0.294.0", "react": "^18.2.0", "react-dom": "^18.2.0", "react-hot-toast": "^2.4.1", - "react-router-dom": "^6.30.3" + "react-router-dom": "^6.30.3" }, "devDependencies": { "@vitejs/plugin-react": "^4.2.0",