diff --git a/README.md b/README.md index 21788a3..431e984 100644 --- a/README.md +++ b/README.md @@ -30,6 +30,12 @@ An AI-powered application that generates comprehensive system design specificati - [Local Development Setup](#local-development-setup) - [Project Structure](#project-structure) - [Usage Guide](#usage-guide) + - [Performance Tips](#performance-tips) + - [Inference Benchmarks](#inference-benchmarks) + - [Model Capabilities](#model-capabilities) + - [GPT-4o](#gpt-4o) + - [Llama 3.2 3B Instruct](#llama-32-3b-instruct) + - [Comparison Summary](#comparison-summary) - [LLM Provider Configuration](#llm-provider-configuration) - [OpenAI](#openai) - [Groq](#groq) @@ -304,6 +310,98 @@ SpecForge/ --- +## Performance Tips + +- **Use larger context windows for complex projects.** Models with 128K+ context (like GPT-4o) can handle more detailed requirements without truncation. For smaller models like Llama 3.2 3B (8K context), reduce `LLM_MAX_TOKENS` to leave room for prompts. +- **Lower `LLM_TEMPERATURE`** (e.g., `0.3–0.5`) for more consistent, structured specifications. Raise it slightly (e.g., `0.7–0.9`) for more creative architectural suggestions. +- **Provide detailed answers to clarifying questions.** The more context you provide, the more accurate and comprehensive the generated specification will be. +- **Use the refinement feature iteratively.** Start with a basic spec, then refine specific sections (e.g., "Add Redis caching layer", "Switch to PostgreSQL") rather than regenerating from scratch. +- **On Apple Silicon**, always run Ollama natively — never inside Docker. The MPS (Metal) GPU backend delivers significantly better throughput than CPU-only inference. +- **For enterprise deployments**, choose a model optimized for long-form technical writing. GPT-4o and Claude Sonnet 3.5 excel at structured documentation. + +--- + +## Inference Benchmarks + +The table below compares inference performance across different providers and models using a standardized SpecForge workload (3 runs: questions generation + spec generation with 1000 max output tokens). + +| Provider | Model | Deployment | Context Window | Avg Input Tokens | Avg Output Tokens | Avg Tokens / Request | P50 Latency (ms) | P95 Latency (ms) | Throughput (req/s) | Hardware | +| -------------- | ------------------------------ | -------------------- | -------------- | ---------------- | ----------------- | -------------------- | ---------------- | ---------------- | ------------------ | ---------------- | +| OpenAI (Cloud) | `gpt-4o` | API (Cloud) | 128K | 4,018 | 875 | 4,893 | 13,540 | 24,892 | 0.074 | Cloud GPUs | +| LiteLLM | `meta-llama/Llama-3.2-3B-Instruct` | Enterprise Gateway | 8.1K | 4,158 | 823 | 4,982 | 33,911 | 38,391 | 0.035 | CPU (Xeon) | + +> **Notes:** +> +> - All benchmarks use identical SpecForge workflows: idea input → 5 questions → spec generation with `LLM_MAX_TOKENS=1000`. +> - Token counts are actual values from API responses (not estimates). +> - GPT-4o delivers 2.5x faster P50 latency and 2.1x better throughput compared to Llama 3.2 3B on the tested infrastructure. +> - Llama 3.2 3B performance is limited by CPU-only inference on the test gateway. Local GPU inference would significantly improve these numbers. + +--- + +## Model Capabilities + +### GPT-4o + +OpenAI's flagship multimodal model, optimized for speed and intelligence across text and vision tasks. + +| Attribute | Details | +| --------------------------- | --------------------------------------------------------------------------------- | +| **Parameters** | Not publicly disclosed | +| **Architecture** | Multimodal Transformer (text + image input, text output) | +| **Context Window** | 128,000 tokens input / 16,384 tokens max output | +| **Reasoning Mode** | Standard inference with strong chain-of-thought reasoning | +| **Tool / Function Calling** | Supported; parallel function calling | +| **Structured Output** | JSON mode and strict JSON schema adherence supported | +| **Multilingual** | Broad multilingual support (50+ languages) | +| **Benchmarks** | Strong performance on system design, architectural decision-making, and technical documentation | +| **Pricing** | $2.50 / 1M input tokens, $10.00 / 1M output tokens (as of 2024) | +| **Fine-Tuning** | Supervised fine-tuning via OpenAI API | +| **License** | Proprietary (OpenAI Terms of Use) | +| **Deployment** | Cloud-only — OpenAI API or Azure OpenAI Service. No self-hosted option | +| **Knowledge Cutoff** | October 2023 | + +### Llama 3.2 3B Instruct + +Meta's small-scale open-weight instruction-tuned model, designed for edge and on-premises deployment. + +| Attribute | Details | +| --------------------------- | ------------------------------------------------------------------------------------------------------------------- | +| **Parameters** | 3.21B total parameters | +| **Architecture** | Transformer decoder with Grouped Query Attention (GQA) | +| **Context Window** | 131,072 tokens (128K) native | +| **Reasoning Mode** | Standard instruction-following (no explicit chain-of-thought mode) | +| **Tool / Function Calling** | Limited native support; can be prompted for structured output | +| **Structured Output** | JSON formatting supported via prompting | +| **Multilingual** | Primarily English-focused with limited multilingual capabilities | +| **Benchmarks** | MMLU: 63.4%, strong small-model performance for reasoning tasks | +| **Quantization Formats** | GGUF, GPTQ, AWQ — runs on consumer hardware (4GB+ RAM) | +| **Inference Runtimes** | Ollama, vLLM, llama.cpp, LMStudio, Transformers | +| **Fine-Tuning** | Full fine-tuning and LoRA adapters supported | +| **License** | Llama 3.2 Community License (open for research and commercial use) | +| **Deployment** | Local, on-prem, air-gapped, cloud — full data sovereignty | + +### Comparison Summary + +| Capability | GPT-4o | Llama 3.2 3B Instruct | +| ------------------------------- | -------------------------------- | -------------------------------- | +| System design specifications | Excellent | Good | +| Architectural diagrams | Excellent | Good (requires careful prompting)| +| Technical documentation | Excellent | Good | +| Function / tool calling | Native support | Prompt-based | +| JSON structured output | Native with schema validation | Prompt-based | +| On-prem / air-gapped deployment | No | Yes | +| Data sovereignty | No (cloud API) | Full (weights run locally) | +| Open weights | No (proprietary) | Yes (Llama 3.2 License) | +| Custom fine-tuning | API-based only | Full fine-tuning + LoRA | +| Edge device deployment | N/A | Yes (quantized variants) | +| Multimodal (image input) | Yes | No | +| Native context window | 128K | 128K | + +> Both models can generate system design specifications, though GPT-4o produces more comprehensive and detailed output with better architectural reasoning. Llama 3.2 3B excels in air-gapped environments, cost-sensitive deployments, and scenarios requiring data sovereignty. + +--- + ## LLM Provider Configuration All providers are configured via the `.env` file. Set `INFERENCE_PROVIDER=remote` for any cloud or API-based provider, and `INFERENCE_PROVIDER=ollama` for local inference. diff --git a/backend/Dockerfile b/backend/Dockerfile index bbc93a9..e0a344a 100644 --- a/backend/Dockerfile +++ b/backend/Dockerfile @@ -2,6 +2,9 @@ FROM python:3.11-slim WORKDIR /app +# Upgrade pip, setuptools, and wheel to fix security vulnerabilities +RUN pip install --no-cache-dir --upgrade pip setuptools>=79.1.0 wheel>=0.46.2 + # Copy requirements first for better caching COPY requirements.txt . diff --git a/frontend/package-lock.json b/frontend/package-lock.json index 18b1c7d..095d52e 100644 --- a/frontend/package-lock.json +++ b/frontend/package-lock.json @@ -8,7 +8,7 @@ "name": "frontend", "version": "0.0.0", "dependencies": { - "mermaid": "^11.13.0", + "mermaid": "^11.14.0", "react": "^19.2.4", "react-dom": "^19.2.4", "react-markdown": "^10.1.0", @@ -635,9 +635,9 @@ } }, "node_modules/@mermaid-js/parser": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/@mermaid-js/parser/-/parser-1.0.1.tgz", - "integrity": "sha512-opmV19kN1JsK0T6HhhokHpcVkqKpF+x2pPDKKM2ThHtZAB5F4PROopk0amuVYK5qMrIA4erzpNm8gmPNJgMDxQ==", + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@mermaid-js/parser/-/parser-1.1.0.tgz", + "integrity": "sha512-gxK9ZX2+Fex5zu8LhRQoMeMPEHbc73UKZ0FQ54YrQtUxE1VVhMwzeNtKRPAu5aXks4FasbMe4xB4bWrmq6Jlxw==", "license": "MIT", "dependencies": { "langium": "^4.0.0" @@ -1434,9 +1434,9 @@ } }, "node_modules/brace-expansion": { - "version": "1.1.12", - "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.12.tgz", - "integrity": "sha512-9T9UjW3r0UW5c1Q7GTwllptXwhvYmEzFhzMfZ9H7FQWt+uZePjZPjBP/W1ZEyZ1twGWom5/56TF4lPcqjnDHcg==", + "version": "1.1.13", + "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.13.tgz", + "integrity": "sha512-9ZLprWS6EENmhEOpjCYW2c8VkmOvckIJZfkr7rBW6dObmfgJ/L1GpSYW5Hpo9lDz4D1+n0Ckz8rU7FwHDQiG/w==", "dev": true, "license": "MIT", "dependencies": { @@ -2597,9 +2597,9 @@ } }, "node_modules/flatted": { - "version": "3.4.1", - "resolved": "https://registry.npmjs.org/flatted/-/flatted-3.4.1.tgz", - "integrity": "sha512-IxfVbRFVlV8V/yRaGzk0UVIcsKKHMSfYw66T/u4nTwlWteQePsxe//LjudR1AMX4tZW3WFCh3Zqa/sjlqpbURQ==", + "version": "3.4.2", + "resolved": "https://registry.npmjs.org/flatted/-/flatted-3.4.2.tgz", + "integrity": "sha512-PjDse7RzhcPkIJwy5t7KPWQSZ9cAbzQXcafsetQoD7sOJRQlGikNbx7yZp2OotDnJyrDcbyRq3Ttb18iYOqkxA==", "dev": true, "license": "ISC" }, @@ -3309,9 +3309,9 @@ } }, "node_modules/lodash-es": { - "version": "4.17.23", - "resolved": "https://registry.npmjs.org/lodash-es/-/lodash-es-4.17.23.tgz", - "integrity": "sha512-kVI48u3PZr38HdYz98UmfPnXl2DXrpdctLrFLCd3kOx1xUkOmpFPx7gCWWM5MPkL/fD8zb+Ph0QzjGFs4+hHWg==", + "version": "4.18.1", + "resolved": "https://registry.npmjs.org/lodash-es/-/lodash-es-4.18.1.tgz", + "integrity": "sha512-J8xewKD/Gk22OZbhpOVSwcs60zhd95ESDwezOFuA3/099925PdHJ7OFHNTGtajL3AlZkykD32HykiMo+BIBI8A==", "license": "MIT" }, "node_modules/lodash.merge": { @@ -3646,14 +3646,14 @@ } }, "node_modules/mermaid": { - "version": "11.13.0", - "resolved": "https://registry.npmjs.org/mermaid/-/mermaid-11.13.0.tgz", - "integrity": "sha512-fEnci+Immw6lKMFI8sqzjlATTyjLkRa6axrEgLV2yHTfv8r+h1wjFbV6xeRtd4rUV1cS4EpR9rwp3Rci7TRWDw==", + "version": "11.14.0", + "resolved": "https://registry.npmjs.org/mermaid/-/mermaid-11.14.0.tgz", + "integrity": "sha512-GSGloRsBs+JINmmhl0JDwjpuezCsHB4WGI4NASHxL3fHo3o/BRXTxhDLKnln8/Q0lRFRyDdEjmk1/d5Sn1Xz8g==", "license": "MIT", "dependencies": { "@braintree/sanitize-url": "^7.1.1", "@iconify/utils": "^3.0.2", - "@mermaid-js/parser": "^1.0.1", + "@mermaid-js/parser": "^1.1.0", "@types/d3": "^7.4.3", "@upsetjs/venn.js": "^2.0.0", "cytoscape": "^3.33.1", @@ -4435,9 +4435,9 @@ "license": "ISC" }, "node_modules/picomatch": { - "version": "4.0.3", - "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-4.0.3.tgz", - "integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==", + "version": "4.0.4", + "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-4.0.4.tgz", + "integrity": "sha512-QP88BAKvMam/3NxH6vj2o21R6MjxZUAd6nlwAS/pnGvN9IVLocLHxGYIzFhg6fUQ+5th6P4dv4eW9jX3DSIj7A==", "dev": true, "license": "MIT", "engines": { diff --git a/frontend/package.json b/frontend/package.json index b524efb..3b177da 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -10,7 +10,7 @@ "preview": "vite preview" }, "dependencies": { - "mermaid": "^11.13.0", + "mermaid": "^11.14.0", "react": "^19.2.4", "react-dom": "^19.2.4", "react-markdown": "^10.1.0", @@ -26,5 +26,8 @@ "eslint-plugin-react-refresh": "^0.5.2", "globals": "^17.4.0", "vite": "^8.0.0" + }, + "overrides": { + "lodash-es": "^4.18.0" } }