diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 0000000..ecba33e --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,39 @@ +## Summary + + + +- + +## Type of Change + + + +- [ ] Bug fix +- [ ] New feature / enhancement +- [ ] Documentation update +- [ ] Refactor (no behavior change) +- [ ] Chore (dependencies, CI, tooling) + +## Changes Made + + + +Resolves # + +## How to Test + + + +1. + +## Checklist + +- [ ] I have read the [Contributing Guide](../CONTRIBUTING.md) +- [ ] My branch is up to date with `main` +- [ ] New environment variables (if any) are documented in `.env.example` and the README +- [ ] No secrets, API keys, or credentials are included in this PR +- [ ] I have tested my changes locally + +## Screenshots (if applicable) + + \ No newline at end of file diff --git a/.github/workflows/code-scans.yaml b/.github/workflows/code-scans.yaml index 8bf3bf4..a362ade 100644 --- a/.github/workflows/code-scans.yaml +++ b/.github/workflows/code-scans.yaml @@ -7,7 +7,7 @@ on: description: 'Pull request number' required: true push: - branches: [ main ] + branches: [ main, finsights/dev ] pull_request: types: [opened, synchronize, reopened, ready_for_review] diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 2af36a1..0a9d0c8 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -33,7 +33,7 @@ git --version New to contributing? 1. Open an issue or pick an existing one to work on. -2. Sync your branch from `dev`. +2. Sync your branch from `finsights/dev`. 3. Follow the local setup guide below. 4. Run the app locally and verify your change before opening a PR. @@ -193,11 +193,11 @@ This starts: ### Start contributing code? 1. Open or choose an issue. -2. Create a feature branch from `dev`. +2. Create a feature branch from `finsights/dev`. 3. Keep the change focused on a single problem. 4. Run the app locally and verify the affected workflow. 5. Update docs when behavior, setup, configuration, or architecture changes. -6. Open a pull request back from your feature branch into `dev`. +6. Open a pull request back from your feature branch into `finsights/dev`. ### Improve the documentation? @@ -218,7 +218,7 @@ Follow the checklist below before opening your PR. Your pull request should: - Include screenshots or short recordings for UI changes. - Reference the related GitHub issue when applicable. -Note: pull requests should target the `dev` branch. +Note: pull requests should target the `finsights/dev` branch. --- @@ -245,7 +245,7 @@ Before submitting your pull request, confirm the following: - You kept the pull request scoped to one issue or topic. - You added screenshots for UI changes when relevant. - You did not commit secrets, API keys, or cached documents. -- You are opening the pull request against `dev`. +- You are opening the pull request against `finsights/dev`. If one or more of these are missing, the pull request may be sent back for changes before review. @@ -253,10 +253,10 @@ If one or more of these are missing, the pull request may be sent back for chang ## Branching model -- Base new work from `dev`. -- Open pull requests against `dev`. +- Base new work from `finsights/dev`. +- Open pull requests against `finsights/dev`. - Use descriptive branch names such as `fix/rag-chat-validation` or `docs/update-contributing-guide`. -- Rebase or merge the latest `dev` before opening your PR if your branch has drifted. +- Rebase or merge the latest `finsights/dev` before opening your PR if your branch has drifted. --- diff --git a/backend/api/routes.py b/backend/api/routes.py index 6992f69..73144c9 100644 --- a/backend/api/routes.py +++ b/backend/api/routes.py @@ -23,6 +23,47 @@ router = APIRouter() +def _build_upload_warning(filename: str, document_info: dict) -> dict: + over_size_limit = document_info["file_size_bytes"] > config.MAX_PDF_SIZE + over_page_limit = bool(document_info.get("is_pdf")) and (document_info.get("page_count") or 0) > config.MAX_PDF_PAGES + + reasons = [] + if over_size_limit: + reasons.append( + f'"{filename}" is {document_info["file_size_mb"]} MB, above the {document_info["max_file_size_mb"]} MB limit.' + ) + if over_page_limit: + reasons.append( + f'This PDF has {document_info["page_count"]} pages, and only the first {document_info["pages_to_process"]} pages will be processed.' + ) + + if not reasons: + return {} + + summary = " ".join(reasons) + if over_page_limit: + summary += f" If you continue, FinSights will upload the file but use only the first {document_info['pages_to_process']} pages for extraction." + elif over_size_limit: + summary += " If you continue, FinSights will upload the full file and attempt normal extraction." + + return { + "code": "upload_confirmation_required", + "message": summary, + "filename": filename, + "requires_confirmation": True, + "over_size_limit": over_size_limit, + "over_page_limit": over_page_limit, + "file_size_bytes": document_info["file_size_bytes"], + "file_size_mb": document_info["file_size_mb"], + "max_file_size_bytes": document_info["max_file_size_bytes"], + "max_file_size_mb": document_info["max_file_size_mb"], + "page_count": document_info.get("page_count"), + "page_limit": document_info.get("page_limit"), + "pages_to_process": document_info.get("pages_to_process"), + "will_trim_pages": document_info.get("will_trim_pages", False), + } + + @router.get("/health", response_model=HealthResponse) async def health_check(): """Health check endpoint""" @@ -153,6 +194,7 @@ async def summarize_document( section: str = Form(""), # required if mode=financial_section # doc_id for cached section requests (frontend should send this on clicks) doc_id: str = Form(""), + ignore_upload_warnings: str = Form("false"), files: Optional[UploadFile] = File(None), ): """ @@ -168,6 +210,7 @@ async def summarize_document( """ try: stream_bool = stream.lower() == "true" + ignore_upload_warnings_bool = ignore_upload_warnings.lower() == "true" # Enforce backend constraints: # streaming supported only for financial_overall (per llm_service) @@ -312,6 +355,13 @@ async def summarize_document( file_type = "PDF" if filename_lower.endswith(".pdf") else "DOCX" logger.info(f"Extracting text from {file_type} file") + document_info = pdf_service.analyze_document(temp_path) + upload_warning = _build_upload_warning(files.filename, document_info) + + if upload_warning and not ignore_upload_warnings_bool: + os.remove(temp_path) + raise HTTPException(status_code=413, detail=upload_warning) + text_content = pdf_service.extract_text(temp_path) os.remove(temp_path) @@ -336,7 +386,7 @@ async def summarize_document( mode="financial_section", section=section.strip(), ) - return { + response = { "doc_id": created_doc_id, "text": section_out, "summary": section_out, @@ -346,11 +396,14 @@ async def summarize_document( "section": section.strip(), "sections": llm_service.get_doc_sections(created_doc_id) or [], } + if upload_warning: + response["upload_warning"] = upload_warning + return response # Otherwise return fast initial summary (also discovers sections internally) initial_summary = llm_service.initial_summary_first_chunk(created_doc_id) sections = llm_service.get_doc_sections(created_doc_id) or [] - return { + response = { "doc_id": created_doc_id, "text": initial_summary, "summary": initial_summary, @@ -360,6 +413,9 @@ async def summarize_document( "section": "", "sections": sections, } + if upload_warning: + response["upload_warning"] = upload_warning + return response # TXT if filename_lower.endswith(".txt"): @@ -412,16 +468,14 @@ async def summarize_document( # Unsupported type logger.error(f"Unsupported file type: {files.filename}") - os.remove(temp_path) raise HTTPException( status_code=400, detail="Unsupported file type. Please upload PDF, DOCX, or TXT files.", ) - except Exception: + finally: if os.path.exists(temp_path): os.remove(temp_path) - raise # ========== Invalid Request ========== raise HTTPException(status_code=400, detail="Either text message or file is required") diff --git a/backend/services/pdf/pdf_service.py b/backend/services/pdf/pdf_service.py index 6e1f19c..9657acb 100644 --- a/backend/services/pdf/pdf_service.py +++ b/backend/services/pdf/pdf_service.py @@ -49,6 +49,34 @@ def extract_text(self, file_path: str) -> str: logger.error(f"Document extraction error: {str(e)}") raise Exception(f"Failed to extract text from document: {str(e)}") + def analyze_document(self, file_path: str) -> dict: + """ + Inspect a document before extraction so the API can explain limits clearly. + """ + filename_lower = file_path.lower() + file_size_bytes = os.path.getsize(file_path) + metadata = { + "file_size_bytes": file_size_bytes, + "file_size_mb": round(file_size_bytes / (1024 * 1024), 2), + "max_file_size_bytes": config.MAX_PDF_SIZE, + "max_file_size_mb": round(config.MAX_PDF_SIZE / (1024 * 1024), 2), + "page_limit": config.MAX_PDF_PAGES, + "page_count": None, + "pages_to_process": None, + "will_trim_pages": False, + "is_pdf": filename_lower.endswith(".pdf"), + } + + if metadata["is_pdf"]: + with open(file_path, "rb") as file: + pdf_reader = PdfReader(file) + page_count = len(pdf_reader.pages) + metadata["page_count"] = page_count + metadata["pages_to_process"] = min(page_count, config.MAX_PDF_PAGES) + metadata["will_trim_pages"] = page_count > config.MAX_PDF_PAGES + + return metadata + def _extract_from_pdf(self, pdf_path: str) -> str: """ Extract text from PDF file with automatic OCR fallback for image-based PDFs diff --git a/frontend/src/components/FileUpload.jsx b/frontend/src/components/FileUpload.jsx index 22531f1..6d31bc6 100644 --- a/frontend/src/components/FileUpload.jsx +++ b/frontend/src/components/FileUpload.jsx @@ -1,10 +1,26 @@ import { useState } from 'react'; import { Upload, FileText, X } from 'lucide-react'; -const FileUpload = ({ onSubmit, isLoading, acceptedTypes, fileType, title, maxFileSize }) => { +const FileUpload = ({ + onSubmit, + isLoading, + acceptedTypes, + fileType, + title, + maxFileSize, + maxFileSizeBytes = 50 * 1024 * 1024, + uploadWarning, + processingNotice, + onDismissWarning, +}) => { const [dragActive, setDragActive] = useState(false); const [file, setFile] = useState(null); + const setSelectedFile = (nextFile) => { + setFile(nextFile); + onDismissWarning?.(); + }; + const handleDrag = (e) => { e.preventDefault(); e.stopPropagation(); @@ -25,7 +41,7 @@ const FileUpload = ({ onSubmit, isLoading, acceptedTypes, fileType, title, maxFi const fileExtension = '.' + droppedFile.name.split('.').pop().toLowerCase(); if (acceptedTypes.includes(fileExtension)) { - setFile(droppedFile); + setSelectedFile(droppedFile); } } }; @@ -33,16 +49,15 @@ const FileUpload = ({ onSubmit, isLoading, acceptedTypes, fileType, title, maxFi const handleChange = (e) => { e.preventDefault(); if (e.target.files && e.target.files[0]) { - setFile(e.target.files[0]); + setSelectedFile(e.target.files[0]); } }; const handleRemoveFile = () => { - setFile(null); + setSelectedFile(null); }; - const handleSubmit = (e) => { - e.preventDefault(); + const submitFile = (ignoreUploadWarnings = false) => { if (!file) return; const formData = new FormData(); @@ -53,10 +68,18 @@ const FileUpload = ({ onSubmit, isLoading, acceptedTypes, fileType, title, maxFi formData.append('language', 'en'); formData.append('summary_type', 'auto'); formData.append('stream', 'false'); + formData.append('ignore_upload_warnings', String(ignoreUploadWarnings)); onSubmit(formData, false); }; + const handleSubmit = (e) => { + e.preventDefault(); + submitFile(false); + }; + + const isOverLocalSizeLimit = !!file && file.size > maxFileSizeBytes; + return (
@@ -122,6 +145,58 @@ const FileUpload = ({ onSubmit, isLoading, acceptedTypes, fileType, title, maxFi )}
+ {isOverLocalSizeLimit && !uploadWarning && ( +
+

+ This file is {(file.size / 1024 / 1024).toFixed(2)} MB. The backend limit is {maxFileSize || '50 MB'}, so you will need to confirm the upload before processing continues. +

+
+ )} + + {uploadWarning && ( +
+
+

Upload warning

+

{uploadWarning.message}

+
+ +
+ {uploadWarning.file_size_mb ? ( +

File size: {uploadWarning.file_size_mb} MB of {uploadWarning.max_file_size_mb} MB allowed.

+ ) : null} + {uploadWarning.page_count ? ( +

Pages detected: {uploadWarning.page_count}. Pages that will be processed: {uploadWarning.pages_to_process}.

+ ) : null} +
+ +
+ + +
+
+ )} + + {processingNotice && !uploadWarning && ( +
+

Processing notice

+

{processingNotice.message}

+
+ )} +

📋 Document Limits: Maximum file size is 50 MB, and document is limited to 100 pages. diff --git a/frontend/src/pages/Generate.jsx b/frontend/src/pages/Generate.jsx index a08d2e2..4d17be4 100644 --- a/frontend/src/pages/Generate.jsx +++ b/frontend/src/pages/Generate.jsx @@ -37,6 +37,8 @@ const streamText = (text, onUpdate, targetMs = 900, tickMs = 20) => { export const Generate = () => { const [activeTab, setActiveTab] = useState('text'); + const [uploadWarning, setUploadWarning] = useState(null); + const [processingNotice, setProcessingNotice] = useState(null); // Store last submitted input (for text flow only) const [lastFormData, setLastFormData] = useState(null); @@ -201,11 +203,15 @@ export const Generate = () => { doc.save(filename); }; - const resetRunState = () => { + const resetRunState = ({ preserveUploadState = false } = {}) => { setHistory([]); setDocId(''); setDynamicSections([]); setLastFormData(null); + if (!preserveUploadState) { + setUploadWarning(null); + setProcessingNotice(null); + } // RAG setRagReady(false); @@ -235,13 +241,15 @@ export const Generate = () => { const handleSubmit = async (formData) => { setIsLoadingInitial(true); + setUploadWarning(null); + setProcessingNotice(null); // Clean up old vectors before submitting new document if (docId) { await deleteVectors(docId); } - resetRunState(); + resetRunState({ preserveUploadState: true }); // Store lastFormData only for text flow if (activeTab === 'text') setLastFormData(formData); @@ -251,6 +259,9 @@ export const Generate = () => { fd.set('mode', 'financial_initial'); const json = await generateSummaryJson(fd); + if (json?.upload_warning) { + setProcessingNotice(json.upload_warning); + } if (json.doc_id) { setDocId(json.doc_id); @@ -305,6 +316,11 @@ export const Generate = () => { toast.success('Initial summary generated. Select a section below.'); } catch (error) { console.error('Error:', error); + if (error?.detail?.code === 'upload_confirmation_required') { + setUploadWarning(error.detail); + toast.error('Review the upload warning before continuing.'); + return; + } toast.error('Failed to generate summary. Please try again.'); resetRunState(); } finally { @@ -609,6 +625,13 @@ export const Generate = () => { fileType="text" title="Upload Document" maxFileSize="50 MB" + maxFileSizeBytes={50 * 1024 * 1024} + uploadWarning={uploadWarning} + processingNotice={processingNotice} + onDismissWarning={() => { + setUploadWarning(null); + setProcessingNotice(null); + }} /> )}

diff --git a/frontend/src/services/api.js b/frontend/src/services/api.js index b17741b..0725f72 100644 --- a/frontend/src/services/api.js +++ b/frontend/src/services/api.js @@ -1,6 +1,18 @@ // src/services/api.js const BACKEND_ENDPOINT = import.meta.env.VITE_BACKEND_ENDPOINT || ''; +const parseErrorPayload = async (response) => { + const responseText = await response.text(); + + try { + const parsed = JSON.parse(responseText); + if (parsed?.detail !== undefined) return parsed.detail; + return parsed; + } catch { + return responseText; + } +}; + // Helper function to decode Python string escape sequences const decodePythonString = (str) => { return str @@ -110,8 +122,15 @@ export const generateSummaryJson = async (formData) => { }); if (!response.ok) { - const errorText = await response.text(); - throw new Error(`API Error: ${response.status} - ${errorText}`); + const detail = await parseErrorPayload(response); + const error = new Error( + typeof detail === 'string' + ? `API Error: ${response.status} - ${detail}` + : `API Error: ${response.status}` + ); + error.status = response.status; + error.detail = detail; + throw error; } const responseText = await response.text();