cld2labs · geethac2l · Apr 1, 2026 · Apr 1, 2026
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
@@ -0,0 +1,39 @@
+## Summary
+
+<!-- What does this PR do? Keep it to 1-3 bullet points. -->
+
+-
+
+## Type of Change
+
+<!-- Check the one that applies. -->
+
+- [ ] Bug fix
+- [ ] New feature / enhancement
+- [ ] Documentation update
+- [ ] Refactor (no behavior change)
+- [ ] Chore (dependencies, CI, tooling)
+
+## Changes Made
+
+<!-- Briefly describe the key changes. Link to relevant issues if applicable. -->
+
+Resolves #<!-- issue number -->
+
+## How to Test
+
+<!-- Steps a reviewer can follow to verify the changes. -->
+
+1.
+
+## Checklist
+
+- [ ] I have read the [Contributing Guide](../CONTRIBUTING.md)
+- [ ] My branch is up to date with `main`
+- [ ] New environment variables (if any) are documented in `.env.example` and the README
+- [ ] No secrets, API keys, or credentials are included in this PR
+- [ ] I have tested my changes locally
+
+## Screenshots (if applicable)
+
+<!-- Add screenshots for UI changes. Delete this section if not applicable. -->
diff --git a/.github/workflows/code-scans.yaml b/.github/workflows/code-scans.yaml
@@ -7,7 +7,7 @@ on:
         description: 'Pull request number'
         required: true
   push:
-    branches: [ main ]
+    branches: [ main, finsights/dev ]
   pull_request:
     types: [opened, synchronize, reopened, ready_for_review]
 

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -33,7 +33,7 @@ git --version
 New to contributing?
 
 1. Open an issue or pick an existing one to work on.
-2. Sync your branch from `dev`.
+2. Sync your branch from `finsights/dev`.
 3. Follow the local setup guide below.
 4. Run the app locally and verify your change before opening a PR.
 
@@ -193,11 +193,11 @@ This starts:
 ### Start contributing code?
 
 1. Open or choose an issue.
-2. Create a feature branch from `dev`.
+2. Create a feature branch from `finsights/dev`.
 3. Keep the change focused on a single problem.
 4. Run the app locally and verify the affected workflow.
 5. Update docs when behavior, setup, configuration, or architecture changes.
-6. Open a pull request back from your feature branch into `dev`.
+6. Open a pull request back from your feature branch into `finsights/dev`.
 
 ### Improve the documentation?
 
@@ -218,7 +218,7 @@ Follow the checklist below before opening your PR. Your pull request should:
 - Include screenshots or short recordings for UI changes.
 - Reference the related GitHub issue when applicable.
 
-Note: pull requests should target the `dev` branch.
+Note: pull requests should target the `finsights/dev` branch.
 
 ---
 
@@ -245,18 +245,18 @@ Before submitting your pull request, confirm the following:
 - You kept the pull request scoped to one issue or topic.
 - You added screenshots for UI changes when relevant.
 - You did not commit secrets, API keys, or cached documents.
-- You are opening the pull request against `dev`.
+- You are opening the pull request against `finsights/dev`.
 
 If one or more of these are missing, the pull request may be sent back for changes before review.
 
 ---
 
 ## Branching model
 
-- Base new work from `dev`.
-- Open pull requests against `dev`.
+- Base new work from `finsights/dev`.
+- Open pull requests against `finsights/dev`.
 - Use descriptive branch names such as `fix/rag-chat-validation` or `docs/update-contributing-guide`.
-- Rebase or merge the latest `dev` before opening your PR if your branch has drifted.
+- Rebase or merge the latest `finsights/dev` before opening your PR if your branch has drifted.
 
 ---
 

diff --git a/backend/api/routes.py b/backend/api/routes.py
@@ -23,6 +23,47 @@
 router = APIRouter()
 
 
+def _build_upload_warning(filename: str, document_info: dict) -> dict:
+    over_size_limit = document_info["file_size_bytes"] > config.MAX_PDF_SIZE
+    over_page_limit = bool(document_info.get("is_pdf")) and (document_info.get("page_count") or 0) > config.MAX_PDF_PAGES
+
+    reasons = []
+    if over_size_limit:
+        reasons.append(
+            f'"{filename}" is {document_info["file_size_mb"]} MB, above the {document_info["max_file_size_mb"]} MB limit.'
+        )
+    if over_page_limit:
+        reasons.append(
+            f'This PDF has {document_info["page_count"]} pages, and only the first {document_info["pages_to_process"]} pages will be processed.'
+        )
+
+    if not reasons:
+        return {}
+
+    summary = " ".join(reasons)
+    if over_page_limit:
+        summary += f" If you continue, FinSights will upload the file but use only the first {document_info['pages_to_process']} pages for extraction."
+    elif over_size_limit:
+        summary += " If you continue, FinSights will upload the full file and attempt normal extraction."
+
+    return {
+        "code": "upload_confirmation_required",
+        "message": summary,
+        "filename": filename,
+        "requires_confirmation": True,
+        "over_size_limit": over_size_limit,
+        "over_page_limit": over_page_limit,
+        "file_size_bytes": document_info["file_size_bytes"],
+        "file_size_mb": document_info["file_size_mb"],
+        "max_file_size_bytes": document_info["max_file_size_bytes"],
+        "max_file_size_mb": document_info["max_file_size_mb"],
+        "page_count": document_info.get("page_count"),
+        "page_limit": document_info.get("page_limit"),
+        "pages_to_process": document_info.get("pages_to_process"),
+        "will_trim_pages": document_info.get("will_trim_pages", False),
+    }
+
+
 @router.get("/health", response_model=HealthResponse)
 async def health_check():
     """Health check endpoint"""
@@ -153,6 +194,7 @@ async def summarize_document(
     section: str = Form(""),                # required if mode=financial_section
     # doc_id for cached section requests (frontend should send this on clicks)
     doc_id: str = Form(""),
+    ignore_upload_warnings: str = Form("false"),
     files: Optional[UploadFile] = File(None),
 ):
     """
@@ -168,6 +210,7 @@ async def summarize_document(
     """
     try:
         stream_bool = stream.lower() == "true"
+        ignore_upload_warnings_bool = ignore_upload_warnings.lower() == "true"
 
         # Enforce backend constraints:
         # streaming supported only for financial_overall (per llm_service)
@@ -312,6 +355,13 @@ async def summarize_document(
                     file_type = "PDF" if filename_lower.endswith(".pdf") else "DOCX"
                     logger.info(f"Extracting text from {file_type} file")
 
+                    document_info = pdf_service.analyze_document(temp_path)
+                    upload_warning = _build_upload_warning(files.filename, document_info)
+
+                    if upload_warning and not ignore_upload_warnings_bool:
+                        os.remove(temp_path)
+                        raise HTTPException(status_code=413, detail=upload_warning)
+
                     text_content = pdf_service.extract_text(temp_path)
                     os.remove(temp_path)
 
@@ -336,7 +386,7 @@ async def summarize_document(
                             mode="financial_section",
                             section=section.strip(),
                         )
-                        return {
+                        response = {
                             "doc_id": created_doc_id,
                             "text": section_out,
                             "summary": section_out,
@@ -346,11 +396,14 @@ async def summarize_document(
                             "section": section.strip(),
                             "sections": llm_service.get_doc_sections(created_doc_id) or [],
                         }
+                        if upload_warning:
+                            response["upload_warning"] = upload_warning
+                        return response
 
                     # Otherwise return fast initial summary (also discovers sections internally)
                     initial_summary = llm_service.initial_summary_first_chunk(created_doc_id)
                     sections = llm_service.get_doc_sections(created_doc_id) or []
-                    return {
+                    response = {
                         "doc_id": created_doc_id,
                         "text": initial_summary,
                         "summary": initial_summary,
@@ -360,6 +413,9 @@ async def summarize_document(
                         "section": "",
                         "sections": sections,
                     }
+                    if upload_warning:
+                        response["upload_warning"] = upload_warning
+                    return response
 
                 # TXT
                 if filename_lower.endswith(".txt"):
@@ -412,16 +468,14 @@ async def summarize_document(
 
                 # Unsupported type
                 logger.error(f"Unsupported file type: {files.filename}")
-                os.remove(temp_path)
                 raise HTTPException(
                     status_code=400,
                     detail="Unsupported file type. Please upload PDF, DOCX, or TXT files.",
                 )
 
-            except Exception:
+            finally:
                 if os.path.exists(temp_path):
                     os.remove(temp_path)
-                raise
 
         # ========== Invalid Request ==========
         raise HTTPException(status_code=400, detail="Either text message or file is required")

diff --git a/backend/services/pdf/pdf_service.py b/backend/services/pdf/pdf_service.py
@@ -49,6 +49,34 @@ def extract_text(self, file_path: str) -> str:
             logger.error(f"Document extraction error: {str(e)}")
             raise Exception(f"Failed to extract text from document: {str(e)}")
 
+    def analyze_document(self, file_path: str) -> dict:
+        """
+        Inspect a document before extraction so the API can explain limits clearly.
+        """
+        filename_lower = file_path.lower()
+        file_size_bytes = os.path.getsize(file_path)
+        metadata = {
+            "file_size_bytes": file_size_bytes,
+            "file_size_mb": round(file_size_bytes / (1024 * 1024), 2),
+            "max_file_size_bytes": config.MAX_PDF_SIZE,
+            "max_file_size_mb": round(config.MAX_PDF_SIZE / (1024 * 1024), 2),
+            "page_limit": config.MAX_PDF_PAGES,
+            "page_count": None,
+            "pages_to_process": None,
+            "will_trim_pages": False,
+            "is_pdf": filename_lower.endswith(".pdf"),
+        }
+
+        if metadata["is_pdf"]:
+            with open(file_path, "rb") as file:
+                pdf_reader = PdfReader(file)
+                page_count = len(pdf_reader.pages)
+            metadata["page_count"] = page_count
+            metadata["pages_to_process"] = min(page_count, config.MAX_PDF_PAGES)
+            metadata["will_trim_pages"] = page_count > config.MAX_PDF_PAGES
+
+        return metadata
+
     def _extract_from_pdf(self, pdf_path: str) -> str:
         """
         Extract text from PDF file with automatic OCR fallback for image-based PDFs