From 63f55755a112a0e54c894b0f8f2de5206821592c Mon Sep 17 00:00:00 2001
From: Mothilal-hire10x <mothilal@hire10x.ai>
Date: Mon, 13 Oct 2025 18:10:27 +0530
Subject: [PATCH 1/4] Enhance DOCXHandler for comprehensive text extraction
 from DOCX files

- Improved extraction logic to include text from paragraphs, tables, headers, footers, footnotes, endnotes, and text boxes.
- Added text normalization and duplicate handling to ensure clean output.
---
 textxtract/handlers/docx.py | 138 ++++++++++++++++++++++++++++++++++--
 1 file changed, 134 insertions(+), 4 deletions(-)

diff --git a/textxtract/handlers/docx.py b/textxtract/handlers/docx.py
index dcb47dc..6aa169a 100644
--- a/textxtract/handlers/docx.py
+++ b/textxtract/handlers/docx.py
@@ -1,21 +1,151 @@
-"""DOCX file handler for text extraction."""
+"""DOCX file handler for comprehensive text extraction.
+
+This handler extracts text from:
+- Document paragraphs
+- Tables and cells
+- Headers and footers
+- Text boxes and shapes
+- Footnotes and endnotes (if available)
+"""
 
 from pathlib import Path
 from typing import Optional
+import re
 
 from textxtract.core.base import FileTypeHandler
-from textxtract.core.exceptions import ExtractionError, InvalidFileError
+from textxtract.core.exceptions import ExtractionError
 
 
 class DOCXHandler(FileTypeHandler):
-    """Handler for extracting text from DOCX files."""
+    """Enhanced handler for comprehensive text extraction from DOCX files.
+    
+    Extracts text from all document elements including paragraphs, tables,
+    headers, footers, text boxes, and footnotes to ensure complete content
+    extraction for documents like resumes, reports, and complex layouts.
+    """
+
+    def _clean_text(self, text: str) -> str:
+        """Clean and normalize extracted text."""
+        if not text:
+            return ""
+        
+        # Normalize whitespace
+        text = re.sub(r'\s+', ' ', text)
+        # Remove excessive dots/periods
+        text = re.sub(r'\.{2,}', ' ', text)
+        # Clean up spacing around punctuation
+        text = re.sub(r'\s+([.!?,:;])', r'\1', text)
+        return text.strip()
 
     def extract(self, file_path: Path, config: Optional[dict] = None) -> str:
         try:
             from docx import Document
+            import re
 
             doc = Document(file_path)
-            return "\n".join(paragraph.text for paragraph in doc.paragraphs)
+            text_parts = []
+            processed_text = set()  # To avoid duplicates
+            
+            # Extract text from paragraphs
+            for paragraph in doc.paragraphs:
+                text = paragraph.text.strip()
+                if text and text not in processed_text:
+                    text_parts.append(text)
+                    processed_text.add(text)
+            
+            # Extract text from tables
+            for table in doc.tables:
+                table_texts = []
+                for row in table.rows:
+                    row_text = []
+                    for cell in row.cells:
+                        # Get text from all paragraphs in the cell
+                        cell_paragraphs = []
+                        for paragraph in cell.paragraphs:
+                            text = paragraph.text.strip()
+                            if text and text not in processed_text:
+                                cell_paragraphs.append(text)
+                                processed_text.add(text)
+                        if cell_paragraphs:
+                            row_text.append(" ".join(cell_paragraphs))
+                    if row_text:
+                        table_texts.append(" | ".join(row_text))
+                
+                # Add table content if any
+                if table_texts:
+                    text_parts.extend(table_texts)
+            
+            # Extract text from headers and footers
+            for section in doc.sections:
+                # Header text
+                if section.header:
+                    for paragraph in section.header.paragraphs:
+                        text = paragraph.text.strip()
+                        if text and text not in processed_text:
+                            text_parts.append(text)
+                            processed_text.add(text)
+                
+                # Footer text
+                if section.footer:
+                    for paragraph in section.footer.paragraphs:
+                        text = paragraph.text.strip()
+                        if text and text not in processed_text:
+                            text_parts.append(text)
+                            processed_text.add(text)
+            
+            # Try to extract text from footnotes and endnotes
+            try:
+                # Extract footnotes
+                if hasattr(doc, 'footnotes'):
+                    for footnote in doc.footnotes:
+                        for paragraph in footnote.paragraphs:
+                            text = paragraph.text.strip()
+                            if text and text not in processed_text:
+                                text_parts.append(f"[Footnote: {text}]")
+                                processed_text.add(text)
+                
+                # Extract endnotes
+                if hasattr(doc, 'endnotes'):
+                    for endnote in doc.endnotes:
+                        for paragraph in endnote.paragraphs:
+                            text = paragraph.text.strip()
+                            if text and text not in processed_text:
+                                text_parts.append(f"[Endnote: {text}]")
+                                processed_text.add(text)
+            except Exception:
+                # If footnotes/endnotes extraction fails, continue
+                pass
+            
+            # Try to extract text from text boxes and shapes using xml parsing
+            try:
+                from docx.oxml.ns import qn
+                
+                # Look for drawing elements containing text
+                for element in doc.element.body.iter():
+                    if element.tag.endswith('}txbxContent'):
+                        # Extract text from text boxes
+                        for para in element.iter():
+                            if para.tag.endswith('}t') and para.text:
+                                text = para.text.strip()
+                                if text and text not in processed_text:
+                                    text_parts.append(f"[TextBox: {text}]")
+                                    processed_text.add(text)
+            except Exception:
+                # If text box extraction fails, continue
+                pass
+            
+            # Clean up and join text
+            if text_parts:
+                # Clean each part and join with newlines
+                cleaned_parts = [self._clean_text(part) for part in text_parts if part.strip()]
+                result = "\n".join(cleaned_parts)
+                
+                # Ensure proper sentence breaks for readability
+                result = re.sub(r'([.!?])\s*([A-Z])', r'\1\n\2', result)
+                return result.strip()
+            
+            return ""
+            
         except Exception as e:
             raise ExtractionError(f"DOCX extraction failed: {e}")
 

From 5e35693fe98d0d2df66bcea4eb934c83082bfb71 Mon Sep 17 00:00:00 2001
From: Mothilal-hire10x <mothilal@hire10x.ai>
Date: Mon, 13 Oct 2025 19:09:29 +0530
Subject: [PATCH 2/4] Enhance DOCXHandler with detailed extraction features and
 improved documentation

---
 textxtract/handlers/docx.py | 135 +++++++++++++++++++++++++++++-------
 1 file changed, 109 insertions(+), 26 deletions(-)

diff --git a/textxtract/handlers/docx.py b/textxtract/handlers/docx.py
index 6aa169a..bb1fa61 100644
--- a/textxtract/handlers/docx.py
+++ b/textxtract/handlers/docx.py
@@ -19,47 +19,107 @@
 class DOCXHandler(FileTypeHandler):
     """Enhanced handler for comprehensive text extraction from DOCX files.
     
-    Extracts text from all document elements including paragraphs, tables,
-    headers, footers, text boxes, and footnotes to ensure complete content
-    extraction for documents like resumes, reports, and complex layouts.
+    This handler provides complete text extraction from Microsoft Word documents,
+    including all document elements such as paragraphs, tables, headers, footers,
+    text boxes, and footnotes. It's designed to handle complex document layouts
+    commonly found in resumes, reports, and structured documents.
+    
+    Features:
+        - Extracts text from document body paragraphs
+        - Processes table content with cell-by-cell extraction
+        - Captures header and footer text from all sections
+        - Attempts to extract text from embedded text boxes and shapes
+        - Handles footnotes and endnotes when available
+        - Deduplicates repeated content
+        - Cleans and normalizes extracted text
+    
+    Example:
+        >>> handler = DOCXHandler()
+        >>> text = handler.extract(Path("document.docx"))
+        >>> print(text)
+        "Document title\nParagraph content...\nTable data | Column 2..."
+        
+        >>> # Async extraction
+        >>> text = await handler.extract_async(Path("document.docx"))
     """
 
     def _clean_text(self, text: str) -> str:
-        """Clean and normalize extracted text."""
+        """Clean and normalize extracted text.
+        
+        Performs various text cleaning operations to improve readability
+        and consistency of extracted content.
+        
+        Args:
+            text (str): Raw text to be cleaned.
+            
+        Returns:
+            str: Cleaned and normalized text with proper spacing and formatting.
+            
+        Note:
+            - Normalizes multiple whitespace characters to single spaces
+            - Removes excessive consecutive dots/periods
+            - Fixes spacing around punctuation marks
+            - Strips leading and trailing whitespace
+        """
         if not text:
             return ""
         
-        # Normalize whitespace
+        # Normalize whitespace (replace multiple spaces, tabs, newlines with single space)
         text = re.sub(r'\s+', ' ', text)
-        # Remove excessive dots/periods
+        # Remove excessive dots/periods (likely formatting artifacts)
         text = re.sub(r'\.{2,}', ' ', text)
-        # Clean up spacing around punctuation
+        # Clean up spacing around punctuation (remove spaces before punctuation)
         text = re.sub(r'\s+([.!?,:;])', r'\1', text)
         return text.strip()
 
     def extract(self, file_path: Path, config: Optional[dict] = None) -> str:
+        """Extract text from a DOCX file with comprehensive content capture.
+        
+        Performs thorough text extraction from all available document elements
+        including body text, tables, headers, footers, and embedded content.
+        
+        Args:
+            file_path (Path): Path to the DOCX file to extract text from.
+            config (Optional[dict], optional): Configuration options for extraction.
+                Currently not used but reserved for future enhancements.
+                
+        Returns:
+            str: Extracted and cleaned text from the document with proper formatting.
+                Returns empty string if no text is found.
+                
+        Raises:
+            ExtractionError: If the file cannot be read or processed, or if the
+                python-docx library is not available.
+                
+        Note:
+            - Text is deduplicated to avoid repeated content from overlapping elements
+            - Table content is formatted with pipe separators between columns
+            - Special content (footnotes, text boxes) is labeled with descriptive tags
+            - Sentence breaks are automatically inserted for better readability
+        """
         try:
             from docx import Document
             import re
 
+            # Load the document
             doc = Document(file_path)
             text_parts = []
-            processed_text = set()  # To avoid duplicates
+            processed_text = set()  # Track processed text to avoid duplicates
             
-            # Extract text from paragraphs
+            # Extract text from main document paragraphs
             for paragraph in doc.paragraphs:
                 text = paragraph.text.strip()
                 if text and text not in processed_text:
                     text_parts.append(text)
                     processed_text.add(text)
             
-            # Extract text from tables
+            # Extract text from all tables in the document
             for table in doc.tables:
                 table_texts = []
                 for row in table.rows:
                     row_text = []
                     for cell in row.cells:
-                        # Get text from all paragraphs in the cell
+                        # Process each paragraph within the cell
                         cell_paragraphs = []
                         for paragraph in cell.paragraphs:
                             text = paragraph.text.strip()
@@ -69,15 +129,16 @@ def extract(self, file_path: Path, config: Optional[dict] = None) -> str:
                         if cell_paragraphs:
                             row_text.append(" ".join(cell_paragraphs))
                     if row_text:
+                        # Join cell contents with pipe separator for table structure
                         table_texts.append(" | ".join(row_text))
                 
-                # Add table content if any
+                # Add table content to main text collection
                 if table_texts:
                     text_parts.extend(table_texts)
             
-            # Extract text from headers and footers
+            # Extract text from headers and footers across all document sections
             for section in doc.sections:
-                # Header text
+                # Process header content
                 if section.header:
                     for paragraph in section.header.paragraphs:
                         text = paragraph.text.strip()
@@ -85,7 +146,7 @@ def extract(self, file_path: Path, config: Optional[dict] = None) -> str:
                             text_parts.append(text)
                             processed_text.add(text)
                 
-                # Footer text
+                # Process footer content
                 if section.footer:
                     for paragraph in section.footer.paragraphs:
                         text = paragraph.text.strip()
@@ -93,9 +154,9 @@ def extract(self, file_path: Path, config: Optional[dict] = None) -> str:
                             text_parts.append(text)
                             processed_text.add(text)
             
-            # Try to extract text from footnotes and endnotes
+            # Attempt to extract footnotes and endnotes (may not be available in all documents)
             try:
-                # Extract footnotes
+                # Extract footnotes if present
                 if hasattr(doc, 'footnotes'):
                     for footnote in doc.footnotes:
                         for paragraph in footnote.paragraphs:
@@ -104,7 +165,7 @@ def extract(self, file_path: Path, config: Optional[dict] = None) -> str:
                                 text_parts.append(f"[Footnote: {text}]")
                                 processed_text.add(text)
                 
-                # Extract endnotes
+                # Extract endnotes if present
                 if hasattr(doc, 'endnotes'):
                     for endnote in doc.endnotes:
                         for paragraph in endnote.paragraphs:
@@ -113,17 +174,17 @@ def extract(self, file_path: Path, config: Optional[dict] = None) -> str:
                                 text_parts.append(f"[Endnote: {text}]")
                                 processed_text.add(text)
             except Exception:
-                # If footnotes/endnotes extraction fails, continue
+                # Footnote/endnote extraction is optional - continue if it fails
                 pass
             
-            # Try to extract text from text boxes and shapes using xml parsing
+            # Attempt to extract text from embedded text boxes and shapes using XML parsing
             try:
                 from docx.oxml.ns import qn
                 
-                # Look for drawing elements containing text
+                # Iterate through document XML elements to find drawing content
                 for element in doc.element.body.iter():
                     if element.tag.endswith('}txbxContent'):
-                        # Extract text from text boxes
+                        # Extract text from text box elements
                         for para in element.iter():
                             if para.tag.endswith('}t') and para.text:
                                 text = para.text.strip()
@@ -131,16 +192,16 @@ def extract(self, file_path: Path, config: Optional[dict] = None) -> str:
                                     text_parts.append(f"[TextBox: {text}]")
                                     processed_text.add(text)
             except Exception:
-                # If text box extraction fails, continue
+                # Text box extraction is optional - continue if it fails
                 pass
             
-            # Clean up and join text
+            # Process and format the final output
             if text_parts:
-                # Clean each part and join with newlines
+                # Clean each text part and filter out empty content
                 cleaned_parts = [self._clean_text(part) for part in text_parts if part.strip()]
                 result = "\n".join(cleaned_parts)
                 
-                # Ensure proper sentence breaks for readability
+                # Add proper sentence breaks for improved readability
                 result = re.sub(r'([.!?])\s*([A-Z])', r'\1\n\2', result)
                 return result.strip()
             
@@ -152,6 +213,28 @@ def extract(self, file_path: Path, config: Optional[dict] = None) -> str:
     async def extract_async(
         self, file_path: Path, config: Optional[dict] = None
     ) -> str:
+        """Asynchronously extract text from a DOCX file.
+        
+        Provides non-blocking text extraction by running the synchronous
+        extraction method in a separate thread.
+        
+        Args:
+            file_path (Path): Path to the DOCX file to extract text from.
+            config (Optional[dict], optional): Configuration options for extraction.
+                Currently not used but reserved for future enhancements.
+                
+        Returns:
+            str: Extracted and cleaned text from the document with proper formatting.
+                Returns empty string if no text is found.
+                
+        Raises:
+            ExtractionError: If the file cannot be read or processed, or if the
+                python-docx library is not available.
+                
+        Note:
+            This method uses asyncio.to_thread() to run the synchronous extraction
+            in a thread pool, making it suitable for async/await usage patterns.
+        """
         import asyncio
 
         return await asyncio.to_thread(self.extract, file_path, config)

From 36ebc44d1a2cfe8917408dec6ca77e52fb0ef46b Mon Sep 17 00:00:00 2001
From: Mothilal-hire10x <mothilal@hire10x.ai>
Date: Thu, 30 Oct 2025 12:02:23 +0530
Subject: [PATCH 3/4] Refactor dependency installation in CI workflow to remove
 unnecessary extras

---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 6ea6669..5133c9c 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -21,7 +21,7 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          pip install .[dev,all]
+          pip install .[dev]
           pip install pytest-cov
 
       - name: Run tests with coverage

From 650c519c8c46fd28504989121554f37ddb34920d Mon Sep 17 00:00:00 2001
From: Mothilal-hire10x <mothilal@hire10x.ai>
Date: Thu, 30 Oct 2025 12:36:15 +0530
Subject: [PATCH 4/4] Update dependency installation in CI workflow to include
 all extras

---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 5133c9c..6ea6669 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -21,7 +21,7 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          pip install .[dev]
+          pip install .[dev,all]
           pip install pytest-cov
 
       - name: Run tests with coverage