From 63f55755a112a0e54c894b0f8f2de5206821592c Mon Sep 17 00:00:00 2001 From: Mothilal-hire10x Date: Mon, 13 Oct 2025 18:10:27 +0530 Subject: [PATCH 1/4] Enhance DOCXHandler for comprehensive text extraction from DOCX files - Improved extraction logic to include text from paragraphs, tables, headers, footers, footnotes, endnotes, and text boxes. - Added text normalization and duplicate handling to ensure clean output. --- textxtract/handlers/docx.py | 138 ++++++++++++++++++++++++++++++++++-- 1 file changed, 134 insertions(+), 4 deletions(-) diff --git a/textxtract/handlers/docx.py b/textxtract/handlers/docx.py index dcb47dc..6aa169a 100644 --- a/textxtract/handlers/docx.py +++ b/textxtract/handlers/docx.py @@ -1,21 +1,151 @@ -"""DOCX file handler for text extraction.""" +"""DOCX file handler for comprehensive text extraction. + +This handler extracts text from: +- Document paragraphs +- Tables and cells +- Headers and footers +- Text boxes and shapes +- Footnotes and endnotes (if available) +""" from pathlib import Path from typing import Optional +import re from textxtract.core.base import FileTypeHandler -from textxtract.core.exceptions import ExtractionError, InvalidFileError +from textxtract.core.exceptions import ExtractionError class DOCXHandler(FileTypeHandler): - """Handler for extracting text from DOCX files.""" + """Enhanced handler for comprehensive text extraction from DOCX files. + + Extracts text from all document elements including paragraphs, tables, + headers, footers, text boxes, and footnotes to ensure complete content + extraction for documents like resumes, reports, and complex layouts. + """ + + def _clean_text(self, text: str) -> str: + """Clean and normalize extracted text.""" + if not text: + return "" + + # Normalize whitespace + text = re.sub(r'\s+', ' ', text) + # Remove excessive dots/periods + text = re.sub(r'\.{2,}', ' ', text) + # Clean up spacing around punctuation + text = re.sub(r'\s+([.!?,:;])', r'\1', text) + return text.strip() def extract(self, file_path: Path, config: Optional[dict] = None) -> str: try: from docx import Document + import re doc = Document(file_path) - return "\n".join(paragraph.text for paragraph in doc.paragraphs) + text_parts = [] + processed_text = set() # To avoid duplicates + + # Extract text from paragraphs + for paragraph in doc.paragraphs: + text = paragraph.text.strip() + if text and text not in processed_text: + text_parts.append(text) + processed_text.add(text) + + # Extract text from tables + for table in doc.tables: + table_texts = [] + for row in table.rows: + row_text = [] + for cell in row.cells: + # Get text from all paragraphs in the cell + cell_paragraphs = [] + for paragraph in cell.paragraphs: + text = paragraph.text.strip() + if text and text not in processed_text: + cell_paragraphs.append(text) + processed_text.add(text) + if cell_paragraphs: + row_text.append(" ".join(cell_paragraphs)) + if row_text: + table_texts.append(" | ".join(row_text)) + + # Add table content if any + if table_texts: + text_parts.extend(table_texts) + + # Extract text from headers and footers + for section in doc.sections: + # Header text + if section.header: + for paragraph in section.header.paragraphs: + text = paragraph.text.strip() + if text and text not in processed_text: + text_parts.append(text) + processed_text.add(text) + + # Footer text + if section.footer: + for paragraph in section.footer.paragraphs: + text = paragraph.text.strip() + if text and text not in processed_text: + text_parts.append(text) + processed_text.add(text) + + # Try to extract text from footnotes and endnotes + try: + # Extract footnotes + if hasattr(doc, 'footnotes'): + for footnote in doc.footnotes: + for paragraph in footnote.paragraphs: + text = paragraph.text.strip() + if text and text not in processed_text: + text_parts.append(f"[Footnote: {text}]") + processed_text.add(text) + + # Extract endnotes + if hasattr(doc, 'endnotes'): + for endnote in doc.endnotes: + for paragraph in endnote.paragraphs: + text = paragraph.text.strip() + if text and text not in processed_text: + text_parts.append(f"[Endnote: {text}]") + processed_text.add(text) + except Exception: + # If footnotes/endnotes extraction fails, continue + pass + + # Try to extract text from text boxes and shapes using xml parsing + try: + from docx.oxml.ns import qn + + # Look for drawing elements containing text + for element in doc.element.body.iter(): + if element.tag.endswith('}txbxContent'): + # Extract text from text boxes + for para in element.iter(): + if para.tag.endswith('}t') and para.text: + text = para.text.strip() + if text and text not in processed_text: + text_parts.append(f"[TextBox: {text}]") + processed_text.add(text) + except Exception: + # If text box extraction fails, continue + pass + + # Clean up and join text + if text_parts: + # Clean each part and join with newlines + cleaned_parts = [self._clean_text(part) for part in text_parts if part.strip()] + result = "\n".join(cleaned_parts) + + # Ensure proper sentence breaks for readability + result = re.sub(r'([.!?])\s*([A-Z])', r'\1\n\2', result) + return result.strip() + + return "" + except Exception as e: raise ExtractionError(f"DOCX extraction failed: {e}") From 5e35693fe98d0d2df66bcea4eb934c83082bfb71 Mon Sep 17 00:00:00 2001 From: Mothilal-hire10x Date: Mon, 13 Oct 2025 19:09:29 +0530 Subject: [PATCH 2/4] Enhance DOCXHandler with detailed extraction features and improved documentation --- textxtract/handlers/docx.py | 135 +++++++++++++++++++++++++++++------- 1 file changed, 109 insertions(+), 26 deletions(-) diff --git a/textxtract/handlers/docx.py b/textxtract/handlers/docx.py index 6aa169a..bb1fa61 100644 --- a/textxtract/handlers/docx.py +++ b/textxtract/handlers/docx.py @@ -19,47 +19,107 @@ class DOCXHandler(FileTypeHandler): """Enhanced handler for comprehensive text extraction from DOCX files. - Extracts text from all document elements including paragraphs, tables, - headers, footers, text boxes, and footnotes to ensure complete content - extraction for documents like resumes, reports, and complex layouts. + This handler provides complete text extraction from Microsoft Word documents, + including all document elements such as paragraphs, tables, headers, footers, + text boxes, and footnotes. It's designed to handle complex document layouts + commonly found in resumes, reports, and structured documents. + + Features: + - Extracts text from document body paragraphs + - Processes table content with cell-by-cell extraction + - Captures header and footer text from all sections + - Attempts to extract text from embedded text boxes and shapes + - Handles footnotes and endnotes when available + - Deduplicates repeated content + - Cleans and normalizes extracted text + + Example: + >>> handler = DOCXHandler() + >>> text = handler.extract(Path("document.docx")) + >>> print(text) + "Document title\nParagraph content...\nTable data | Column 2..." + + >>> # Async extraction + >>> text = await handler.extract_async(Path("document.docx")) """ def _clean_text(self, text: str) -> str: - """Clean and normalize extracted text.""" + """Clean and normalize extracted text. + + Performs various text cleaning operations to improve readability + and consistency of extracted content. + + Args: + text (str): Raw text to be cleaned. + + Returns: + str: Cleaned and normalized text with proper spacing and formatting. + + Note: + - Normalizes multiple whitespace characters to single spaces + - Removes excessive consecutive dots/periods + - Fixes spacing around punctuation marks + - Strips leading and trailing whitespace + """ if not text: return "" - # Normalize whitespace + # Normalize whitespace (replace multiple spaces, tabs, newlines with single space) text = re.sub(r'\s+', ' ', text) - # Remove excessive dots/periods + # Remove excessive dots/periods (likely formatting artifacts) text = re.sub(r'\.{2,}', ' ', text) - # Clean up spacing around punctuation + # Clean up spacing around punctuation (remove spaces before punctuation) text = re.sub(r'\s+([.!?,:;])', r'\1', text) return text.strip() def extract(self, file_path: Path, config: Optional[dict] = None) -> str: + """Extract text from a DOCX file with comprehensive content capture. + + Performs thorough text extraction from all available document elements + including body text, tables, headers, footers, and embedded content. + + Args: + file_path (Path): Path to the DOCX file to extract text from. + config (Optional[dict], optional): Configuration options for extraction. + Currently not used but reserved for future enhancements. + + Returns: + str: Extracted and cleaned text from the document with proper formatting. + Returns empty string if no text is found. + + Raises: + ExtractionError: If the file cannot be read or processed, or if the + python-docx library is not available. + + Note: + - Text is deduplicated to avoid repeated content from overlapping elements + - Table content is formatted with pipe separators between columns + - Special content (footnotes, text boxes) is labeled with descriptive tags + - Sentence breaks are automatically inserted for better readability + """ try: from docx import Document import re + # Load the document doc = Document(file_path) text_parts = [] - processed_text = set() # To avoid duplicates + processed_text = set() # Track processed text to avoid duplicates - # Extract text from paragraphs + # Extract text from main document paragraphs for paragraph in doc.paragraphs: text = paragraph.text.strip() if text and text not in processed_text: text_parts.append(text) processed_text.add(text) - # Extract text from tables + # Extract text from all tables in the document for table in doc.tables: table_texts = [] for row in table.rows: row_text = [] for cell in row.cells: - # Get text from all paragraphs in the cell + # Process each paragraph within the cell cell_paragraphs = [] for paragraph in cell.paragraphs: text = paragraph.text.strip() @@ -69,15 +129,16 @@ def extract(self, file_path: Path, config: Optional[dict] = None) -> str: if cell_paragraphs: row_text.append(" ".join(cell_paragraphs)) if row_text: + # Join cell contents with pipe separator for table structure table_texts.append(" | ".join(row_text)) - # Add table content if any + # Add table content to main text collection if table_texts: text_parts.extend(table_texts) - # Extract text from headers and footers + # Extract text from headers and footers across all document sections for section in doc.sections: - # Header text + # Process header content if section.header: for paragraph in section.header.paragraphs: text = paragraph.text.strip() @@ -85,7 +146,7 @@ def extract(self, file_path: Path, config: Optional[dict] = None) -> str: text_parts.append(text) processed_text.add(text) - # Footer text + # Process footer content if section.footer: for paragraph in section.footer.paragraphs: text = paragraph.text.strip() @@ -93,9 +154,9 @@ def extract(self, file_path: Path, config: Optional[dict] = None) -> str: text_parts.append(text) processed_text.add(text) - # Try to extract text from footnotes and endnotes + # Attempt to extract footnotes and endnotes (may not be available in all documents) try: - # Extract footnotes + # Extract footnotes if present if hasattr(doc, 'footnotes'): for footnote in doc.footnotes: for paragraph in footnote.paragraphs: @@ -104,7 +165,7 @@ def extract(self, file_path: Path, config: Optional[dict] = None) -> str: text_parts.append(f"[Footnote: {text}]") processed_text.add(text) - # Extract endnotes + # Extract endnotes if present if hasattr(doc, 'endnotes'): for endnote in doc.endnotes: for paragraph in endnote.paragraphs: @@ -113,17 +174,17 @@ def extract(self, file_path: Path, config: Optional[dict] = None) -> str: text_parts.append(f"[Endnote: {text}]") processed_text.add(text) except Exception: - # If footnotes/endnotes extraction fails, continue + # Footnote/endnote extraction is optional - continue if it fails pass - # Try to extract text from text boxes and shapes using xml parsing + # Attempt to extract text from embedded text boxes and shapes using XML parsing try: from docx.oxml.ns import qn - # Look for drawing elements containing text + # Iterate through document XML elements to find drawing content for element in doc.element.body.iter(): if element.tag.endswith('}txbxContent'): - # Extract text from text boxes + # Extract text from text box elements for para in element.iter(): if para.tag.endswith('}t') and para.text: text = para.text.strip() @@ -131,16 +192,16 @@ def extract(self, file_path: Path, config: Optional[dict] = None) -> str: text_parts.append(f"[TextBox: {text}]") processed_text.add(text) except Exception: - # If text box extraction fails, continue + # Text box extraction is optional - continue if it fails pass - # Clean up and join text + # Process and format the final output if text_parts: - # Clean each part and join with newlines + # Clean each text part and filter out empty content cleaned_parts = [self._clean_text(part) for part in text_parts if part.strip()] result = "\n".join(cleaned_parts) - # Ensure proper sentence breaks for readability + # Add proper sentence breaks for improved readability result = re.sub(r'([.!?])\s*([A-Z])', r'\1\n\2', result) return result.strip() @@ -152,6 +213,28 @@ def extract(self, file_path: Path, config: Optional[dict] = None) -> str: async def extract_async( self, file_path: Path, config: Optional[dict] = None ) -> str: + """Asynchronously extract text from a DOCX file. + + Provides non-blocking text extraction by running the synchronous + extraction method in a separate thread. + + Args: + file_path (Path): Path to the DOCX file to extract text from. + config (Optional[dict], optional): Configuration options for extraction. + Currently not used but reserved for future enhancements. + + Returns: + str: Extracted and cleaned text from the document with proper formatting. + Returns empty string if no text is found. + + Raises: + ExtractionError: If the file cannot be read or processed, or if the + python-docx library is not available. + + Note: + This method uses asyncio.to_thread() to run the synchronous extraction + in a thread pool, making it suitable for async/await usage patterns. + """ import asyncio return await asyncio.to_thread(self.extract, file_path, config) From 36ebc44d1a2cfe8917408dec6ca77e52fb0ef46b Mon Sep 17 00:00:00 2001 From: Mothilal-hire10x Date: Thu, 30 Oct 2025 12:02:23 +0530 Subject: [PATCH 3/4] Refactor dependency installation in CI workflow to remove unnecessary extras --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 6ea6669..5133c9c 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -21,7 +21,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install .[dev,all] + pip install .[dev] pip install pytest-cov - name: Run tests with coverage From 650c519c8c46fd28504989121554f37ddb34920d Mon Sep 17 00:00:00 2001 From: Mothilal-hire10x Date: Thu, 30 Oct 2025 12:36:15 +0530 Subject: [PATCH 4/4] Update dependency installation in CI workflow to include all extras --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 5133c9c..6ea6669 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -21,7 +21,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install .[dev] + pip install .[dev,all] pip install pytest-cov - name: Run tests with coverage