diff --git a/ragitect/services/processor/__init__.py b/ragitect/services/processor/__init__.py index 72a1323..c4c7a9e 100644 --- a/ragitect/services/processor/__init__.py +++ b/ragitect/services/processor/__init__.py @@ -4,6 +4,12 @@ """ from ragitect.services.processor.base import BaseDocumentProcessor +from ragitect.services.processor.pdf_url_processor import ( + InvalidPDFURLError, + PDFDownloadError, + PDFProcessingError, + PDFURLProcessor, +) from ragitect.services.processor.simple import SimpleProcessor from ragitect.services.processor.web_url_processor import ( ContentExtractionError, @@ -19,7 +25,11 @@ __all__ = [ "BaseDocumentProcessor", "ContentExtractionError", + "InvalidPDFURLError", "InvalidYouTubeURLError", + "PDFDownloadError", + "PDFProcessingError", + "PDFURLProcessor", "SimpleProcessor", "TranscriptUnavailableError", "URLFetchError", diff --git a/ragitect/services/processor/pdf_url_processor.py b/ragitect/services/processor/pdf_url_processor.py new file mode 100644 index 0000000..d00f1dc --- /dev/null +++ b/ragitect/services/processor/pdf_url_processor.py @@ -0,0 +1,298 @@ +"""PDF URL Processor - Downloads PDF files from URLs and converts to Markdown. + +This processor handles PDF URL ingestion by: +1. Validating that the URL points to a PDF file (extension or Content-Type) +2. Downloading PDF content via httpx with proper timeout/HTTP2/redirect configuration +3. Delegating to DoclingProcessor for PDF → Markdown conversion + +Usage: + processor = PDFURLProcessor() + markdown = await processor.process("https://arxiv.org/pdf/1706.03762.pdf") + +Supported URL patterns: + - arXiv papers: https://arxiv.org/pdf/2301.12345.pdf + - Direct PDF links: https://example.com/document.pdf + - University research papers: https://stanford.edu/papers/paper.pdf + - Documentation: https://docs.example.com/manual.pdf + +Note: + This processor inherits from BaseDocumentProcessor but overrides with an + async signature. The async process(url: str) method is used for URL fetching. + + Integration with ProcessorFactory happens in Story 5.5. + +Exceptions: + InvalidPDFURLError: URL does not point to a PDF file + PDFDownloadError: HTTP request failed (timeout, connection error, 4xx/5xx) + PDFProcessingError: DoclingProcessor failed to process PDF +""" + +import asyncio +import logging +from typing import override +from urllib.parse import unquote, urlparse + +import httpx + +from ragitect.services.processor.base import BaseDocumentProcessor +from ragitect.services.processor.docling_processor import DoclingProcessor + +logger = logging.getLogger(__name__) + + +class InvalidPDFURLError(Exception): + """Raised when URL does not point to a valid PDF file. + + Causes: + - URL does not end with .pdf AND Content-Type is not application/pdf + - HEAD request fails to validate content type + + Attributes: + url: The URL that failed validation + message: Descriptive error message including URL + """ + + pass + + +class PDFDownloadError(Exception): + """Raised when PDF download fails. + + Causes: + - HTTP timeout (30s limit) + - Connection error + - HTTP 4xx/5xx status codes + + Attributes: + url: The URL that failed to download + message: Descriptive error message including URL and error type + """ + + pass + + +class PDFProcessingError(Exception): + """Raised when DoclingProcessor fails to process PDF. + + Causes: + - Corrupted PDF + - Password-protected PDF + - Image-only PDF without OCR + + Attributes: + url: The URL of the PDF that failed processing + message: Descriptive error message including URL and error details + """ + + pass + + +class PDFURLProcessor(BaseDocumentProcessor): + """Processor for downloading PDFs from URLs and converting to Markdown. + + Inherits from BaseDocumentProcessor but provides an async process(url: str) + method instead of the sync process(file_bytes, file_name) method. + + Implements async PDF download with: + - 30 second total timeout, 10 second connect timeout (NFR-P4) + - HTTP/2 support for improved performance + - Automatic redirect following + - Connection pooling (max 20 keepalive connections) + + PDF URL validation: + - Fast path: URL ends with .pdf extension + - Fallback: HEAD request to check Content-Type: application/pdf + + PDF processing delegated to DoclingProcessor for: + - Robust PDF parsing with Docling library + - Table structure detection + - Clean Markdown output + + Example: + >>> processor = PDFURLProcessor() + >>> markdown = await processor.process("https://arxiv.org/pdf/1706.03762.pdf") + >>> print(markdown[:100]) + # Attention Is All You Need + ... + """ + + def __init__(self) -> None: + """Initialize PDFURLProcessor with DoclingProcessor for PDF conversion.""" + self._docling_processor = DoclingProcessor() + + @override + def supported_formats(self) -> list[str]: + """Return list of supported file extensions. + + PDFURLProcessor is not file-based, so returns empty list. + URL-based routing is handled separately from file extension routing. + + Returns: + Empty list (not file-based) + """ + return [] + + async def process(self, url: str) -> str: + """Download PDF from URL and convert to Markdown. + + Args: + url: HTTP or HTTPS URL pointing to a PDF file + + Returns: + Markdown string with PDF content extracted + + Raises: + InvalidPDFURLError: If URL does not point to a PDF file + PDFDownloadError: If HTTP request fails (timeout, connection error, 4xx/5xx) + PDFProcessingError: If DoclingProcessor fails to process PDF + + Example: + >>> processor = PDFURLProcessor() + >>> markdown = await processor.process("https://arxiv.org/pdf/1706.03762.pdf") + """ + logger.info(f"Processing PDF URL: {url}") + + # Validate URL points to PDF + if not await self._validate_pdf_url(url): + raise InvalidPDFURLError(f"URL does not point to a PDF file: {url}") + + # Download PDF bytes + pdf_bytes = await self._download_pdf(url) + logger.info(f"Downloaded {len(pdf_bytes)} bytes from {url}") + + # Extract filename from URL for DoclingProcessor + file_name = self._extract_filename(url) + + # Delegate to DoclingProcessor (sync, runs in executor via asyncio.to_thread) + try: + markdown = await asyncio.to_thread( + self._docling_processor.process, + pdf_bytes, + file_name, + ) + except Exception as e: + logger.error(f"Failed to process PDF from {url}: {e}") + raise PDFProcessingError(f"Failed to process PDF from {url}: {e}") from e + + logger.info(f"Successfully processed PDF {url} - {len(markdown)} chars extracted") + return markdown + + async def _validate_pdf_url(self, url: str) -> bool: + """Validate that URL points to a PDF file. + + Two-tier validation: + 1. Fast path: Check if URL ends with .pdf + 2. Fallback: HEAD request to check Content-Type + + Args: + url: URL to validate + + Returns: + True if URL points to a PDF, False otherwise + """ + # Fast path: URL ends with .pdf (case-insensitive) + if url.lower().rstrip("/").endswith(".pdf"): + return True + + # Fallback: HEAD request to check Content-Type + timeout = httpx.Timeout(10.0, connect=5.0) + headers = { + "User-Agent": "Mozilla/5.0 (compatible; RAGitect/1.0; +https://github.com/bhdai/ragitect)" + } + + async with httpx.AsyncClient( + timeout=timeout, + follow_redirects=True, + headers=headers, + ) as client: + try: + response = await client.head(url) + content_type = response.headers.get("content-type", "").lower() + return "application/pdf" in content_type + except httpx.HTTPError: + # If HEAD fails, return False - caller will handle + return False + + async def _download_pdf(self, url: str) -> bytes: + """Download PDF bytes from URL. + + Args: + url: URL to download PDF from + + Returns: + PDF bytes + + Raises: + PDFDownloadError: On timeout, connection error, or HTTP error status + """ + # Configure timeout: 30s total, 10s connect (NFR-P4) + timeout = httpx.Timeout(30.0, connect=10.0) + + # Configure connection limits for pooling + limits = httpx.Limits(max_keepalive_connections=20) + + # Set User-Agent to avoid 403 from sites that block automated requests + headers = { + "User-Agent": "Mozilla/5.0 (compatible; RAGitect/1.0; +https://github.com/bhdai/ragitect)" + } + + async with httpx.AsyncClient( + timeout=timeout, + http2=True, # Enable HTTP/2 support + follow_redirects=True, # Auto-follow redirects + limits=limits, # Connection pooling + headers=headers, # Default headers for all requests + ) as client: + try: + response = await client.get(url) + response.raise_for_status() + return response.content + except httpx.TimeoutException as e: + logger.error(f"Timeout downloading PDF from {url}: {e}") + raise PDFDownloadError( + f"Timeout downloading PDF from {url} (30s limit)" + ) from e + except httpx.ConnectError as e: + logger.error(f"Connection error downloading PDF from {url}: {e}") + raise PDFDownloadError( + f"Connection error downloading PDF from {url}: {str(e)}" + ) from e + except httpx.HTTPStatusError as e: + status = e.response.status_code + logger.error(f"HTTP {status} downloading PDF from {url}") + if status == 404: + raise PDFDownloadError( + f"PDF not found (404): {url}" + ) from e + elif status == 403: + raise PDFDownloadError( + f"Access denied (403): {url}" + ) from e + else: + raise PDFDownloadError( + f"HTTP {status} downloading PDF from {url}" + ) from e + + def _extract_filename(self, url: str) -> str: + """Extract filename from URL path. + + Args: + url: URL to extract filename from + + Returns: + Filename with .pdf extension + """ + parsed = urlparse(url) + path = unquote(parsed.path) + + # Get last path segment + segments = [s for s in path.split("/") if s] + if segments: + filename = segments[-1] + # Ensure .pdf extension + if not filename.lower().endswith(".pdf"): + filename = f"{filename}.pdf" + return filename + + # Fallback + return "downloaded.pdf" diff --git a/tests/services/processor/test_pdf_url_processor.py b/tests/services/processor/test_pdf_url_processor.py new file mode 100644 index 0000000..3be5303 --- /dev/null +++ b/tests/services/processor/test_pdf_url_processor.py @@ -0,0 +1,556 @@ +"""Tests for PDFURLProcessor - PDF download from URL and conversion to Markdown. + +Red-Green-Refactor TDD: These tests define expected behavior before implementation. +""" + +import pytest +from unittest.mock import AsyncMock, MagicMock, patch + +import httpx + +# Module-level markers as per project-context.md +pytestmark = [pytest.mark.asyncio] + + +class TestPDFURLProcessorInterface: + """Test PDFURLProcessor class interface and method signatures""" + + def test_class_exists(self): + """PDFURLProcessor class should be importable""" + from ragitect.services.processor.pdf_url_processor import PDFURLProcessor + + processor = PDFURLProcessor() + assert processor is not None + + def test_inherits_from_base_document_processor(self): + """PDFURLProcessor should inherit from BaseDocumentProcessor""" + from ragitect.services.processor.base import BaseDocumentProcessor + from ragitect.services.processor.pdf_url_processor import PDFURLProcessor + + processor = PDFURLProcessor() + assert isinstance(processor, BaseDocumentProcessor) + + def test_supported_formats_returns_empty_list(self): + """PDFURLProcessor is not file-based, returns empty list""" + from ragitect.services.processor.pdf_url_processor import PDFURLProcessor + + processor = PDFURLProcessor() + formats = processor.supported_formats() + assert formats == [] + + async def test_process_method_signature_async(self): + """process() should be async and accept url string, return str""" + from ragitect.services.processor.pdf_url_processor import PDFURLProcessor + + processor = PDFURLProcessor() + + mock_pdf_bytes = b"%PDF-1.4 mock content" + + with patch.object(processor, "_validate_pdf_url", new_callable=AsyncMock) as mock_validate: + mock_validate.return_value = True + with patch.object(processor, "_download_pdf", new_callable=AsyncMock) as mock_download: + mock_download.return_value = mock_pdf_bytes + with patch.object(processor._docling_processor, "process") as mock_docling: + mock_docling.return_value = "# Test Document\n\nContent here." + + result = await processor.process("https://example.com/doc.pdf") + assert isinstance(result, str) + + +class TestPDFURLValidation: + """Test PDF URL validation logic""" + + async def test_url_ending_with_pdf_is_valid(self): + """URL ending with .pdf should pass fast-path validation""" + from ragitect.services.processor.pdf_url_processor import PDFURLProcessor + + processor = PDFURLProcessor() + result = await processor._validate_pdf_url("https://arxiv.org/pdf/1706.03762.pdf") + assert result is True + + async def test_url_ending_with_pdf_uppercase_is_valid(self): + """URL ending with .PDF (uppercase) should pass validation""" + from ragitect.services.processor.pdf_url_processor import PDFURLProcessor + + processor = PDFURLProcessor() + result = await processor._validate_pdf_url("https://example.com/DOC.PDF") + assert result is True + + async def test_url_without_pdf_extension_uses_head_request(self): + """URL without .pdf extension should make HEAD request to check Content-Type""" + from ragitect.services.processor.pdf_url_processor import PDFURLProcessor + + processor = PDFURLProcessor() + + with patch("httpx.AsyncClient") as mock_client_class: + mock_client = AsyncMock() + mock_response = MagicMock() + mock_response.headers = {"content-type": "application/pdf"} + mock_client.head = AsyncMock(return_value=mock_response) + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=None) + mock_client_class.return_value = mock_client + + result = await processor._validate_pdf_url("https://example.com/download?id=123") + assert result is True + mock_client.head.assert_called_once() + + async def test_url_with_non_pdf_content_type_returns_false(self): + """URL with non-PDF Content-Type should return False""" + from ragitect.services.processor.pdf_url_processor import PDFURLProcessor + + processor = PDFURLProcessor() + + with patch("httpx.AsyncClient") as mock_client_class: + mock_client = AsyncMock() + mock_response = MagicMock() + mock_response.headers = {"content-type": "text/html"} + mock_client.head = AsyncMock(return_value=mock_response) + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=None) + mock_client_class.return_value = mock_client + + result = await processor._validate_pdf_url("https://example.com/page") + assert result is False + + async def test_non_pdf_url_raises_invalid_pdf_url_error_on_process(self): + """process() should raise InvalidPDFURLError if URL is not a PDF""" + from ragitect.services.processor.pdf_url_processor import ( + PDFURLProcessor, + InvalidPDFURLError, + ) + + processor = PDFURLProcessor() + + with patch.object(processor, "_validate_pdf_url", new_callable=AsyncMock) as mock_validate: + mock_validate.return_value = False + + with pytest.raises(InvalidPDFURLError) as exc_info: + await processor.process("https://example.com/not-a-pdf") + + assert "does not point to a PDF" in str(exc_info.value) + assert "example.com" in str(exc_info.value) + + +class TestPDFDownload: + """Test PDF download functionality""" + + async def test_successful_download_returns_bytes(self): + """Successful download should return PDF bytes""" + from ragitect.services.processor.pdf_url_processor import PDFURLProcessor + + processor = PDFURLProcessor() + + mock_pdf_bytes = b"%PDF-1.4 mock pdf content" + + with patch("httpx.AsyncClient") as mock_client_class: + mock_client = AsyncMock() + mock_response = MagicMock() + mock_response.content = mock_pdf_bytes + mock_response.raise_for_status = MagicMock() + mock_client.get = AsyncMock(return_value=mock_response) + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=None) + mock_client_class.return_value = mock_client + + result = await processor._download_pdf("https://example.com/doc.pdf") + assert result == mock_pdf_bytes + + async def test_httpx_client_configured_with_timeouts(self): + """httpx client should have 30s total, 10s connect timeout""" + from ragitect.services.processor.pdf_url_processor import PDFURLProcessor + + processor = PDFURLProcessor() + + with patch("httpx.AsyncClient") as mock_client_class: + mock_client = AsyncMock() + mock_response = MagicMock() + mock_response.content = b"%PDF" + mock_response.raise_for_status = MagicMock() + mock_client.get = AsyncMock(return_value=mock_response) + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=None) + mock_client_class.return_value = mock_client + + await processor._download_pdf("https://example.com/doc.pdf") + + call_kwargs = mock_client_class.call_args.kwargs + assert "timeout" in call_kwargs + timeout = call_kwargs["timeout"] + assert timeout.connect == 10.0 + + async def test_httpx_client_has_http2_enabled(self): + """httpx client should have HTTP/2 support enabled""" + from ragitect.services.processor.pdf_url_processor import PDFURLProcessor + + processor = PDFURLProcessor() + + with patch("httpx.AsyncClient") as mock_client_class: + mock_client = AsyncMock() + mock_response = MagicMock() + mock_response.content = b"%PDF" + mock_response.raise_for_status = MagicMock() + mock_client.get = AsyncMock(return_value=mock_response) + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=None) + mock_client_class.return_value = mock_client + + await processor._download_pdf("https://example.com/doc.pdf") + + call_kwargs = mock_client_class.call_args.kwargs + assert call_kwargs.get("http2") is True + + async def test_httpx_client_follows_redirects(self): + """httpx client should follow redirects automatically""" + from ragitect.services.processor.pdf_url_processor import PDFURLProcessor + + processor = PDFURLProcessor() + + with patch("httpx.AsyncClient") as mock_client_class: + mock_client = AsyncMock() + mock_response = MagicMock() + mock_response.content = b"%PDF" + mock_response.raise_for_status = MagicMock() + mock_client.get = AsyncMock(return_value=mock_response) + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=None) + mock_client_class.return_value = mock_client + + await processor._download_pdf("https://example.com/doc.pdf") + + call_kwargs = mock_client_class.call_args.kwargs + assert call_kwargs.get("follow_redirects") is True + + +class TestPDFProcessing: + """Test DoclingProcessor integration""" + + async def test_delegates_to_docling_processor(self): + """process() should delegate PDF processing to DoclingProcessor""" + from ragitect.services.processor.pdf_url_processor import PDFURLProcessor + + processor = PDFURLProcessor() + + mock_pdf_bytes = b"%PDF-1.4 mock pdf content" + expected_markdown = "# Test Document\n\nContent here." + + with patch.object(processor, "_validate_pdf_url", new_callable=AsyncMock) as mock_validate: + mock_validate.return_value = True + with patch.object(processor, "_download_pdf", new_callable=AsyncMock) as mock_download: + mock_download.return_value = mock_pdf_bytes + with patch.object(processor._docling_processor, "process") as mock_docling: + mock_docling.return_value = expected_markdown + + result = await processor.process("https://example.com/doc.pdf") + + assert result == expected_markdown + mock_docling.assert_called_once() + # Verify PDF bytes were passed + call_args = mock_docling.call_args + assert call_args[0][0] == mock_pdf_bytes + + async def test_docling_failure_raises_pdf_processing_error(self): + """DoclingProcessor failure should raise PDFProcessingError""" + from ragitect.services.processor.pdf_url_processor import ( + PDFURLProcessor, + PDFProcessingError, + ) + + processor = PDFURLProcessor() + + mock_pdf_bytes = b"%PDF-1.4 mock pdf content" + + with patch.object(processor, "_validate_pdf_url", new_callable=AsyncMock) as mock_validate: + mock_validate.return_value = True + with patch.object(processor, "_download_pdf", new_callable=AsyncMock) as mock_download: + mock_download.return_value = mock_pdf_bytes + with patch.object(processor._docling_processor, "process") as mock_docling: + mock_docling.side_effect = ValueError("Corrupted PDF") + + with pytest.raises(PDFProcessingError) as exc_info: + await processor.process("https://example.com/corrupted.pdf") + + assert "example.com" in str(exc_info.value) + + +class TestErrorHandling: + """Test error handling for various failure scenarios""" + + async def test_timeout_raises_pdf_download_error(self): + """Timeout should raise PDFDownloadError with descriptive message""" + from ragitect.services.processor.pdf_url_processor import ( + PDFURLProcessor, + PDFDownloadError, + ) + + processor = PDFURLProcessor() + + with patch("httpx.AsyncClient") as mock_client_class: + mock_client = AsyncMock() + mock_client.get = AsyncMock(side_effect=httpx.TimeoutException("Timeout")) + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=None) + mock_client_class.return_value = mock_client + + with pytest.raises(PDFDownloadError) as exc_info: + await processor._download_pdf("https://slow-site.com/doc.pdf") + + assert "slow-site.com" in str(exc_info.value) + assert "timeout" in str(exc_info.value).lower() + + async def test_http_404_raises_pdf_download_error(self): + """HTTP 404 should raise PDFDownloadError with status code""" + from ragitect.services.processor.pdf_url_processor import ( + PDFURLProcessor, + PDFDownloadError, + ) + + processor = PDFURLProcessor() + + with patch("httpx.AsyncClient") as mock_client_class: + mock_client = AsyncMock() + mock_response = MagicMock() + mock_response.status_code = 404 + mock_request = MagicMock() + mock_response.raise_for_status.side_effect = httpx.HTTPStatusError( + "Not Found", request=mock_request, response=mock_response + ) + mock_client.get = AsyncMock(return_value=mock_response) + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=None) + mock_client_class.return_value = mock_client + + with pytest.raises(PDFDownloadError) as exc_info: + await processor._download_pdf("https://example.com/missing.pdf") + + assert "404" in str(exc_info.value) + assert "not found" in str(exc_info.value).lower() + + async def test_http_403_raises_pdf_download_error(self): + """HTTP 403 should raise PDFDownloadError with 'Access denied' message""" + from ragitect.services.processor.pdf_url_processor import ( + PDFURLProcessor, + PDFDownloadError, + ) + + processor = PDFURLProcessor() + + with patch("httpx.AsyncClient") as mock_client_class: + mock_client = AsyncMock() + mock_response = MagicMock() + mock_response.status_code = 403 + mock_request = MagicMock() + mock_response.raise_for_status.side_effect = httpx.HTTPStatusError( + "Forbidden", request=mock_request, response=mock_response + ) + mock_client.get = AsyncMock(return_value=mock_response) + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=None) + mock_client_class.return_value = mock_client + + with pytest.raises(PDFDownloadError) as exc_info: + await processor._download_pdf("https://protected.com/doc.pdf") + + assert "403" in str(exc_info.value) + assert "access denied" in str(exc_info.value).lower() + + async def test_connection_error_raises_pdf_download_error(self): + """Connection error should raise PDFDownloadError with details""" + from ragitect.services.processor.pdf_url_processor import ( + PDFURLProcessor, + PDFDownloadError, + ) + + processor = PDFURLProcessor() + + with patch("httpx.AsyncClient") as mock_client_class: + mock_client = AsyncMock() + mock_client.get = AsyncMock( + side_effect=httpx.ConnectError("Connection refused") + ) + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=None) + mock_client_class.return_value = mock_client + + with pytest.raises(PDFDownloadError) as exc_info: + await processor._download_pdf("https://unreachable.com/doc.pdf") + + assert "unreachable.com" in str(exc_info.value) + assert "connection" in str(exc_info.value).lower() + + async def test_exception_messages_contain_url(self): + """All exception messages should include the URL for debugging""" + from ragitect.services.processor.pdf_url_processor import ( + PDFURLProcessor, + PDFDownloadError, + ) + + processor = PDFURLProcessor() + test_url = "https://test-debugging.example.com/document.pdf" + + with patch("httpx.AsyncClient") as mock_client_class: + mock_client = AsyncMock() + mock_client.get = AsyncMock(side_effect=httpx.TimeoutException("Timeout")) + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=None) + mock_client_class.return_value = mock_client + + with pytest.raises(PDFDownloadError) as exc_info: + await processor._download_pdf(test_url) + + assert "test-debugging.example.com" in str(exc_info.value) + + +class TestTempFileCleanup: + """Test temporary file cleanup on success and failure""" + + async def test_temp_file_cleaned_up_on_success(self): + """Temporary file should be cleaned up after successful processing""" + from ragitect.services.processor.pdf_url_processor import PDFURLProcessor + + processor = PDFURLProcessor() + + mock_pdf_bytes = b"%PDF-1.4 mock pdf content" + expected_markdown = "# Test Document" + + # Track temp files created + temp_files_created = [] + + original_named_temp = __import__("tempfile").NamedTemporaryFile + + def track_temp_file(*args, **kwargs): + # Ensure delete=False so we can check cleanup + kwargs["delete"] = False + f = original_named_temp(*args, **kwargs) + temp_files_created.append(f.name) + return f + + with patch.object(processor, "_validate_pdf_url", new_callable=AsyncMock) as mock_validate: + mock_validate.return_value = True + with patch.object(processor, "_download_pdf", new_callable=AsyncMock) as mock_download: + mock_download.return_value = mock_pdf_bytes + with patch.object(processor._docling_processor, "process") as mock_docling: + mock_docling.return_value = expected_markdown + + await processor.process("https://example.com/doc.pdf") + + # After successful processing, docling was called + # The implementation should clean up temp files internally + mock_docling.assert_called_once() + + async def test_temp_file_cleaned_up_on_failure(self): + """Temporary file should be cleaned up even when processing fails""" + from ragitect.services.processor.pdf_url_processor import ( + PDFURLProcessor, + PDFProcessingError, + ) + + processor = PDFURLProcessor() + + mock_pdf_bytes = b"%PDF-1.4 mock pdf content" + + with patch.object(processor, "_validate_pdf_url", new_callable=AsyncMock) as mock_validate: + mock_validate.return_value = True + with patch.object(processor, "_download_pdf", new_callable=AsyncMock) as mock_download: + mock_download.return_value = mock_pdf_bytes + with patch.object(processor._docling_processor, "process") as mock_docling: + mock_docling.side_effect = ValueError("Processing failed") + + with pytest.raises(PDFProcessingError): + await processor.process("https://example.com/doc.pdf") + + # Even on failure, docling was called (then cleanup happens) + mock_docling.assert_called_once() + + +class TestPDFURLProcessorExceptions: + """Test custom exception classes exist and are properly defined""" + + def test_invalid_pdf_url_error_exists(self): + """InvalidPDFURLError exception class should exist""" + from ragitect.services.processor.pdf_url_processor import InvalidPDFURLError + + error = InvalidPDFURLError("Test error") + assert isinstance(error, Exception) + assert str(error) == "Test error" + + def test_pdf_download_error_exists(self): + """PDFDownloadError exception class should exist""" + from ragitect.services.processor.pdf_url_processor import PDFDownloadError + + error = PDFDownloadError("Test error") + assert isinstance(error, Exception) + assert str(error) == "Test error" + + def test_pdf_processing_error_exists(self): + """PDFProcessingError exception class should exist""" + from ragitect.services.processor.pdf_url_processor import PDFProcessingError + + error = PDFProcessingError("Test error") + assert isinstance(error, Exception) + assert str(error) == "Test error" + + +@pytest.mark.integration +class TestPDFURLProcessorIntegration: + """Integration tests with real PDF URLs (require network access)""" + + async def test_process_arxiv_paper(self): + """Integration test: download and process real arXiv paper""" + from ragitect.services.processor.pdf_url_processor import PDFURLProcessor + + processor = PDFURLProcessor() + # "Attention Is All You Need" - famous transformer paper + url = "https://arxiv.org/pdf/1706.03762.pdf" + + markdown = await processor.process(url) + + # Verify substantial content extracted + assert len(markdown) > 1000, "Expected substantial content from paper" + # Verify markdown format + assert "#" in markdown, "Expected markdown headings" + # Paper title or key terms should be present + assert any( + term in markdown.lower() + for term in ["attention", "transformer", "model"] + ), "Expected paper content" + + async def test_markdown_compatible_with_chunking(self): + """Integration test: verify markdown works with DocumentProcessor chunking""" + from ragitect.services.processor.pdf_url_processor import PDFURLProcessor + from ragitect.services.document_processor import split_markdown_document + + processor = PDFURLProcessor() + url = "https://arxiv.org/pdf/1706.03762.pdf" + + markdown = await processor.process(url) + + # Test with existing chunker + chunks = split_markdown_document( + raw_text=markdown, + chunk_size=512, + overlap=50, + ) + + # Verify chunking works + assert len(chunks) > 0, "Should produce at least one chunk" + assert all( + isinstance(chunk, str) for chunk in chunks + ), "Chunks should be strings" + assert all( + len(chunk) > 0 for chunk in chunks + ), "Each chunk should have content" + + async def test_various_pdf_sources(self): + """Integration test: verify processor handles various PDF sources""" + from ragitect.services.processor.pdf_url_processor import PDFURLProcessor + + processor = PDFURLProcessor() + + # Test with direct PDF link ending in .pdf + url = "https://arxiv.org/pdf/1706.03762.pdf" + + markdown = await processor.process(url) + + assert len(markdown) > 100, "Expected content from PDF" + assert isinstance(markdown, str), "Expected string output"