diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..e6fcf67 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,46 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] + +## [3.0.0] - 2026-01-30 + +### Security + +- **CRITICAL**: Removed client-side URL fetching to prevent SSRF vulnerabilities +- URLs are now passed to the server for secure server-side fetching +- Restricted `sign()` method to local files only (API limitation) + +### Changed + +- **BREAKING**: `sign()` only accepts local files (paths, bytes, file objects) - no URLs +- **BREAKING**: Most methods now accept `FileInputWithUrl` - URLs passed to server +- **BREAKING**: Removed client-side PDF parsing - leverage API's negative index support +- Methods like `rotate()`, `split()`, `deletePages()` now support negative indices (-1 = last page) +- All methods except `sign()` accept URLs that are passed securely to the server + +### Removed + +- **BREAKING**: Removed `process_remote_file_input()` from public API (security risk) +- **BREAKING**: Removed `get_pdf_page_count()` from public API (client-side PDF parsing) +- **BREAKING**: Removed `is_valid_pdf()` from public API (internal use only) +- Removed ~200 lines of client-side PDF parsing code + +### Added + +- SSRF protection documentation in README +- Migration guide (docs/MIGRATION.md) +- Security best practices for handling remote files +- Support for negative page indices in all page-based methods + +## [2.0.0] - 2025-01-09 + +- Initial stable release with full API coverage +- Async-first design with httpx and aiofiles +- Comprehensive type hints and mypy strict mode +- Workflow builder with staged pattern +- Error hierarchy with typed exceptions diff --git a/docs/MIGRATION.md b/docs/MIGRATION.md new file mode 100644 index 0000000..fbefc9a --- /dev/null +++ b/docs/MIGRATION.md @@ -0,0 +1,75 @@ +# Migration Guide: v2.x to v3.0 + +## Overview + +Version 3.0.0 introduces SSRF protection and removes client-side PDF parsing. + +## Key Changes + +### 1. `sign()` No Longer Accepts URLs (API Limitation) + +**Before (v2.x)**: +```python +result = await client.sign('https://example.com/document.pdf', {...}) +``` + +**After (v3.0)** - Fetch file first: +```python +import httpx + +async with httpx.AsyncClient() as http: + url = 'https://example.com/document.pdf' + + # IMPORTANT: Validate URL + if not url.startswith('https://trusted-domain.com/'): + raise ValueError('URL not from trusted domain') + + response = await http.get(url, timeout=10.0) + response.raise_for_status() + pdf_bytes = response.content + +result = await client.sign(pdf_bytes, {...}) +``` + +### 2. Most Methods Now Accept URLs (Passed directly to DWS) + +Good news! These methods now support URLs passed securely to the DWS: +- `rotate()`, `split()`, `add_page()`, `duplicate_pages()`, `delete_pages()` +- `set_page_labels()`, `set_metadata()`, `optimize()` +- `flatten()`, `apply_instant_json()`, `apply_xfdf()` +- All redaction methods +- `convert()`, `ocr()`, `watermark_*()`, `extract_*()`, `merge()`, `password_protect()` + +**Example**: +```python +# This now works! +result = await client.rotate('https://example.com/doc.pdf', 90, pages={'start': 0, 'end': 5}) +``` + +### 3. Negative Page Indices Now Supported + +Use negative indices for "from end" references: +- `-1` = last page +- `-2` = second-to-last page +- etc. + +**Examples**: +```python +# Rotate last 3 pages +await client.rotate(pdf, 90, pages={'start': -3, 'end': -1}) + +# Delete first and last pages +await client.delete_pages(pdf, [0, -1]) + +# Split: keep middle pages, excluding first and last +await client.split(pdf, [{'start': 1, 'end': -2}]) +``` + +### 4. Removed from Public API + +- `process_remote_file_input()` - No longer needed (URLs passed to server) +- `get_pdf_page_count()` - Use negative indices instead +- `is_valid_pdf()` - Let server validate (internal use only) + +**Still Available:** +- `is_remote_file_input()` - Helper to detect if input is a URL (still public) diff --git a/pyproject.toml b/pyproject.toml index 507caaa..164a32d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,7 @@ nutrient_dws_scripts = [ [project] name = "nutrient-dws" -version = "2.0.0" +version = "3.0.0" description = "Python client library for Nutrient Document Web Services API" readme = "README.md" requires-python = ">=3.10" @@ -112,7 +112,7 @@ ignore = [ convention = "google" [tool.ruff.lint.per-file-ignores] -"tests/*" = [] # Don't require docstrings in tests, allow asserts +"tests/*" = ["D102"] # Don't require docstrings in tests [tool.mypy] python_version = "3.10" diff --git a/src/nutrient_dws/__init__.py b/src/nutrient_dws/__init__.py index 0cab185..f14eb6c 100644 --- a/src/nutrient_dws/__init__.py +++ b/src/nutrient_dws/__init__.py @@ -12,9 +12,11 @@ ValidationError, ) from nutrient_dws.inputs import ( + FileInput, + LocalFileInput, + UrlFileInput, is_remote_file_input, process_file_input, - process_remote_file_input, validate_file_input, ) from nutrient_dws.utils import get_library_version, get_user_agent @@ -22,14 +24,16 @@ __all__ = [ "APIError", "AuthenticationError", + "FileInput", + "LocalFileInput", "NetworkError", "NutrientClient", "NutrientError", + "UrlFileInput", "ValidationError", "get_library_version", "get_user_agent", "is_remote_file_input", "process_file_input", - "process_remote_file_input", "validate_file_input", ] diff --git a/src/nutrient_dws/builder/builder.py b/src/nutrient_dws/builder/builder.py index 5497e88..8064b68 100644 --- a/src/nutrient_dws/builder/builder.py +++ b/src/nutrient_dws/builder/builder.py @@ -85,7 +85,7 @@ def _register_asset(self, asset: FileInput) -> str: """Register an asset in the workflow and return its key for use in actions. Args: - asset: The asset to register + asset: The asset to register (must be local, not URL) Returns: The asset key that can be used in BuildActions diff --git a/src/nutrient_dws/client.py b/src/nutrient_dws/client.py index 38e2b2e..670878b 100644 --- a/src/nutrient_dws/client.py +++ b/src/nutrient_dws/client.py @@ -26,11 +26,9 @@ ) from nutrient_dws.inputs import ( FileInput, - get_pdf_page_count, - is_remote_file_input, + LocalFileInput, is_valid_pdf, process_file_input, - process_remote_file_input, ) from nutrient_dws.types.account_info import AccountInfo from nutrient_dws.types.build_actions import ( @@ -299,14 +297,18 @@ def _process_typed_workflow_result( async def sign( self, - pdf: FileInput, + pdf: LocalFileInput, data: CreateDigitalSignature | None = None, options: SignRequestOptions | None = None, ) -> BufferOutput: """Sign a PDF document. + **Security Note**: This method only accepts local files (paths, bytes, file objects) + due to an API limitation. URLs are not supported. For remote files, fetch them first + with proper URL validation. + Args: - pdf: The PDF file to sign + pdf: The local PDF file to sign (no URLs) data: Signature data options: Additional options (image, graphicImage) @@ -315,12 +317,29 @@ async def sign( Example: ```python + # Example 1: Sign a local file result = await client.sign('document.pdf', { 'signatureType': 'cms', 'flatten': False, 'cadesLevel': 'b-lt' }) + # Example 2: Sign a remote file (fetch first) + import httpx + async with httpx.AsyncClient() as http: + # Validate URL before fetching + url = 'https://trusted-domain.com/document.pdf' + if not url.startswith('https://trusted-domain.com/'): + raise ValueError('URL not from trusted domain') + + response = await http.get(url, timeout=10.0) + response.raise_for_status() + pdf_bytes = response.content + + result = await client.sign(pdf_bytes, { + 'signatureType': 'cms' + }) + # Access the signed PDF buffer pdf_buffer = result['buffer'] @@ -332,35 +351,24 @@ async def sign( f.write(pdf_buffer) ``` """ - # Normalize the file input - if is_remote_file_input(pdf): - normalized_file = await process_remote_file_input(str(pdf)) - else: - normalized_file = await process_file_input(pdf) + # Process as local file only (no URL support) + normalized_file = await process_file_input(pdf) if not is_valid_pdf(normalized_file[0]): raise ValidationError("Invalid pdf file", {"input": pdf}) - # Prepare optional files + # Prepare optional files (local files only) normalized_image = None normalized_graphic_image = None if options: if "image" in options: image = options["image"] - if is_remote_file_input(image): - normalized_image = await process_remote_file_input(str(image)) - else: - normalized_image = await process_file_input(image) + normalized_image = await process_file_input(image) if "graphicImage" in options: graphic_image = options["graphicImage"] - if is_remote_file_input(graphic_image): - normalized_graphic_image = await process_remote_file_input( - str(graphic_image) - ) - else: - normalized_graphic_image = await process_file_input(graphic_image) + normalized_graphic_image = await process_file_input(graphic_image) request_data = { "file": normalized_file, @@ -399,8 +407,10 @@ async def watermark_text( """Add a text watermark to a document. This is a convenience method that uses the workflow builder. + **Note**: URLs are passed to the server for secure server-side fetching. + Args: - file: The input file to watermark + file: The input file to watermark (URLs supported) text: The watermark text options: Watermark options @@ -414,6 +424,9 @@ async def watermark_text( 'fontSize': 24 }) + # Works with URLs too + result = await client.watermark_text('https://example.com/doc.pdf', 'CONFIDENTIAL') + # Access the watermarked PDF buffer pdf_buffer = result['buffer'] @@ -438,8 +451,10 @@ async def watermark_image( """Add an image watermark to a document. This is a convenience method that uses the workflow builder. + **Note**: URLs are passed to the server for secure server-side fetching. + Args: - file: The input file to watermark + file: The input file to watermark (URLs supported) image: The watermark image. Can be a file path (string or Path), bytes, file-like object, or a URL to a remote image. options: Watermark options @@ -488,8 +503,10 @@ async def convert( """Convert a document to a different format. This is a convenience method that uses the workflow builder. + **Note**: URLs are passed to the server for secure server-side fetching. + Args: - file: The input file to convert + file: The input file to convert (URLs supported) target_format: The target format to convert to Returns: @@ -508,6 +525,9 @@ async def convert( # Convert to HTML html_result = await client.convert('document.pdf', 'html') html_content = html_result['content'] + + # Works with URLs + pdf_result = await client.convert('https://example.com/document.docx', 'pdf') ``` """ builder = self.workflow().add_file_part(file) @@ -546,6 +566,8 @@ async def ocr( """Perform OCR (Optical Character Recognition) on a document. This is a convenience method that uses the workflow builder. + **Note**: URLs are passed to the server for secure server-side fetching. + Args: file: The input file to perform OCR on language: The language(s) to use for OCR. Can be a single language @@ -581,8 +603,10 @@ async def extract_text( """Extract text content from a document. This is a convenience method that uses the workflow builder. + **Note**: URLs are passed to the server for secure server-side fetching. + Args: - file: The file to extract text from + file: The file to extract text from (URLs supported) pages: Optional page range to extract text from Returns: @@ -596,6 +620,9 @@ async def extract_text( # Extract text from specific pages result = await client.extract_text('document.pdf', {'start': 0, 'end': 2}) + # Works with URLs + result = await client.extract_text('https://example.com/doc.pdf') + # Access the extracted text content text_content = result['data']['pages'][0]['plainText'] ``` @@ -627,8 +654,10 @@ async def extract_table( """Extract table content from a document. This is a convenience method that uses the workflow builder. + **Note**: URLs are passed to the server for secure server-side fetching. + Args: - file: The file to extract table from + file: The file to extract table from (URLs supported) pages: Optional page range to extract tables from Returns: @@ -638,6 +667,9 @@ async def extract_table( ```python result = await client.extract_table('document.pdf') + # Works with URLs + result = await client.extract_table('https://example.com/doc.pdf') + # Access the extracted tables tables = result['data']['pages'][0]['tables'] @@ -674,8 +706,10 @@ async def extract_key_value_pairs( """Extract key value pair content from a document. This is a convenience method that uses the workflow builder. + **Note**: URLs are passed to the server for secure server-side fetching. + Args: - file: The file to extract KVPs from + file: The file to extract KVPs from (URLs supported) pages: Optional page range to extract KVPs from Returns: @@ -685,6 +719,9 @@ async def extract_key_value_pairs( ```python result = await client.extract_key_value_pairs('document.pdf') + # Works with URLs + result = await client.extract_key_value_pairs('https://example.com/doc.pdf') + # Access the extracted key-value pairs kvps = result['data']['pages'][0]['keyValuePairs'] @@ -724,8 +761,11 @@ async def set_page_labels( """Set page labels for a PDF document. This is a convenience method that uses the workflow builder. + **Note**: URLs are passed to the server for secure server-side fetching. + PDF validation is performed server-side. + Args: - pdf: The PDF file to modify + pdf: The PDF file to modify (URLs supported) labels: Array of label objects with pages and label properties Returns: @@ -737,17 +777,12 @@ async def set_page_labels( {'pages': [0, 1, 2], 'label': 'Cover'}, {'pages': [3, 4, 5], 'label': 'Chapter 1'} ]) + + # Works with URLs + result = await client.set_page_labels('https://example.com/doc.pdf', labels) ``` """ - # Validate PDF - if is_remote_file_input(pdf): - normalized_file = await process_remote_file_input(str(pdf)) - else: - normalized_file = await process_file_input(pdf) - - if not is_valid_pdf(normalized_file[0]): - raise ValidationError("Invalid pdf file", {"input": pdf}) - + # No client-side PDF validation - let server handle it result = ( await self.workflow() .add_file_part(pdf) @@ -767,8 +802,10 @@ async def password_protect( """Password protect a PDF document. This is a convenience method that uses the workflow builder. + **Note**: URLs are passed to the server for secure server-side fetching. + Args: - file: The file to protect + file: The file to protect (URLs supported) user_password: Password required to open the document owner_password: Password required to modify the document permissions: Optional array of permissions granted when opened with user password @@ -811,8 +848,11 @@ async def set_metadata( """Set metadata for a PDF document. This is a convenience method that uses the workflow builder. + **Note**: URLs are passed to the server for secure server-side fetching. + PDF validation is performed server-side. + Args: - pdf: The PDF file to modify + pdf: The PDF file to modify (URLs supported) metadata: The metadata to set (title and/or author) Returns: @@ -824,17 +864,12 @@ async def set_metadata( 'title': 'My Document', 'author': 'John Doe' }) + + # Works with URLs + result = await client.set_metadata('https://example.com/doc.pdf', metadata) ``` """ - # Validate PDF - if is_remote_file_input(pdf): - normalized_file = await process_remote_file_input(str(pdf)) - else: - normalized_file = await process_file_input(pdf) - - if not is_valid_pdf(normalized_file[0]): - raise ValidationError("Invalid pdf file", {"input": pdf}) - + # No client-side PDF validation - let server handle it result = ( await self.workflow() .add_file_part(pdf) @@ -852,9 +887,12 @@ async def apply_instant_json( """Apply Instant JSON to a document. This is a convenience method that uses the workflow builder. + **Note**: URLs are passed to the server for secure server-side fetching. + PDF validation is performed server-side. + Args: - pdf: The PDF file to modify - instant_json_file: The Instant JSON file to apply + pdf: The PDF file to modify (URLs supported) + instant_json_file: The Instant JSON file to apply (URLs supported) Returns: The modified document @@ -862,17 +900,15 @@ async def apply_instant_json( Example: ```python result = await client.apply_instant_json('document.pdf', 'annotations.json') + + # Works with URLs + result = await client.apply_instant_json( + 'https://example.com/doc.pdf', + 'https://example.com/annotations.json' + ) ``` """ - # Validate PDF - if is_remote_file_input(pdf): - normalized_file = await process_remote_file_input(str(pdf)) - else: - normalized_file = await process_file_input(pdf) - - if not is_valid_pdf(normalized_file[0]): - raise ValidationError("Invalid pdf file", {"input": pdf}) - + # No client-side PDF validation - let server handle it apply_json_action = BuildActions.apply_instant_json(instant_json_file) result = ( @@ -893,9 +929,12 @@ async def apply_xfdf( """Apply XFDF to a document. This is a convenience method that uses the workflow builder. + **Note**: URLs are passed to the server for secure server-side fetching. + PDF validation is performed server-side. + Args: - pdf: The PDF file to modify - xfdf_file: The XFDF file to apply + pdf: The PDF file to modify (URLs supported) + xfdf_file: The XFDF file to apply (URLs supported) options: Optional settings for applying XFDF Returns: @@ -904,22 +943,21 @@ async def apply_xfdf( Example: ```python result = await client.apply_xfdf('document.pdf', 'annotations.xfdf') - # Or with options: + + # With options: result = await client.apply_xfdf( 'document.pdf', 'annotations.xfdf', {'ignorePageRotation': True, 'richTextEnabled': False} ) + + # Works with URLs + result = await client.apply_xfdf( + 'https://example.com/doc.pdf', + 'https://example.com/annotations.xfdf' + ) ``` """ - # Validate PDF - if is_remote_file_input(pdf): - normalized_file = await process_remote_file_input(str(pdf)) - else: - normalized_file = await process_file_input(pdf) - - if not is_valid_pdf(normalized_file[0]): - raise ValidationError("Invalid pdf file", {"input": pdf}) - + # No client-side PDF validation - let server handle it apply_xfdf_action = BuildActions.apply_xfdf(xfdf_file, options) result = ( @@ -935,8 +973,10 @@ async def merge(self, files: list[FileInput]) -> BufferOutput: """Merge multiple documents into a single document. This is a convenience method that uses the workflow builder. + **Note**: URLs are passed to the server for secure server-side fetching. + Args: - files: The files to merge + files: The files to merge (URLs supported) Returns: The merged document @@ -945,6 +985,13 @@ async def merge(self, files: list[FileInput]) -> BufferOutput: ```python result = await client.merge(['doc1.pdf', 'doc2.pdf', 'doc3.pdf']) + # Works with URLs + result = await client.merge([ + 'https://example.com/doc1.pdf', + 'doc2.pdf', + 'https://example.com/doc3.pdf' + ]) + # Access the merged PDF buffer pdf_buffer = result['buffer'] ``` @@ -972,6 +1019,9 @@ async def flatten( """Flatten annotations in a PDF document. This is a convenience method that uses the workflow builder. + **Note**: URLs are passed to the server for secure server-side fetching. + PDF validation is performed server-side. + Args: pdf: The PDF file to flatten annotation_ids: Optional list of specific annotation IDs to flatten. @@ -996,15 +1046,7 @@ async def flatten( result = await client.flatten('annotated-document.pdf', ['note1', 2, 'highlight3']) ``` """ - # Validate PDF - if is_remote_file_input(pdf): - normalized_file = await process_remote_file_input(str(pdf)) - else: - normalized_file = await process_file_input(pdf) - - if not is_valid_pdf(normalized_file[0]): - raise ValidationError("Invalid pdf file", {"input": pdf}) - + # No client-side PDF validation - let server handle it flatten_action = BuildActions.flatten(annotation_ids) result = ( @@ -1018,7 +1060,7 @@ async def flatten( async def create_redactions_ai( self, - pdf: FileInput, + pdf: LocalFileInput, criteria: str, redaction_state: Literal["stage", "apply"] = "stage", pages: PageRange | None = None, @@ -1026,8 +1068,11 @@ async def create_redactions_ai( ) -> BufferOutput: """Use AI to redact sensitive information in a document. + **Security Note**: This method only accepts local files (direct API call). + For remote files, fetch them first with proper validation. + Args: - pdf: The PDF file to redact + pdf: The PDF file to redact (local files only, no URLs) criteria: AI redaction criteria redaction_state: Whether to stage or apply redactions (default: 'stage') pages: Optional pages to redact @@ -1052,17 +1097,11 @@ async def create_redactions_ai( ) ``` """ - # Validate PDF - if is_remote_file_input(pdf): - normalized_file = await process_remote_file_input(str(pdf)) - else: - normalized_file = await process_file_input(pdf) + # Process local files only + normalized_file = await process_file_input(pdf) - if not is_valid_pdf(normalized_file[0]): - raise ValidationError("Invalid pdf file", {"input": pdf}) - - page_count = get_pdf_page_count(normalized_file[0]) - normalized_pages = normalize_page_params(pages, page_count) if pages else None + # Use pages directly - no page count computation needed + normalized_pages = normalize_page_params(pages) if pages else None document_data: dict[str, Any] = { "file": "file", @@ -1118,11 +1157,14 @@ async def create_redactions_preset( """Create redaction annotations based on a preset pattern. This is a convenience method that uses the workflow builder. + **Note**: URLs are passed to the server for secure server-side fetching. + Supports negative page indices (-1 = last page). + Args: - pdf: The PDF file to create redactions in + pdf: The PDF file to create redactions in (URLs supported) preset: The preset pattern to search for (e.g., 'email-address', 'social-security-number') redaction_state: Whether to stage or apply redactions (default: 'stage') - pages: Optional page range to create redactions in + pages: Optional page range to create redactions in (supports negative indices) preset_options: Optional settings for the preset strategy options: Optional settings for creating redactions @@ -1132,29 +1174,25 @@ async def create_redactions_preset( Example: ```python result = await client.create_redactions_preset('document.pdf', 'email-address') + + # Works with URLs + result = await client.create_redactions_preset( + 'https://example.com/doc.pdf', + 'social-security-number' + ) ``` """ - # Validate PDF - if is_remote_file_input(pdf): - normalized_file = await process_remote_file_input(str(pdf)) - else: - normalized_file = await process_file_input(pdf) - - if not is_valid_pdf(normalized_file[0]): - raise ValidationError("Invalid pdf file", {"input": pdf}) - - # Get page count for handling negative indices - page_count = get_pdf_page_count(normalized_file[0]) - normalized_pages = normalize_page_params(pages, page_count) if pages else None + # No client-side PDF validation - let server handle it + # Use negative indices: -1 means "to end", calculate limit accordingly + start = pages.get("start", 0) if pages else 0 + end = pages.get("end", -1) if pages else -1 # Prepare strategy options with pages strategy_options = preset_options.copy() if preset_options else {} - if normalized_pages: - strategy_options["start"] = normalized_pages["start"] - if normalized_pages["end"] >= 0: - strategy_options["limit"] = ( - normalized_pages["end"] - normalized_pages["start"] + 1 - ) + strategy_options["start"] = start + # If end is -1, omit limit (search to end); otherwise calculate count + if end != -1: + strategy_options["limit"] = end - start + 1 create_redactions_action = BuildActions.create_redactions_preset( preset, options, strategy_options @@ -1185,11 +1223,14 @@ async def create_redactions_regex( r"""Create redaction annotations based on a regular expression. This is a convenience method that uses the workflow builder. + **Note**: URLs are passed to the server for secure server-side fetching. + Supports negative page indices (-1 = last page). + Args: - pdf: The PDF file to create redactions in + pdf: The PDF file to create redactions in (URLs supported) regex: The regular expression to search for redaction_state: Whether to stage or apply redactions (default: 'stage') - pages: Optional page range to create redactions in + pages: Optional page range to create redactions in (supports negative indices) regex_options: Optional settings for the regex strategy options: Optional settings for creating redactions @@ -1199,29 +1240,25 @@ async def create_redactions_regex( Example: ```python result = await client.create_redactions_regex('document.pdf', r'Account:\s*\d{8,12}') + + # Works with URLs + result = await client.create_redactions_regex( + 'https://example.com/doc.pdf', + r'\b\d{3}-\d{2}-\d{4}\b' + ) ``` """ - # Validate PDF - if is_remote_file_input(pdf): - normalized_file = await process_remote_file_input(str(pdf)) - else: - normalized_file = await process_file_input(pdf) - - if not is_valid_pdf(normalized_file[0]): - raise ValidationError("Invalid pdf file", {"input": pdf}) - - # Get page count for handling negative indices - page_count = get_pdf_page_count(normalized_file[0]) - normalized_pages = normalize_page_params(pages, page_count) if pages else None + # No client-side PDF validation - let server handle it + # Use negative indices: -1 means "to end", calculate limit accordingly + start = pages.get("start", 0) if pages else 0 + end = pages.get("end", -1) if pages else -1 # Prepare strategy options with pages strategy_options = regex_options.copy() if regex_options else {} - if normalized_pages: - strategy_options["start"] = normalized_pages["start"] - if normalized_pages["end"] >= 0: - strategy_options["limit"] = ( - normalized_pages["end"] - normalized_pages["start"] + 1 - ) + strategy_options["start"] = start + # If end is -1, omit limit (search to end); otherwise calculate count + if end != -1: + strategy_options["limit"] = end - start + 1 create_redactions_action = BuildActions.create_redactions_regex( regex, options, strategy_options @@ -1252,11 +1289,14 @@ async def create_redactions_text( """Create redaction annotations based on text. This is a convenience method that uses the workflow builder. + **Note**: URLs are passed to the server for secure server-side fetching. + Supports negative page indices (-1 = last page). + Args: - pdf: The PDF file to create redactions in + pdf: The PDF file to create redactions in (URLs supported) text: The text to search for redaction_state: Whether to stage or apply redactions (default: 'stage') - pages: Optional page range to create redactions in + pages: Optional page range to create redactions in (supports negative indices) text_options: Optional settings for the text strategy options: Optional settings for creating redactions @@ -1266,29 +1306,25 @@ async def create_redactions_text( Example: ```python result = await client.create_redactions_text('document.pdf', 'email@example.com') + + # Works with URLs + result = await client.create_redactions_text( + 'https://example.com/doc.pdf', + 'CONFIDENTIAL' + ) ``` """ - # Validate PDF - if is_remote_file_input(pdf): - normalized_file = await process_remote_file_input(str(pdf)) - else: - normalized_file = await process_file_input(pdf) - - if not is_valid_pdf(normalized_file[0]): - raise ValidationError("Invalid pdf file", {"input": pdf}) - - # Get page count for handling negative indices - page_count = get_pdf_page_count(normalized_file[0]) - normalized_pages = normalize_page_params(pages, page_count) if pages else None + # No client-side PDF validation - let server handle it + # Use negative indices: -1 means "to end", calculate limit accordingly + start = pages.get("start", 0) if pages else 0 + end = pages.get("end", -1) if pages else -1 # Prepare strategy options with pages strategy_options = text_options.copy() if text_options else {} - if normalized_pages: - strategy_options["start"] = normalized_pages["start"] - if normalized_pages["end"] >= 0: - strategy_options["limit"] = ( - normalized_pages["end"] - normalized_pages["start"] + 1 - ) + strategy_options["start"] = start + # If end is -1, omit limit (search to end); otherwise calculate count + if end != -1: + strategy_options["limit"] = end - start + 1 create_redactions_action = BuildActions.create_redactions_text( text, options, strategy_options @@ -1310,8 +1346,10 @@ async def create_redactions_text( async def apply_redactions(self, pdf: FileInput) -> BufferOutput: """Apply staged redaction into the PDF. + **Note**: URLs are passed to the server for secure server-side fetching. + Args: - pdf: The PDF file with redaction annotations to apply + pdf: The PDF file with redaction annotations to apply (URLs supported) Returns: The document with applied redactions @@ -1330,15 +1368,7 @@ async def apply_redactions(self, pdf: FileInput) -> BufferOutput: """ apply_redactions_action = BuildActions.apply_redactions() - # Validate PDF - if is_remote_file_input(pdf): - normalized_file = await process_remote_file_input(str(pdf)) - else: - normalized_file = await process_file_input(pdf) - - if not is_valid_pdf(normalized_file[0]): - raise ValidationError("Invalid pdf file", {"input": pdf}) - + # No client-side PDF validation - let server handle it result = ( await self.workflow() .add_file_part(pdf, None, [apply_redactions_action]) @@ -1357,61 +1387,60 @@ async def rotate( """Rotate pages in a document. This is a convenience method that uses the workflow builder. + **Note**: URLs are passed to the server for secure server-side fetching. + Supports negative page indices (-1 = last page, -2 = second-to-last, etc.). + Args: - pdf: The PDF file to rotate + pdf: The PDF file to rotate (URLs supported) angle: Rotation angle (90, 180, or 270 degrees) - pages: Optional page range to rotate + pages: Optional page range to rotate (supports negative indices) Returns: The entire document with specified pages rotated Example: ```python + # Rotate entire document result = await client.rotate('document.pdf', 90) - # Rotate specific pages: + # Rotate specific pages result = await client.rotate('document.pdf', 90, {'start': 1, 'end': 3}) + + # Rotate with URL + result = await client.rotate('https://example.com/doc.pdf', 90) + + # Rotate last 3 pages using negative indices + result = await client.rotate('document.pdf', 90, {'start': -3, 'end': -1}) ``` """ rotate_action = BuildActions.rotate(angle) - - # Validate PDF - if is_remote_file_input(pdf): - normalized_file = await process_remote_file_input(str(pdf)) - else: - normalized_file = await process_file_input(pdf) - - if not is_valid_pdf(normalized_file[0]): - raise ValidationError("Invalid pdf file", {"input": pdf}) - workflow = self.workflow() if pages: - page_count = get_pdf_page_count(normalized_file[0]) - normalized_pages = normalize_page_params(pages, page_count) + # Use negative index support (-1 = last page) + # No need for client-side PDF parsing + start = pages.get("start", 0) + end = pages.get("end", -1) - # Add pages before the range to rotate - if normalized_pages["start"] > 0: + # Add pages before the rotation range + if start != 0: part_options = cast( "FilePartOptions", - {"pages": {"start": 0, "end": normalized_pages["start"] - 1}}, + {"pages": {"start": 0, "end": start - 1}}, ) workflow = workflow.add_file_part(pdf, part_options) - # Add the specific pages with rotation action - part_options = cast("FilePartOptions", {"pages": normalized_pages}) + # Add the rotation range + part_options = cast( + "FilePartOptions", {"pages": {"start": start, "end": end}} + ) workflow = workflow.add_file_part(pdf, part_options, [rotate_action]) - # Add pages after the range to rotate - if normalized_pages["end"] < page_count - 1: + # Add pages after the rotation range (unless end is -1) + if end != -1: part_options = cast( "FilePartOptions", - { - "pages": { - "start": normalized_pages["end"] + 1, - "end": page_count - 1, - } - }, + {"pages": {"start": end + 1, "end": -1}}, ) workflow = workflow.add_file_part(pdf, part_options) else: @@ -1427,10 +1456,15 @@ async def add_page( """Add blank pages to a document. This is a convenience method that uses the workflow builder. + **Note**: URLs are passed to the server for secure server-side fetching. + Index must be non-negative. If the index exceeds the document's page count, + the server will return an error. + Args: - pdf: The PDF file to add pages to + pdf: The PDF file to add pages to (URLs supported) count: The number of blank pages to add index: Optional index where to add the blank pages (0-based). If not provided, pages are added at the end. + Must be non-negative. Returns: The document with added pages @@ -1442,57 +1476,38 @@ async def add_page( # Add 1 blank page after the first page (at index 1) result = await client.add_page('document.pdf', 1, 1) + + # Works with URLs + result = await client.add_page('https://example.com/doc.pdf', 3) ``` """ - # Validate PDF - if is_remote_file_input(pdf): - normalized_file = await process_remote_file_input(str(pdf)) - else: - normalized_file = await process_file_input(pdf) - - if not is_valid_pdf(normalized_file[0]): - raise ValidationError("Invalid pdf file", {"input": pdf}) - - # If no index is provided or it's the end of the document, simply add pages at the end + # No client-side PDF validation - let server handle it + # If no index is provided, simply add pages at the end if index is None: builder = self.workflow() - builder.add_file_part(pdf) - - # Add the specified number of blank pages builder = builder.add_new_page({"pageCount": count}) - result = await builder.output_pdf().execute() else: - # Get the actual page count of the PDF - page_count = get_pdf_page_count(normalized_file[0]) - - # Validate that the index is within range - if index < 0 or index > page_count: - raise ValidationError( - f"Index {index} is out of range (document has {page_count} pages)" - ) + # Validate index is non-negative + if index < 0: + raise ValidationError(f"Index must be non-negative, got: {index}") builder = self.workflow() # Add pages before the specified index if index > 0: - before_pages = normalize_page_params( - {"start": 0, "end": index - 1}, page_count - ) + before_pages = normalize_page_params({"start": 0, "end": index - 1}) part_options = cast("FilePartOptions", {"pages": before_pages}) builder = builder.add_file_part(pdf, part_options) # Add the blank pages builder = builder.add_new_page({"pageCount": count}) - # Add pages after the specified index - if index < page_count: - after_pages = normalize_page_params( - {"start": index, "end": page_count - 1}, page_count - ) - part_options = cast("FilePartOptions", {"pages": after_pages}) - builder = builder.add_file_part(pdf, part_options) + # Add pages after the specified index (use -1 for "to end") + after_pages = normalize_page_params({"start": index, "end": -1}) + part_options = cast("FilePartOptions", {"pages": after_pages}) + builder = builder.add_file_part(pdf, part_options) result = await builder.output_pdf().execute() @@ -1504,9 +1519,12 @@ async def split( """Split a PDF document into multiple parts based on page ranges. This is a convenience method that uses the workflow builder. + **Note**: URLs are passed to the server for secure server-side fetching. + Supports negative page indices (-1 = last page). + Args: - pdf: The PDF file to split - page_ranges: Array of page ranges to extract + pdf: The PDF file to split (URLs supported) + page_ranges: Array of page ranges to extract (supports negative indices) Returns: An array of PDF documents, one for each page range @@ -1517,42 +1535,27 @@ async def split( {'start': 0, 'end': 2}, # Pages 0, 1, 2 {'start': 3, 'end': 5} # Pages 3, 4, 5 ]) + + # Works with URLs and negative indices + results = await client.split('https://example.com/doc.pdf', [ + {'start': 0, 'end': 4}, # First 5 pages + {'start': 5, 'end': -1} # Remaining pages to end + ]) ``` """ if not page_ranges or len(page_ranges) == 0: raise ValidationError("At least one page range is required for splitting") - # Validate PDF - if is_remote_file_input(pdf): - normalized_file = await process_remote_file_input(str(pdf)) - else: - normalized_file = await process_file_input(pdf) - - if not is_valid_pdf(normalized_file[0]): - raise ValidationError("Invalid pdf file", {"input": pdf}) - - # Get the actual page count of the PDF - page_count = get_pdf_page_count(normalized_file[0]) - - # Normalize and validate all page ranges - normalized_ranges = [ - normalize_page_params(page_range, page_count) for page_range in page_ranges - ] - - # Validate that all page ranges are within bounds - for page_range in normalized_ranges: - if page_range["start"] > page_range["end"]: - raise ValidationError( - f"Page range {page_range} is invalid (start > end)" - ) - - # Create a separate workflow for each page range + # No client-side PDF validation - server handles it + # Use negative indices directly - no page count needed import asyncio from typing import cast as typing_cast - async def create_split_pdf(page_range: Pages) -> BufferOutput: + async def create_split_pdf(page_range: PageRange) -> BufferOutput: builder = self.workflow() - part_options = cast("FilePartOptions", {"pages": page_range}) + # Normalize pages to ensure we have start/end + normalized = normalize_page_params(page_range) + part_options = cast("FilePartOptions", {"pages": normalized}) builder = builder.add_file_part(pdf, part_options) result = await builder.output_pdf().execute() return typing_cast( @@ -1560,7 +1563,7 @@ async def create_split_pdf(page_range: Pages) -> BufferOutput: ) # Execute all workflows in parallel and process the results - tasks = [create_split_pdf(page_range) for page_range in normalized_ranges] + tasks = [create_split_pdf(page_range) for page_range in page_ranges] results = await asyncio.gather(*tasks) return results @@ -1571,8 +1574,11 @@ async def duplicate_pages( """Create a new PDF containing only the specified pages in the order provided. This is a convenience method that uses the workflow builder. + **Note**: URLs are passed to the server for secure server-side fetching. + Supports negative page indices (-1 = last page, -2 = second-to-last, etc.). + Args: - pdf: The PDF file to extract pages from + pdf: The PDF file to extract pages from (URLs supported) page_indices: Array of page indices to include in the new PDF (0-based) Negative indices count from the end of the document (e.g., -1 is the last page) @@ -1592,45 +1598,20 @@ async def duplicate_pages( # Create a new PDF with the first and last pages result = await client.duplicate_pages('document.pdf', [0, -1]) + + # Works with URLs + result = await client.duplicate_pages('https://example.com/doc.pdf', [0, -1]) ``` """ if not page_indices or len(page_indices) == 0: raise ValidationError("At least one page index is required for duplication") - # Validate PDF - if is_remote_file_input(pdf): - normalized_file = await process_remote_file_input(str(pdf)) - else: - normalized_file = await process_file_input(pdf) - - if not is_valid_pdf(normalized_file[0]): - raise ValidationError("Invalid pdf file", {"input": pdf}) - - # Get the actual page count of the PDF - page_count = get_pdf_page_count(normalized_file[0]) - - # Normalize negative indices - normalized_indices = [] - for index in page_indices: - if index < 0: - # Handle negative indices (e.g., -1 is the last page) - normalized_indices.append(page_count + index) - else: - normalized_indices.append(index) - - # Validate that all page indices are within range - for i, original_index in enumerate(page_indices): - normalized_index = normalized_indices[i] - if normalized_index < 0 or normalized_index >= page_count: - raise ValidationError( - f"Page index {original_index} is out of range (document has {page_count} pages)" - ) - + # No client-side PDF validation - let server handle it + # Use negative indices directly - server interprets them builder = self.workflow() # Add each page in the order specified - for page_index in normalized_indices: - # Use normalize_page_params to ensure consistent handling + for page_index in page_indices: page_range = normalize_page_params({"start": page_index, "end": page_index}) part_options = cast("FilePartOptions", {"pages": page_range}) builder = builder.add_file_part(pdf, part_options) @@ -1644,8 +1625,11 @@ async def delete_pages( """Delete pages from a PDF document. This is a convenience method that uses the workflow builder. + **Note**: URLs are passed to the server for secure server-side fetching. + Supports negative page indices (-1 = last page, -2 = second-to-last, etc.). + Args: - pdf: The PDF file to modify + pdf: The PDF file to modify (URLs supported) page_indices: Array of page indices to delete (0-based) Negative indices count from the end of the document (e.g., -1 is the last page) @@ -1662,54 +1646,29 @@ async def delete_pages( # Delete the first and last two pages result = await client.delete_pages('document.pdf', [0, -1, -2]) + + # Works with URLs + result = await client.delete_pages('https://example.com/doc.pdf', [0, -1]) ``` """ if not page_indices or len(page_indices) == 0: raise ValidationError("At least one page index is required for deletion") - # Validate PDF - if is_remote_file_input(pdf): - normalized_file = await process_remote_file_input(str(pdf)) - else: - normalized_file = await process_file_input(pdf) - - if not is_valid_pdf(normalized_file[0]): - raise ValidationError("Invalid pdf file", {"input": pdf}) - - # Get the actual page count of the PDF - page_count = get_pdf_page_count(normalized_file[0]) - - # Normalize negative indices - normalized_indices = [] - for index in page_indices: - if index < 0: - # Handle negative indices (e.g., -1 is the last page) - normalized_indices.append(page_count + index) - else: - normalized_indices.append(index) - - # Remove duplicates and sort the deleteIndices - delete_indices = sorted(set(normalized_indices)) - - # Validate that all page indices are within range - for original_index in page_indices: - if original_index >= 0: - normalized_index = original_index - else: - normalized_index = page_count + original_index - - if normalized_index < 0 or normalized_index >= page_count: - raise ValidationError( - f"Page index {original_index} is out of range (document has {page_count} pages)" - ) + # No client-side PDF validation or page count computation + # Use negative indices directly - server interprets them + # Remove duplicates and sort the delete indices + delete_indices = sorted(set(page_indices)) builder = self.workflow() - # Group consecutive pages that should be kept into ranges - current_page = 0 + positive_deletes = [i for i in delete_indices if i >= 0] + negative_deletes = [i for i in delete_indices if i < 0] + page_ranges = [] + current_page = 0 - for delete_index in delete_indices: + # Build keep ranges for positive delete indices + for delete_index in positive_deletes: if current_page < delete_index: page_ranges.append( normalize_page_params( @@ -1718,11 +1677,33 @@ async def delete_pages( ) current_page = delete_index + 1 - if ( - current_page > 0 or (current_page == 0 and len(delete_indices) == 0) - ) and current_page < page_count: + if negative_deletes: + # Add keep range from current position up to just before the first negative delete + trailing_end = negative_deletes[0] - 1 # e.g. -1 -> -2, -2 -> -3 page_ranges.append( - normalize_page_params({"start": current_page, "end": page_count - 1}) + normalize_page_params({"start": current_page, "end": trailing_end}) + ) + # Add keep ranges between consecutive negative delete indices + for i in range(len(negative_deletes) - 1): + between_start = negative_deletes[i] + 1 + between_end = negative_deletes[i + 1] - 1 + if between_start <= between_end: + page_ranges.append( + normalize_page_params( + {"start": between_start, "end": between_end} + ) + ) + # Add keep range after the last negative delete, if it isn't the last page + if negative_deletes[-1] != -1: + page_ranges.append( + normalize_page_params( + {"start": negative_deletes[-1] + 1, "end": -1} + ) + ) + else: + # All deletes are positive: keep remaining pages to end of document + page_ranges.append( + normalize_page_params({"start": current_page, "end": -1}) ) if len(page_ranges) == 0: @@ -1743,8 +1724,11 @@ async def optimize( """Optimize a PDF document for size reduction. This is a convenience method that uses the workflow builder. + **Note**: URLs are passed to the server for secure server-side fetching. + PDF validation is performed server-side. + Args: - pdf: The PDF file to optimize + pdf: The PDF file to optimize (URLs supported) options: Optimization options Returns: @@ -1757,17 +1741,12 @@ async def optimize( 'mrcCompression': True, 'imageOptimizationQuality': 2 }) + + # Works with URLs + result = await client.optimize('https://example.com/large.pdf') ``` """ - # Validate PDF - if is_remote_file_input(pdf): - normalized_file = await process_remote_file_input(str(pdf)) - else: - normalized_file = await process_file_input(pdf) - - if not is_valid_pdf(normalized_file[0]): - raise ValidationError("Invalid pdf file", {"input": pdf}) - + # No client-side PDF validation - let server handle it if options is None: options = {"imageOptimizationQuality": 2} diff --git a/src/nutrient_dws/http.py b/src/nutrient_dws/http.py index 6aae822..ee92942 100644 --- a/src/nutrient_dws/http.py +++ b/src/nutrient_dws/http.py @@ -14,7 +14,7 @@ NutrientError, ValidationError, ) -from nutrient_dws.inputs import FileInput, NormalizedFileData +from nutrient_dws.inputs import LocalFileInput, NormalizedFileData from nutrient_dws.types.account_info import AccountInfo from nutrient_dws.types.analyze_response import AnalyzeBuildResponse from nutrient_dws.types.build_instruction import BuildInstructions @@ -38,8 +38,8 @@ class AnalyzeBuildRequestData(TypedDict): class SignRequestOptions(TypedDict): - image: NotRequired[FileInput] - graphicImage: NotRequired[FileInput] + image: NotRequired[LocalFileInput] + graphicImage: NotRequired[LocalFileInput] class SignRequestData(TypedDict): diff --git a/src/nutrient_dws/inputs.py b/src/nutrient_dws/inputs.py index 5acb5cf..24cce38 100644 --- a/src/nutrient_dws/inputs.py +++ b/src/nutrient_dws/inputs.py @@ -1,15 +1,15 @@ import contextlib import io import os -import re from pathlib import Path from typing import BinaryIO, TypeGuard from urllib.parse import urlparse import aiofiles -import httpx -FileInput = str | Path | bytes | BinaryIO +LocalFileInput = Path | bytes | BinaryIO +UrlFileInput = str +FileInput = UrlFileInput | LocalFileInput NormalizedFileData = tuple[bytes, str] @@ -48,7 +48,9 @@ def is_remote_file_input(file_input: FileInput) -> TypeGuard[str]: return isinstance(file_input, str) and is_url(file_input) -async def process_file_input(file_input: FileInput) -> NormalizedFileData: +async def process_file_input( + file_input: FileInput, +) -> NormalizedFileData: """Convert various file input types to bytes. Args: @@ -140,25 +142,9 @@ async def process_file_input(file_input: FileInput) -> NormalizedFileData: raise ValueError(f"Unsupported file input type: {type(file_input)}") -async def process_remote_file_input(url: str) -> NormalizedFileData: - """Convert various file input types to bytes.""" - async with httpx.AsyncClient() as client: - response = await client.get(url) - # This will raise an exception for bad responses (4xx or 5xx status codes) - response.raise_for_status() - # The .content attribute holds the raw bytes of the response - file_bytes = response.content - - filename = "downloaded_file" - # Try to get filename from 'Content-Disposition' header first - header = response.headers.get("content-disposition") - if header: - # Use regex to find a filename in the header - match = re.search(r'filename="?([^"]+)"?', header) - if match: - filename = match.group(1) - - return file_bytes, filename +# process_remote_file_input() has been removed in v3.0.0 +# URLs are now passed to the server for secure server-side fetching +# This function was removed to prevent SSRF vulnerabilities def validate_file_input(file_input: FileInput) -> bool: @@ -179,45 +165,7 @@ def validate_file_input(file_input: FileInput) -> bool: return False -def get_pdf_page_count(pdf_bytes: bytes) -> int: - """Zero dependency way to get the number of pages in a PDF. - - Args: - pdf_bytes: PDF file bytes - - Returns: - Number of pages in a PDF. - """ - # Find all PDF objects - objects = re.findall(rb"(\d+)\s+(\d+)\s+obj(.*?)endobj", pdf_bytes, re.DOTALL) - - # Get the Catalog Object - catalog_obj = None - for _obj_num, _gen_num, obj_data in objects: - if b"/Type" in obj_data and b"/Catalog" in obj_data: - catalog_obj = obj_data - break - - if not catalog_obj: - raise ValueError("Could not find /Catalog object in PDF.") - - # Extract /Pages reference (e.g. 3 0 R) - pages_ref_match = re.search(rb"/Pages\s+(\d+)\s+(\d+)\s+R", catalog_obj) - if not pages_ref_match: - raise ValueError("Could not find /Pages reference in /Catalog.") - pages_obj_num = pages_ref_match.group(1).decode() - pages_obj_gen = pages_ref_match.group(2).decode() - - # Step 3: Find the referenced /Pages object - pages_obj_pattern = rf"{pages_obj_num}\s+{pages_obj_gen}\s+obj(.*?)endobj".encode() - pages_obj_match = re.search(pages_obj_pattern, pdf_bytes, re.DOTALL) - if not pages_obj_match: - raise ValueError("Could not find root /Pages object.") - pages_obj_data = pages_obj_match.group(1) - - # Step 4: Extract /Count - count_match = re.search(rb"/Count\s+(\d+)", pages_obj_data) - if not count_match: - raise ValueError("Could not find /Count in root /Pages object.") - - return int(count_match.group(1)) +# get_pdf_page_count() has been removed in v3.0.0 +# The API natively supports negative indices (-1 = last page) +# Client-side PDF parsing is no longer needed +# This removes ~40 lines of code and improves security diff --git a/tests/test_integration.py b/tests/test_integration.py index 3e57352..922ebd4 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -306,7 +306,8 @@ async def test_invalid_api_key(self): @pytest.mark.asyncio async def test_network_timeout(self): """Test handling of network timeouts.""" - timeout_client = NutrientClient(api_key=os.getenv("NUTRIENT_API_KEY", ""), timeout=1) + # Use an extremely short timeout (1ms) to guarantee a timeout error + timeout_client = NutrientClient(api_key=os.getenv("NUTRIENT_API_KEY", ""), timeout=0.001) with pytest.raises(NutrientError): await timeout_client.convert(sample_docx, "pdf") diff --git a/tests/unit/test_client.py b/tests/unit/test_client.py index c3873b7..995c16a 100644 --- a/tests/unit/test_client.py +++ b/tests/unit/test_client.py @@ -737,7 +737,7 @@ async def test_password_protect_pdf_with_permissions( owner_password = "owner456" permissions = ["printing", "extract_accessibility"] - result = await unit_client.password_protect( + await unit_client.password_protect( file, user_password, owner_password, permissions ) @@ -871,22 +871,14 @@ async def test_delete_token(self, mock_send_request, valid_client_options, unit_ class TestNutrientClientFlatten: """Tests for NutrientClient flatten functionality.""" - @patch("nutrient_dws.client.is_remote_file_input", return_value=False) - @patch("nutrient_dws.client.process_file_input") - @patch("nutrient_dws.client.is_valid_pdf", return_value=True) @patch("nutrient_dws.client.StagedWorkflowBuilder") @pytest.mark.asyncio async def test_flatten_all_annotations( self, mock_staged_workflow_builder, - mock_is_valid_pdf, - mock_process_file_input, - mock_is_remote, unit_client, ): """Test flattening all annotations (no annotation_ids specified).""" - mock_process_file_input.return_value = (b"%PDF-test", "test.pdf") - # Setup mock workflow mock_workflow_instance = MagicMock() mock_output_stage = MagicMock() @@ -927,22 +919,14 @@ async def test_flatten_all_annotations( assert result["buffer"] == b"test-buffer" assert result["mimeType"] == "application/pdf" - @patch("nutrient_dws.client.is_remote_file_input", return_value=False) - @patch("nutrient_dws.client.process_file_input") - @patch("nutrient_dws.client.is_valid_pdf", return_value=True) @patch("nutrient_dws.client.StagedWorkflowBuilder") @pytest.mark.asyncio async def test_flatten_specific_annotations_by_string_ids( self, mock_staged_workflow_builder, - mock_is_valid_pdf, - mock_process_file_input, - mock_is_remote, unit_client, ): """Test flattening specific annotations by string IDs.""" - mock_process_file_input.return_value = (b"%PDF-test", "test.pdf") - # Setup mock workflow mock_workflow_instance = MagicMock() mock_output_stage = MagicMock() @@ -977,22 +961,14 @@ async def test_flatten_specific_annotations_by_string_ids( # Verify the result assert result["buffer"] == b"test-buffer" - @patch("nutrient_dws.client.is_remote_file_input", return_value=False) - @patch("nutrient_dws.client.process_file_input") - @patch("nutrient_dws.client.is_valid_pdf", return_value=True) @patch("nutrient_dws.client.StagedWorkflowBuilder") @pytest.mark.asyncio async def test_flatten_specific_annotations_by_integer_ids( self, mock_staged_workflow_builder, - mock_is_valid_pdf, - mock_process_file_input, - mock_is_remote, unit_client, ): """Test flattening specific annotations by integer IDs.""" - mock_process_file_input.return_value = (b"%PDF-test", "test.pdf") - # Setup mock workflow mock_workflow_instance = MagicMock() mock_output_stage = MagicMock() @@ -1027,22 +1003,14 @@ async def test_flatten_specific_annotations_by_integer_ids( # Verify the result assert result["buffer"] == b"test-buffer" - @patch("nutrient_dws.client.is_remote_file_input", return_value=False) - @patch("nutrient_dws.client.process_file_input") - @patch("nutrient_dws.client.is_valid_pdf", return_value=True) @patch("nutrient_dws.client.StagedWorkflowBuilder") @pytest.mark.asyncio async def test_flatten_specific_annotations_by_mixed_ids( self, mock_staged_workflow_builder, - mock_is_valid_pdf, - mock_process_file_input, - mock_is_remote, unit_client, ): """Test flattening specific annotations with mixed string and integer IDs.""" - mock_process_file_input.return_value = (b"%PDF-test", "test.pdf") - # Setup mock workflow mock_workflow_instance = MagicMock() mock_output_stage = MagicMock() @@ -1234,3 +1202,111 @@ async def test_ocr_with_iso_language_codes( mock_workflow_instance.add_file_part.assert_called_once_with( file, None, [{"type": "ocr", "language": ["eng", "deu", "fra"]}] ) + + +class TestNutrientClientRotate: + """Tests for NutrientClient rotate functionality.""" + + def _make_mock_workflow(self, mock_staged_workflow_builder): + mock_workflow_instance = MagicMock() + mock_output_stage = MagicMock() + mock_output_stage.execute = AsyncMock( + return_value={ + "success": True, + "output": { + "buffer": b"test-buffer", + "mimeType": "application/pdf", + "filename": "output.pdf", + }, + } + ) + mock_workflow_instance.add_file_part.return_value = mock_workflow_instance + mock_workflow_instance.output_pdf.return_value = mock_output_stage + mock_staged_workflow_builder.return_value = mock_workflow_instance + return mock_workflow_instance + + @patch("nutrient_dws.client.StagedWorkflowBuilder") + @pytest.mark.asyncio + async def test_rotate_with_negative_start_includes_prefix_pages( + self, mock_staged_workflow_builder, unit_client + ): + """Rotating last N pages with negative start must keep all preceding pages.""" + mock_workflow = self._make_mock_workflow(mock_staged_workflow_builder) + + await unit_client.rotate("document.pdf", 90, {"start": -3, "end": -1}) + + # Expect 2 add_file_part calls: prefix pages then rotated pages + assert mock_workflow.add_file_part.call_count == 2 + # First call: prefix pages before the rotation range (pages 0 to -4) + first_call_pages = mock_workflow.add_file_part.call_args_list[0][0][1] + assert first_call_pages == {"pages": {"start": 0, "end": -4}} + # Second call: the rotated range with the rotate action + second_call_pages = mock_workflow.add_file_part.call_args_list[1][0][1] + assert second_call_pages == {"pages": {"start": -3, "end": -1}} + + +class TestNutrientClientDeletePages: + """Tests for NutrientClient delete_pages functionality.""" + + def _make_mock_workflow(self, mock_staged_workflow_builder): + mock_workflow_instance = MagicMock() + mock_output_stage = MagicMock() + mock_output_stage.execute = AsyncMock( + return_value={ + "success": True, + "output": { + "buffer": b"test-buffer", + "mimeType": "application/pdf", + "filename": "output.pdf", + }, + } + ) + mock_workflow_instance.add_file_part.return_value = mock_workflow_instance + mock_workflow_instance.output_pdf.return_value = mock_output_stage + mock_staged_workflow_builder.return_value = mock_workflow_instance + return mock_workflow_instance + + @patch("nutrient_dws.client.StagedWorkflowBuilder") + @pytest.mark.asyncio + async def test_delete_last_page_using_negative_index( + self, mock_staged_workflow_builder, unit_client + ): + """delete_pages([-1]) must keep all pages except the last, not raise an error.""" + mock_workflow = self._make_mock_workflow(mock_staged_workflow_builder) + + result = await unit_client.delete_pages("document.pdf", [-1]) + + assert result["buffer"] == b"test-buffer" + assert mock_workflow.add_file_part.call_count == 1 + kept_pages = mock_workflow.add_file_part.call_args_list[0][0][1] + assert kept_pages == {"pages": {"start": 0, "end": -2}} + + @patch("nutrient_dws.client.StagedWorkflowBuilder") + @pytest.mark.asyncio + async def test_delete_multiple_pages_using_only_negative_indices( + self, mock_staged_workflow_builder, unit_client + ): + """delete_pages([-2, -1]) must keep all pages except the last two.""" + mock_workflow = self._make_mock_workflow(mock_staged_workflow_builder) + + result = await unit_client.delete_pages("document.pdf", [-2, -1]) + + assert result["buffer"] == b"test-buffer" + assert mock_workflow.add_file_part.call_count == 1 + kept_pages = mock_workflow.add_file_part.call_args_list[0][0][1] + assert kept_pages == {"pages": {"start": 0, "end": -3}} + + @patch("nutrient_dws.client.StagedWorkflowBuilder") + @pytest.mark.asyncio + async def test_delete_pages_with_mixed_positive_and_negative_indices( + self, mock_staged_workflow_builder, unit_client + ): + """delete_pages([0, -1]) must keep middle pages, not include the deleted last page.""" + mock_workflow = self._make_mock_workflow(mock_staged_workflow_builder) + + result = await unit_client.delete_pages("document.pdf", [0, -1]) + + assert result["buffer"] == b"test-buffer" + assert mock_workflow.add_file_part.call_count == 1 + kept_pages = mock_workflow.add_file_part.call_args_list[0][0][1] + assert kept_pages == {"pages": {"start": 1, "end": -2}} diff --git a/tests/unit/test_inputs.py b/tests/unit/test_inputs.py index e8f126f..05a901f 100644 --- a/tests/unit/test_inputs.py +++ b/tests/unit/test_inputs.py @@ -5,13 +5,12 @@ import pytest from nutrient_dws.inputs import ( - get_pdf_page_count, - is_remote_file_input, - is_valid_pdf, + is_remote_file_input, # Still used internally process_file_input, - process_remote_file_input, validate_file_input, FileInput, + LocalFileInput, + UrlFileInput, ) from tests.helpers import sample_pdf, TestDocumentGenerator @@ -170,126 +169,6 @@ async def test_throw_for_none(self): await process_file_input(None) -class TestProcessRemoteFileInput: - @pytest.mark.asyncio - async def test_process_url_string_input(self): - mock_response_data = b"test pdf content" - - with patch("httpx.AsyncClient") as mock_client: - mock_response = AsyncMock() - mock_response.content = mock_response_data - mock_response.headers = {} - mock_response.raise_for_status = Mock(return_value=None) - mock_client.return_value.__aenter__.return_value.get.return_value = ( - mock_response - ) - - result = await process_remote_file_input("https://example.com/test.pdf") - - assert result[0] == mock_response_data - assert result[1] == "downloaded_file" - - @pytest.mark.asyncio - async def test_process_url_with_content_disposition_header(self): - mock_response_data = b"test pdf content" - - with patch("httpx.AsyncClient") as mock_client: - mock_response = AsyncMock() - mock_response.content = mock_response_data - mock_response.headers = { - "content-disposition": 'attachment; filename="document.pdf"' - } - mock_response.raise_for_status = Mock(return_value=None) - mock_client.return_value.__aenter__.return_value.get.return_value = ( - mock_response - ) - - result = await process_remote_file_input("https://example.com/test.pdf") - - assert result[0] == mock_response_data - assert result[1] == "document.pdf" - - @pytest.mark.asyncio - async def test_throw_error_for_http_error(self): - with patch("httpx.AsyncClient") as mock_client: - mock_response = AsyncMock() - mock_response.raise_for_status = Mock(side_effect=Exception("HTTP 404")) - mock_client.return_value.__aenter__.return_value.get.return_value = ( - mock_response - ) - - with pytest.raises(Exception): - await process_remote_file_input("https://example.com/test.pdf") - - -class TestGetPdfPageCount: - def test_pdf_with_1_page(self): - pdf_bytes = TestDocumentGenerator.generate_simple_pdf_content("Text") - result = get_pdf_page_count(pdf_bytes) - assert result == 1 - - def test_pdf_with_6_pages(self): - result = get_pdf_page_count(sample_pdf) - assert result == 6 - - def test_throw_for_invalid_pdf_no_objects(self): - invalid_pdf = b"%PDF-1.4\n%%EOF" - - with pytest.raises(ValueError, match="Could not find /Catalog object"): - get_pdf_page_count(invalid_pdf) - - def test_throw_for_invalid_pdf_no_catalog(self): - invalid_pdf = b"%PDF-1.4\n1 0 obj\n<< /Type /NotCatalog >>\nendobj\n%%EOF" - - with pytest.raises(ValueError, match="Could not find /Catalog object"): - get_pdf_page_count(invalid_pdf) - - def test_throw_for_catalog_without_pages_reference(self): - invalid_pdf = b"%PDF-1.4\n1 0 obj\n<< /Type /Catalog >>\nendobj\n%%EOF" - - with pytest.raises(ValueError, match="Could not find /Pages reference"): - get_pdf_page_count(invalid_pdf) - - def test_throw_for_missing_pages_object(self): - invalid_pdf = ( - b"%PDF-1.4\n1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n%%EOF" - ) - - with pytest.raises(ValueError, match="Could not find root /Pages object"): - get_pdf_page_count(invalid_pdf) - - def test_throw_for_pages_object_without_count(self): - invalid_pdf = b"%PDF-1.4\n1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n2 0 obj\n<< /Type /Pages >>\nendobj\n%%EOF" - - with pytest.raises(ValueError, match="Could not find /Count"): - get_pdf_page_count(invalid_pdf) - - -class TestIsValidPdf: - def test_return_true_for_valid_pdf_files(self): - # Test with generated PDF - valid_pdf_bytes = TestDocumentGenerator.generate_simple_pdf_content( - "Test content" - ) - result = is_valid_pdf(valid_pdf_bytes) - assert result is True - - # Test with sample PDF - result = is_valid_pdf(sample_pdf) - assert result is True - - def test_return_false_for_non_pdf_files(self): - # Test with non-PDF bytes - non_pdf_bytes = b"This is not a PDF file" - result = is_valid_pdf(non_pdf_bytes) - assert result is False - - def test_return_false_for_partial_pdf_header(self): - # Test with partial PDF header - partial_pdf = b"%PD" - result = is_valid_pdf(partial_pdf) - assert result is False - - def test_return_false_for_empty_bytes(self): - result = is_valid_pdf(b"") - assert result is False +# Tests for process_remote_file_input, get_pdf_page_count, and is_valid_pdf removed in v3.0.0 +# These functions were removed from the public API for security reasons (SSRF protection) +# and to eliminate client-side PDF parsing (leveraging server-side negative index support)