From 72c356fc94498b67f5ed92718212ae7e89debf1c Mon Sep 17 00:00:00 2001 From: Anthony Stirling <77850077+Frooodle@users.noreply.github.com> Date: Thu, 16 Apr 2026 23:44:11 +0100 Subject: [PATCH 1/4] updates --- docs/API.md | 452 +++++++++++++++--- docs/Configuration/Configuration.md | 20 + docs/Configuration/Diagnostics.md | 173 +++++++ .../LibreOffice-Parallel-Processing.md | 191 ++++++++ docs/Configuration/Process-Limits.md | 2 + docs/FAQ.md | 11 +- docs/Server-Admin-Onboarding.md | 254 ++++++---- 7 files changed, 927 insertions(+), 176 deletions(-) create mode 100644 docs/Configuration/Diagnostics.md create mode 100644 docs/Configuration/LibreOffice-Parallel-Processing.md diff --git a/docs/API.md b/docs/API.md index 9bdcea06..51bb6bb7 100644 --- a/docs/API.md +++ b/docs/API.md @@ -2,7 +2,7 @@ sidebar_position: 7 id: API title: API -description: Overview of API offering in S-PDF +description: REST API documentation, authentication, and usage examples for Stirling PDF tags: - API --- @@ -11,99 +11,419 @@ import TabItem from '@theme/TabItem'; # Stirling PDF API -Stirling PDF exposes a simple API for easy integration with external scripts. You can access the API documentation in two ways: +Stirling PDF provides a REST API for all PDF operations available in the web UI. The API uses multipart form data for file uploads and returns processed files directly in the response body. -1. Local Swagger UI at `/swagger-ui/index.html` on your Stirling PDF instance -2. Online [Swagger Documentation](https://app.swaggerhub.com/apis-docs/Frooodle/Stirling-PDF/) +--- + +## API Documentation + +Every Stirling PDF instance hosts its own interactive API documentation that exactly matches the installed version. + +### Local Swagger UI (Recommended) + +Navigate to your instance's Swagger UI to browse, test, and experiment with all endpoints directly in your browser: + +``` +http:///swagger-ui/index.html +``` + +The path `/swagger-ui.html` also works as a redirect. For example: `http://localhost:8080/swagger-ui/index.html` + +The Swagger UI lets you fill in parameters and execute requests against your running instance, which is the fastest way to learn how each endpoint works. You can also reach it from the Settings menu (gear icon in the top-right corner). -You can also access the documentation through the settings menu (gear icon in the top-right corner). +### OpenAPI Specification (Machine-Readable) -## Accessing API Documentation +The raw OpenAPI 3.0 JSON specification is available at: -### Local Swagger UI -Your Stirling PDF instance includes built-in API documentation: -1. Navigate to `http://your-instance:port/swagger-ui/index.html` -2. Or append `/swagger-ui/index.html` to your Stirling PDF URL -3. This provides an interactive documentation interface where you can: - - View all available endpoints - - Test API calls directly - - See request/response schemas - - View authentication requirements +``` +http:///v1/api-docs +``` + +You can import this into tools like Postman or Insomnia, or use it to generate client libraries in any language. -### Settings Menu Access -1. Click the gear icon (⚙️) in the top-right corner -2. Look for the "API Documentation" or "API" link -3. This will take you to the local Swagger UI +:::caution Always Use Your Local Swagger UI +Always reference the Swagger UI on your own instance rather than external API documentation links. The endpoints and parameters may differ between versions, and your local Swagger UI is always accurate for your installed version. +::: + +--- ## API Authentication -When security is enabled, all API requests require authentication. There are two ways to handle API authentication: +When security is enabled, all API requests require authentication via the `X-API-KEY` header. ### User-Specific API Keys -1. Obtain your API key: - - Log into Stirling PDF - - Go to Account Settings (via the gear icon) - - Find your API key in the account details + +1. Log into Stirling PDF +2. Go to Account Settings (via the gear icon) +3. Find your API key in the account details ### Global API Key -You can set a custom global API key using the environment variable: + +You can set a custom global API key via environment variable: + ```bash SECURITY_CUSTOMGLOBALAPIKEY=your-custom-api-key ``` + This allows you to set a single API key that works regardless of user authentication. -2. Include the API key in all requests: - ```http - X-API-KEY: your-api-key-here - ``` +### Using the API Key + +Include the API key in every request using the `X-API-KEY` header: -3. Example authenticated request: - ```bash - curl -X POST "http://localhost:8080/add-watermark" \ - -H "X-API-KEY: your-api-key-here" \ - -H "Content-Type: multipart/form-data" \ - ... - ``` +```bash +curl -H "X-API-KEY: your-api-key-here" ... +``` + +If login/security is not enabled, the API endpoints are accessible without authentication. + +--- + +## Basic Request Pattern + +All PDF processing endpoints follow the same pattern: + +1. Send a `POST` request with `Content-Type: multipart/form-data` +2. Attach the PDF file as `fileInput` +3. Include operation-specific parameters as form fields +4. Receive the processed PDF (or other output format) in the response body + +Successful responses return the processed file directly with appropriate content headers (`Content-Type: application/pdf` and `Content-Disposition`). Error responses return JSON with details about what went wrong. + +--- ## API Limitations -Stirling PDF's feature set is not entirely confined to the backend, hence not all functionalities are accessible via the API. Certain operations, such as the "view-pdf" or "visually sign", are executed exclusively on the front-end, and as such, they are only available through the Web-UI. If you encounter a situation where some API endpoints appear to be absent, it is likely attributable to these front-end exclusive features. +Not all Stirling PDF features are available through the API. Some operations (such as "view-pdf" or "visually sign") run exclusively on the front end and are only available through the Web UI. If you find that some API endpoints appear to be missing, this is likely the reason. + +Stirling PDF also provides health and statistics endpoints for integration with monitoring and dashboard applications. + +--- + +## Example API Requests + +### Merge Multiple PDFs + + + + ```bash + curl -X POST "http://localhost:8080/api/v1/general/merge-pdfs" \ + -H "X-API-KEY: your-api-key" \ + -F "fileInput=@file1.pdf" \ + -F "fileInput=@file2.pdf" \ + -F "sortType=orderProvided" \ + > merged_output.pdf + ``` + + + ```bash + curl -X POST "http://localhost:8080/api/v1/general/merge-pdfs" ^ + -H "X-API-KEY: your-api-key" ^ + -F "fileInput=@file1.pdf" ^ + -F "fileInput=@file2.pdf" ^ + -F "sortType=orderProvided" ^ + > merged_output.pdf + ``` + + + +### Split a PDF by Pages + + + + ```bash + curl -X POST "http://localhost:8080/api/v1/general/split-pages" \ + -H "X-API-KEY: your-api-key" \ + -F "fileInput=@document.pdf" \ + -F "pageNumbers=1,3,5-10" \ + > split_output.pdf + ``` + + + ```bash + curl -X POST "http://localhost:8080/api/v1/general/split-pages" ^ + -H "X-API-KEY: your-api-key" ^ + -F "fileInput=@document.pdf" ^ + -F "pageNumbers=1,3,5-10" ^ + > split_output.pdf + ``` + + + +### Convert Office Document to PDF + + + + ```bash + curl -X POST "http://localhost:8080/api/v1/convert/file/pdf" \ + -H "X-API-KEY: your-api-key" \ + -F "fileInput=@document.docx" \ + > converted.pdf + ``` + + + ```bash + curl -X POST "http://localhost:8080/api/v1/convert/file/pdf" ^ + -H "X-API-KEY: your-api-key" ^ + -F "fileInput=@document.docx" ^ + > converted.pdf + ``` + + + +### Add Watermark + + + + ```bash + curl -X POST "http://localhost:8080/api/v1/security/add-watermark" \ + -H "X-API-KEY: your-api-key" \ + -F "fileInput=@document.pdf" \ + -F "watermarkType=text" \ + -F "watermarkText=CONFIDENTIAL" \ + -F "fontSize=30" \ + -F "rotation=45" \ + -F "opacity=0.5" \ + > watermarked.pdf + ``` + + + ```bash + curl -X POST "http://localhost:8080/api/v1/security/add-watermark" ^ + -H "X-API-KEY: your-api-key" ^ + -F "fileInput=@document.pdf" ^ + -F "watermarkType=text" ^ + -F "watermarkText=CONFIDENTIAL" ^ + -F "fontSize=30" ^ + -F "rotation=45" ^ + -F "opacity=0.5" ^ + > watermarked.pdf + ``` + + + +### OCR a Scanned PDF -Stirling PDF also has statistic and health endpoints to integrate with monitoring/dashboard applications. + + + ```bash + curl -X POST "http://localhost:8080/api/v1/misc/ocr-pdf" \ + -H "X-API-KEY: your-api-key" \ + -F "fileInput=@scanned_document.pdf" \ + -F "languages=eng" \ + -F "ocrType=force-ocr" \ + > searchable_document.pdf + ``` + + + ```bash + curl -X POST "http://localhost:8080/api/v1/misc/ocr-pdf" ^ + -H "X-API-KEY: your-api-key" ^ + -F "fileInput=@scanned_document.pdf" ^ + -F "languages=eng" ^ + -F "ocrType=force-ocr" ^ + > searchable_document.pdf + ``` + + + +### Compress a PDF + + + + ```bash + curl -X POST "http://localhost:8080/api/v1/general/optimize-pdf" \ + -H "X-API-KEY: your-api-key" \ + -F "fileInput=@large_document.pdf" \ + -F "optimizeLevel=2" \ + > compressed_document.pdf + ``` + + + ```bash + curl -X POST "http://localhost:8080/api/v1/general/optimize-pdf" ^ + -H "X-API-KEY: your-api-key" ^ + -F "fileInput=@large_document.pdf" ^ + -F "optimizeLevel=2" ^ + > compressed_document.pdf + ``` + + + +--- + +## Password Protection & Permissions + +The add-password endpoint uses **prevent** flags (not allow flags). To restrict specific actions, set the corresponding `prevent*` parameter to `true`. + + + + ```bash + curl -X POST "http://localhost:8080/api/v1/security/add-password" \ + -H "X-API-KEY: your-api-key" \ + -F "fileInput=@document.pdf" \ + -F "ownerPassword=OwnerPass123" \ + -F "keyLength=256" \ + -F "preventPrinting=false" \ + -F "preventModify=true" \ + -F "preventAssembly=true" \ + -F "preventExtractContent=true" \ + -F "preventExtractForAccessibility=false" \ + -F "preventFillInForm=false" \ + -F "preventModifyAnnotations=true" \ + -F "preventPrintingFaithful=false" \ + > protected_file.pdf + ``` + + + ```bash + curl -X POST "http://localhost:8080/api/v1/security/add-password" ^ + -H "X-API-KEY: your-api-key" ^ + -F "fileInput=@document.pdf" ^ + -F "ownerPassword=OwnerPass123" ^ + -F "keyLength=256" ^ + -F "preventPrinting=false" ^ + -F "preventModify=true" ^ + -F "preventAssembly=true" ^ + -F "preventExtractContent=true" ^ + -F "preventExtractForAccessibility=false" ^ + -F "preventFillInForm=false" ^ + -F "preventModifyAnnotations=true" ^ + -F "preventPrintingFaithful=false" ^ + > protected_file.pdf + ``` + + + +### Permission Flags Reference + +| Flag | What it prevents when `true` | +|---|---| +| `preventPrinting` | Standard quality printing | +| `preventPrintingFaithful` | High fidelity printing | +| `preventModify` | Modifying document content | +| `preventModifyAnnotations` | Modifying annotations and comments | +| `preventAssembly` | Assembling the document (merge, rearrange pages) | +| `preventExtractContent` | Copying text and graphics | +| `preventExtractForAccessibility` | Extracting content for accessibility (screen readers) | +| `preventFillInForm` | Filling in form fields | + +Supported key lengths: `128` (AES-128) and `256` (AES-256). + +### Remove Password + + + + ```bash + curl -X POST "http://localhost:8080/api/v1/security/remove-password" \ + -H "X-API-KEY: your-api-key" \ + -F "fileInput=@protected_file.pdf" \ + -F "password=CurrentPassword123" \ + > unlocked_file.pdf + ``` + + + ```bash + curl -X POST "http://localhost:8080/api/v1/security/remove-password" ^ + -H "X-API-KEY: your-api-key" ^ + -F "fileInput=@protected_file.pdf" ^ + -F "password=CurrentPassword123" ^ + > unlocked_file.pdf + ``` + + + +--- -## Example CURL Commands +## PDF-to-CSV and PDF-to-XLSX Conversion + +These endpoints extract tabular data from PDF files: ```bash - curl -X POST "http://localhost:8080/add-watermark" \ - -H "Content-Type: multipart/form-data" \ - -F "fileInput=@/Users/username/Downloads/sample-1_cropped.pdf" \ - -F "watermarkType=text" \ - -F "watermarkText=YOUR_WATERMARK_TEXT" \ - -F "alphabet=roman" \ - -F "fontSize=30" \ - -F "rotation=0" \ - -F "opacity=0.5" \ - -F "widthSpacer=50" \ - -F "heightSpacer=50" \ - > "/Users/username/Downloads/output.pdf" + # PDF to CSV + curl -X POST "http://localhost:8080/api/v1/convert/pdf/csv" \ + -H "X-API-KEY: your-api-key" \ + -F "fileInput=@table_document.pdf" \ + -F "pageNumbers=all" \ + > extracted_tables.csv + + # PDF to Excel + curl -X POST "http://localhost:8080/api/v1/convert/pdf/xlsx" \ + -H "X-API-KEY: your-api-key" \ + -F "fileInput=@table_document.pdf" \ + > extracted_tables.xlsx ``` ```bash - curl -X POST "http://localhost:8080/add-watermark" ^ - -H "Content-Type: multipart/form-data" ^ - -F "fileInput=@C:\Users\systo\Downloads\sample-1_cropped.pdf" ^ - -F "watermarkType=text" ^ - -F "watermarkText=YOUR_WATERMARK_TEXT" ^ - -F "alphabet=roman" ^ - -F "fontSize=30" ^ - -F "rotation=0" ^ - -F "opacity=0.5" ^ - -F "widthSpacer=50" ^ - -F "heightSpacer=50" ^ - > "C:\Users\systo\Downloads\output.pdf" - ``` - - \ No newline at end of file + REM PDF to CSV + curl -X POST "http://localhost:8080/api/v1/convert/pdf/csv" ^ + -H "X-API-KEY: your-api-key" ^ + -F "fileInput=@table_document.pdf" ^ + -F "pageNumbers=all" ^ + > extracted_tables.csv + + REM PDF to Excel + curl -X POST "http://localhost:8080/api/v1/convert/pdf/xlsx" ^ + -H "X-API-KEY: your-api-key" ^ + -F "fileInput=@table_document.pdf" ^ + > extracted_tables.xlsx + ``` + + + +:::danger Empty or 0-byte Output? +These endpoints extract text-based tables from PDFs. The text in your PDF must be selectable (not scanned images). If you get empty or 0-byte output, the PDF likely contains image-based content. Run OCR on the document first using the `/api/v1/misc/ocr-pdf` endpoint to make the text extractable, then retry the conversion. See [OCR Configuration](./Configuration/OCR.md) for language pack setup. +::: + +--- + +## Response Handling + +Always check the HTTP status code before processing the response body: + +```bash +response=$(curl -s -w "\n%{http_code}" -X POST "http://localhost:8080/api/v1/general/merge-pdfs" \ + -H "X-API-KEY: your-api-key" \ + -F "fileInput=@file1.pdf" \ + -F "fileInput=@file2.pdf") + +http_code=$(echo "$response" | tail -1) +if [ "$http_code" -ne 200 ]; then + echo "Error: HTTP $http_code" + echo "$response" | head -n -1 +fi +``` + +--- + +## Rate Limits and Timeouts + +The default async request timeout is **20 minutes** (1,200,000 ms). For very large files or complex operations, this can be adjusted: + +```bash +SYSTEM_CONNECTIONTIMEOUTMILLISECONDS=1800000 # 30 minutes +``` + +Process-specific timeouts (LibreOffice, Tesseract, etc.) are configured separately — see the [Process Limits](./Configuration/Process-Limits.md) documentation. + +--- + +## Health and Monitoring Endpoints + +| Endpoint | Purpose | +|---|---| +| `/api/v1/info/status` | Application status (used by Docker health checks) | +| `/api/v1/info/health` | Detailed health information | +| `/actuator/health` | Spring Boot Actuator health endpoint | +| `/actuator/prometheus` | Prometheus-compatible metrics export | + +--- + +## Related + +- [Process Limits](./Configuration/Process-Limits.md) — Configure timeouts and concurrency for external tools +- [Production Deployment Guide](./Server-Admin-Onboarding.md) — Sizing and scaling recommendations +- [Diagnostics](./Configuration/Diagnostics.md) — Troubleshooting and reporting issues diff --git a/docs/Configuration/Configuration.md b/docs/Configuration/Configuration.md index aeb94f11..a852f558 100644 --- a/docs/Configuration/Configuration.md +++ b/docs/Configuration/Configuration.md @@ -231,6 +231,25 @@ For advanced features and specific use cases, see these detailed guides: --- +### Performance & Scaling + +**[Process Limits](./Process-Limits.md)** +- Session limits and timeouts for external tools + +**[LibreOffice Parallel Processing](./LibreOffice-Parallel-Processing.md)** +- Configure multiple LibreOffice instances for faster document conversion +- Local UNO server pool and remote UNO server endpoints + +--- + +### Diagnostics & Support + +**[Diagnostics & Reporting Issues](./Diagnostics.md)** +- Built-in diagnostics tool for Docker containers +- How to report issues via GitHub, Discord, and email + +--- + ### Other Configuration **[Folder Scanning](./FolderScanning.md)** @@ -303,3 +322,4 @@ If missing: - **Production Deployment:** See [Production Deployment Guide](../Server-Admin-Onboarding.md) - **API Usage:** See [API Documentation](../API.md) - **Tool Reference:** See [Functionality](../Functionality/Functionality.md) +- **Troubleshooting:** See [Diagnostics & Reporting Issues](./Diagnostics.md) diff --git a/docs/Configuration/Diagnostics.md b/docs/Configuration/Diagnostics.md new file mode 100644 index 00000000..03c5d7ef --- /dev/null +++ b/docs/Configuration/Diagnostics.md @@ -0,0 +1,173 @@ +--- +sidebar_position: 21 +id: Diagnostics +title: Diagnostics & Reporting Issues +description: Use the built-in diagnostics tool and learn how to report issues effectively +tags: + - Diagnostics + - Troubleshooting + - Support +--- + +# Diagnostics & Reporting Issues + +Stirling PDF includes a built-in diagnostics tool inside Docker containers that collects logs, configuration, system information, and application metrics into a single archive. This is the fastest way to gather the information needed when troubleshooting or reporting issues. + +--- + +## Running the Diagnostics Tool + +Open an interactive shell inside the running container and invoke the tool: + +```bash +docker exec -it diag +``` + +The following aliases all work identically: `diag`, `debug`, `diagnostic`, `diagnostics`, `stirling-diagnostics`. + +:::caution Interactive Terminal Required +The diagnostics tool requires an interactive terminal (`-it` flag). It will not run in non-interactive or headless sessions. +::: + +--- + +## Collection Modes + +When you run the tool, you'll be prompted to choose a collection mode. + +### Auto Mode (Recommended) + +Select option **1** when prompted. Auto mode collects: + +- Application logs from the last 24 hours +- Configuration files from `/configs` +- System information (OS, CPU, memory, disk, Java version) +- Application metrics and health endpoints + +This is sufficient for most issue reports. + +### Custom Mode + +Select option **2** for granular control over what gets collected: + +| Prompt | Default | What It Collects | +|---|---|---| +| Output directory | `/configs` | Where to save the archive | +| Days of logs | 1 | How many days of logs to include | +| Include /configs | Yes | Configuration files | +| Include /customFiles | No | Custom files (excluding PDFs and images) | +| Include /pipeline | No | Pipeline working files (excluding PDFs) | +| Include /tmp/stirling-pdf | No | Temporary processing files | +| Include system information | Yes | OS, CPU, RAM, disk, Java/Python versions | +| Include environment variables | No | Full environment dump | +| Fetch metrics endpoints | Yes | Application status, health, and load data | +| Include UI data endpoints | No | Sign, pipeline, and OCR endpoint data | +| Redact sensitive information | Yes | Apply redaction filters (see below) | + +### Redaction Options + +When redaction is enabled, you can selectively mask: + +- **Secrets/tokens/passwords** - Redacts Authorization headers, API keys, passwords, and similar credentials +- **URL hosts/domains** - Masks hostnames in URLs +- **Email addresses** - Replaces email addresses with `[REDACTED_EMAIL]` +- **Host/Domain/Server fields** - Masks values in host-related configuration fields + +:::caution +Always enable redaction if you plan to share the diagnostics bundle publicly (for example, in a GitHub issue). However, redaction is not perfect and may miss some sensitive values - always review the output manually before sharing publicly. You can disable redaction for private support channels if full detail is needed. +::: + +--- + +## What Gets Collected + +The diagnostics bundle is a `.tar.gz` archive saved to the output directory (default: `/configs`). It contains: + +``` +stirling-diagnostics-YYYYMMDD-HHMMSS.tar.gz +├── summary.txt # Collection metadata and settings +├── bundle/ +│ ├── logs/ # Application log files +│ ├── configs/ # Configuration files (settings.yml, etc.) +│ ├── system/ # System information +│ │ ├── uname.txt # Kernel version +│ │ ├── os-release # OS distribution info +│ │ ├── meminfo.txt # Memory details +│ │ ├── cpuinfo.txt # CPU details +│ │ ├── df.txt # Disk usage +│ │ ├── free.txt # Memory summary +│ │ ├── ps.txt # Running processes +│ │ ├── java-version.txt # Java runtime version +│ │ └── python-version.txt # Python version +│ ├── metrics/ # Application metrics +│ │ ├── api/v1/info/status.json +│ │ ├── api/v1/info/uptime.json +│ │ ├── api/v1/info/health.json +│ │ ├── api/v1/info/requests.json +│ │ ├── api/v1/info/load.json +│ │ ├── actuator/health.json +│ │ └── actuator/prometheus.txt +│ ├── env/ # Environment variables (if requested) +│ └── tree/ # Directory listings +│ ├── logs.txt +│ ├── configs.txt +│ ├── customFiles.txt +│ ├── pipeline.txt +│ ├── tessdata.txt # Installed OCR language packs +│ └── tessdata-mount.txt +``` + +PDFs, images, and compressed archives are always excluded from collection. + +### Retrieving the Bundle + +After the tool finishes, copy the archive out of the container: + +```bash +docker cp :/configs/stirling-diagnostics-*.tar.gz ./ +``` + +--- + +## AOT Diagnostics + +If you are running with AOT (Ahead-of-Time) compilation enabled (`STIRLING_AOT_ENABLE=true`), an additional diagnostics tool is available: + +```bash +docker exec -it aot-diag +``` + +This tool diagnoses AOT cache generation failures, particularly on ARM64/aarch64 platforms. It checks cache integrity, JVM compatibility, and can run smoke tests. + +Aliases: `aot-diag`, `aot-diagnostics` + +--- + +## How to Report Issues + +When you encounter a problem with Stirling PDF, choose the right channel depending on the nature of your issue. + +### GitHub Issues - Bug Reports & Feature Requests + +For reproducible bugs and feature requests, open an issue at: +**https://github.com/Stirling-Tools/Stirling-PDF/issues** + +The repository includes issue templates for bug reports and feature requests that will guide you through providing the right information. + +When submitting a bug report, include as much detail as possible: the diagnostics bundle (run `diag` in your container first), steps to reproduce the issue, expected vs. actual behavior, your deployment method (Docker, bare metal, Kubernetes), Stirling PDF version (visible in the UI footer or in `summary.txt` from the diagnostics bundle), and any commands, API requests, or actions you were performing when the issue occurred. The more context you provide, the faster it can be resolved. + +### Discord Community - Questions & Discussion + +For quick questions, troubleshooting help, and community discussion: +**https://discord.gg/HYmhKj45pU** + +Discord is the best place for configuration help, setup questions, sharing workarounds with other users, general discussion about features and usage, and getting faster informal feedback before filing a formal issue. It's also great for following up on GitHub issues and having deeper conversations with the community. + +### Email Support + +For enterprise customers and licensing inquiries: +**support@stirlingpdf.com** + +For security vulnerabilities: +**security@stirlingpdf.com** or use the [GitHub Security Advisory](https://github.com/Stirling-Tools/Stirling-PDF/security) process. + diff --git a/docs/Configuration/LibreOffice-Parallel-Processing.md b/docs/Configuration/LibreOffice-Parallel-Processing.md new file mode 100644 index 00000000..653e282a --- /dev/null +++ b/docs/Configuration/LibreOffice-Parallel-Processing.md @@ -0,0 +1,191 @@ +--- +sidebar_position: 22 +id: LibreOffice-Parallel-Processing +title: LibreOffice Parallel Processing +description: Configure multiple LibreOffice instances for parallel document conversion +tags: + - LibreOffice + - Performance + - Scaling + - UNO Server +--- +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# LibreOffice Parallel Processing + +Stirling PDF uses LibreOffice for converting office documents (DOCX, XLSX, PPTX, etc.) to PDF and other formats. LibreOffice processes each conversion in a single thread, meaning one conversion uses one CPU core at 100% regardless of how many cores are available. To process multiple conversions at the same time, you need to run multiple LibreOffice instances. + +--- + +## Local UNO Server Pool + +By default, Stirling PDF manages a local pool of UNO (Universal Network Objects) server instances. The number of instances is controlled by the `libreOfficeSessionLimit` setting. + + + + ```yaml + processExecutor: + autoUnoServer: true + sessionLimit: + libreOfficeSessionLimit: 4 # Run 4 LibreOffice instances + ``` + + + ```bash + PROCESS_EXECUTOR_SESSION_LIMIT_LIBRE_OFFICE_SESSION_LIMIT=4 + ``` + + + ```yaml + services: + stirling-pdf: + image: docker.stirlingpdf.com/stirlingtools/stirling-pdf:latest + environment: + PROCESS_EXECUTOR_SESSION_LIMIT_LIBRE_OFFICE_SESSION_LIMIT: 4 + ``` + + + +Each additional instance consumes approximately **50 MB of RAM when idle** and significantly more during active conversion. A reasonable starting point is one instance per 2 CPU cores, adjusted based on available RAM and expected workload. + +:::info +The default `libreOfficeSessionLimit` is `1`, meaning only one conversion runs at a time. If you see conversions queuing up or running slowly, increasing this is the first thing to try. +::: + +--- + +## Remote UNO Server Endpoints + +For larger deployments or when you want to isolate LibreOffice from the main application, you can run UNO servers as separate containers and configure Stirling PDF to connect to them remotely. + +Set `autoUnoServer` to `false` and define your remote endpoints: + + + + ```yaml + processExecutor: + autoUnoServer: false + unoServerEndpoints: + - host: "unoserver1" + port: 2003 + hostLocation: "remote" + protocol: "http" + - host: "unoserver2" + port: 2003 + hostLocation: "remote" + protocol: "http" + - host: "unoserver3" + port: 2003 + hostLocation: "remote" + protocol: "http" + ``` + + + ```bash + PROCESS_EXECUTOR_AUTO_UNO_SERVER=false + PROCESS_EXECUTOR_UNO_SERVER_ENDPOINTS_0_HOST=unoserver1 + PROCESS_EXECUTOR_UNO_SERVER_ENDPOINTS_0_PORT=2003 + PROCESS_EXECUTOR_UNO_SERVER_ENDPOINTS_0_HOST_LOCATION=remote + PROCESS_EXECUTOR_UNO_SERVER_ENDPOINTS_0_PROTOCOL=http + PROCESS_EXECUTOR_UNO_SERVER_ENDPOINTS_1_HOST=unoserver2 + PROCESS_EXECUTOR_UNO_SERVER_ENDPOINTS_1_PORT=2003 + PROCESS_EXECUTOR_UNO_SERVER_ENDPOINTS_1_HOST_LOCATION=remote + PROCESS_EXECUTOR_UNO_SERVER_ENDPOINTS_1_PROTOCOL=http + ``` + + + ```yaml + services: + stirling-pdf: + image: docker.stirlingpdf.com/stirlingtools/stirling-pdf:latest + environment: + PROCESS_EXECUTOR_AUTO_UNO_SERVER: "false" + PROCESS_EXECUTOR_UNO_SERVER_ENDPOINTS_0_HOST: "unoserver1" + PROCESS_EXECUTOR_UNO_SERVER_ENDPOINTS_0_PORT: "2003" + PROCESS_EXECUTOR_UNO_SERVER_ENDPOINTS_0_HOST_LOCATION: "remote" + PROCESS_EXECUTOR_UNO_SERVER_ENDPOINTS_1_HOST: "unoserver2" + PROCESS_EXECUTOR_UNO_SERVER_ENDPOINTS_1_PORT: "2003" + PROCESS_EXECUTOR_UNO_SERVER_ENDPOINTS_1_HOST_LOCATION: "remote" + ports: + - "8080:8080" + + unoserver1: + image: docker.stirlingpdf.com/stirlingtools/stirling-pdf-unoserver:latest + ports: + - "2003:2003" + + unoserver2: + image: docker.stirlingpdf.com/stirlingtools/stirling-pdf-unoserver:latest + ports: + - "2004:2003" + ``` + + + +To add more endpoints, add additional entries to the `unoServerEndpoints` list in settings.yml, or for environment variables, increment the index number (e.g. `_0_` for the first, `_1_` for the second, `_2_` for the third, and so on). + +### Endpoint Configuration + +The `host` field accepts a Docker service name (e.g. `unoserver1`), a DNS hostname (e.g. `uno.internal.example.com`), or an IP address (e.g. `192.168.1.50`). The default is `127.0.0.1`. + +The `hostLocation` setting controls how files are transferred between Stirling PDF and the UNO server: + +| Value | When to Use | How It Works | +|---|---|---| +| `auto` | Default, detects automatically | Checks if the host is local or remote | +| `local` | UNO server is on the same machine | Files are passed via filesystem paths (fastest) | +| `remote` | UNO server is a separate container or machine | Files are transferred over HTTP | + +:::caution +Use `remote` when running UNO servers in separate Docker containers, even if the containers are on the same host machine. The containers don't share a filesystem, so `local` will not work. +::: + +--- + +## Running UNO Servers Without Docker + +If you are running Stirling PDF without Docker (bare metal or systemd), you can start additional UNO server instances manually using the `unoserver` Python package: + +```bash +# Install unoserver (included in Docker images) +pip install unoserver + +# Start instances on different ports +unoserver --port 2003 & +unoserver --port 2004 & +unoserver --port 2005 & +``` + +Then configure Stirling PDF to connect to these instances at `127.0.0.1` on the respective ports with `hostLocation: "local"`. + +--- + +## Timeout Configuration + +LibreOffice conversion has a default timeout of **30 minutes**. For very large or complex documents, you may need to increase this: + + + + ```yaml + processExecutor: + timeoutMinutes: + libreOfficetimeoutMinutes: 60 + ``` + + + ```bash + PROCESS_EXECUTOR_TIMEOUT_MINUTES_LIBRE_OFFICETIMEOUT_MINUTES=60 + ``` + + + +If conversions are consistently timing out, this usually indicates the system is under-resourced rather than needing a longer timeout. Check CPU and memory usage first. + +--- + +## Related + +- [Process Limits](./Process-Limits.md) — Configure session limits and timeouts for all external tools +- [Production Deployment Guide](../Server-Admin-Onboarding.md) — Sizing recommendations for different workloads +- [Diagnostics](./Diagnostics.md) — Collect system and application diagnostics for troubleshooting diff --git a/docs/Configuration/Process-Limits.md b/docs/Configuration/Process-Limits.md index 066ea1be..911be42b 100644 --- a/docs/Configuration/Process-Limits.md +++ b/docs/Configuration/Process-Limits.md @@ -41,6 +41,8 @@ Controls how many concurrent instances of each process are allowed. Extra reques **Increase** limits on a beefy server with concurrent users. **Decrease** them on low-RAM servers - LibreOffice in particular is memory-hungry. +For LibreOffice specifically, you can also scale by running multiple remote UNO server instances — see [LibreOffice Parallel Processing](./LibreOffice-Parallel-Processing.md) for details. + :::info Be mindful of memory and CPU usage when raising session limits. Each concurrent process consumes resources, and setting limits too high can starve the host or cause out-of-memory issues possibly killing the instance. Start with the defaults and increase gradually while monitoring your server. ::: diff --git a/docs/FAQ.md b/docs/FAQ.md index 0633c4f3..ac966f41 100644 --- a/docs/FAQ.md +++ b/docs/FAQ.md @@ -21,14 +21,17 @@ All feedback and suggestions are appreciated. It is best to submit these via a G You can also reach out in discord but without a ticket to track it the request can often get lost! ### Q5: I found a bug in Stirling PDF. Where can I report it? -Please report any bugs or issues you encounter through our GitHub Issues page. Be sure to include as much detail as possible so we can diagnose and resolve the issue quickly. +Please report any bugs or issues you encounter through our [GitHub Issues page](https://github.com/Stirling-Tools/Stirling-PDF/issues). Be sure to include as much detail as possible so we can diagnose and resolve the issue quickly. If you're running Docker, use the built-in [diagnostics tool](./Configuration/Diagnostics.md) to collect logs, configuration, and system information into a shareable archive. -### Q6: My Stirling PDF Using high RAM at idle, How can I optimize memory usage? +### Q6: My Stirling PDF is using high RAM at idle. How can I optimize memory usage? Stirling PDF's memory usage can be optimized in several ways: -Disable additional features: Set DISABLE_ADDITIONAL_FEATURES=true to reduce RAM consumption. This can significantly lower memory usage, especially during idle periods. +- **Disable additional features:** Set `DISABLE_ADDITIONAL_FEATURES=true` to reduce RAM consumption. This can significantly lower memory usage, especially during idle periods. +- **Use the Ultra Lite version:** Pull the `latest-ultra-lite` tag from Docker Hub or GitHub, which is specifically designed for lower-end hardware. +- **Tune the JVM heap:** Set `JAVA_TOOL_OPTIONS="-Xms256m -Xmx1g"` to limit the maximum heap size. By default, the JVM may allocate up to 25% of container memory as heap. +- **Reduce LibreOffice instances:** Each idle LibreOffice UNO server instance uses approximately 50 MB. The default session limit is 1. See [LibreOffice Parallel Processing](./Configuration/LibreOffice-Parallel-Processing.md) for details. -Use the Ultra Lite version: Pull the latest-ultra-lite tag from Docker Hub or GitHub, which is specifically designed for lower-end hardware. +For detailed sizing recommendations, see [Production Deployment Guide — Performance Optimization](./Server-Admin-Onboarding.md#step-9-performance-optimization--sizing). ### Q7: I'm experiencing connection errors when pulling from docker.stirling.com diff --git a/docs/Server-Admin-Onboarding.md b/docs/Server-Admin-Onboarding.md index 7c40d816..0c3c25e8 100644 --- a/docs/Server-Admin-Onboarding.md +++ b/docs/Server-Admin-Onboarding.md @@ -1317,116 +1317,114 @@ find backups/ -name "stirling-data-*.tar.gz" -mtime +30 -delete --- -## Step 9: Performance Optimization +## Step 9: Performance Optimization & Sizing -Optimize Stirling-PDF for your workload and server capacity. +Understanding how Stirling PDF uses resources is essential for sizing your deployment correctly. PDF processing is memory-intensive — a single large PDF can expand to many times its file size in memory during processing. -### 9.1: Resource Allocation +### 9.1: How Stirling PDF Uses Memory + +Stirling PDF loads PDFs into memory using a tiered strategy based on file size: + +| File Size | Strategy | Memory Impact | +|---|---|---| +| Up to 10 MB | Loaded entirely into JVM heap as byte array | Fast, but consumes heap proportional to file size | +| 10 MB to 50 MB | Mixed mode — 10 MB budget in heap, remainder file-backed | Moderate heap usage with disk spillover | +| Over 50 MB | Fully file-backed (scratch space on disk) | Minimal heap, but requires adequate temp disk space | + +The application also monitors heap pressure. If free heap drops below **30% of total heap** or below **256 MB absolute**, all operations are forced into file-backed mode regardless of file size. + +The maximum number of simultaneous PDF operations is bounded by a semaphore based on your CPU count: `max(4, available CPU cores)`. Each concurrent operation may hold a document in memory, so peak memory usage scales with both file sizes and concurrency. + +:::caution Memory-Intensive Operations +A 50 MB PDF with complex vector graphics, embedded fonts, and many pages can expand to 200–500 MB in memory during processing. Operations that render pages (such as PDF-to-image conversion) and OCR are particularly memory-intensive. Plan for several times the maximum expected file size in available heap per concurrent operation. +::: + +### 9.2: Resource Recommendations - + **Recommended specifications:** -- **CPU:** 2 cores -- **RAM:** 4GB -- **Disk:** 20GB -- **Concurrent operations:** 2-4 +- **CPU:** 2 cores (4+ recommended) +- **RAM:** 4 GB total, 2 GB JVM heap +- **Disk:** 10 GB free temp space +- **Expected files:** Under 20 MB -**Docker resource limits:** +**Docker Compose:** ```yaml -deploy: - resources: - limits: - memory: 4G - cpus: '2.0' - reservations: - memory: 2G - cpus: '1.0' -``` - -**Settings:** -```yaml -system: - maxFileSize: 500 # 500MB max file - connectionTimeoutMinutes: 5 - maxConcurrentOperations: 2 +services: + stirling-pdf: + image: docker.stirlingpdf.com/stirlingtools/stirling-pdf:latest + environment: + JAVA_TOOL_OPTIONS: "-Xms512m -Xmx2g" + deploy: + resources: + limits: + memory: 4G + cpus: '2.0' ``` - + **Recommended specifications:** -- **CPU:** 4 cores -- **RAM:** 8GB -- **Disk:** 50GB -- **Concurrent operations:** 4-8 - -**Docker resource limits:** -```yaml -deploy: - resources: - limits: - memory: 8G - cpus: '4.0' - reservations: - memory: 4G - cpus: '2.0' -``` +- **CPU:** 4–8 cores +- **RAM:** 8–16 GB total, 4–8 GB JVM heap +- **Disk:** 50 GB temp space (SSD recommended) +- **Expected files:** Up to 100 MB -**Settings:** +**Docker Compose:** ```yaml -system: - maxFileSize: 1000 # 1GB max file - connectionTimeoutMinutes: 10 - maxConcurrentOperations: 4 +services: + stirling-pdf: + image: docker.stirlingpdf.com/stirlingtools/stirling-pdf:latest + environment: + JAVA_TOOL_OPTIONS: "-Xms1g -Xmx4g" + PROCESS_EXECUTOR_SESSION_LIMIT_LIBRE_OFFICE_SESSION_LIMIT: 2 + deploy: + resources: + limits: + memory: 8G + cpus: '4.0' ``` **Consider:** -- Load balancer for multiple instances -- Database on separate server +- Increase LibreOffice session limit for faster document conversions — see [LibreOffice Parallel Processing](./Configuration/LibreOffice-Parallel-Processing.md) +- External PostgreSQL database for reliability - + **Recommended specifications:** - **CPU:** 8+ cores -- **RAM:** 16GB+ -- **Disk:** 100GB+ (SSD recommended) -- **Concurrent operations:** 8+ +- **RAM:** 16–32 GB total, 8–16 GB JVM heap +- **Disk:** 100+ GB temp space, SSD strongly recommended +- **Expected files:** Up to 500 MB, OCR and conversion workloads -**Docker resource limits:** +**Docker Compose:** ```yaml -deploy: - resources: - limits: - memory: 16G - cpus: '8.0' - reservations: - memory: 8G - cpus: '4.0' -``` - -**Settings:** -```yaml -system: - maxFileSize: 2000 # 2GB max file - connectionTimeoutMinutes: 15 - maxConcurrentOperations: 8 +services: + stirling-pdf: + image: docker.stirlingpdf.com/stirlingtools/stirling-pdf:latest + environment: + JAVA_TOOL_OPTIONS: "-Xms2g -Xmx8g" + PROCESS_EXECUTOR_SESSION_LIMIT_LIBRE_OFFICE_SESSION_LIMIT: 4 + PROCESS_EXECUTOR_SESSION_LIMIT_TESSERACT_SESSION_LIMIT: 2 + deploy: + resources: + limits: + memory: 16G + cpus: '8.0' ``` -**Architecture:** -- Multiple instances for processing -- Load balancer with session affinity -- Dedicated database server -- Redis for session storage +**Architecture considerations:** +- Multiple instances behind a load balancer with session affinity +- Remote UNO servers for LibreOffice scaling — see [LibreOffice Parallel Processing](./Configuration/LibreOffice-Parallel-Processing.md) +- External PostgreSQL database (enterprise feature) +- Shared `/configs` volume across instances for consistent settings :::tip Server/Enterprise Recommended -For large organizations, **Server or Enterprise plans** provide: -- Advanced load balancing -- High availability configuration -- Database clustering -- Performance monitoring -- Dedicated support +For large organizations, **Server or Enterprise plans** provide SSO, external database support, advanced monitoring, and dedicated support. [Learn more](#step-10-paid-plans-serverenterprise) ::: @@ -1434,7 +1432,44 @@ For large organizations, **Server or Enterprise plans** provide: -### 9.2: Storage Management +### 9.3: JVM Tuning + +The application runs on Java 21+ with virtual threads enabled. The JVM does not ship with fixed heap settings — it uses the JVM's automatic ergonomics, which typically sets max heap to 25% of available container memory. For production, always explicitly set the heap: + +```bash +JAVA_TOOL_OPTIONS="-Xms512m -Xmx4g" +``` + +| Setting | Meaning | Recommendation | +|---|---|---| +| `-Xms` | Initial heap size | Set to 25–50% of `-Xmx` to reduce GC churn at startup | +| `-Xmx` | Maximum heap size | Set based on your workload (see sizing table above) | + +:::caution Container Memory Limits +If running in Docker or Kubernetes with memory limits, set the container limit to **at least 1.5x the JVM max heap** to account for JVM metaspace, LibreOffice processes (~50 MB each), Tesseract, Python processes, and OS overhead. For example, if you set `-Xmx4g`, set your container memory limit to at least 6 GB. +::: + +### 9.4: Reverse Proxy Configuration + +If you're using a reverse proxy, ensure your upload size limits and timeouts are set appropriately. + +**NGINX** defaults to a 1 MB `client_max_body_size`, which will block most PDF uploads and return `.htm` error pages instead. You **must** increase this: + +```nginx +server { + client_max_body_size 2000M; # Match your Stirling PDF file upload limit + proxy_read_timeout 600s; # Allow time for large file processing + proxy_send_timeout 600s; +} +``` + +See also [FAQ](./FAQ.md) for more common issues related to reverse proxy configuration. + +### 9.5: Storage & Temp File Management + +Stirling PDF stores temporary processing files in a configurable temp directory (default: the system temp directory under `stirling-pdf/`). Automatic cleanup runs every **30 minutes** and removes files older than **24 hours**. + +For high-throughput deployments, ensure your temp directory is on fast storage (SSD) with sufficient space. Monitor disk usage — if cleanup cannot keep pace with file processing, the temp directory can grow large. **Monitor disk usage:** ```bash @@ -1445,30 +1480,33 @@ docker system df du -sh ./stirling-data/* ``` -**Cleanup strategies:** +### 9.6: Job Queue Behavior -```yaml -system: - # Automatic cleanup settings - tempFileCleanup: true - tempFileMaxAge: 24 # hours +Under high load, Stirling PDF queues incoming requests with these defaults: - # Log rotation - logRetentionDays: 30 - maxLogSize: 100 # MB -``` +| Parameter | Default | Notes | +|---|---|---| +| Base queue capacity | 10 | Maximum queued jobs before rejection | +| Minimum queue capacity | 2 | Floor during resource pressure | +| Queue check interval | 1 second | How often queued jobs are re-evaluated | +| Maximum wait time | 10 minutes | After which queued jobs are rejected | -**Manual cleanup:** -```bash -# Clean Docker system -docker system prune -a --volumes +When CPU or memory exceeds critical thresholds (CPU 90%, heap 90%), the queue capacity is dynamically reduced toward the minimum to shed load. If users are experiencing rejected requests during peak usage, consider scaling horizontally with multiple instances. -# Clean old logs -find ./stirling-data/logs -name "*.log" -mtime +30 -delete +### 9.7: Resource-Intensive Operations -# Clean temporary files (if not auto-cleaned) -find ./stirling-data/temp -type f -mtime +1 -delete -``` +Some operations require significantly more resources than others: + +| Operation | CPU Impact | Memory Impact | Notes | +|---|---|---|---| +| Merge / Split | Low | Proportional to total file sizes | Lightweight file operations | +| OCR (Tesseract) | Very High | High | CPU-bound image analysis | +| File Conversion (LibreOffice) | High | High | Single-threaded per instance | +| PDF-to-Image | Moderate | Very High | Page rendering expands memory significantly | +| PDF/A Conversion | Moderate | High | Font embedding and color profiles | +| Compression | Moderate | High | Rewriting internal PDF structures | + +For configuration of per-tool concurrency limits and timeouts, see [Process Limits](./Configuration/Process-Limits.md). --- @@ -1517,6 +1555,7 @@ Congratulations! You've successfully deployed and configured Stirling-PDF for yo - [OCR Configuration](./Configuration/OCR.md) - Add more languages - [Pipeline Automation](./Configuration/Pipeline.md) - Automate workflows - [API Integration](./API.md) - Integrate with other systems + - [LibreOffice Parallel Processing](./Configuration/LibreOffice-Parallel-Processing.md) - Scale document conversions 3. **🔒 Harden security** - [Fail2Ban Setup](./Configuration/Fail2Ban.md) - Prevent brute force @@ -1565,9 +1604,10 @@ Congratulations! You've successfully deployed and configured Stirling-PDF for yo **Solutions:** 1. Check resource limits: `docker stats stirling-pdf` -2. Increase memory: Update `deploy.resources.limits.memory` -3. Reduce concurrent operations: Lower `maxConcurrentOperations` -4. Check disk I/O: Use SSD for better performance +2. Increase JVM heap: Set `JAVA_TOOL_OPTIONS="-Xms1g -Xmx4g"` (see [Step 9](#step-9-performance-optimization--sizing)) +3. Increase LibreOffice instances if document conversions are slow — see [LibreOffice Parallel Processing](./Configuration/LibreOffice-Parallel-Processing.md) +4. Check disk I/O: Use SSD for temp file storage +5. Run the built-in [diagnostics tool](./Configuration/Diagnostics.md) and check application logs ### HTTPS/Certificate Issues @@ -1591,13 +1631,15 @@ Congratulations! You've successfully deployed and configured Stirling-PDF for yo ### Need More Help? +Run the built-in [diagnostics tool](./Configuration/Diagnostics.md) inside your Docker container to collect logs, configuration, and system information into a shareable archive. + **For Community Support:** -- Join Discord: https://discord.gg/Cn8pWhQRxZ +- Join Discord: https://discord.gg/HYmhKj45pU - Search GitHub Issues: https://github.com/Stirling-Tools/Stirling-PDF/issues **For Priority Support:** - Upgrade to Server or Enterprise plan -- Email: support@stirlingtools.com +- Email: support@stirlingpdf.com - Get dedicated support team --- From 759f53431f05bcf170726de5906074731f20ed31 Mon Sep 17 00:00:00 2001 From: Anthony Stirling <77850077+Frooodle@users.noreply.github.com> Date: Fri, 17 Apr 2026 00:18:21 +0100 Subject: [PATCH 2/4] move performance --- docs/API.md | 14 +- docs/Configuration/Configuration.md | 3 + .../Configuration/Performance-Optimization.md | 209 ++++++++++++++++++ docs/FAQ.md | 5 +- docs/Server-Admin-Onboarding.md | 187 +--------------- 5 files changed, 228 insertions(+), 190 deletions(-) create mode 100644 docs/Configuration/Performance-Optimization.md diff --git a/docs/API.md b/docs/API.md index 51bb6bb7..a632e6f3 100644 --- a/docs/API.md +++ b/docs/API.md @@ -24,10 +24,10 @@ Every Stirling PDF instance hosts its own interactive API documentation that exa Navigate to your instance's Swagger UI to browse, test, and experiment with all endpoints directly in your browser: ``` -http:///swagger-ui/index.html +http:///swagger-ui.html ``` -The path `/swagger-ui.html` also works as a redirect. For example: `http://localhost:8080/swagger-ui/index.html` +For example: `http://localhost:8080/swagger-ui.html` The Swagger UI lets you fill in parameters and execute requests against your running instance, which is the fastest way to learn how each endpoint works. You can also reach it from the Settings menu (gear icon in the top-right corner). @@ -308,7 +308,7 @@ The add-password endpoint uses **prevent** flags (not allow flags). To restrict | `preventExtractForAccessibility` | Extracting content for accessibility (screen readers) | | `preventFillInForm` | Filling in form fields | -Supported key lengths: `128` (AES-128) and `256` (AES-256). +Supported key lengths: `40`, `128`, and `256`. ### Remove Password @@ -407,7 +407,7 @@ The default async request timeout is **20 minutes** (1,200,000 ms). For very lar SYSTEM_CONNECTIONTIMEOUTMILLISECONDS=1800000 # 30 minutes ``` -Process-specific timeouts (LibreOffice, Tesseract, etc.) are configured separately — see the [Process Limits](./Configuration/Process-Limits.md) documentation. +Process-specific timeouts (LibreOffice, Tesseract, etc.) are configured separately - see the [Process Limits](./Configuration/Process-Limits.md) documentation. --- @@ -424,6 +424,6 @@ Process-specific timeouts (LibreOffice, Tesseract, etc.) are configured separate ## Related -- [Process Limits](./Configuration/Process-Limits.md) — Configure timeouts and concurrency for external tools -- [Production Deployment Guide](./Server-Admin-Onboarding.md) — Sizing and scaling recommendations -- [Diagnostics](./Configuration/Diagnostics.md) — Troubleshooting and reporting issues +- [Process Limits](./Configuration/Process-Limits.md) - Configure timeouts and concurrency for external tools +- [Production Deployment Guide](./Server-Admin-Onboarding.md) - Sizing and scaling recommendations +- [Diagnostics](./Configuration/Diagnostics.md) - Troubleshooting and reporting issues diff --git a/docs/Configuration/Configuration.md b/docs/Configuration/Configuration.md index a852f558..3312123d 100644 --- a/docs/Configuration/Configuration.md +++ b/docs/Configuration/Configuration.md @@ -233,6 +233,9 @@ For advanced features and specific use cases, see these detailed guides: ### Performance & Scaling +**[Performance Optimization & Sizing](./Performance-Optimization.md)** +- Resource sizing, JVM tuning, memory model, and scaling guidance + **[Process Limits](./Process-Limits.md)** - Session limits and timeouts for external tools diff --git a/docs/Configuration/Performance-Optimization.md b/docs/Configuration/Performance-Optimization.md new file mode 100644 index 00000000..59c02d24 --- /dev/null +++ b/docs/Configuration/Performance-Optimization.md @@ -0,0 +1,209 @@ +--- +sidebar_position: 24 +id: Performance-Optimization +title: Performance Optimization & Sizing +description: Resource sizing, JVM tuning, memory model, and scaling guidance for Stirling PDF +tags: + - Performance + - Sizing + - Memory + - Scaling +--- +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Performance Optimization & Sizing + +Understanding how Stirling PDF uses resources is essential for sizing your deployment correctly. PDF processing is memory-intensive - a single large PDF can expand to many times its file size in memory during processing. + +--- + +## How Stirling PDF Uses Memory + +Stirling PDF loads PDFs into memory using a tiered strategy based on file size: + +| File Size | Strategy | Memory Impact | +|---|---|---| +| Up to 10 MB | Loaded entirely into JVM heap as byte array | Fast, but consumes heap proportional to file size | +| 10 MB to 50 MB | Mixed mode - 10 MB budget in heap, remainder file-backed | Moderate heap usage with disk spillover | +| Over 50 MB | Fully file-backed (scratch space on disk) | Minimal heap, but requires adequate temp disk space | + +The application also monitors heap pressure. If free heap drops below **30% of total heap** or below **256 MB absolute**, all operations are forced into file-backed mode regardless of file size. + +The maximum number of simultaneous PDF operations is bounded by a semaphore based on your CPU count: `max(4, available CPU cores)`. Each concurrent operation may hold a document in memory, so peak memory usage scales with both file sizes and concurrency. + +:::caution Memory-Intensive Operations +A 50 MB PDF with complex vector graphics, embedded fonts, and many pages can expand to 200-500 MB in memory during processing. Operations that render pages (such as PDF-to-image conversion) and OCR are particularly memory-intensive. Plan for several times the maximum expected file size in available heap per concurrent operation. +::: + +--- + +## Resource Recommendations + + + + +**Recommended specifications:** +- **CPU:** 2 cores (4+ recommended) +- **RAM:** 4 GB total, 2 GB JVM heap +- **Disk:** 10 GB free temp space +- **Expected files:** Under 20 MB + +**Docker Compose:** +```yaml +services: + stirling-pdf: + image: docker.stirlingpdf.com/stirlingtools/stirling-pdf:latest + environment: + JAVA_TOOL_OPTIONS: "-Xms512m -Xmx2g" + deploy: + resources: + limits: + memory: 4G + cpus: '2.0' +``` + + + + +**Recommended specifications:** +- **CPU:** 4-8 cores +- **RAM:** 8-16 GB total, 4-8 GB JVM heap +- **Disk:** 50 GB temp space (SSD recommended) +- **Expected files:** Up to 100 MB + +**Docker Compose:** +```yaml +services: + stirling-pdf: + image: docker.stirlingpdf.com/stirlingtools/stirling-pdf:latest + environment: + JAVA_TOOL_OPTIONS: "-Xms1g -Xmx4g" + PROCESS_EXECUTOR_SESSION_LIMIT_LIBRE_OFFICE_SESSION_LIMIT: 2 + deploy: + resources: + limits: + memory: 8G + cpus: '4.0' +``` + +**Consider:** +- Increase LibreOffice session limit for faster document conversions - see [LibreOffice Parallel Processing](./LibreOffice-Parallel-Processing.md) +- External PostgreSQL database for reliability + + + + +**Recommended specifications:** +- **CPU:** 8+ cores +- **RAM:** 16-32 GB total, 8-16 GB JVM heap +- **Disk:** 100+ GB temp space, SSD strongly recommended +- **Expected files:** Up to 500 MB, OCR and conversion workloads + +**Docker Compose:** +```yaml +services: + stirling-pdf: + image: docker.stirlingpdf.com/stirlingtools/stirling-pdf:latest + environment: + JAVA_TOOL_OPTIONS: "-Xms2g -Xmx8g" + PROCESS_EXECUTOR_SESSION_LIMIT_LIBRE_OFFICE_SESSION_LIMIT: 4 + PROCESS_EXECUTOR_SESSION_LIMIT_TESSERACT_SESSION_LIMIT: 2 + deploy: + resources: + limits: + memory: 16G + cpus: '8.0' +``` + +**Architecture considerations:** +- Multiple instances behind a load balancer with session affinity +- Remote UNO servers for LibreOffice scaling - see [LibreOffice Parallel Processing](./LibreOffice-Parallel-Processing.md) +- External PostgreSQL database (enterprise feature) +- Shared `/configs` volume across instances for consistent settings + +:::tip Server/Enterprise Recommended +For large organizations, **Server or Enterprise plans** provide SSO, external database support, advanced monitoring, and dedicated support. + +[Learn more](../Server-Admin-Onboarding.md#step-10-paid-plans-serverenterprise) +::: + + + + +--- + +## JVM Tuning + +The application runs on Java 21+ with virtual threads enabled. The JVM does not ship with fixed heap settings - it uses the JVM's automatic ergonomics, which typically sets max heap to 25% of available container memory. For production, always explicitly set the heap: + +```bash +JAVA_TOOL_OPTIONS="-Xms512m -Xmx4g" +``` + +| Setting | Meaning | Recommendation | +|---|---|---| +| `-Xms` | Initial heap size | Set to 25-50% of `-Xmx` to reduce GC churn at startup | +| `-Xmx` | Maximum heap size | Set based on your workload (see sizing table above) | + +:::caution Container Memory Limits +If running in Docker or Kubernetes with memory limits, set the container limit to **at least 1.5x the JVM max heap** to account for JVM metaspace, LibreOffice processes (~50 MB each), Tesseract, Python processes, and OS overhead. For example, if you set `-Xmx4g`, set your container memory limit to at least 6 GB. +::: + +--- + +## Storage & Temp File Management + +Stirling PDF stores temporary processing files in a configurable temp directory (default: the system temp directory under `stirling-pdf/`). Automatic cleanup runs every **30 minutes** and removes files older than **24 hours**. + +For high-throughput deployments, ensure your temp directory is on fast storage (SSD) with sufficient space. Monitor disk usage - if cleanup cannot keep pace with file processing, the temp directory can grow large. + +**Monitor disk usage:** +```bash +# Check Docker disk usage +docker system df + +# Check Stirling-PDF data usage +du -sh ./stirling-data/* +``` + +--- + +## Job Queue Behavior + +Under high load, Stirling PDF queues incoming requests with these defaults: + +| Parameter | Default | Notes | +|---|---|---| +| Base queue capacity | 10 | Maximum queued jobs before rejection | +| Minimum queue capacity | 2 | Floor during resource pressure | +| Queue check interval | 1 second | How often queued jobs are re-evaluated | +| Maximum wait time | 10 minutes | After which queued jobs are rejected | + +When CPU or memory exceeds critical thresholds (CPU 90%, heap 90%), the queue capacity is dynamically reduced toward the minimum to shed load. If users are experiencing rejected requests during peak usage, consider scaling horizontally with multiple instances. + +--- + +## Resource-Intensive Operations + +Some operations require significantly more resources than others: + +| Operation | CPU Impact | Memory Impact | Notes | +|---|---|---|---| +| Merge / Split | Low | Proportional to total file sizes | Lightweight file operations | +| OCR (Tesseract) | Very High | High | CPU-bound image analysis | +| File Conversion (LibreOffice) | High | High | Single-threaded per instance | +| PDF-to-Image | Moderate | Very High | Page rendering expands memory significantly | +| PDF/A Conversion | Moderate | High | Font embedding and color profiles | +| Compression | Moderate | High | Rewriting internal PDF structures | + +For configuration of per-tool concurrency limits and timeouts, see [Process Limits](./Process-Limits.md). + +--- + +## Related + +- [Process Limits](./Process-Limits.md) - Configure session limits and timeouts for all external tools +- [LibreOffice Parallel Processing](./LibreOffice-Parallel-Processing.md) - Scale document conversions with multiple instances +- [Production Deployment Guide](../Server-Admin-Onboarding.md) - Full production setup walkthrough +- [Diagnostics](./Diagnostics.md) - Collect system and application diagnostics for troubleshooting diff --git a/docs/FAQ.md b/docs/FAQ.md index ac966f41..d1493f8e 100644 --- a/docs/FAQ.md +++ b/docs/FAQ.md @@ -26,12 +26,11 @@ Please report any bugs or issues you encounter through our [GitHub Issues page]( ### Q6: My Stirling PDF is using high RAM at idle. How can I optimize memory usage? Stirling PDF's memory usage can be optimized in several ways: -- **Disable additional features:** Set `DISABLE_ADDITIONAL_FEATURES=true` to reduce RAM consumption. This can significantly lower memory usage, especially during idle periods. - **Use the Ultra Lite version:** Pull the `latest-ultra-lite` tag from Docker Hub or GitHub, which is specifically designed for lower-end hardware. -- **Tune the JVM heap:** Set `JAVA_TOOL_OPTIONS="-Xms256m -Xmx1g"` to limit the maximum heap size. By default, the JVM may allocate up to 25% of container memory as heap. +- **Tune the JVM heap:** Set `JAVA_TOOL_OPTIONS="-Xms512m -Xmx2g"` to limit the maximum heap size. By default, the JVM may allocate up to 25% of container memory as heap. - **Reduce LibreOffice instances:** Each idle LibreOffice UNO server instance uses approximately 50 MB. The default session limit is 1. See [LibreOffice Parallel Processing](./Configuration/LibreOffice-Parallel-Processing.md) for details. -For detailed sizing recommendations, see [Production Deployment Guide — Performance Optimization](./Server-Admin-Onboarding.md#step-9-performance-optimization--sizing). +For detailed sizing recommendations, see the [Performance Optimization](./Configuration/Performance-Optimization.md) guide. ### Q7: I'm experiencing connection errors when pulling from docker.stirling.com diff --git a/docs/Server-Admin-Onboarding.md b/docs/Server-Admin-Onboarding.md index 0c3c25e8..89d31827 100644 --- a/docs/Server-Admin-Onboarding.md +++ b/docs/Server-Admin-Onboarding.md @@ -1317,197 +1317,24 @@ find backups/ -name "stirling-data-*.tar.gz" -mtime +30 -delete --- -## Step 9: Performance Optimization & Sizing +## Step 9: Performance Optimization -Understanding how Stirling PDF uses resources is essential for sizing your deployment correctly. PDF processing is memory-intensive — a single large PDF can expand to many times its file size in memory during processing. +For detailed resource sizing, JVM tuning, memory model documentation, job queue behavior, and scaling guidance, see the dedicated [Performance Optimization & Sizing](./Configuration/Performance-Optimization.md) guide. -### 9.1: How Stirling PDF Uses Memory +**Quick reference for Docker Compose resource limits:** -Stirling PDF loads PDFs into memory using a tiered strategy based on file size: - -| File Size | Strategy | Memory Impact | -|---|---|---| -| Up to 10 MB | Loaded entirely into JVM heap as byte array | Fast, but consumes heap proportional to file size | -| 10 MB to 50 MB | Mixed mode — 10 MB budget in heap, remainder file-backed | Moderate heap usage with disk spillover | -| Over 50 MB | Fully file-backed (scratch space on disk) | Minimal heap, but requires adequate temp disk space | - -The application also monitors heap pressure. If free heap drops below **30% of total heap** or below **256 MB absolute**, all operations are forced into file-backed mode regardless of file size. - -The maximum number of simultaneous PDF operations is bounded by a semaphore based on your CPU count: `max(4, available CPU cores)`. Each concurrent operation may hold a document in memory, so peak memory usage scales with both file sizes and concurrency. - -:::caution Memory-Intensive Operations -A 50 MB PDF with complex vector graphics, embedded fonts, and many pages can expand to 200–500 MB in memory during processing. Operations that render pages (such as PDF-to-image conversion) and OCR are particularly memory-intensive. Plan for several times the maximum expected file size in available heap per concurrent operation. -::: - -### 9.2: Resource Recommendations - - - - -**Recommended specifications:** -- **CPU:** 2 cores (4+ recommended) -- **RAM:** 4 GB total, 2 GB JVM heap -- **Disk:** 10 GB free temp space -- **Expected files:** Under 20 MB - -**Docker Compose:** -```yaml -services: - stirling-pdf: - image: docker.stirlingpdf.com/stirlingtools/stirling-pdf:latest - environment: - JAVA_TOOL_OPTIONS: "-Xms512m -Xmx2g" - deploy: - resources: - limits: - memory: 4G - cpus: '2.0' -``` - - - - -**Recommended specifications:** -- **CPU:** 4–8 cores -- **RAM:** 8–16 GB total, 4–8 GB JVM heap -- **Disk:** 50 GB temp space (SSD recommended) -- **Expected files:** Up to 100 MB - -**Docker Compose:** ```yaml services: stirling-pdf: - image: docker.stirlingpdf.com/stirlingtools/stirling-pdf:latest environment: - JAVA_TOOL_OPTIONS: "-Xms1g -Xmx4g" - PROCESS_EXECUTOR_SESSION_LIMIT_LIBRE_OFFICE_SESSION_LIMIT: 2 + JAVA_TOOL_OPTIONS: "-Xms512m -Xmx4g" # Always set explicitly for production deploy: resources: limits: - memory: 8G + memory: 8G # At least 1.5x your -Xmx value cpus: '4.0' ``` -**Consider:** -- Increase LibreOffice session limit for faster document conversions — see [LibreOffice Parallel Processing](./Configuration/LibreOffice-Parallel-Processing.md) -- External PostgreSQL database for reliability - - - - -**Recommended specifications:** -- **CPU:** 8+ cores -- **RAM:** 16–32 GB total, 8–16 GB JVM heap -- **Disk:** 100+ GB temp space, SSD strongly recommended -- **Expected files:** Up to 500 MB, OCR and conversion workloads - -**Docker Compose:** -```yaml -services: - stirling-pdf: - image: docker.stirlingpdf.com/stirlingtools/stirling-pdf:latest - environment: - JAVA_TOOL_OPTIONS: "-Xms2g -Xmx8g" - PROCESS_EXECUTOR_SESSION_LIMIT_LIBRE_OFFICE_SESSION_LIMIT: 4 - PROCESS_EXECUTOR_SESSION_LIMIT_TESSERACT_SESSION_LIMIT: 2 - deploy: - resources: - limits: - memory: 16G - cpus: '8.0' -``` - -**Architecture considerations:** -- Multiple instances behind a load balancer with session affinity -- Remote UNO servers for LibreOffice scaling — see [LibreOffice Parallel Processing](./Configuration/LibreOffice-Parallel-Processing.md) -- External PostgreSQL database (enterprise feature) -- Shared `/configs` volume across instances for consistent settings - -:::tip Server/Enterprise Recommended -For large organizations, **Server or Enterprise plans** provide SSO, external database support, advanced monitoring, and dedicated support. - -[Learn more](#step-10-paid-plans-serverenterprise) -::: - - - - -### 9.3: JVM Tuning - -The application runs on Java 21+ with virtual threads enabled. The JVM does not ship with fixed heap settings — it uses the JVM's automatic ergonomics, which typically sets max heap to 25% of available container memory. For production, always explicitly set the heap: - -```bash -JAVA_TOOL_OPTIONS="-Xms512m -Xmx4g" -``` - -| Setting | Meaning | Recommendation | -|---|---|---| -| `-Xms` | Initial heap size | Set to 25–50% of `-Xmx` to reduce GC churn at startup | -| `-Xmx` | Maximum heap size | Set based on your workload (see sizing table above) | - -:::caution Container Memory Limits -If running in Docker or Kubernetes with memory limits, set the container limit to **at least 1.5x the JVM max heap** to account for JVM metaspace, LibreOffice processes (~50 MB each), Tesseract, Python processes, and OS overhead. For example, if you set `-Xmx4g`, set your container memory limit to at least 6 GB. -::: - -### 9.4: Reverse Proxy Configuration - -If you're using a reverse proxy, ensure your upload size limits and timeouts are set appropriately. - -**NGINX** defaults to a 1 MB `client_max_body_size`, which will block most PDF uploads and return `.htm` error pages instead. You **must** increase this: - -```nginx -server { - client_max_body_size 2000M; # Match your Stirling PDF file upload limit - proxy_read_timeout 600s; # Allow time for large file processing - proxy_send_timeout 600s; -} -``` - -See also [FAQ](./FAQ.md) for more common issues related to reverse proxy configuration. - -### 9.5: Storage & Temp File Management - -Stirling PDF stores temporary processing files in a configurable temp directory (default: the system temp directory under `stirling-pdf/`). Automatic cleanup runs every **30 minutes** and removes files older than **24 hours**. - -For high-throughput deployments, ensure your temp directory is on fast storage (SSD) with sufficient space. Monitor disk usage — if cleanup cannot keep pace with file processing, the temp directory can grow large. - -**Monitor disk usage:** -```bash -# Check Docker disk usage -docker system df - -# Check Stirling-PDF data usage -du -sh ./stirling-data/* -``` - -### 9.6: Job Queue Behavior - -Under high load, Stirling PDF queues incoming requests with these defaults: - -| Parameter | Default | Notes | -|---|---|---| -| Base queue capacity | 10 | Maximum queued jobs before rejection | -| Minimum queue capacity | 2 | Floor during resource pressure | -| Queue check interval | 1 second | How often queued jobs are re-evaluated | -| Maximum wait time | 10 minutes | After which queued jobs are rejected | - -When CPU or memory exceeds critical thresholds (CPU 90%, heap 90%), the queue capacity is dynamically reduced toward the minimum to shed load. If users are experiencing rejected requests during peak usage, consider scaling horizontally with multiple instances. - -### 9.7: Resource-Intensive Operations - -Some operations require significantly more resources than others: - -| Operation | CPU Impact | Memory Impact | Notes | -|---|---|---|---| -| Merge / Split | Low | Proportional to total file sizes | Lightweight file operations | -| OCR (Tesseract) | Very High | High | CPU-bound image analysis | -| File Conversion (LibreOffice) | High | High | Single-threaded per instance | -| PDF-to-Image | Moderate | Very High | Page rendering expands memory significantly | -| PDF/A Conversion | Moderate | High | Font embedding and color profiles | -| Compression | Moderate | High | Rewriting internal PDF structures | - -For configuration of per-tool concurrency limits and timeouts, see [Process Limits](./Configuration/Process-Limits.md). - --- ## Step 10: Paid Plans (Server/Enterprise) @@ -1604,8 +1431,8 @@ Congratulations! You've successfully deployed and configured Stirling-PDF for yo **Solutions:** 1. Check resource limits: `docker stats stirling-pdf` -2. Increase JVM heap: Set `JAVA_TOOL_OPTIONS="-Xms1g -Xmx4g"` (see [Step 9](#step-9-performance-optimization--sizing)) -3. Increase LibreOffice instances if document conversions are slow — see [LibreOffice Parallel Processing](./Configuration/LibreOffice-Parallel-Processing.md) +2. Increase JVM heap - see [Performance Optimization](./Configuration/Performance-Optimization.md) +3. Increase LibreOffice instances if document conversions are slow - see [LibreOffice Parallel Processing](./Configuration/LibreOffice-Parallel-Processing.md) 4. Check disk I/O: Use SSD for temp file storage 5. Run the built-in [diagnostics tool](./Configuration/Diagnostics.md) and check application logs From b5a36ecc14a15632c16a595ac2a55d1aae1f3aba Mon Sep 17 00:00:00 2001 From: Anthony Stirling <77850077+Frooodle@users.noreply.github.com> Date: Fri, 17 Apr 2026 00:43:10 +0100 Subject: [PATCH 3/4] reset api docs --- docs/API.md | 446 ++++++++-------------------------------------------- 1 file changed, 63 insertions(+), 383 deletions(-) diff --git a/docs/API.md b/docs/API.md index a632e6f3..50f4a97b 100644 --- a/docs/API.md +++ b/docs/API.md @@ -2,7 +2,7 @@ sidebar_position: 7 id: API title: API -description: REST API documentation, authentication, and usage examples for Stirling PDF +description: Overview of API offering in S-PDF tags: - API --- @@ -11,419 +11,99 @@ import TabItem from '@theme/TabItem'; # Stirling PDF API -Stirling PDF provides a REST API for all PDF operations available in the web UI. The API uses multipart form data for file uploads and returns processed files directly in the response body. +Stirling PDF exposes a simple API for easy integration with external scripts. You can access the API documentation in two ways: ---- - -## API Documentation - -Every Stirling PDF instance hosts its own interactive API documentation that exactly matches the installed version. +1. Local Swagger UI at `/swagger-ui.html` on your Stirling PDF instance +2. Online [Swagger Documentation](https://app.swaggerhub.com/apis-docs/Frooodle/Stirling-PDF/) -### Local Swagger UI (Recommended) +You can also access the documentation through the settings menu (gear icon in the top-right corner). -Navigate to your instance's Swagger UI to browse, test, and experiment with all endpoints directly in your browser: - -``` -http:///swagger-ui.html -``` +## Accessing API Documentation -For example: `http://localhost:8080/swagger-ui.html` +### Local Swagger UI +Your Stirling PDF instance includes built-in API documentation: +1. Navigate to `http://your-instance:port/swagger-ui.html` +2. Or append `/swagger-ui.html` to your Stirling PDF URL +3. This provides an interactive documentation interface where you can: + - View all available endpoints + - Test API calls directly + - See request/response schemas + - View authentication requirements -The Swagger UI lets you fill in parameters and execute requests against your running instance, which is the fastest way to learn how each endpoint works. You can also reach it from the Settings menu (gear icon in the top-right corner). - -### OpenAPI Specification (Machine-Readable) - -The raw OpenAPI 3.0 JSON specification is available at: - -``` -http:///v1/api-docs -``` - -You can import this into tools like Postman or Insomnia, or use it to generate client libraries in any language. - -:::caution Always Use Your Local Swagger UI -Always reference the Swagger UI on your own instance rather than external API documentation links. The endpoints and parameters may differ between versions, and your local Swagger UI is always accurate for your installed version. -::: - ---- +### Settings Menu Access +1. Click the gear icon (⚙️) in the top-right corner +2. Look for the "API Documentation" or "API" link +3. This will take you to the local Swagger UI ## API Authentication -When security is enabled, all API requests require authentication via the `X-API-KEY` header. +When security is enabled, all API requests require authentication. There are two ways to handle API authentication: ### User-Specific API Keys - -1. Log into Stirling PDF -2. Go to Account Settings (via the gear icon) -3. Find your API key in the account details +1. Obtain your API key: + - Log into Stirling PDF + - Go to Account Settings (via the gear icon) + - Find your API key in the account details ### Global API Key - -You can set a custom global API key via environment variable: - +You can set a custom global API key using the environment variable: ```bash SECURITY_CUSTOMGLOBALAPIKEY=your-custom-api-key ``` - This allows you to set a single API key that works regardless of user authentication. -### Using the API Key - -Include the API key in every request using the `X-API-KEY` header: - -```bash -curl -H "X-API-KEY: your-api-key-here" ... -``` - -If login/security is not enabled, the API endpoints are accessible without authentication. - ---- - -## Basic Request Pattern +2. Include the API key in all requests: + ```http + X-API-KEY: your-api-key-here + ``` -All PDF processing endpoints follow the same pattern: - -1. Send a `POST` request with `Content-Type: multipart/form-data` -2. Attach the PDF file as `fileInput` -3. Include operation-specific parameters as form fields -4. Receive the processed PDF (or other output format) in the response body - -Successful responses return the processed file directly with appropriate content headers (`Content-Type: application/pdf` and `Content-Disposition`). Error responses return JSON with details about what went wrong. - ---- +3. Example authenticated request: + ```bash + curl -X POST "http://localhost:8080/add-watermark" \ + -H "X-API-KEY: your-api-key-here" \ + -H "Content-Type: multipart/form-data" \ + ... + ``` ## API Limitations -Not all Stirling PDF features are available through the API. Some operations (such as "view-pdf" or "visually sign") run exclusively on the front end and are only available through the Web UI. If you find that some API endpoints appear to be missing, this is likely the reason. - -Stirling PDF also provides health and statistics endpoints for integration with monitoring and dashboard applications. - ---- - -## Example API Requests - -### Merge Multiple PDFs - - - - ```bash - curl -X POST "http://localhost:8080/api/v1/general/merge-pdfs" \ - -H "X-API-KEY: your-api-key" \ - -F "fileInput=@file1.pdf" \ - -F "fileInput=@file2.pdf" \ - -F "sortType=orderProvided" \ - > merged_output.pdf - ``` - - - ```bash - curl -X POST "http://localhost:8080/api/v1/general/merge-pdfs" ^ - -H "X-API-KEY: your-api-key" ^ - -F "fileInput=@file1.pdf" ^ - -F "fileInput=@file2.pdf" ^ - -F "sortType=orderProvided" ^ - > merged_output.pdf - ``` - - - -### Split a PDF by Pages - - - - ```bash - curl -X POST "http://localhost:8080/api/v1/general/split-pages" \ - -H "X-API-KEY: your-api-key" \ - -F "fileInput=@document.pdf" \ - -F "pageNumbers=1,3,5-10" \ - > split_output.pdf - ``` - - - ```bash - curl -X POST "http://localhost:8080/api/v1/general/split-pages" ^ - -H "X-API-KEY: your-api-key" ^ - -F "fileInput=@document.pdf" ^ - -F "pageNumbers=1,3,5-10" ^ - > split_output.pdf - ``` - - - -### Convert Office Document to PDF - - - - ```bash - curl -X POST "http://localhost:8080/api/v1/convert/file/pdf" \ - -H "X-API-KEY: your-api-key" \ - -F "fileInput=@document.docx" \ - > converted.pdf - ``` - - - ```bash - curl -X POST "http://localhost:8080/api/v1/convert/file/pdf" ^ - -H "X-API-KEY: your-api-key" ^ - -F "fileInput=@document.docx" ^ - > converted.pdf - ``` - - - -### Add Watermark - - - - ```bash - curl -X POST "http://localhost:8080/api/v1/security/add-watermark" \ - -H "X-API-KEY: your-api-key" \ - -F "fileInput=@document.pdf" \ - -F "watermarkType=text" \ - -F "watermarkText=CONFIDENTIAL" \ - -F "fontSize=30" \ - -F "rotation=45" \ - -F "opacity=0.5" \ - > watermarked.pdf - ``` - - - ```bash - curl -X POST "http://localhost:8080/api/v1/security/add-watermark" ^ - -H "X-API-KEY: your-api-key" ^ - -F "fileInput=@document.pdf" ^ - -F "watermarkType=text" ^ - -F "watermarkText=CONFIDENTIAL" ^ - -F "fontSize=30" ^ - -F "rotation=45" ^ - -F "opacity=0.5" ^ - > watermarked.pdf - ``` - - - -### OCR a Scanned PDF - - - - ```bash - curl -X POST "http://localhost:8080/api/v1/misc/ocr-pdf" \ - -H "X-API-KEY: your-api-key" \ - -F "fileInput=@scanned_document.pdf" \ - -F "languages=eng" \ - -F "ocrType=force-ocr" \ - > searchable_document.pdf - ``` - - - ```bash - curl -X POST "http://localhost:8080/api/v1/misc/ocr-pdf" ^ - -H "X-API-KEY: your-api-key" ^ - -F "fileInput=@scanned_document.pdf" ^ - -F "languages=eng" ^ - -F "ocrType=force-ocr" ^ - > searchable_document.pdf - ``` - - - -### Compress a PDF - - - - ```bash - curl -X POST "http://localhost:8080/api/v1/general/optimize-pdf" \ - -H "X-API-KEY: your-api-key" \ - -F "fileInput=@large_document.pdf" \ - -F "optimizeLevel=2" \ - > compressed_document.pdf - ``` - - - ```bash - curl -X POST "http://localhost:8080/api/v1/general/optimize-pdf" ^ - -H "X-API-KEY: your-api-key" ^ - -F "fileInput=@large_document.pdf" ^ - -F "optimizeLevel=2" ^ - > compressed_document.pdf - ``` - - - ---- +Stirling PDF's feature set is not entirely confined to the backend, hence not all functionalities are accessible via the API. Certain operations, such as the "view-pdf" or "visually sign", are executed exclusively on the front-end, and as such, they are only available through the Web-UI. If you encounter a situation where some API endpoints appear to be absent, it is likely attributable to these front-end exclusive features. -## Password Protection & Permissions +Stirling PDF also has statistic and health endpoints to integrate with monitoring/dashboard applications. -The add-password endpoint uses **prevent** flags (not allow flags). To restrict specific actions, set the corresponding `prevent*` parameter to `true`. +## Example CURL Commands ```bash - curl -X POST "http://localhost:8080/api/v1/security/add-password" \ - -H "X-API-KEY: your-api-key" \ - -F "fileInput=@document.pdf" \ - -F "ownerPassword=OwnerPass123" \ - -F "keyLength=256" \ - -F "preventPrinting=false" \ - -F "preventModify=true" \ - -F "preventAssembly=true" \ - -F "preventExtractContent=true" \ - -F "preventExtractForAccessibility=false" \ - -F "preventFillInForm=false" \ - -F "preventModifyAnnotations=true" \ - -F "preventPrintingFaithful=false" \ - > protected_file.pdf + curl -X POST "http://localhost:8080/add-watermark" \ + -H "Content-Type: multipart/form-data" \ + -F "fileInput=@/Users/username/Downloads/sample-1_cropped.pdf" \ + -F "watermarkType=text" \ + -F "watermarkText=YOUR_WATERMARK_TEXT" \ + -F "alphabet=roman" \ + -F "fontSize=30" \ + -F "rotation=0" \ + -F "opacity=0.5" \ + -F "widthSpacer=50" \ + -F "heightSpacer=50" \ + > "/Users/username/Downloads/output.pdf" ``` ```bash - curl -X POST "http://localhost:8080/api/v1/security/add-password" ^ - -H "X-API-KEY: your-api-key" ^ - -F "fileInput=@document.pdf" ^ - -F "ownerPassword=OwnerPass123" ^ - -F "keyLength=256" ^ - -F "preventPrinting=false" ^ - -F "preventModify=true" ^ - -F "preventAssembly=true" ^ - -F "preventExtractContent=true" ^ - -F "preventExtractForAccessibility=false" ^ - -F "preventFillInForm=false" ^ - -F "preventModifyAnnotations=true" ^ - -F "preventPrintingFaithful=false" ^ - > protected_file.pdf + curl -X POST "http://localhost:8080/add-watermark" ^ + -H "Content-Type: multipart/form-data" ^ + -F "fileInput=@C:\Users\systo\Downloads\sample-1_cropped.pdf" ^ + -F "watermarkType=text" ^ + -F "watermarkText=YOUR_WATERMARK_TEXT" ^ + -F "alphabet=roman" ^ + -F "fontSize=30" ^ + -F "rotation=0" ^ + -F "opacity=0.5" ^ + -F "widthSpacer=50" ^ + -F "heightSpacer=50" ^ + > "C:\Users\systo\Downloads\output.pdf" ``` - -### Permission Flags Reference - -| Flag | What it prevents when `true` | -|---|---| -| `preventPrinting` | Standard quality printing | -| `preventPrintingFaithful` | High fidelity printing | -| `preventModify` | Modifying document content | -| `preventModifyAnnotations` | Modifying annotations and comments | -| `preventAssembly` | Assembling the document (merge, rearrange pages) | -| `preventExtractContent` | Copying text and graphics | -| `preventExtractForAccessibility` | Extracting content for accessibility (screen readers) | -| `preventFillInForm` | Filling in form fields | - -Supported key lengths: `40`, `128`, and `256`. - -### Remove Password - - - - ```bash - curl -X POST "http://localhost:8080/api/v1/security/remove-password" \ - -H "X-API-KEY: your-api-key" \ - -F "fileInput=@protected_file.pdf" \ - -F "password=CurrentPassword123" \ - > unlocked_file.pdf - ``` - - - ```bash - curl -X POST "http://localhost:8080/api/v1/security/remove-password" ^ - -H "X-API-KEY: your-api-key" ^ - -F "fileInput=@protected_file.pdf" ^ - -F "password=CurrentPassword123" ^ - > unlocked_file.pdf - ``` - - - ---- - -## PDF-to-CSV and PDF-to-XLSX Conversion - -These endpoints extract tabular data from PDF files: - - - - ```bash - # PDF to CSV - curl -X POST "http://localhost:8080/api/v1/convert/pdf/csv" \ - -H "X-API-KEY: your-api-key" \ - -F "fileInput=@table_document.pdf" \ - -F "pageNumbers=all" \ - > extracted_tables.csv - - # PDF to Excel - curl -X POST "http://localhost:8080/api/v1/convert/pdf/xlsx" \ - -H "X-API-KEY: your-api-key" \ - -F "fileInput=@table_document.pdf" \ - > extracted_tables.xlsx - ``` - - - ```bash - REM PDF to CSV - curl -X POST "http://localhost:8080/api/v1/convert/pdf/csv" ^ - -H "X-API-KEY: your-api-key" ^ - -F "fileInput=@table_document.pdf" ^ - -F "pageNumbers=all" ^ - > extracted_tables.csv - - REM PDF to Excel - curl -X POST "http://localhost:8080/api/v1/convert/pdf/xlsx" ^ - -H "X-API-KEY: your-api-key" ^ - -F "fileInput=@table_document.pdf" ^ - > extracted_tables.xlsx - ``` - - - -:::danger Empty or 0-byte Output? -These endpoints extract text-based tables from PDFs. The text in your PDF must be selectable (not scanned images). If you get empty or 0-byte output, the PDF likely contains image-based content. Run OCR on the document first using the `/api/v1/misc/ocr-pdf` endpoint to make the text extractable, then retry the conversion. See [OCR Configuration](./Configuration/OCR.md) for language pack setup. -::: - ---- - -## Response Handling - -Always check the HTTP status code before processing the response body: - -```bash -response=$(curl -s -w "\n%{http_code}" -X POST "http://localhost:8080/api/v1/general/merge-pdfs" \ - -H "X-API-KEY: your-api-key" \ - -F "fileInput=@file1.pdf" \ - -F "fileInput=@file2.pdf") - -http_code=$(echo "$response" | tail -1) -if [ "$http_code" -ne 200 ]; then - echo "Error: HTTP $http_code" - echo "$response" | head -n -1 -fi -``` - ---- - -## Rate Limits and Timeouts - -The default async request timeout is **20 minutes** (1,200,000 ms). For very large files or complex operations, this can be adjusted: - -```bash -SYSTEM_CONNECTIONTIMEOUTMILLISECONDS=1800000 # 30 minutes -``` - -Process-specific timeouts (LibreOffice, Tesseract, etc.) are configured separately - see the [Process Limits](./Configuration/Process-Limits.md) documentation. - ---- - -## Health and Monitoring Endpoints - -| Endpoint | Purpose | -|---|---| -| `/api/v1/info/status` | Application status (used by Docker health checks) | -| `/api/v1/info/health` | Detailed health information | -| `/actuator/health` | Spring Boot Actuator health endpoint | -| `/actuator/prometheus` | Prometheus-compatible metrics export | - ---- - -## Related - -- [Process Limits](./Configuration/Process-Limits.md) - Configure timeouts and concurrency for external tools -- [Production Deployment Guide](./Server-Admin-Onboarding.md) - Sizing and scaling recommendations -- [Diagnostics](./Configuration/Diagnostics.md) - Troubleshooting and reporting issues From d9f1bc3c856de3a4dca8ae21aae397e04e2f6e77 Mon Sep 17 00:00:00 2001 From: Anthony Stirling <77850077+Frooodle@users.noreply.github.com> Date: Fri, 17 Apr 2026 00:55:42 +0100 Subject: [PATCH 4/4] simplify performance and FAQ for admins --- .../Configuration/Performance-Optimization.md | 94 +++++-------------- docs/FAQ.md | 2 +- docs/Server-Admin-Onboarding.md | 16 +--- 3 files changed, 26 insertions(+), 86 deletions(-) diff --git a/docs/Configuration/Performance-Optimization.md b/docs/Configuration/Performance-Optimization.md index 59c02d24..d3f5c473 100644 --- a/docs/Configuration/Performance-Optimization.md +++ b/docs/Configuration/Performance-Optimization.md @@ -2,11 +2,10 @@ sidebar_position: 24 id: Performance-Optimization title: Performance Optimization & Sizing -description: Resource sizing, JVM tuning, memory model, and scaling guidance for Stirling PDF +description: Resource sizing and scaling guidance for Stirling PDF deployments tags: - Performance - Sizing - - Memory - Scaling --- import Tabs from '@theme/Tabs'; @@ -14,7 +13,7 @@ import TabItem from '@theme/TabItem'; # Performance Optimization & Sizing -Understanding how Stirling PDF uses resources is essential for sizing your deployment correctly. PDF processing is memory-intensive - a single large PDF can expand to many times its file size in memory during processing. +PDF processing is memory-intensive - a single large PDF can expand to many times its file size in memory during processing. This guide helps you size your deployment correctly. --- @@ -24,16 +23,14 @@ Stirling PDF loads PDFs into memory using a tiered strategy based on file size: | File Size | Strategy | Memory Impact | |---|---|---| -| Up to 10 MB | Loaded entirely into JVM heap as byte array | Fast, but consumes heap proportional to file size | -| 10 MB to 50 MB | Mixed mode - 10 MB budget in heap, remainder file-backed | Moderate heap usage with disk spillover | -| Over 50 MB | Fully file-backed (scratch space on disk) | Minimal heap, but requires adequate temp disk space | +| Up to 10 MB | Loaded entirely into memory | Fast, but consumes memory proportional to file size | +| 10 MB to 50 MB | Partially in memory, remainder stored on disk | Moderate memory usage with disk spillover | +| Over 50 MB | Fully stored on disk during processing | Minimal memory usage, but requires adequate disk space | -The application also monitors heap pressure. If free heap drops below **30% of total heap** or below **256 MB absolute**, all operations are forced into file-backed mode regardless of file size. - -The maximum number of simultaneous PDF operations is bounded by a semaphore based on your CPU count: `max(4, available CPU cores)`. Each concurrent operation may hold a document in memory, so peak memory usage scales with both file sizes and concurrency. +The application also monitors memory pressure. If available memory drops too low, all operations are forced into disk-backed mode regardless of file size. :::caution Memory-Intensive Operations -A 50 MB PDF with complex vector graphics, embedded fonts, and many pages can expand to 200-500 MB in memory during processing. Operations that render pages (such as PDF-to-image conversion) and OCR are particularly memory-intensive. Plan for several times the maximum expected file size in available heap per concurrent operation. +A 50 MB PDF with complex vector graphics, embedded fonts, and many pages can expand to 200-500 MB in memory during processing. Operations that render pages (such as PDF-to-image conversion) and OCR are particularly memory-intensive. Plan for several times the maximum expected file size per concurrent operation. ::: --- @@ -45,17 +42,14 @@ A 50 MB PDF with complex vector graphics, embedded fonts, and many pages can exp **Recommended specifications:** - **CPU:** 2 cores (4+ recommended) -- **RAM:** 4 GB total, 2 GB JVM heap -- **Disk:** 10 GB free temp space -- **Expected files:** Under 20 MB +- **RAM:** 4 GB +- **Disk:** 10 GB free space **Docker Compose:** ```yaml services: stirling-pdf: image: docker.stirlingpdf.com/stirlingtools/stirling-pdf:latest - environment: - JAVA_TOOL_OPTIONS: "-Xms512m -Xmx2g" deploy: resources: limits: @@ -68,9 +62,8 @@ services: **Recommended specifications:** - **CPU:** 4-8 cores -- **RAM:** 8-16 GB total, 4-8 GB JVM heap -- **Disk:** 50 GB temp space (SSD recommended) -- **Expected files:** Up to 100 MB +- **RAM:** 8-16 GB +- **Disk:** 50 GB (SSD recommended) **Docker Compose:** ```yaml @@ -78,7 +71,6 @@ services: stirling-pdf: image: docker.stirlingpdf.com/stirlingtools/stirling-pdf:latest environment: - JAVA_TOOL_OPTIONS: "-Xms1g -Xmx4g" PROCESS_EXECUTOR_SESSION_LIMIT_LIBRE_OFFICE_SESSION_LIMIT: 2 deploy: resources: @@ -92,13 +84,12 @@ services: - External PostgreSQL database for reliability - + **Recommended specifications:** - **CPU:** 8+ cores -- **RAM:** 16-32 GB total, 8-16 GB JVM heap -- **Disk:** 100+ GB temp space, SSD strongly recommended -- **Expected files:** Up to 500 MB, OCR and conversion workloads +- **RAM:** 16-32 GB +- **Disk:** 100+ GB, SSD strongly recommended **Docker Compose:** ```yaml @@ -106,7 +97,6 @@ services: stirling-pdf: image: docker.stirlingpdf.com/stirlingtools/stirling-pdf:latest environment: - JAVA_TOOL_OPTIONS: "-Xms2g -Xmx8g" PROCESS_EXECUTOR_SESSION_LIMIT_LIBRE_OFFICE_SESSION_LIMIT: 4 PROCESS_EXECUTOR_SESSION_LIMIT_TESSERACT_SESSION_LIMIT: 2 deploy: @@ -133,60 +123,24 @@ For large organizations, **Server or Enterprise plans** provide SSO, external da --- -## JVM Tuning - -The application runs on Java 21+ with virtual threads enabled. The JVM does not ship with fixed heap settings - it uses the JVM's automatic ergonomics, which typically sets max heap to 25% of available container memory. For production, always explicitly set the heap: - -```bash -JAVA_TOOL_OPTIONS="-Xms512m -Xmx4g" -``` - -| Setting | Meaning | Recommendation | -|---|---|---| -| `-Xms` | Initial heap size | Set to 25-50% of `-Xmx` to reduce GC churn at startup | -| `-Xmx` | Maximum heap size | Set based on your workload (see sizing table above) | - -:::caution Container Memory Limits -If running in Docker or Kubernetes with memory limits, set the container limit to **at least 1.5x the JVM max heap** to account for JVM metaspace, LibreOffice processes (~50 MB each), Tesseract, Python processes, and OS overhead. For example, if you set `-Xmx4g`, set your container memory limit to at least 6 GB. -::: - ---- - -## Storage & Temp File Management +## Fine Tuning -Stirling PDF stores temporary processing files in a configurable temp directory (default: the system temp directory under `stirling-pdf/`). Automatic cleanup runs every **30 minutes** and removes files older than **24 hours**. +For most deployments, Stirling PDF's defaults work well and no manual tuning is needed. If you are experiencing performance issues with large files or high concurrency, you can adjust the memory allocated to the application using the `JAVA_TOOL_OPTIONS` environment variable: -For high-throughput deployments, ensure your temp directory is on fast storage (SSD) with sufficient space. Monitor disk usage - if cleanup cannot keep pace with file processing, the temp directory can grow large. - -**Monitor disk usage:** -```bash -# Check Docker disk usage -docker system df - -# Check Stirling-PDF data usage -du -sh ./stirling-data/* +```yaml +services: + stirling-pdf: + environment: + JAVA_TOOL_OPTIONS: "-Xms512m -Xmx4g" ``` ---- - -## Job Queue Behavior - -Under high load, Stirling PDF queues incoming requests with these defaults: - -| Parameter | Default | Notes | -|---|---|---| -| Base queue capacity | 10 | Maximum queued jobs before rejection | -| Minimum queue capacity | 2 | Floor during resource pressure | -| Queue check interval | 1 second | How often queued jobs are re-evaluated | -| Maximum wait time | 10 minutes | After which queued jobs are rejected | - -When CPU or memory exceeds critical thresholds (CPU 90%, heap 90%), the queue capacity is dynamically reduced toward the minimum to shed load. If users are experiencing rejected requests during peak usage, consider scaling horizontally with multiple instances. +`-Xms` sets the initial memory allocation and `-Xmx` sets the maximum. If running in Docker or Kubernetes with memory limits, set the container limit to **at least 1.5x the `-Xmx` value** to leave room for background processes like LibreOffice and Tesseract. --- ## Resource-Intensive Operations -Some operations require significantly more resources than others: +Some operations require significantly more resources than others. If your organization primarily uses specific tools, you should size your deployment based on the most resource-heavy operations your users will perform. | Operation | CPU Impact | Memory Impact | Notes | |---|---|---|---| @@ -197,7 +151,7 @@ Some operations require significantly more resources than others: | PDF/A Conversion | Moderate | High | Font embedding and color profiles | | Compression | Moderate | High | Rewriting internal PDF structures | -For configuration of per-tool concurrency limits and timeouts, see [Process Limits](./Process-Limits.md). +For example, if your team primarily uses OCR and document conversion, you will need significantly more resources than a team that mainly merges and splits PDFs. Adjust your [Process Limits](./Process-Limits.md) and resource allocation accordingly. --- diff --git a/docs/FAQ.md b/docs/FAQ.md index d1493f8e..eebff0e8 100644 --- a/docs/FAQ.md +++ b/docs/FAQ.md @@ -27,7 +27,7 @@ Please report any bugs or issues you encounter through our [GitHub Issues page]( Stirling PDF's memory usage can be optimized in several ways: - **Use the Ultra Lite version:** Pull the `latest-ultra-lite` tag from Docker Hub or GitHub, which is specifically designed for lower-end hardware. -- **Tune the JVM heap:** Set `JAVA_TOOL_OPTIONS="-Xms512m -Xmx2g"` to limit the maximum heap size. By default, the JVM may allocate up to 25% of container memory as heap. +- **Tune memory allocation:** See the [Fine Tuning](./Configuration/Performance-Optimization.md#fine-tuning) section of the Performance Optimization guide for how to adjust memory limits. - **Reduce LibreOffice instances:** Each idle LibreOffice UNO server instance uses approximately 50 MB. The default session limit is 1. See [LibreOffice Parallel Processing](./Configuration/LibreOffice-Parallel-Processing.md) for details. For detailed sizing recommendations, see the [Performance Optimization](./Configuration/Performance-Optimization.md) guide. diff --git a/docs/Server-Admin-Onboarding.md b/docs/Server-Admin-Onboarding.md index 89d31827..f986c22c 100644 --- a/docs/Server-Admin-Onboarding.md +++ b/docs/Server-Admin-Onboarding.md @@ -1319,21 +1319,7 @@ find backups/ -name "stirling-data-*.tar.gz" -mtime +30 -delete ## Step 9: Performance Optimization -For detailed resource sizing, JVM tuning, memory model documentation, job queue behavior, and scaling guidance, see the dedicated [Performance Optimization & Sizing](./Configuration/Performance-Optimization.md) guide. - -**Quick reference for Docker Compose resource limits:** - -```yaml -services: - stirling-pdf: - environment: - JAVA_TOOL_OPTIONS: "-Xms512m -Xmx4g" # Always set explicitly for production - deploy: - resources: - limits: - memory: 8G # At least 1.5x your -Xmx value - cpus: '4.0' -``` +For resource sizing recommendations, scaling guidance, and fine tuning, see the dedicated [Performance Optimization & Sizing](./Configuration/Performance-Optimization.md) guide. ---