diff --git a/docling_core/utils/file.py b/docling_core/utils/file.py index ca8dba39..c858bda1 100644 --- a/docling_core/utils/file.py +++ b/docling_core/utils/file.py @@ -1,6 +1,7 @@ """File-related utilities.""" import ipaddress +import logging import re import tempfile from io import BytesIO @@ -14,10 +15,27 @@ from docling_core.types.doc.utils import relative_path from docling_core.types.io import DocumentStream +from docling_core.utils.settings import settings + +_logger = logging.getLogger(__name__) _MAX_REDIRECTS = 5 +def _ip_in_allowlist(ip: ipaddress.IPv4Address, allowlist: list[str]) -> bool: + for entry in allowlist: + entry = entry.strip() + if not entry: + continue + try: + network = ipaddress.ip_network(entry) + if ip in network: + return True + except ValueError: + _logger.warning(f"Skipping malformed entry in DOCLINGCORE_ALLOWED_PRIVATE_IPS: {entry!r}") + return False + + def _is_safe_url(url: str) -> bool: """Return whether a URL resolves to a globally routable address.""" try: @@ -37,6 +55,8 @@ def _is_safe_url(url: str) -> bool: ip = ipaddress.ip_address(ip_str) except (socket.gaierror, socket.herror): return False + if settings.allowed_private_ips and _ip_in_allowlist(ip, settings.allowed_private_ips): + return True return ip.is_global and not ( ip.is_private diff --git a/docling_core/utils/settings.py b/docling_core/utils/settings.py index c54fd356..9dc89d3f 100644 --- a/docling_core/utils/settings.py +++ b/docling_core/utils/settings.py @@ -6,6 +6,7 @@ class CoreSettings(BaseSettings): allow_image_file_uri: bool = False max_image_decoded_size: int = 20 * 1024 * 1024 # 20MB + allowed_private_ips: list[str] = [] settings = CoreSettings() diff --git a/test/test_utils.py b/test/test_utils.py index 3df2eef7..1513de2e 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -192,7 +192,54 @@ def test_is_safe_url_rejects_private_networks(): assert _is_safe_url("http://8.8.8.8/file") assert _is_safe_url("https://example.com/file") assert _is_safe_url("https://github.com/github/file") + +def test_ip_in_allowlist(): + import ipaddress + from docling_core.utils.file import _ip_in_allowlist + assert _ip_in_allowlist(ipaddress.ip_address("127.0.0.1"), ["127.0.0.1"]) + assert not _ip_in_allowlist(ipaddress.ip_address("127.0.0.1"), ["127.0.0.2"]) + + assert _ip_in_allowlist(ipaddress.ip_address("10.20.30.40"), ["10.0.0.0/8"]) + assert not _ip_in_allowlist(ipaddress.ip_address("10.21.0.1"), ["10.20.0.0/16"]) + + assert _ip_in_allowlist(ipaddress.ip_address("192.168.1.100"), ["192.168.1.0/24"]) + assert not _ip_in_allowlist(ipaddress.ip_address("192.168.1.100"), ["192.168.1.5/24"]) + + assert _ip_in_allowlist(ipaddress.ip_address("127.0.0.1"), ["garbage", "127.0.0.1"]) + assert not _ip_in_allowlist(ipaddress.ip_address("127.0.0.1"), ["garbage", "300.300.300.300"]) + + + +def test_allowed_private_ips(monkeypatch): + import pytest + from docling_core.utils.file import resolve_source_to_stream + from docling_core.utils.settings import settings + from requests import Response + + def mock_get(*args, **kwargs): + r = Response() + r.status_code = 200 + r._content = b"ok" + return r + + monkeypatch.setattr("requests.Session.get", mock_get) + + monkeypatch.setattr(settings, "allowed_private_ips", []) + with pytest.raises(ValueError, match="URL is not allowed"): + resolve_source_to_stream("http://10.0.0.1/file") + + monkeypatch.setattr(settings, "allowed_private_ips", [""]) + with pytest.raises(ValueError, match="URL is not allowed"): + resolve_source_to_stream("http://10.0.0.1/file") + + monkeypatch.setattr(settings, "allowed_private_ips", ["127.0.0.1"]) + assert resolve_source_to_stream("http://127.0.0.1/file").stream.read() == b"ok" + + monkeypatch.setattr(settings, "allowed_private_ips", ["192.168.1.0/24"]) + assert resolve_source_to_stream("http://192.168.1.42/doc").stream.read() == b"ok" + with pytest.raises(ValueError, match="URL is not allowed"): + resolve_source_to_stream("http://192.168.2.1/doc") def test_resolve_remote_filename_sanitizes_content_disposition(monkeypatch): """Test filename normalization from Content-Disposition."""