Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions docling_core/utils/file.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""File-related utilities."""

import ipaddress
import logging
import re
import tempfile
from io import BytesIO
Expand All @@ -14,10 +15,27 @@

from docling_core.types.doc.utils import relative_path
from docling_core.types.io import DocumentStream
from docling_core.utils.settings import settings

_logger = logging.getLogger(__name__)

_MAX_REDIRECTS = 5


def _ip_in_allowlist(ip: ipaddress.IPv4Address, allowlist: list[str]) -> bool:
for entry in allowlist:
entry = entry.strip()
if not entry:
continue
try:
network = ipaddress.ip_network(entry)
if ip in network:
return True
except ValueError:
_logger.warning(f"Skipping malformed entry in DOCLINGCORE_ALLOWED_PRIVATE_IPS: {entry!r}")
return False


def _is_safe_url(url: str) -> bool:
"""Return whether a URL resolves to a globally routable address."""
try:
Expand All @@ -37,6 +55,8 @@ def _is_safe_url(url: str) -> bool:
ip = ipaddress.ip_address(ip_str)
except (socket.gaierror, socket.herror):
return False
if settings.allowed_private_ips and _ip_in_allowlist(ip, settings.allowed_private_ips):
return True

return ip.is_global and not (
ip.is_private
Expand Down
1 change: 1 addition & 0 deletions docling_core/utils/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ class CoreSettings(BaseSettings):

allow_image_file_uri: bool = False
max_image_decoded_size: int = 20 * 1024 * 1024 # 20MB
allowed_private_ips: list[str] = []


settings = CoreSettings()
47 changes: 47 additions & 0 deletions test/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,54 @@ def test_is_safe_url_rejects_private_networks():
assert _is_safe_url("http://8.8.8.8/file")
assert _is_safe_url("https://example.com/file")
assert _is_safe_url("https://github.com/github/file")

def test_ip_in_allowlist():
import ipaddress
from docling_core.utils.file import _ip_in_allowlist

assert _ip_in_allowlist(ipaddress.ip_address("127.0.0.1"), ["127.0.0.1"])
assert not _ip_in_allowlist(ipaddress.ip_address("127.0.0.1"), ["127.0.0.2"])

assert _ip_in_allowlist(ipaddress.ip_address("10.20.30.40"), ["10.0.0.0/8"])
assert not _ip_in_allowlist(ipaddress.ip_address("10.21.0.1"), ["10.20.0.0/16"])

assert _ip_in_allowlist(ipaddress.ip_address("192.168.1.100"), ["192.168.1.0/24"])
assert not _ip_in_allowlist(ipaddress.ip_address("192.168.1.100"), ["192.168.1.5/24"])

assert _ip_in_allowlist(ipaddress.ip_address("127.0.0.1"), ["garbage", "127.0.0.1"])
assert not _ip_in_allowlist(ipaddress.ip_address("127.0.0.1"), ["garbage", "300.300.300.300"])



def test_allowed_private_ips(monkeypatch):
import pytest
from docling_core.utils.file import resolve_source_to_stream
from docling_core.utils.settings import settings
from requests import Response

def mock_get(*args, **kwargs):
r = Response()
r.status_code = 200
r._content = b"ok"
return r

monkeypatch.setattr("requests.Session.get", mock_get)

monkeypatch.setattr(settings, "allowed_private_ips", [])
with pytest.raises(ValueError, match="URL is not allowed"):
resolve_source_to_stream("http://10.0.0.1/file")

monkeypatch.setattr(settings, "allowed_private_ips", [""])
with pytest.raises(ValueError, match="URL is not allowed"):
resolve_source_to_stream("http://10.0.0.1/file")

monkeypatch.setattr(settings, "allowed_private_ips", ["127.0.0.1"])
assert resolve_source_to_stream("http://127.0.0.1/file").stream.read() == b"ok"

monkeypatch.setattr(settings, "allowed_private_ips", ["192.168.1.0/24"])
assert resolve_source_to_stream("http://192.168.1.42/doc").stream.read() == b"ok"
with pytest.raises(ValueError, match="URL is not allowed"):
resolve_source_to_stream("http://192.168.2.1/doc")

def test_resolve_remote_filename_sanitizes_content_disposition(monkeypatch):
"""Test filename normalization from Content-Disposition."""
Expand Down
Loading