Skip to content

Commit 10d7802

Browse files
jerannclaude
andcommitted
fix(models): honour declared charset in RFC 5987 filename* parsing
The Content-Disposition filename* branch captured the charset in its regex but discarded it, so unquote() always decoded with UTF-8. RFC 5987 also permits ISO-8859-1, so an ISO-8859-1 percent-encoded filename decoded to mojibake despite the helper claiming RFC 5987 support. - Decode the extended value with its declared charset; fall back to UTF-8 for an unknown or empty charset label (LookupError) instead of raising. - Strip surrounding quotes off non-conformant quoted filename* values. - Add tests for ISO-8859-1, an unknown charset, and a quoted filename*. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent def9b92 commit 10d7802

2 files changed

Lines changed: 16 additions & 2 deletions

File tree

stackone_ai/models.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,12 +76,20 @@ def _filename_from_content_disposition(value: str | None) -> str | None:
7676
7777
Handles both the plain ``filename="example.pdf"`` form and the RFC 5987 extended
7878
``filename*=UTF-8''example%20file.pdf`` form (which takes precedence when present).
79+
The extended form is percent-decoded using its declared charset (RFC 5987 permits
80+
both ``UTF-8`` and ``ISO-8859-1``); an unknown or empty charset falls back to UTF-8.
7981
"""
8082
if not value:
8183
return None
82-
extended = re.search(r"filename\*\s*=\s*[^']*'[^']*'([^;]+)", value, re.IGNORECASE)
84+
extended = re.search(r"filename\*\s*=\s*([^']*)'[^']*'([^;]+)", value, re.IGNORECASE)
8385
if extended:
84-
return unquote(extended.group(1).strip())
86+
charset = extended.group(1).strip() or "utf-8"
87+
encoded = extended.group(2).strip().strip('"')
88+
try:
89+
return unquote(encoded, encoding=charset, errors="replace") or None
90+
except LookupError:
91+
# Unrecognised charset label - decode as UTF-8 rather than failing.
92+
return unquote(encoded, encoding="utf-8", errors="replace") or None
8593
quoted = re.search(r'filename\s*=\s*"([^"]*)"', value, re.IGNORECASE)
8694
if quoted:
8795
return quoted.group(1).strip() or None

tests/test_tool_calling.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -504,6 +504,12 @@ def test_is_json_content_type(self, content_type, expected):
504504
('inline; filename="my report.docx"', "my report.docx"),
505505
# RFC 5987 extended form is percent-decoded and takes precedence.
506506
("attachment; filename=\"fallback.txt\"; filename*=UTF-8''na%C3%AFve.txt", "naïve.txt"),
507+
# Non-UTF-8 charset is honoured: 0xA3 is "£" in ISO-8859-1, not UTF-8.
508+
("attachment; filename*=ISO-8859-1'en'%A3%20rates.txt", "£ rates.txt"),
509+
# Unknown charset label falls back to UTF-8 instead of raising.
510+
("attachment; filename*=bogus-charset''%C2%A3.txt", "£.txt"),
511+
# Non-conformant quoted extended value: surrounding quotes are stripped.
512+
("attachment; filename*=\"UTF-8''na%C3%AFve.txt\"", "naïve.txt"),
507513
("attachment", None),
508514
(None, None),
509515
("", None),

0 commit comments

Comments
 (0)