fix(models): honour declared charset in RFC 5987 filename* parsing

jerann · claude · jerann · commit 10d780275f61 · 2026-06-03T14:34:31.000+02:00
The Content-Disposition filename* branch captured the charset in its
regex but discarded it, so unquote() always decoded with UTF-8. RFC 5987
also permits ISO-8859-1, so an ISO-8859-1 percent-encoded filename
decoded to mojibake despite the helper claiming RFC 5987 support.

- Decode the extended value with its declared charset; fall back to
  UTF-8 for an unknown or empty charset label (LookupError) instead of
  raising.
- Strip surrounding quotes off non-conformant quoted filename* values.
- Add tests for ISO-8859-1, an unknown charset, and a quoted filename*.

Co-Authored-By: Claude Opus 4.8 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/stackone_ai/models.py b/stackone_ai/models.py
@@ -76,12 +76,20 @@ def _filename_from_content_disposition(value: str | None) -> str | None:
 
     Handles both the plain ``filename="example.pdf"`` form and the RFC 5987 extended
     ``filename*=UTF-8''example%20file.pdf`` form (which takes precedence when present).
+    The extended form is percent-decoded using its declared charset (RFC 5987 permits
+    both ``UTF-8`` and ``ISO-8859-1``); an unknown or empty charset falls back to UTF-8.
     """
     if not value:
         return None
-    extended = re.search(r"filename\*\s*=\s*[^']*'[^']*'([^;]+)", value, re.IGNORECASE)
+    extended = re.search(r"filename\*\s*=\s*([^']*)'[^']*'([^;]+)", value, re.IGNORECASE)
     if extended:
-        return unquote(extended.group(1).strip())
+        charset = extended.group(1).strip() or "utf-8"
+        encoded = extended.group(2).strip().strip('"')
+        try:
+            return unquote(encoded, encoding=charset, errors="replace") or None
+        except LookupError:
+            # Unrecognised charset label - decode as UTF-8 rather than failing.
+            return unquote(encoded, encoding="utf-8", errors="replace") or None
     quoted = re.search(r'filename\s*=\s*"([^"]*)"', value, re.IGNORECASE)
     if quoted:
         return quoted.group(1).strip() or None
diff --git a/tests/test_tool_calling.py b/tests/test_tool_calling.py
@@ -504,6 +504,12 @@ def test_is_json_content_type(self, content_type, expected):
             ('inline; filename="my report.docx"', "my report.docx"),
             # RFC 5987 extended form is percent-decoded and takes precedence.
             ("attachment; filename=\"fallback.txt\"; filename*=UTF-8''na%C3%AFve.txt", "naïve.txt"),
+            # Non-UTF-8 charset is honoured: 0xA3 is "£" in ISO-8859-1, not UTF-8.
+            ("attachment; filename*=ISO-8859-1'en'%A3%20rates.txt", "£ rates.txt"),
+            # Unknown charset label falls back to UTF-8 instead of raising.
+            ("attachment; filename*=bogus-charset''%C2%A3.txt", "£.txt"),
+            # Non-conformant quoted extended value: surrounding quotes are stripped.
+            ("attachment; filename*=\"UTF-8''na%C3%AFve.txt\"", "naïve.txt"),
             ("attachment", None),
             (None, None),
             ("", None),