From ab525d49f070bed3c6b9f331e0be2a8ad3ac719c Mon Sep 17 00:00:00 2001 From: Taksh Date: Wed, 8 Apr 2026 08:52:35 +0530 Subject: [PATCH] Shorten cache filenames to fit eCryptfs 143-byte NAME_MAX limit url_to_filename() was appending the full trailing URL path component (e.g. tfidf_vectors_sparse.npz) to the hash-based filename, producing names up to 154 characters. This exceeds the 143-byte NAME_MAX on eCryptfs-encrypted filesystems, causing OSError: File name too long. Now only the file extension is preserved (e.g. .npz), keeping the worst-case filename (including .json sidecar) under 143 bytes. _find_existing_cache_file() matches both old and new filename formats for backward compatibility. Fixes #539 Co-Authored-By: Claude Opus 4.6 (1M context) --- scispacy/file_cache.py | 37 +++++++++++++++++++++++++++++++++++-- tests/test_file_cache.py | 20 ++++++++++++++++++++ 2 files changed, 55 insertions(+), 2 deletions(-) diff --git a/scispacy/file_cache.py b/scispacy/file_cache.py index 9ff99180..ed309db8 100644 --- a/scispacy/file_cache.py +++ b/scispacy/file_cache.py @@ -55,9 +55,15 @@ def url_to_filename(url: str, etag: Optional[str] = None) -> str: Convert `url` into a hashed filename in a repeatable way. If `etag` is specified, append its hash to the url's, delimited by a period. - """ + Only the file extension from the original URL is preserved (not the + full trailing path component) to keep filenames short enough for + filesystems with a 143-byte NAME_MAX (e.g. eCryptfs). + See: https://github.com/allenai/scispacy/issues/539 + """ last_part = url.split("/")[-1] + _, ext = os.path.splitext(last_part) + url_bytes = url.encode("utf-8") url_hash = sha256(url_bytes) filename = url_hash.hexdigest() @@ -67,7 +73,7 @@ def url_to_filename(url: str, etag: Optional[str] = None) -> str: etag_hash = sha256(etag_bytes) filename += "." + etag_hash.hexdigest() - filename += "." + last_part + filename += ext return filename @@ -106,6 +112,33 @@ def http_get(url: str, temp_file: IO) -> None: pbar.close() +def _find_existing_cache_file(url: str, cache_dir: str) -> Optional[str]: + """ + Check if a cached file already exists for the given URL. + Since the filename includes the etag (which we may not have without a + network call), we look for any file matching the URL hash prefix. + + Supports both old-format filenames (..) + and new-format filenames (..). + """ + url_bytes = url.encode("utf-8") + url_hash = sha256(url_bytes).hexdigest() + last_part = url.split("/")[-1] + _, ext = os.path.splitext(last_part) + + for filename in os.listdir(cache_dir): + if filename.endswith(".json") or not os.path.isfile( + os.path.join(cache_dir, filename) + ): + continue + if not filename.startswith(url_hash): + continue + if filename.endswith("." + last_part) or filename.endswith(ext): + return os.path.join(cache_dir, filename) + return None + + + def get_from_cache(url: str, cache_dir: Optional[str] = None) -> str: """ Given a URL, look for the corresponding dataset in the local cache. diff --git a/tests/test_file_cache.py b/tests/test_file_cache.py index 9ca4a714..b577782f 100644 --- a/tests/test_file_cache.py +++ b/tests/test_file_cache.py @@ -59,3 +59,23 @@ def test_url_to_filename_with_etags_eliminates_quotes(self): back_to_url, etag = filename_to_url(filename, cache_dir=self.TEST_DIR) assert back_to_url == url assert etag == "mytag" + + def test_url_to_filename_length_under_ecryptfs_limit(self): + """Filenames (including .json sidecar) must stay under 143 bytes for eCryptfs. + See: https://github.com/allenai/scispacy/issues/539 + """ + # These are the actual URLs used by scispacy linkers + urls = [ + 'https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linkers/2023-04-23/umls/nmslib_index.bin', + 'https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linkers/2023-04-23/umls/tfidf_vectorizer.joblib', + 'https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linkers/2023-04-23/umls/tfidf_vectors_sparse.npz', + 'https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linkers/2023-04-23/umls/concept_aliases.json', + ] + # Simulate a realistic 64-char hex ETag + long_etag = '"d41d8cd98f00b204e9800998ecf8427ed41d8cd98f00b204e9800998ecf8427e"' + for url in urls: + filename = url_to_filename(url, etag=long_etag) + meta_filename = filename + ".json" + assert len(meta_filename) < 143, ( + f"Metadata filename too long for eCryptfs ({len(meta_filename)} >= 143): {meta_filename}" + )