From bf0e32c86bedd82adf09474173d0d9748b5b39c7 Mon Sep 17 00:00:00 2001
From: Ashut0sh-mishra <mishra.ashuutosh@gmail.com>
Date: Wed, 15 Apr 2026 07:07:12 +0000
Subject: [PATCH 1/2] fix: handle file names exceeding OS limit in cache (#539)

Long entity names or cache keys could exceed the 255-character
filesystem limit causing OSError. Changed url_to_filename() to
only preserve the file extension instead of the full trailing URL
component, keeping filenames under 143 bytes (eCryptfs limit).
Added backward-compat lookup for old-style cache entries.

Fixes #539

Co-authored-by: nik464 <nikhil18chaudhary@gmail.com>
---
 scispacy/file_cache.py   | 27 ++++++++++++++++++++++++---
 tests/test_file_cache.py | 35 +++++++++++++++++++++++++++++++++++
 2 files changed, 59 insertions(+), 3 deletions(-)

diff --git a/scispacy/file_cache.py b/scispacy/file_cache.py
index 9ff99180..a5edb1b5 100644
--- a/scispacy/file_cache.py
+++ b/scispacy/file_cache.py
@@ -56,8 +56,6 @@ def url_to_filename(url: str, etag: Optional[str] = None) -> str:
     If `etag` is specified, append its hash to the url's, delimited
     by a period.
     """
-
-    last_part = url.split("/")[-1]
     url_bytes = url.encode("utf-8")
     url_hash = sha256(url_bytes)
     filename = url_hash.hexdigest()
@@ -67,7 +65,12 @@ def url_to_filename(url: str, etag: Optional[str] = None) -> str:
         etag_hash = sha256(etag_bytes)
         filename += "." + etag_hash.hexdigest()
 
-    filename += "." + last_part
+    # Only keep the file extension to stay within filesystem NAME_MAX
+    # limits (e.g. 143 bytes on eCryptfs).
+    _, ext = os.path.splitext(url.split("/")[-1])
+    if ext:
+        filename += ext
+
     return filename
 
 
@@ -106,6 +109,19 @@ def http_get(url: str, temp_file: IO) -> None:
     pbar.close()
 
 
+def _find_legacy_cache_path(
+    url: str, etag: Optional[str], cache_dir: str
+) -> Optional[str]:
+    """Check for a cached file using the old naming scheme (full trailing URL component)."""
+    last_part = url.split("/")[-1]
+    filename = sha256(url.encode("utf-8")).hexdigest()
+    if etag:
+        filename += "." + sha256(etag.encode("utf-8")).hexdigest()
+    filename += "." + last_part
+    path = os.path.join(cache_dir, filename)
+    return path if os.path.exists(path) else None
+
+
 def get_from_cache(url: str, cache_dir: Optional[str] = None) -> str:
     """
     Given a URL, look for the corresponding dataset in the local cache.
@@ -131,6 +147,11 @@ def get_from_cache(url: str, cache_dir: Optional[str] = None) -> str:
     cache_path = os.path.join(cache_dir, filename)
 
     if not os.path.exists(cache_path):
+        # Check for files cached under the old naming scheme, which appended
+        # the full trailing URL component instead of just the extension.
+        legacy_path = _find_legacy_cache_path(url, etag, cache_dir)
+        if legacy_path is not None:
+            return legacy_path
         # Download to temporary file, then copy to cache dir once finished.
         # Otherwise you get corrupt cache entries if the download gets interrupted.
         with tempfile.NamedTemporaryFile() as temp_file:  # type: IO
diff --git a/tests/test_file_cache.py b/tests/test_file_cache.py
index 9ca4a714..5b214a74 100644
--- a/tests/test_file_cache.py
+++ b/tests/test_file_cache.py
@@ -59,3 +59,38 @@ def test_url_to_filename_with_etags_eliminates_quotes(self):
             back_to_url, etag = filename_to_url(filename, cache_dir=self.TEST_DIR)
             assert back_to_url == url
             assert etag == "mytag"
+
+    def test_url_to_filename_stays_within_name_max(self):
+        # eCryptfs limits filenames to 143 bytes; make sure we stay under that
+        # even with a long URL and etag.
+        long_url = "https://s3-us-west-2.amazonaws.com/bucket/" + "a" * 300 + "/file.npz"
+        long_etag = "x" * 300
+        filename = url_to_filename(long_url, etag=long_etag)
+        assert len(filename) <= 143
+        assert filename.endswith(".npz")
+        # also without etag
+        filename_no_etag = url_to_filename(long_url)
+        assert len(filename_no_etag) <= 143
+
+    def test_url_to_filename_no_extension(self):
+        # URLs without a file extension should still produce a valid filename
+        filename = url_to_filename("https://example.com/data/somefile")
+        assert len(filename) == 64  # just the sha256 hex digest
+        assert "." not in filename
+
+    def test_legacy_cache_files_still_found(self):
+        from scispacy.file_cache import _find_legacy_cache_path
+        from hashlib import sha256
+
+        url = "https://example.com/data/model.bin"
+        etag = "some-etag"
+        # Create a file with the old naming scheme
+        last_part = url.split("/")[-1]
+        old_filename = sha256(url.encode("utf-8")).hexdigest()
+        old_filename += "." + sha256(etag.encode("utf-8")).hexdigest()
+        old_filename += "." + last_part
+        old_path = os.path.join(self.TEST_DIR, old_filename)
+        pathlib.Path(old_path).touch()
+
+        found = _find_legacy_cache_path(url, etag, self.TEST_DIR)
+        assert found == old_path

From 6425711d0e2678ce0376a51e3b621b139cbd594e Mon Sep 17 00:00:00 2001
From: Ashut0sh-mishra <mishra.ashuutosh@gmail.com>
Date: Wed, 15 Apr 2026 10:55:28 +0000
Subject: [PATCH 2/2] feat: add entity merging across multiple NER models
 (#388)

Adds scispacy/entity_merging.py with:
- merge_overlapping_spans(): keeps longest non-overlapping spans
  from a flat list using spacy.util.filter_spans
- merge_entities(): runs text through multiple spaCy models,
  collects all recognized entities, optionally adds abbreviation
  long forms, and returns a single Doc with merged ents

Also adds tests/test_entity_merging.py covering overlap
resolution, deduplication, and multi-model span merging.

Fixes #388

Co-authored-by: nik464 <nikhil18chaudhary@gmail.com>
---
 scispacy/entity_merging.py   | 144 +++++++++++++++++++++++++++++++++++
 tests/test_entity_merging.py |  79 +++++++++++++++++++
 2 files changed, 223 insertions(+)
 create mode 100644 scispacy/entity_merging.py
 create mode 100644 tests/test_entity_merging.py

diff --git a/scispacy/entity_merging.py b/scispacy/entity_merging.py
new file mode 100644
index 00000000..17e70677
--- /dev/null
+++ b/scispacy/entity_merging.py
@@ -0,0 +1,144 @@
+"""
+Merge entities recognized by different NER models,
+optionally incorporating abbreviation long forms as entities.
+
+Usage
+-----
+
+.. code-block:: python
+
+    import spacy
+    from scispacy.entity_merging import merge_entities
+
+    text = "Spinal and bulbar muscular atrophy (SBMA) is an inherited motor neuron disease."
+    doc = merge_entities(
+        text,
+        model_names=["en_core_sci_sm", "en_core_sci_lg"],
+        use_abbreviations=True,
+    )
+    print(doc.ents)
+
+Or as a function you call on an already-processed list of docs:
+
+.. code-block:: python
+
+    from scispacy.entity_merging import merge_overlapping_spans
+
+    merged = merge_overlapping_spans(all_spans, doc)
+"""
+
+from typing import List, Optional
+
+import spacy
+from spacy.language import Language
+from spacy.tokens import Doc, Span
+from spacy.util import filter_spans
+
+
+def merge_overlapping_spans(spans: List[Span], doc: Doc) -> List[Span]:
+    """
+    Given a flat list of (possibly overlapping) spans that all reference
+    the same Doc, return a filtered list keeping the longest non-overlapping
+    spans.  Ties are broken by whichever span appears first.
+    """
+    if not spans:
+        return []
+    # filter_spans keeps the longest span when there's overlap
+    return filter_spans(spans)
+
+
+def _collect_entity_spans(
+    text: str,
+    model_names: List[str],
+    use_abbreviations: bool,
+) -> tuple:
+    """
+    Run *text* through each model in *model_names*, collect every entity
+    span, and (optionally) add spans for abbreviation long forms.
+
+    Returns (base_doc, all_spans) where *base_doc* is the Doc produced by
+    the first model and *all_spans* are Span objects that all reference
+    *base_doc*.
+    """
+    if not model_names:
+        raise ValueError("model_names must contain at least one model")
+
+    pipelines = [spacy.load(name) for name in model_names]
+    base_nlp = pipelines[0]
+    base_doc = base_nlp(text)
+
+    all_spans: List[Span] = list(base_doc.ents)
+
+    # Entities from the remaining models need to be projected onto base_doc
+    # because spaCy Spans are tied to a specific Doc object.
+    for nlp in pipelines[1:]:
+        other_doc = nlp(text)
+        for ent in other_doc.ents:
+            try:
+                span = base_doc.char_span(ent.start_char, ent.end_char, label=ent.label_)
+            except Exception:
+                continue
+            if span is not None:
+                all_spans.append(span)
+
+    if use_abbreviations:
+        all_spans = _add_abbreviation_spans(base_nlp, base_doc, all_spans)
+
+    return base_doc, all_spans
+
+
+def _add_abbreviation_spans(
+    nlp: Language, doc: Doc, spans: List[Span]
+) -> List[Span]:
+    """
+    If AbbreviationDetector is in the pipeline, use detected long forms
+    to create additional entity spans.
+    """
+    try:
+        nlp.get_pipe("abbreviation_detector")
+    except KeyError:
+        # no abbreviation detector in this pipeline — add one temporarily
+        from scispacy.abbreviation import AbbreviationDetector  # noqa: F811
+
+        nlp.add_pipe("abbreviation_detector")
+        doc = nlp(doc.text)
+
+    for abrv in doc._.abbreviations:
+        long_form = abrv._.long_form
+        if long_form is None:
+            continue
+        # long_form is already a Span on our doc
+        if isinstance(long_form, Span):
+            spans.append(long_form)
+
+    return spans
+
+
+def merge_entities(
+    text: str,
+    model_names: List[str],
+    use_abbreviations: bool = True,
+) -> Doc:
+    """
+    Run *text* through multiple spaCy NER models, collect all recognized
+    entities, optionally add abbreviation long forms, and return a single
+    Doc whose ``.ents`` contains the longest non-overlapping entity spans.
+
+    Parameters
+    ----------
+    text : str
+        The text to process.
+    model_names : list of str
+        Names of spaCy models to use (e.g. ``["en_core_sci_sm", "en_core_sci_lg"]``).
+    use_abbreviations : bool, optional (default True)
+        Whether to incorporate abbreviation long forms as candidate entities.
+
+    Returns
+    -------
+    Doc
+        A spaCy Doc with merged entities set as ``doc.ents``.
+    """
+    base_doc, all_spans = _collect_entity_spans(text, model_names, use_abbreviations)
+    merged = merge_overlapping_spans(all_spans, base_doc)
+    base_doc.ents = merged
+    return base_doc
diff --git a/tests/test_entity_merging.py b/tests/test_entity_merging.py
new file mode 100644
index 00000000..a3c0ee83
--- /dev/null
+++ b/tests/test_entity_merging.py
@@ -0,0 +1,79 @@
+import unittest
+
+import spacy
+from spacy.tokens import Doc, Span
+
+from scispacy.entity_merging import merge_overlapping_spans
+
+
+class TestMergeOverlappingSpans(unittest.TestCase):
+    def setUp(self):
+        self.nlp = spacy.blank("en")
+
+    def _make_doc(self, text):
+        return self.nlp(text)
+
+    def test_no_spans_returns_empty(self):
+        doc = self._make_doc("hello world")
+        result = merge_overlapping_spans([], doc)
+        assert result == []
+
+    def test_non_overlapping_spans_kept(self):
+        doc = self._make_doc("Spinal atrophy and motor neuron disease are conditions")
+        span_a = doc.char_span(0, 14, label="ENTITY")   # "Spinal atrophy"
+        span_b = doc.char_span(19, 39, label="ENTITY")  # "motor neuron disease"
+        assert span_a is not None
+        assert span_b is not None
+        result = merge_overlapping_spans([span_a, span_b], doc)
+        assert len(result) == 2
+
+    def test_overlapping_spans_keep_longest(self):
+        doc = self._make_doc("Spinal and bulbar muscular atrophy is a disease")
+        short = doc.char_span(0, 6, label="ENTITY")   # "Spinal"
+        long = doc.char_span(0, 34, label="ENTITY")   # "Spinal and bulbar muscular atrophy"
+        assert short is not None
+        assert long is not None
+        result = merge_overlapping_spans([short, long], doc)
+        assert len(result) == 1
+        assert result[0].text == "Spinal and bulbar muscular atrophy"
+
+    def test_partial_overlap_keeps_longest(self):
+        doc = self._make_doc("bulbar muscular atrophy is studied")
+        span_a = doc.char_span(0, 23, label="ENTITY")  # "bulbar muscular atrophy"
+        span_b = doc.char_span(7, 23, label="ENTITY")  # "muscular atrophy"
+        assert span_a is not None
+        assert span_b is not None
+        result = merge_overlapping_spans([span_a, span_b], doc)
+        assert len(result) == 1
+        assert result[0].text == "bulbar muscular atrophy"
+
+    def test_duplicate_spans_deduplicated(self):
+        doc = self._make_doc("motor neuron disease is common")
+        span_a = doc.char_span(0, 20, label="ENTITY")
+        span_b = doc.char_span(0, 20, label="ENTITY")
+        assert span_a is not None
+        assert span_b is not None
+        result = merge_overlapping_spans([span_a, span_b], doc)
+        assert len(result) == 1
+
+    def test_many_overlapping_spans(self):
+        # Simulates entities from multiple models with different granularity
+        doc = self._make_doc("Spinal and bulbar muscular atrophy caused by androgen receptor")
+        spans = []
+        # model A: fragments
+        spans.append(doc.char_span(0, 6, label="ENTITY"))    # "Spinal"
+        spans.append(doc.char_span(11, 34, label="ENTITY"))  # "bulbar muscular atrophy"
+        # model B: full phrase
+        spans.append(doc.char_span(0, 34, label="ENTITY"))   # "Spinal and bulbar muscular atrophy"
+        # model C: second entity
+        spans.append(doc.char_span(45, 62, label="ENTITY"))  # "androgen receptor"
+        # filter out any None from char_span misalignment
+        spans = [s for s in spans if s is not None]
+
+        result = merge_overlapping_spans(spans, doc)
+        texts = {s.text for s in result}
+        assert "Spinal and bulbar muscular atrophy" in texts
+        assert "androgen receptor" in texts
+        # fragments should be gone
+        assert "Spinal" not in texts
+        assert "bulbar muscular atrophy" not in texts