From cd59fe54352efab4e3c2e16fd74affd0b0fe2458 Mon Sep 17 00:00:00 2001
From: Christos Koutras <koutras21@gmail.com>
Date: Mon, 4 May 2026 09:29:07 -0400
Subject: [PATCH 01/13] add embeddings support for jaccard distance, replace
 jaccard with generic tversky index, add more one-to-one filtering methods

---
 README.md                                     |   4 +-
 docs/api.md                                   |  41 ++-
 docs/changelog.md                             |   2 +-
 docs/example.md                               |   2 +-
 docs/faq.md                                   |   2 +-
 docs/metrics.md                               |   2 +-
 docs/results.md                               |  10 +-
 examples/valentine_example_mixed.py           |   2 +-
 examples/valentine_example_pandas.py          |   2 +-
 examples/valentine_example_polars.py          |   2 +-
 experiments/bench.py                          |  43 ++-
 pyproject.toml                                |   3 +
 tests/test_coverage_gaps.py                   |   8 +-
 tests/test_distribution_based_benchmark.py    |   5 +-
 tests/test_docs_smoke.py                      |   4 +-
 tests/test_matcher_results.py                 |  13 +-
 .../algorithms/jaccard_distance/__init__.py   |   3 +
 .../jaccard_distance/jaccard_distance.py      | 284 +++++++++++++++++-
 valentine/algorithms/matcher_results.py       | 173 +++++++++--
 valentine/metrics/base_metric.py              |  14 +-
 valentine/metrics/metric_helpers.py           |  20 ++
 valentine/metrics/metrics.py                  |  64 +++-
 22 files changed, 622 insertions(+), 81 deletions(-)

diff --git a/README.md b/README.md
index 62fcc2e..d33087d 100644
--- a/README.md
+++ b/README.md
@@ -137,9 +137,9 @@ for pair, score in matches.items():
 ```python
 top_n_matches = matches.take_top_n(5)
 top_n_percent_matches = matches.take_top_percent(25)
-one_to_one_matches = matches.one_to_one()
+one_to_one_matches = matches.one_to_one_hungarian()
 high_confidence = matches.filter(min_score=0.7)
-one_to_one_strict = matches.one_to_one(threshold=0.5)
+one_to_one_strict = matches.one_to_one_hungarian(threshold=0.5)
 ```
 
 ### Match details (Coma)
diff --git a/docs/api.md b/docs/api.md
index a7e3b26..b34b189 100644
--- a/docs/api.md
+++ b/docs/api.md
@@ -141,7 +141,8 @@ class MatcherResults(Mapping[ColumnPair, float]):
 Immutable `Mapping` returned by [`valentine_match`](#valentine_match).
 Entries are sorted from highest to lowest similarity score on
 construction. Because the mapping is immutable, derived views (such as
-the cached result of [`one_to_one`](#one_to_one)) cannot be silently
+the cached result of [`one_to_one_hungarian`](#one_to_one_hungarian))
+cannot be silently
 invalidated.
 
 ### Mapping protocol
@@ -186,15 +187,17 @@ All transformations return a **new** `MatcherResults` instance; the
 original is left untouched. Sub-matcher details are carried over to the
 filtered subset.
 
-#### `one_to_one`
+#### `one_to_one_hungarian`
 
 ```python
-def one_to_one(threshold: float | None = None) -> MatcherResults
+def one_to_one_hungarian(threshold: float | None = None) -> MatcherResults
 ```
 
-Greedy bipartite filter: starting from the highest-scoring pair, assign
-each source and each target column **at most one** partner. Pairs below
-`threshold` are discarded.
+Default 1:1 selector. Globally optimal bipartite filter via Hungarian
+assignment (`scipy.optimize.linear_sum_assignment`): each source and
+each target column appears in **at most one** returned pair, with the
+assignment chosen to maximise total similarity. Pairs below `threshold`
+are discarded.
 
 - `threshold=None` (default) uses the median of unique similarity scores
   as the cutoff, and the result is cached.
@@ -202,6 +205,30 @@ each source and each target column **at most one** partner. Pairs below
 - When the input has fewer than two distinct score values, all entries
   are returned unchanged.
 
+#### `one_to_one_greedy`
+
+```python
+def one_to_one_greedy(threshold: float | None = None) -> MatcherResults
+```
+
+Greedy bipartite filter, kept for backwards compatibility. Starting
+from the highest-scoring pair, greedily assigns each source and each
+target column at most one partner. Same threshold semantics as
+`one_to_one_hungarian`. Greedy can lock in a locally-best pair that
+blocks a better global assignment, so prefer the Hungarian variant
+unless you need the legacy behaviour.
+
+#### `one_to_one_mutual_top`
+
+```python
+def one_to_one_mutual_top(n: int = 1) -> MatcherResults
+```
+
+Mutual top-`n` filter: keeps pair `(s, t)` only if `t` is in `s`'s
+top-`n` targets AND `s` is in `t`'s top-`n` sources. With `n=1` this
+is the classic mutual nearest-neighbour filter — high-precision, drops
+one-sided affinities. Strictly stricter than `one_to_one_hungarian`.
+
 #### `filter`
 
 ```python
@@ -615,7 +642,7 @@ Precision(one_to_one: bool = True)
 ```
 
 `TP / (TP + FP)`. When `one_to_one=True` (default), applies
-`MatcherResults.one_to_one()` before counting.
+`MatcherResults.one_to_one_hungarian()` before counting.
 
 #### `Recall`
 
diff --git a/docs/changelog.md b/docs/changelog.md
index 543ad93..23e3fe4 100644
--- a/docs/changelog.md
+++ b/docs/changelog.md
@@ -84,7 +84,7 @@ coming from 0.5.x or earlier, the changes below will affect your code.
   `valentine_match` / `valentine_match_batch` pair.
 - **Immutable [`MatcherResults`](api.md#matcherresults).** The result
   object is now a `Mapping`, not a `dict` subclass. Derived views
-  (e.g. [`one_to_one()`](api.md#one_to_one)) are cached and cannot be
+  (e.g. [`one_to_one_hungarian()`](api.md#one_to_one_hungarian)) are cached and cannot be
   silently invalidated.
 - [`Coma`](api.md#coma) is now a pure-Python implementation of
   COMA 3.0 — no JVM dependency. Constructor signature updated to
diff --git a/docs/example.md b/docs/example.md
index d614442..af23f8f 100644
--- a/docs/example.md
+++ b/docs/example.md
@@ -60,7 +60,7 @@ def main():
 
     # 4. Reduce to one-to-one matches (greedy, highest-first).
     print("\nGetting the one-to-one matches:")
-    pp.pprint(matches.one_to_one())
+    pp.pprint(matches.one_to_one_hungarian())
 
     # 5. If you have a ground truth, compute evaluation metrics.
     ground_truth = [
diff --git a/docs/faq.md b/docs/faq.md
index e8ff040..114afdd 100644
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -95,7 +95,7 @@ provides three reduction helpers:
 ```python
 matches.take_top_n(10)              # absolute top 10
 matches.take_top_percent(5)         # top 5%
-matches.one_to_one()                # bidirectional best matches
+matches.one_to_one_hungarian()                # bidirectional best matches
 ```
 
 All three return a new `MatcherResults` — the original is immutable.
diff --git a/docs/metrics.md b/docs/metrics.md
index 2acc695..7d061c3 100644
--- a/docs/metrics.md
+++ b/docs/metrics.md
@@ -71,7 +71,7 @@ from valentine.metrics import (
 
 `Precision`, `Recall`, `F1Score` and `PrecisionTopNPercent` all accept a
 `one_to_one: bool` flag that applies
-[`MatcherResults.one_to_one()`](api.md#one_to_one) before counting.
+[`MatcherResults.one_to_one_hungarian()`](api.md#one_to_one_hungarian) before counting.
 `PrecisionTopNPercent` additionally takes `n: int` for the cutoff, and
 `RecallAtSizeofGroundTruth` defaults to `one_to_one=False`. See the
 [API reference](api.md#built-in-metrics) for full defaults.
diff --git a/docs/results.md b/docs/results.md
index 1fe5d6f..21e9362 100644
--- a/docs/results.md
+++ b/docs/results.md
@@ -10,7 +10,7 @@ mapping** of [`ColumnPair`](api.md#columnpair) keys to similarity
 scores, sorted from highest score to lowest. It behaves like a `dict`
 for lookup and iteration, but cannot be mutated (preventing accidental
 invalidation of cached derived views such as
-[`one_to_one()`](api.md#one_to_one)).
+[`one_to_one_hungarian()`](api.md#one_to_one_hungarian)).
 
 For the authoritative method signatures, see the API reference for
 [`MatcherResults`](api.md#matcherresults) and
@@ -82,24 +82,24 @@ strong = matches.filter(min_score=0.7)
 
 # Reduce to one-to-one matches (greedy, highest-first). Threshold defaults
 # to the median score of the current results.
-one_to_one = matches.one_to_one()
+one_to_one = matches.one_to_one_hungarian()
 
 # Override the threshold to be stricter
-strict = matches.one_to_one(threshold=0.8)
+strict = matches.one_to_one_hungarian(threshold=0.8)
 ```
 
 Each method is documented in full in the API reference:
 [`take_top_n`](api.md#take_top_n),
 [`take_top_percent`](api.md#take_top_percent),
 [`filter`](api.md#filter), and
-[`one_to_one`](api.md#one_to_one).
+[`one_to_one`](api.md#one_to_one_hungarian).
 
 Every transformation returns a **new**
 [`MatcherResults`](api.md#matcherresults) instance, so you can chain
 them:
 
 ```python
-best_strict_pairs = matches.filter(min_score=0.5).one_to_one(threshold=0.7)
+best_strict_pairs = matches.filter(min_score=0.5).one_to_one_hungarian(threshold=0.7)
 ```
 
 !!! tip "Details propagation"
diff --git a/examples/valentine_example_mixed.py b/examples/valentine_example_mixed.py
index 8104a6a..9b24ff7 100644
--- a/examples/valentine_example_mixed.py
+++ b/examples/valentine_example_mixed.py
@@ -37,7 +37,7 @@ def main():
         print(f"  {pair.source_column:>20s} <-> {pair.target_column:<20s}  {score:.4f}")
 
     print("\nOne-to-one matches:")
-    for pair, score in matches.one_to_one().items():
+    for pair, score in matches.one_to_one_hungarian().items():
         print(f"  {pair.source_column:>20s} <-> {pair.target_column:<20s}  {score:.4f}")
 
     # Evaluate against ground truth
diff --git a/examples/valentine_example_pandas.py b/examples/valentine_example_pandas.py
index 6ecec70..c3d93af 100644
--- a/examples/valentine_example_pandas.py
+++ b/examples/valentine_example_pandas.py
@@ -35,7 +35,7 @@ def main():
             print(f"  {'':>20s}      [{breakdown}]")
 
     print("\nGetting the one-to-one matches:")
-    pp.pprint(matches.one_to_one())
+    pp.pprint(matches.one_to_one_hungarian())
 
     # If ground truth available valentine could calculate the metrics
     ground_truth = [
diff --git a/examples/valentine_example_polars.py b/examples/valentine_example_polars.py
index 5512140..0a4f668 100644
--- a/examples/valentine_example_polars.py
+++ b/examples/valentine_example_polars.py
@@ -35,7 +35,7 @@ def main():
             print(f"  {'':>20s}      [{breakdown}]")
 
     print("\nGetting the one-to-one matches:")
-    pp.pprint(matches.one_to_one())
+    pp.pprint(matches.one_to_one_hungarian())
 
     # If ground truth available valentine could calculate the metrics
     ground_truth = [
diff --git a/experiments/bench.py b/experiments/bench.py
index 3cf9fed..292ed64 100644
--- a/experiments/bench.py
+++ b/experiments/bench.py
@@ -51,6 +51,7 @@
     JaccardDistanceMatcher,
     SimilarityFlooding,
 )
+from valentine.algorithms.jaccard_distance import StringDistanceFunction
 from valentine.metrics import F1Score, MeanReciprocalRank, RecallAtSizeofGroundTruth
 
 try:
@@ -67,7 +68,7 @@
 
 
 def _matcher_builders() -> list[tuple[str, MatcherFactory]]:
-    return [
+    builders: list[tuple[str, MatcherFactory]] = [
         ("Coma", Coma),
         ("Coma_Inst", lambda: Coma(use_instances=True)),
         ("Cupid", Cupid),
@@ -75,6 +76,46 @@ def _matcher_builders() -> list[tuple[str, MatcherFactory]]:
         ("JaccardDistanceMatcher", JaccardDistanceMatcher),
         ("SimilarityFlooding", SimilarityFlooding),
     ]
+    # Only include the embedding variant when sentence-transformers is
+    # actually importable; otherwise the bench would crash on import.
+    try:
+        import sentence_transformers 
+
+        builders.append(
+            (
+                "JaccardDistanceMatcher_emb",
+                # embedding_device=None lets sentence-transformers / torch
+                # auto-pick: cuda > mps > cpu. So the bench transparently
+                # uses GPU on CUDA boxes and MPS on Apple Silicon without
+                # any config; CPU-only machines fall back automatically.
+                #
+                # embedding_batch_size is left unset, so the encode call
+                # uses sentence-transformers' library default (32). This
+                # is the out-of-the-box operating point. To trade memory
+                # for speed on capable hardware, pass an explicit larger
+                # value (e.g. embedding_batch_size=128 or 256): on the
+                # NYU full suite that drops total wall time from ~15s to
+                # ~11s on MPS without affecting accuracy.
+                #
+                # tversky_alpha=tversky_beta=1.0 reduces to Jaccard (the
+                # default; matches prior behaviour). Set both to 0.5 for
+                # Dice, or one to 0 to recover set containment — natural
+                # for subset/superset workloads (dataset discovery), but
+                # on the NYU bench it regressed mean F1 by ~12pp because
+                # asymmetric scoring inflates similarity for size-
+                # asymmetric pairs. match_weighting defaults to Binary
+                # (count-based intersection); switch to Margin to weight
+                # each value by its top1-vs-top2 confidence gap.
+                lambda: JaccardDistanceMatcher(
+                    distance_fun=StringDistanceFunction.Embedding,
+                    threshold_dist=0.7,
+                    embedding_device=None,
+                ),
+            )
+        )
+    except ImportError:
+        pass
+    return builders
 
 
 # ---------------------------------------------------------------------------
diff --git a/pyproject.toml b/pyproject.toml
index db959c4..9adbe58 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -55,6 +55,9 @@ dev = [
 polars = [
     "polars>=1.0,<2.0",
 ]
+embeddings = [
+    "sentence-transformers>=2.0,<6.0",
+]
 docs = [
     # Pinned intentionally: Zensical is pre-1.0 and moves fast. Bump
     # deliberately rather than relying on a range, so a bad release
diff --git a/tests/test_coverage_gaps.py b/tests/test_coverage_gaps.py
index 6f7117d..6fd4de4 100644
--- a/tests/test_coverage_gaps.py
+++ b/tests/test_coverage_gaps.py
@@ -87,13 +87,13 @@ def test_details_empty_when_none(self):
         assert bare.details == {}
         assert bare.get_details(next(iter(bare))) is None
 
-    def test_one_to_one_with_explicit_threshold(self):
-        result = self.results.one_to_one(threshold=0.7)
+    def test_one_to_one_greedy_with_explicit_threshold(self):
+        result = self.results.one_to_one_greedy(threshold=0.7)
         # Only entries >= 0.7 survive the explicit threshold path
         assert all(score >= 0.7 for score in result.values())
         assert len(result) == 3
 
-    def test_one_to_one_identical_scores(self):
+    def test_one_to_one_hungarian_identical_scores(self):
         # Less than two distinct values -> early return branch
         flat = MatcherResults(
             {
@@ -101,7 +101,7 @@ def test_one_to_one_identical_scores(self):
                 ColumnPair("s", "b", "t", "b"): 0.5,
             }
         )
-        assert len(flat.one_to_one()) == len(flat)
+        assert len(flat.one_to_one_hungarian()) == len(flat)
 
     def test_filter(self):
         result = self.results.filter(min_score=0.75)
diff --git a/tests/test_distribution_based_benchmark.py b/tests/test_distribution_based_benchmark.py
index afd175c..7076ad3 100644
--- a/tests/test_distribution_based_benchmark.py
+++ b/tests/test_distribution_based_benchmark.py
@@ -65,8 +65,9 @@ def test_synthetic_numeric_accuracy(self):
         metrics = matches.get_metrics(ground_truth, metrics={Precision(), Recall(), F1Score()})
 
         # Baseline: P=1.0, R=0.75, F1=0.857
-        # The algorithm correctly finds all 4 pairs (raw Recall=1.0), but one_to_one()
-        # post-processing may filter the weakest match below the median threshold.
+        # The algorithm correctly finds all 4 pairs (raw Recall=1.0), but the
+        # one_to_one_hungarian() post-processing may filter the weakest match
+        # below the median threshold.
         assert metrics["Precision"] >= 1.0, f"Precision dropped to {metrics['Precision']}"
         assert metrics["Recall"] >= 0.75, f"Recall dropped to {metrics['Recall']}"
         assert metrics["F1Score"] >= 0.85, f"F1Score dropped to {metrics['F1Score']}"
diff --git a/tests/test_docs_smoke.py b/tests/test_docs_smoke.py
index a22b933..0122bb3 100644
--- a/tests/test_docs_smoke.py
+++ b/tests/test_docs_smoke.py
@@ -130,7 +130,9 @@ def test_data_sources_exports():
 def test_matcher_results_documented_methods():
     """Every MatcherResults method referenced in the docs must exist."""
     for name in (
-        "one_to_one",
+        "one_to_one_hungarian",
+        "one_to_one_greedy",
+        "one_to_one_mutual_top",
         "filter",
         "take_top_n",
         "take_top_percent",
diff --git a/tests/test_matcher_results.py b/tests/test_matcher_results.py
index 674fc16..f58e46e 100644
--- a/tests/test_matcher_results.py
+++ b/tests/test_matcher_results.py
@@ -37,7 +37,7 @@ def test_get_metrics(self):
         metrics_specific = self.matches.get_metrics(self.ground_truth, metrics={Precision()})
         assert "Precision" in metrics_specific
 
-    def test_one_to_one(self):
+    def test_one_to_one_greedy(self):
         m = self.matches
         n = len(m)
         assert n > 0
@@ -56,8 +56,8 @@ def test_one_to_one(self):
 
         assert len(m) == 2 * n
 
-        m_one_to_one = m.one_to_one()
-        # one_to_one should remove duplicates, returning fewer entries
+        m_one_to_one = m.one_to_one_greedy()
+        # 1:1 should remove duplicates, returning fewer entries
         assert len(m_one_to_one) <= n
         assert len(m_one_to_one) < len(m)
 
@@ -65,16 +65,17 @@ def test_one_to_one(self):
         for pair in m_one_to_one:
             assert not pair.target_column.endswith("foo")
 
-        # Cache resets on new instance
+        # Cache resets on new instance — Hungarian is the cached default,
+        # not greedy. Verify the default-path cache lifecycle here.
         m_entry = MatcherResults(dict(m))
-        assert m_entry._cached_one_to_one is None
+        assert m_entry._cached_hungarian is None
 
         # Add a new entry with distinct columns
         ext2 = dict(m_entry)
         ext2[ColumnPair("extra_src", "BLA", "extra_tgt", "BLA")] = 0.7214057
         m_entry = MatcherResults(ext2)
 
-        m_entry_one_to_one = m_entry.one_to_one()
+        m_entry_one_to_one = m_entry.one_to_one_greedy()
         assert m_one_to_one != m_entry_one_to_one
 
         # All remaining values should be above the median
diff --git a/valentine/algorithms/jaccard_distance/__init__.py b/valentine/algorithms/jaccard_distance/__init__.py
index 2100b92..fc0d5c4 100644
--- a/valentine/algorithms/jaccard_distance/__init__.py
+++ b/valentine/algorithms/jaccard_distance/__init__.py
@@ -8,3 +8,6 @@ class StringDistanceFunction(Enum):
     JaroWinkler = auto()
     Hamming = auto()
     Exact = auto()
+    # Sentence-transformer embedding cosine similarity. Requires the
+    # ``sentence-transformers`` extra (``pip install valentine[embeddings]``).
+    Embedding = auto()
diff --git a/valentine/algorithms/jaccard_distance/jaccard_distance.py b/valentine/algorithms/jaccard_distance/jaccard_distance.py
index 890e0a8..09b5e3c 100644
--- a/valentine/algorithms/jaccard_distance/jaccard_distance.py
+++ b/valentine/algorithms/jaccard_distance/jaccard_distance.py
@@ -1,4 +1,5 @@
-from itertools import product
+from functools import lru_cache
+from itertools import combinations, product
 
 import numpy as np
 from rapidfuzz import process
@@ -27,6 +28,31 @@
 }
 
 
+@lru_cache(maxsize=4)
+def _load_sentence_transformer(model_name: str, device: str | None):
+    """Lazily load and cache a SentenceTransformer model on a device.
+
+    Importing inside the function keeps ``sentence-transformers`` an
+    optional dependency: the rest of this module — and every other
+    ``StringDistanceFunction`` value — works without it installed.
+
+    ``device`` is passed straight through to ``SentenceTransformer``.
+    ``None`` lets the library auto-pick (typically ``cuda`` if available,
+    else ``mps`` on Apple Silicon, else ``cpu``). Pass ``"cpu"``,
+    ``"cuda"``, ``"cuda:1"``, or ``"mps"`` to force a specific device.
+    The cache is keyed by ``(model_name, device)`` so switching devices
+    does not silently reuse a model loaded elsewhere.
+    """
+    try:
+        from sentence_transformers import SentenceTransformer
+    except ImportError as exc:  # pragma: no cover - depends on optional extra
+        raise ImportError(
+            "StringDistanceFunction.Embedding requires the 'sentence-transformers' "
+            "package. Install it with: pip install 'valentine[embeddings]'"
+        ) from exc
+    return SentenceTransformer(model_name, device=device)
+
+
 class JaccardDistanceMatcher(BaseMatcher):
     """Baseline instance-based matcher using Jaccard similarity.
 
@@ -40,21 +66,58 @@ class JaccardDistanceMatcher(BaseMatcher):
         Acceptance threshold above which two string values are considered
         equal under the chosen ``distance_fun``, in ``[0, 1]``
         (default: ``0.8``). Ignored when ``distance_fun`` is
-        :attr:`StringDistanceFunction.Exact`.
+        :attr:`StringDistanceFunction.Exact`. For
+        :attr:`StringDistanceFunction.Embedding`, the threshold is
+        applied to cosine similarity of sentence-transformer embeddings;
+        ~0.7 is a typical operating point.
     distance_fun : StringDistanceFunction, optional
         String similarity function. One of
         :attr:`StringDistanceFunction.Levenshtein` (default),
         :attr:`StringDistanceFunction.DamerauLevenshtein`,
         :attr:`StringDistanceFunction.Hamming`,
         :attr:`StringDistanceFunction.Jaro`,
-        :attr:`StringDistanceFunction.JaroWinkler`, or
-        :attr:`StringDistanceFunction.Exact`.
+        :attr:`StringDistanceFunction.JaroWinkler`,
+        :attr:`StringDistanceFunction.Exact`, or
+        :attr:`StringDistanceFunction.Embedding`.
     process_num : int, optional
         Number of worker threads passed to ``rapidfuzz.process.cdist``
         (must be ``>= 1``, default: ``1``). Earlier versions used a
         ``multiprocessing.Pool``; with rapidfuzz the inner kernel is
         already C++ and parallelises via OpenMP threads, so the pool is
         no longer needed.
+    embedding_model : str, optional
+        Name of the sentence-transformers model used when
+        ``distance_fun=StringDistanceFunction.Embedding`` (default:
+        ``"all-MiniLM-L6-v2"``, a 23 MB / 384-dim model that runs well
+        on CPU). Ignored for non-embedding distances.
+    embedding_device : str or None, optional
+        Device to load the embedding model on. Passed straight through
+        to ``SentenceTransformer``. ``None`` (the default) lets the
+        library auto-detect — usually ``"cuda"`` if a GPU is present,
+        ``"mps"`` on Apple Silicon, otherwise ``"cpu"``. Pass
+        ``"cpu"``, ``"cuda"``, ``"cuda:1"``, or ``"mps"`` to force a
+        specific device. Ignored for non-embedding distances.
+    embedding_batch_size : int or None, optional
+        Batch size used for the global ``model.encode`` call. ``None``
+        (the default) does not pass the kwarg, letting
+        sentence-transformers use its own default (``32``). Pass an
+        explicit value (e.g. ``128`` or ``256``) to amortise per-call
+        overhead when encoding large vocabularies on capable hardware.
+        Ignored for non-embedding distances.
+    tversky_alpha : float, optional
+        Tversky penalty for unmatched values on the *reference* side
+        (default: ``1.0``). The pair-similarity reduction is
+        ``T(A, B; α, β) = |A∩B| / (|A∩B| + α·|A−B| + β·|B−A|)``,
+        symmetrised by computing both ``T(A, B)`` and ``T(B, A)`` and
+        taking the max so the matcher remains direction-agnostic. With
+        ``α = β = 1.0`` this reduces to Jaccard; with ``α = 1.0,
+        β = 0.0`` (or vice versa) it reduces to ``max(|∩|/|A|, |∩|/|B|)``,
+        i.e. set containment — the right choice when one column is
+        expected to be a subset of the other. Intermediate values trade
+        off between these extremes.
+    tversky_beta : float, optional
+        Tversky penalty for unmatched values on the *other* side
+        (default: ``1.0``). See ``tversky_alpha``.
     """
 
     def __init__(
@@ -62,10 +125,31 @@ def __init__(
         threshold_dist: float = 0.8,
         distance_fun: StringDistanceFunction = StringDistanceFunction.Levenshtein,
         process_num: int = 1,
+        embedding_model: str = "all-MiniLM-L6-v2",
+        embedding_device: str | None = None,
+        embedding_batch_size: int | None = None,
+        tversky_alpha: float = 1.0,
+        tversky_beta: float = 1.0,
     ):
         self.__threshold_dist = float(threshold_dist)
         self.__process_num = int(process_num)
         self.__distance_function = distance_fun
+        self.__embedding_model_name = str(embedding_model)
+        self.__embedding_device = embedding_device
+        if embedding_batch_size is not None and embedding_batch_size < 1:
+            raise ValueError(
+                f"embedding_batch_size must be >= 1 or None, got {embedding_batch_size}"
+            )
+        self.__embedding_batch_size = (
+            None if embedding_batch_size is None else int(embedding_batch_size)
+        )
+        if tversky_alpha < 0.0 or tversky_beta < 0.0:
+            raise ValueError(
+                f"tversky_alpha and tversky_beta must be >= 0, "
+                f"got alpha={tversky_alpha}, beta={tversky_beta}"
+            )
+        self.__tversky_alpha = float(tversky_alpha)
+        self.__tversky_beta = float(tversky_beta)
         if not 0.0 <= self.__threshold_dist <= 1.0:
             raise ValueError(
                 f"threshold_dist must be between 0.0 and 1.0, got {self.__threshold_dist}"
@@ -74,17 +158,105 @@ def __init__(
             raise ValueError(f"process_num must be >= 1, got {self.__process_num}")
 
     def get_matches(self, source_input: BaseTable, target_input: BaseTable) -> dict:
+        col_embeddings = self.__build_col_embeddings([source_input, target_input])
+        return self.__match_pair(source_input, target_input, col_embeddings)
+
+    def get_matches_batch(self, tables: list[BaseTable]) -> dict:
+        """Match all unique table pairs, sharing one global embedding pass.
+
+        For ``StringDistanceFunction.Embedding`` this means each unique
+        string across every column of every table is encoded exactly
+        once. With other distances the override is equivalent to the
+        default ``BaseMatcher.get_matches_batch``.
+        """
+        col_embeddings = self.__build_col_embeddings(tables)
+        matches: dict = {}
+        for t1, t2 in combinations(tables, 2):
+            matches.update(self.__match_pair(t1, t2, col_embeddings))
+        return matches
+
+    def __match_pair(
+        self,
+        source_input: BaseTable,
+        target_input: BaseTable,
+        col_embeddings: dict[tuple[str, str], tuple[list[str], np.ndarray]] | None,
+    ) -> dict:
         matches: dict = {}
         for combination in self.__get_column_combinations(
             source_input,
             target_input,
             self.__threshold_dist,
             self.__distance_function,
+            col_embeddings,
         ):
             matches.update(self.process_jaccard_distance(combination))
         # Remove the pairs with zero similarity
         return {k: v for k, v in matches.items() if v > 0.0}
 
+    def __build_col_embeddings(
+        self, tables: list[BaseTable]
+    ) -> dict[tuple[str, str], tuple[list[str], np.ndarray]] | None:
+        """Encode every column across every table with one batched call.
+
+        Returns a ``(table_name, column_name) -> (values, embeddings)``
+        map, or ``None`` when the chosen distance is not embedding-based.
+
+        Two layers of deduplication keep this cheap:
+
+        - **Per-column**: the column's value set is converted to a sorted
+          list of unique strings (deterministic for repeated runs).
+        - **Global vocabulary**: identical strings appearing in many
+          columns are encoded only once, then sliced back out.
+
+        The encode itself is a single ``model.encode`` call with a large
+        batch size, which dominates the speedup over per-column encoding.
+        """
+        if self.__distance_function != StringDistanceFunction.Embedding:
+            return None
+
+        # Collect per-column unique values, deterministically ordered.
+        col_values: dict[tuple[str, str], list[str]] = {}
+        for table in tables:
+            for column in table.get_instances_columns():
+                key = (table.name, column.name)
+                if key in col_values:
+                    continue
+                col_values[key] = sorted({str(v) for v in column.data})
+
+        # Build a global vocabulary: each unique string is encoded once.
+        vocab: dict[str, int] = {}
+        for values in col_values.values():
+            for v in values:
+                if v not in vocab:
+                    vocab[v] = len(vocab)
+
+        if not vocab:
+            return {key: (values, np.zeros((0, 0), dtype=np.float32)) for key, values in col_values.items()}
+
+        model = _load_sentence_transformer(
+            self.__embedding_model_name, self.__embedding_device
+        )
+        encode_kwargs: dict = {
+            "normalize_embeddings": True,
+            "show_progress_bar": False,
+            "convert_to_numpy": True,
+        }
+        if self.__embedding_batch_size is not None:
+            encode_kwargs["batch_size"] = self.__embedding_batch_size
+        all_embeddings = model.encode(list(vocab.keys()), **encode_kwargs).astype(
+            np.float32
+        )
+
+        dim = all_embeddings.shape[1]
+        out: dict[tuple[str, str], tuple[list[str], np.ndarray]] = {}
+        for key, values in col_values.items():
+            if not values:
+                out[key] = (values, np.zeros((0, dim), dtype=np.float32))
+                continue
+            indices = [vocab[v] for v in values]
+            out[key] = (values, all_embeddings[indices])
+        return out
+
     def process_jaccard_distance(self, tup: tuple):
         (
             source_data,
@@ -95,8 +267,26 @@ def process_jaccard_distance(self, tup: tuple):
             source_table_name,
             source_column_name,
             distance_function,
+            embeddings,
         ) = tup
 
+        if distance_function == StringDistanceFunction.Embedding:
+            sim = self.__embedding_similarity(
+                embeddings,
+                source_table_name,
+                source_column_name,
+                target_table_name,
+                target_column_name,
+                threshold,
+            )
+            return Match(
+                target_table_name,
+                target_column_name,
+                source_table_name,
+                source_column_name,
+                sim,
+            ).to_dict
+
         set1 = {str(x) for x in source_data}
         set2 = {str(x) for x in target_data}
         # Iterate over the smaller set as queries: cdist scales with
@@ -105,9 +295,11 @@ def process_jaccard_distance(self, tup: tuple):
             set1, set2 = set2, set1
 
         if distance_function == StringDistanceFunction.Exact:
-            intersection_cnt = len(set1 & set2)
+            # Exact match is symmetric — both sides see the same intersection.
+            inter = len(set1 & set2)
+            a_match = b_match = float(inter)
         elif not set1 or not set2:
-            intersection_cnt = 0
+            a_match = b_match = 0.0
         else:
             scorer = _SCORER_MAP[distance_function]
             queries = list(set1)
@@ -119,14 +311,9 @@ def process_jaccard_distance(self, tup: tuple):
                 score_cutoff=threshold,
                 workers=self.__process_num,
             )
-            # Each query string in set1 contributes 1 to the intersection
-            # if at least one choice in set2 scores >= threshold. Scores
-            # below score_cutoff are returned as 0 by rapidfuzz, so the
-            # comparison is exact even when threshold == 0.
-            intersection_cnt = int(np.count_nonzero((scores >= threshold).any(axis=1)))
+            a_match, b_match = self.__directional_counts(scores, threshold)
 
-        union_cnt = len(set1) + len(set2) - intersection_cnt
-        sim = 0.0 if union_cnt == 0 else float(intersection_cnt) / union_cnt
+        sim = self.__aggregate(a_match, b_match, len(set1), len(set2))
 
         return Match(
             target_table_name,
@@ -136,12 +323,82 @@ def process_jaccard_distance(self, tup: tuple):
             sim,
         ).to_dict
 
+    def __embedding_similarity(
+        self,
+        embeddings: dict[tuple[str, str], tuple[list[str], np.ndarray]],
+        source_table_name: str,
+        source_column_name: str,
+        target_table_name: str,
+        target_column_name: str,
+        threshold: float,
+    ) -> float:
+        """Tversky-reduced set similarity using cosine on embeddings.
+
+        Two values are treated as "matched" when their cosine similarity
+        is ``>= threshold``. Both directional match counts come from the
+        same ``sims`` matrix and are reduced via Tversky.
+        """
+        src_values, src_emb = embeddings[(source_table_name, source_column_name)]
+        tgt_values, tgt_emb = embeddings[(target_table_name, target_column_name)]
+        if not src_values or not tgt_values:
+            return 0.0
+        # Iterate over the smaller side, matching the rapidfuzz branch.
+        if len(src_values) > len(tgt_values):
+            src_values, tgt_values = tgt_values, src_values
+            src_emb, tgt_emb = tgt_emb, src_emb
+        # Embeddings are L2-normalised at encode-time, so cosine = dot product.
+        sims = src_emb @ tgt_emb.T
+        a_match, b_match = self.__directional_counts(sims, threshold)
+        return self.__aggregate(a_match, b_match, len(src_values), len(tgt_values))
+
+    @staticmethod
+    def __directional_counts(
+        scores: np.ndarray, threshold: float
+    ) -> tuple[float, float]:
+        """Count rows / columns whose best entry is at least ``threshold``.
+
+        ``scores[i, j]`` is the similarity between A's i-th value and B's
+        j-th value. The first value is the count of A-side values with at
+        least one above-threshold partner; the second is the count on B's
+        side.
+        """
+        hits = scores >= threshold
+        return (
+            float(np.count_nonzero(hits.any(axis=1))),
+            float(np.count_nonzero(hits.any(axis=0))),
+        )
+
+    def __aggregate(
+        self, a_match: float, b_match: float, a_size: int, b_size: int
+    ) -> float:
+        """Reduce directional match counts to a similarity score via Tversky.
+
+        Uses the asymmetric Tversky index in both directions and returns
+        the larger of the two so the matcher stays direction-agnostic:
+
+            T(A, B; α, β) = a_match / (a_match + α·(|A|−a_match) + β·(|B|−b_match))
+
+        With α = β = 1 this is Jaccard; α = 1, β = 0 (or vice versa)
+        recovers ``max(|∩|/|A|, |∩|/|B|)`` containment.
+        """
+        if a_size == 0 or b_size == 0:
+            return 0.0
+        alpha, beta = self.__tversky_alpha, self.__tversky_beta
+        a_unmatched = max(a_size - a_match, 0.0)
+        b_unmatched = max(b_size - b_match, 0.0)
+        denom_ab = a_match + alpha * a_unmatched + beta * b_unmatched
+        denom_ba = b_match + alpha * b_unmatched + beta * a_unmatched
+        t_ab = 0.0 if denom_ab <= 0.0 else a_match / denom_ab
+        t_ba = 0.0 if denom_ba <= 0.0 else b_match / denom_ba
+        return float(max(t_ab, t_ba))
+
     @staticmethod
     def __get_column_combinations(
         source_table: BaseTable,
         target_table: BaseTable,
         threshold,
         distance_function: StringDistanceFunction,
+        col_embeddings: dict[tuple[str, str], tuple[list[str], np.ndarray]] | None,
     ):
         for source_column, target_column in product(
             source_table.get_instances_columns(), target_table.get_instances_columns()
@@ -155,4 +412,5 @@ def __get_column_combinations(
                 source_table.name,
                 source_column.name,
                 distance_function,
+                col_embeddings,
             )
diff --git a/valentine/algorithms/matcher_results.py b/valentine/algorithms/matcher_results.py
index 928a3de..491a3ae 100644
--- a/valentine/algorithms/matcher_results.py
+++ b/valentine/algorithms/matcher_results.py
@@ -37,7 +37,10 @@ def __init__(
         sorted_matches = dict(sorted(matches.items(), key=lambda x: x[1], reverse=True))
         self._data: dict[ColumnPair, float] = sorted_matches
         self._details: dict[ColumnPair, dict[str, float]] = details or {}
-        self._cached_one_to_one: MatcherResults | None = None
+        # Cached default 1:1 selection (Hungarian, since it is the default
+        # filter used by Precision / Recall / F1Score). Greedy and mutual
+        # variants are niche and not cached.
+        self._cached_hungarian: MatcherResults | None = None
 
     # -- Mapping protocol --------------------------------------------------
 
@@ -87,34 +90,118 @@ def get_details(self, key: ColumnPair) -> dict[str, float] | None:
 
     # -- Transformations ---------------------------------------------------
 
-    def one_to_one(self, threshold: float | None = None) -> MatcherResults:
-        """Filter to one-to-one column matches.
+    def one_to_one_hungarian(self, threshold: float | None = None) -> MatcherResults:
+        """Globally optimal 1:1 column matching via Hungarian assignment.
 
-        Starting from the highest-scoring pair, greedily assigns each source
-        and target column at most one match. Pairs below ``threshold`` are
-        discarded. When ``threshold`` is ``None`` (the default), the median
-        similarity score is used.
+        This is the **default** 1:1 selector — it is what
+        :class:`Precision` / :class:`Recall` / :class:`F1Score` call when
+        their ``one_to_one`` flag is set. Each source and target appears
+        in at most one returned pair, with the assignment chosen to
+        maximise **total** similarity over all valid one-to-one
+        assignments. Cost is O(n³) on column counts via
+        ``scipy.optimize.linear_sum_assignment`` — negligible for
+        typical schema sizes — and almost always strictly better than
+        the greedy variant.
 
         Parameters
         ----------
         threshold : float | None
-            Minimum similarity to keep. If None, uses the median score.
+            Minimum similarity to keep. If ``None``, uses the median
+            similarity score.
 
         Returns
         -------
         MatcherResults
-            A new instance with one-to-one matches only.
+            A new instance with the Hungarian-optimal one-to-one
+            assignment, post-thresholding.
         """
-        if threshold is None and self._cached_one_to_one is not None:
-            return self._cached_one_to_one
+        if threshold is None and self._cached_hungarian is not None:
+            return self._cached_hungarian
+        if not self._data:
+            result = MatcherResults({})
+            if threshold is None:
+                self._cached_hungarian = result
+            return result
 
-        set_match_values = set(self._data.values())
+        # Stable index of unique sources and targets.
+        sources: list[tuple[str, str]] = []
+        source_idx: dict[tuple[str, str], int] = {}
+        targets: list[tuple[str, str]] = []
+        target_idx: dict[tuple[str, str], int] = {}
+        for cp in self._data:
+            if cp.source not in source_idx:
+                source_idx[cp.source] = len(sources)
+                sources.append(cp.source)
+            if cp.target not in target_idx:
+                target_idx[cp.target] = len(targets)
+                targets.append(cp.target)
+
+        m, n = len(sources), len(targets)
+        sim = [[0.0] * n for _ in range(m)]
+        pair_lookup: dict[tuple, ColumnPair] = {}
+        for cp, score in self._data.items():
+            sim[source_idx[cp.source]][target_idx[cp.target]] = score
+            pair_lookup[(cp.source, cp.target)] = cp
+
+        # Hungarian minimises cost; we want max similarity.
+        from scipy.optimize import linear_sum_assignment
+
+        cost = [[-s for s in row] for row in sim]
+        row_ind, col_ind = linear_sum_assignment(cost)
 
+        set_match_values = set(self._data.values())
         if len(set_match_values) < 2:
             result = MatcherResults(dict(self._data), details=dict(self._details))
             if threshold is None:
-                self._cached_one_to_one = result
+                self._cached_hungarian = result
             return result
+        if threshold is None:
+            min_sim = sorted(set_match_values, reverse=True)[math.ceil(len(set_match_values) / 2)]
+        else:
+            min_sim = threshold
+
+        selected: dict[ColumnPair, float] = {}
+        for r, c in zip(row_ind, col_ind, strict=False):
+            cp = pair_lookup.get((sources[r], targets[c]))
+            if cp is None:
+                continue  # no actual pair at this (s, t)
+            score = self._data[cp]
+            if score >= min_sim:
+                selected[cp] = score
+
+        filtered_details = {k: v for k, v in self._details.items() if k in selected}
+        result = MatcherResults(selected, details=filtered_details)
+        if threshold is None:
+            self._cached_hungarian = result
+        return result
+
+    def one_to_one_greedy(self, threshold: float | None = None) -> MatcherResults:
+        """Greedy 1:1 column matching, kept for backwards compatibility.
+
+        Starting from the highest-scoring pair, greedily assigns each
+        source and target column at most one match. Pairs below
+        ``threshold`` are discarded. When ``threshold`` is ``None`` (the
+        default), the median similarity score is used.
+
+        Greedy can lock in a locally-best pair that blocks a better
+        globally-optimal assignment, so :meth:`one_to_one_hungarian` is
+        the recommended default; this method is exposed for
+        compatibility and for test pinning.
+
+        Parameters
+        ----------
+        threshold : float | None
+            Minimum similarity to keep. If ``None``, uses the median score.
+
+        Returns
+        -------
+        MatcherResults
+            A new instance with the greedy 1:1 assignment.
+        """
+        set_match_values = set(self._data.values())
+
+        if len(set_match_values) < 2:
+            return MatcherResults(dict(self._data), details=dict(self._details))
 
         matched: dict[tuple[str, str], bool] = {}
         for key in self._data:
@@ -137,10 +224,55 @@ def one_to_one(self, threshold: float | None = None) -> MatcherResults:
                     break
 
         filtered_details = {k: v for k, v in self._details.items() if k in matches1to1}
-        result = MatcherResults(matches1to1, details=filtered_details)
-        if threshold is None:
-            self._cached_one_to_one = result
-        return result
+        return MatcherResults(matches1to1, details=filtered_details)
+
+    def one_to_one_mutual_top(self, n: int = 1) -> MatcherResults:
+        """Keep pairs where each side ranks the other in its top *n*.
+
+        Pair ``(s, t)`` survives iff ``t`` is among ``s``'s ``n`` highest-
+        scoring targets AND ``s`` is among ``t``'s ``n`` highest-scoring
+        sources. With ``n=1`` this is the classic mutual nearest-
+        neighbour filter — high-precision, drops one-sided affinities.
+        Strictly stricter than :meth:`one_to_one_hungarian`: only
+        mutually-confirmed pairs survive, even at the cost of recall.
+
+        Parameters
+        ----------
+        n : int
+            Top-n cutoff per side (default 1 = mutual nearest neighbour).
+
+        Returns
+        -------
+        MatcherResults
+            A new instance with only the mutually-confirmed pairs.
+        """
+        if n < 1:
+            raise ValueError(f"n must be >= 1, got {n}")
+        if not self._data:
+            return MatcherResults({})
+
+        by_source: dict[tuple[str, str], list[tuple[float, tuple[str, str]]]] = {}
+        by_target: dict[tuple[str, str], list[tuple[float, tuple[str, str]]]] = {}
+        for cp, score in self._data.items():
+            by_source.setdefault(cp.source, []).append((score, cp.target))
+            by_target.setdefault(cp.target, []).append((score, cp.source))
+
+        src_top: dict[tuple[str, str], set] = {}
+        for s, lst in by_source.items():
+            lst.sort(reverse=True)
+            src_top[s] = {t for _, t in lst[:n]}
+        tgt_top: dict[tuple[str, str], set] = {}
+        for t, lst in by_target.items():
+            lst.sort(reverse=True)
+            tgt_top[t] = {s for _, s in lst[:n]}
+
+        selected: dict[ColumnPair, float] = {}
+        for cp, score in self._data.items():
+            if cp.target in src_top.get(cp.source, set()) and cp.source in tgt_top.get(cp.target, set()):
+                selected[cp] = score
+
+        filtered_details = {k: v for k, v in self._details.items() if k in selected}
+        return MatcherResults(selected, details=filtered_details)
 
     def filter(self, min_score: float) -> MatcherResults:
         """Filter matches by minimum similarity score.
@@ -223,6 +355,7 @@ def get_metrics(
         self,
         ground_truth: list[tuple[str, str]] | list[ColumnPair],
         metrics: set[Metric] = METRICS_CORE,
+        one_to_one_method: str = "hungarian",
     ) -> dict[str, Any]:
         """Compute evaluation metrics against a ground truth.
 
@@ -235,6 +368,10 @@ def get_metrics(
             comparison.
         metrics : set[Metric], optional
             Set of metric instances to compute (default: ``METRICS_CORE``).
+        one_to_one_method : {"greedy", "hungarian", "mutual_top"}
+            Selection algorithm passed to each metric's ``apply`` method
+            for use when the metric's ``one_to_one`` flag is ``True``
+            (default: ``"hungarian"``).
 
         Returns
         -------
@@ -243,7 +380,7 @@ def get_metrics(
         """
         res: dict[str, Any] = {}
         for metric in metrics:
-            res.update(metric.apply(self, ground_truth))
+            res.update(metric.apply(self, ground_truth, one_to_one_method=one_to_one_method))
         return res
 
     # -- Copies ------------------------------------------------------------
diff --git a/valentine/metrics/base_metric.py b/valentine/metrics/base_metric.py
index 22caa54..25a5946 100644
--- a/valentine/metrics/base_metric.py
+++ b/valentine/metrics/base_metric.py
@@ -4,7 +4,7 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Literal
 
 if TYPE_CHECKING:
     from ..algorithms.matcher_results import MatcherResults
@@ -12,6 +12,10 @@
 from dataclasses import dataclass
 from typing import Any, final
 
+# Valid choices for the 1:1 selection algorithm used by Precision /
+# Recall / F1Score / PrecisionTopNPercent when ``one_to_one=True``.
+OneToOneMethod = Literal["greedy", "hungarian", "mutual_top"]
+
 
 @dataclass(eq=True, frozen=True)
 class Metric(ABC):
@@ -25,6 +29,7 @@ def apply(
         self: Metric,
         matches: MatcherResults,
         ground_truth: list[tuple[str, str]] | list,
+        one_to_one_method: OneToOneMethod = "hungarian",
     ) -> dict[str, Any]:
         """Apply the metric to a ``MatcherResults`` instance, given ground truth.
 
@@ -37,6 +42,13 @@ def apply(
             ``[("src_col", "tgt_col"), ...]`` (table names ignored during
             comparison) or full :class:`~valentine.algorithms.ColumnPair`
             instances for table-aware comparison.
+        one_to_one_method : {"greedy", "hungarian", "mutual_top"}
+            Selection algorithm used when the metric's ``one_to_one``
+            field is ``True``. Defaults to ``"hungarian"`` (globally
+            optimal). ``"greedy"`` matches the legacy behaviour;
+            ``"mutual_top"`` keeps only mutually-confirmed pairs (top-1
+            on each side).  Ignored when ``one_to_one`` is ``False`` or
+            for metrics that do not apply 1:1 filtering.
         """
         pass
 
diff --git a/valentine/metrics/metric_helpers.py b/valentine/metrics/metric_helpers.py
index 93a33d4..e7342c1 100644
--- a/valentine/metrics/metric_helpers.py
+++ b/valentine/metrics/metric_helpers.py
@@ -5,6 +5,26 @@
 if TYPE_CHECKING:
     from ..algorithms.match import ColumnPair
     from ..algorithms.matcher_results import MatcherResults
+    from .base_metric import OneToOneMethod
+
+
+def _apply_one_to_one(matches: MatcherResults, method: OneToOneMethod) -> MatcherResults:
+    """Dispatch ``matches`` through the requested 1:1 selection algorithm.
+
+    ``method`` is one of ``"greedy"``, ``"hungarian"``, or ``"mutual_top"``;
+    invalid values raise ``ValueError``. Mutual-top defaults to ``n=1``
+    (mutual nearest neighbour).
+    """
+    if method == "hungarian":
+        return matches.one_to_one_hungarian()
+    if method == "greedy":
+        return matches.one_to_one_greedy()
+    if method == "mutual_top":
+        return matches.one_to_one_mutual_top()
+    raise ValueError(
+        f"Unknown one_to_one_method: {method!r}; "
+        "expected 'greedy', 'hungarian', or 'mutual_top'"
+    )
 
 
 def _normalize_ground_truth(
diff --git a/valentine/metrics/metrics.py b/valentine/metrics/metrics.py
index 33240e1..9d8f8e6 100644
--- a/valentine/metrics/metrics.py
+++ b/valentine/metrics/metrics.py
@@ -10,8 +10,14 @@
 from dataclasses import dataclass
 from typing import Any
 
-from .base_metric import Metric
-from .metric_helpers import _matches_as_tuples, _normalize_ground_truth, get_fp, get_tp_fn
+from .base_metric import Metric, OneToOneMethod
+from .metric_helpers import (
+    _apply_one_to_one,
+    _matches_as_tuples,
+    _normalize_ground_truth,
+    get_fp,
+    get_tp_fn,
+)
 
 # Public exports
 __all__ = [
@@ -45,9 +51,14 @@ class Precision(Metric):
 
     one_to_one: bool = True
 
-    def apply(self, matches: Any, ground_truth: GroundTruth) -> dict[str, float]:
+    def apply(
+        self,
+        matches: Any,
+        ground_truth: GroundTruth,
+        one_to_one_method: OneToOneMethod = "hungarian",
+    ) -> dict[str, float]:
         if self.one_to_one:
-            matches = matches.one_to_one()
+            matches = _apply_one_to_one(matches, one_to_one_method)
 
         tp, _ = get_tp_fn(matches, ground_truth)
         fp = get_fp(matches, ground_truth)
@@ -67,9 +78,14 @@ class Recall(Metric):
 
     one_to_one: bool = True
 
-    def apply(self, matches: Any, ground_truth: GroundTruth) -> dict[str, float]:
+    def apply(
+        self,
+        matches: Any,
+        ground_truth: GroundTruth,
+        one_to_one_method: OneToOneMethod = "hungarian",
+    ) -> dict[str, float]:
         if self.one_to_one:
-            matches = matches.one_to_one()
+            matches = _apply_one_to_one(matches, one_to_one_method)
 
         tp, fn = get_tp_fn(matches, ground_truth)
         recall = _safe_div(tp, tp + fn)
@@ -88,9 +104,14 @@ class F1Score(Metric):
 
     one_to_one: bool = True
 
-    def apply(self, matches: Any, ground_truth: GroundTruth) -> dict[str, float]:
+    def apply(
+        self,
+        matches: Any,
+        ground_truth: GroundTruth,
+        one_to_one_method: OneToOneMethod = "hungarian",
+    ) -> dict[str, float]:
         if self.one_to_one:
-            matches = matches.one_to_one()
+            matches = _apply_one_to_one(matches, one_to_one_method)
 
         tp, fn = get_tp_fn(matches, ground_truth)
         fp = get_fp(matches, ground_truth)
@@ -120,9 +141,14 @@ def name(self) -> str:
         # Replace the 'N' in the base name with the chosen percent, e.g. "PrecisionTop70Percent".
         return super().name().replace("N", str(self.n))
 
-    def apply(self, matches: Any, ground_truth: GroundTruth) -> dict[str, float]:
+    def apply(
+        self,
+        matches: Any,
+        ground_truth: GroundTruth,
+        one_to_one_method: OneToOneMethod = "hungarian",
+    ) -> dict[str, float]:
         if self.one_to_one:
-            matches = matches.one_to_one()
+            matches = _apply_one_to_one(matches, one_to_one_method)
 
         # Clamp N to a sensible range without mutating the dataclass.
         n_clamped = min(100, max(0, int(self.n)))
@@ -150,9 +176,14 @@ class RecallAtSizeofGroundTruth(Metric):
 
     one_to_one: bool = False
 
-    def apply(self, matches: Any, ground_truth: GroundTruth) -> dict[str, float]:
+    def apply(
+        self,
+        matches: Any,
+        ground_truth: GroundTruth,
+        one_to_one_method: OneToOneMethod = "hungarian",
+    ) -> dict[str, float]:
         if self.one_to_one:
-            matches = matches.one_to_one()
+            matches = _apply_one_to_one(matches, one_to_one_method)
         n_matches = matches.take_top_n(len(ground_truth))
         tp, fn = get_tp_fn(n_matches, ground_truth)
         recall = _safe_div(tp, tp + fn)
@@ -176,9 +207,14 @@ class MeanReciprocalRank(Metric):
 
     one_to_one: bool = False
 
-    def apply(self, matches: Any, ground_truth: GroundTruth) -> dict[str, float]:
+    def apply(
+        self,
+        matches: Any,
+        ground_truth: GroundTruth,
+        one_to_one_method: OneToOneMethod = "hungarian",
+    ) -> dict[str, float]:
         if self.one_to_one:
-            matches = matches.one_to_one()
+            matches = _apply_one_to_one(matches, one_to_one_method)
 
         gt_pairs, table_aware = _normalize_ground_truth(ground_truth)
         ranked = _matches_as_tuples(matches, table_aware)

From bad9fc843aa40c2c2150f59961775d0bbdf7ecbc Mon Sep 17 00:00:00 2001
From: Christos Koutras <koutras21@gmail.com>
Date: Mon, 4 May 2026 09:50:00 -0400
Subject: [PATCH 02/13] update readme

---
 README.md | 39 ++++++++++++++++++++++++++++++++-------
 1 file changed, 32 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index d33087d..f472671 100644
--- a/README.md
+++ b/README.md
@@ -63,6 +63,12 @@ To enable **Polars** support, install the optional extra:
 pip install valentine[polars]
 ```
 
+To enable the **sentence-transformer embedding** distance for `JaccardDistanceMatcher` (see below), install:
+
+```shell
+pip install valentine[embeddings]
+```
+
 
 ## Usage
 Valentine can be used to find matches among columns of a given pair of pandas or Polars DataFrames. You can even mix pandas and Polars frames in the same call — Valentine auto-detects the frame type.
@@ -89,17 +95,21 @@ In order to do so, the user can choose one of the following matching methods:
           *    **threshold1**(*float*) - The threshold for phase 1 of the method, default is 0.15.
           *    **threshold2**(*float*) - The threshold for phase 2 of the method, default is 0.15.
 
-4.   `JaccardDistanceMatcher(float: threshold_dist)` is a baseline method that uses Jaccard Similarity between columns to assess their correspondence score, optionally enhanced by a string similarity measure of choice.
+4.   `JaccardDistanceMatcher(...)` is a baseline method that scores column pairs by **Tversky** similarity over their value sets (Jaccard by default). Element equality between values can be decided by a configurable string distance function, including a sentence-transformer **embedding** option for semantic matching.
      *    **Parameters**:
-          *    **threshold_dist**(*float*) - Acceptance threshold for assessing two strings as equal, default is 0.8.
-
-          *    **distance_fun**(*StringDistanceFunction*) - String similarity function used to assess whether two strings are equal. The enumeration class type `StringDistanceFunction` can be imported from `valentine.algorithms.jaccard_distance`. Functions currently supported are:
-   		       * `StringDistanceFunction.Levenshtein`: [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance)
+          *    **threshold_dist**(*float*) - Acceptance threshold above which two values are considered equal under the chosen `distance_fun`, default is 0.8. For embeddings, ~0.7 is a typical operating point.
+          *    **distance_fun**(*StringDistanceFunction*) - Per-value similarity function. The enumeration class `StringDistanceFunction` can be imported from `valentine.algorithms.jaccard_distance`. Functions currently supported are:
+   		       * `StringDistanceFunction.Levenshtein`: [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance) (default)
                * `StringDistanceFunction.DamerauLevenshtein`: [Damerau-Levenshtein distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance)
                * `StringDistanceFunction.Hamming`: [Hamming distance](https://en.wikipedia.org/wiki/Hamming_distance)
                * `StringDistanceFunction.Jaro`: [Jaro distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance)
                * `StringDistanceFunction.JaroWinkler`: [Jaro-Winkler distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance)
-              * `StringDistanceFunction.Exact`: String equality `==`
+               * `StringDistanceFunction.Exact`: String equality `==`
+               * `StringDistanceFunction.Embedding`: cosine similarity on sentence-transformer embeddings (requires the `valentine[embeddings]` extra)
+          *    **tversky_alpha**(*float*) / **tversky_beta**(*float*) - Tversky penalty weights for unmatched values on each side (defaults `1.0`, `1.0`). Defaults give Jaccard; `0.5, 0.5` gives Sørensen-Dice; `1.0, 0.0` (or vice versa) gives set containment — useful when one column is expected to be a subset of the other.
+          *    **embedding_model**(*str*) - Sentence-transformers model name when `distance_fun=Embedding` (default `"all-MiniLM-L6-v2"`).
+          *    **embedding_device**(*str* or *None*) - Device override (`"cpu"`, `"cuda"`, `"mps"`). `None` (default) auto-picks: cuda → mps → cpu.
+          *    **embedding_batch_size**(*int* or *None*) - Encode batch size; `None` uses the sentence-transformers default (32). Larger values amortise per-call overhead on capable hardware.
 
 5.   `SimilarityFlooding(Policy: coeff_policy, Formula: formula, StringMatcher: string_matcher)` is the python implementation of the paper [Similarity Flooding: A Versatile Graph Matching Algorithmand its Application to Schema Matching](https://ieeexplore.ieee.org/document/994702)
      * **Parameters**:
@@ -137,11 +147,17 @@ for pair, score in matches.items():
 ```python
 top_n_matches = matches.take_top_n(5)
 top_n_percent_matches = matches.take_top_percent(25)
-one_to_one_matches = matches.one_to_one_hungarian()
 high_confidence = matches.filter(min_score=0.7)
+
+# One-to-one selectors — three flavours, pick the one that fits your task:
+one_to_one_matches = matches.one_to_one_hungarian()           # globally optimal (default)
 one_to_one_strict = matches.one_to_one_hungarian(threshold=0.5)
+greedy_legacy = matches.one_to_one_greedy()                   # legacy greedy assignment
+mutual_only = matches.one_to_one_mutual_top(n=1)              # mutual nearest neighbour
 ```
 
+`one_to_one_hungarian` (Hungarian assignment via `scipy.optimize.linear_sum_assignment`) is the recommended default and is what `Precision` / `Recall` / `F1Score` apply when their `one_to_one` flag is set. `one_to_one_greedy` preserves the legacy greedy behaviour for backwards compatibility. `one_to_one_mutual_top(n)` keeps a pair only when each side ranks the other in its top-`n` — a high-precision filter that drops one-sided affinities.
+
 ### Match details (Coma)
 
 When using the Coma matcher, per-sub-matcher score breakdowns are available via `.details`:
@@ -175,6 +191,15 @@ metrics_custom = matches.get_metrics(ground_truth, metrics={F1Score(one_to_one=F
 metrics_predefined_set = matches.get_metrics(ground_truth, metrics=METRICS_PRECISION_INCREASING_N)
 ```
 
+The 1:1 selection algorithm used when a metric's `one_to_one` flag is `True` can be overridden per call (default `"hungarian"`):
+
+```python
+metrics_strict   = matches.get_metrics(ground_truth, metrics={F1Score()}, one_to_one_method="mutual_top")
+metrics_legacy   = matches.get_metrics(ground_truth, metrics={F1Score()}, one_to_one_method="greedy")
+```
+
+Valid values are `"hungarian"` (default), `"greedy"`, and `"mutual_top"`. Metrics whose `one_to_one` flag is `False` (e.g. `MeanReciprocalRank`, `RecallAtSizeofGroundTruth`) ignore the argument.
+
 
 ### Example
 The following block of code shows: 1) how to run a matcher from Valentine on two DataFrames storing information about job candidates, and then 2) how to assess its effectiveness based on a given ground truth. More examples are available in the [`examples/`](https://github.com/delftdata/valentine/tree/master/examples) directory, including a [pandas example](https://github.com/delftdata/valentine/blob/master/examples/valentine_example_pandas.py), a [Polars example](https://github.com/delftdata/valentine/blob/master/examples/valentine_example_polars.py), and a [mixed pandas+Polars example](https://github.com/delftdata/valentine/blob/master/examples/valentine_example_mixed.py).

From e12b22d2d0e7b681d77f9cc0467d1e9704ecda60 Mon Sep 17 00:00:00 2001
From: Kyriakos Psarakis <kpsarakis94@gmail.com>
Date: Tue, 5 May 2026 12:02:40 +0200
Subject: [PATCH 03/13] apply linter rules

---
 experiments/bench.py                          |  7 ++---
 .../jaccard_distance/jaccard_distance.py      | 31 ++++++++-----------
 valentine/algorithms/matcher_results.py       | 14 +++++----
 valentine/metrics/metric_helpers.py           |  3 +-
 4 files changed, 24 insertions(+), 31 deletions(-)

diff --git a/experiments/bench.py b/experiments/bench.py
index 292ed64..c409b42 100644
--- a/experiments/bench.py
+++ b/experiments/bench.py
@@ -26,6 +26,7 @@
 from __future__ import annotations
 
 import argparse
+import importlib.util
 import json
 import statistics
 import sys
@@ -78,9 +79,7 @@ def _matcher_builders() -> list[tuple[str, MatcherFactory]]:
     ]
     # Only include the embedding variant when sentence-transformers is
     # actually importable; otherwise the bench would crash on import.
-    try:
-        import sentence_transformers 
-
+    if importlib.util.find_spec("sentence_transformers") is not None:
         builders.append(
             (
                 "JaccardDistanceMatcher_emb",
@@ -113,8 +112,6 @@ def _matcher_builders() -> list[tuple[str, MatcherFactory]]:
                 ),
             )
         )
-    except ImportError:
-        pass
     return builders
 
 
diff --git a/valentine/algorithms/jaccard_distance/jaccard_distance.py b/valentine/algorithms/jaccard_distance/jaccard_distance.py
index 09b5e3c..d60dc3a 100644
--- a/valentine/algorithms/jaccard_distance/jaccard_distance.py
+++ b/valentine/algorithms/jaccard_distance/jaccard_distance.py
@@ -44,7 +44,7 @@ def _load_sentence_transformer(model_name: str, device: str | None):
     does not silently reuse a model loaded elsewhere.
     """
     try:
-        from sentence_transformers import SentenceTransformer
+        from sentence_transformers import SentenceTransformer  # noqa: PLC0415
     except ImportError as exc:  # pragma: no cover - depends on optional extra
         raise ImportError(
             "StringDistanceFunction.Embedding requires the 'sentence-transformers' "
@@ -107,10 +107,10 @@ class JaccardDistanceMatcher(BaseMatcher):
     tversky_alpha : float, optional
         Tversky penalty for unmatched values on the *reference* side
         (default: ``1.0``). The pair-similarity reduction is
-        ``T(A, B; α, β) = |A∩B| / (|A∩B| + α·|A−B| + β·|B−A|)``,
+        ``T(A, B; a, b) = |A∩B| / (|A∩B| + a·|A-B| + b·|B-A|)``,
         symmetrised by computing both ``T(A, B)`` and ``T(B, A)`` and
         taking the max so the matcher remains direction-agnostic. With
-        ``α = β = 1.0`` this reduces to Jaccard; with ``α = 1.0,
+        ``a = b = 1.0`` this reduces to Jaccard; with ``a = 1.0,
         β = 0.0`` (or vice versa) it reduces to ``max(|∩|/|A|, |∩|/|B|)``,
         i.e. set containment — the right choice when one column is
         expected to be a subset of the other. Intermediate values trade
@@ -231,11 +231,12 @@ def __build_col_embeddings(
                     vocab[v] = len(vocab)
 
         if not vocab:
-            return {key: (values, np.zeros((0, 0), dtype=np.float32)) for key, values in col_values.items()}
+            return {
+                key: (values, np.zeros((0, 0), dtype=np.float32))
+                for key, values in col_values.items()
+            }
 
-        model = _load_sentence_transformer(
-            self.__embedding_model_name, self.__embedding_device
-        )
+        model = _load_sentence_transformer(self.__embedding_model_name, self.__embedding_device)
         encode_kwargs: dict = {
             "normalize_embeddings": True,
             "show_progress_bar": False,
@@ -243,9 +244,7 @@ def __build_col_embeddings(
         }
         if self.__embedding_batch_size is not None:
             encode_kwargs["batch_size"] = self.__embedding_batch_size
-        all_embeddings = model.encode(list(vocab.keys()), **encode_kwargs).astype(
-            np.float32
-        )
+        all_embeddings = model.encode(list(vocab.keys()), **encode_kwargs).astype(np.float32)
 
         dim = all_embeddings.shape[1]
         out: dict[tuple[str, str], tuple[list[str], np.ndarray]] = {}
@@ -352,9 +351,7 @@ def __embedding_similarity(
         return self.__aggregate(a_match, b_match, len(src_values), len(tgt_values))
 
     @staticmethod
-    def __directional_counts(
-        scores: np.ndarray, threshold: float
-    ) -> tuple[float, float]:
+    def __directional_counts(scores: np.ndarray, threshold: float) -> tuple[float, float]:
         """Count rows / columns whose best entry is at least ``threshold``.
 
         ``scores[i, j]`` is the similarity between A's i-th value and B's
@@ -368,17 +365,15 @@ def __directional_counts(
             float(np.count_nonzero(hits.any(axis=0))),
         )
 
-    def __aggregate(
-        self, a_match: float, b_match: float, a_size: int, b_size: int
-    ) -> float:
+    def __aggregate(self, a_match: float, b_match: float, a_size: int, b_size: int) -> float:
         """Reduce directional match counts to a similarity score via Tversky.
 
         Uses the asymmetric Tversky index in both directions and returns
         the larger of the two so the matcher stays direction-agnostic:
 
-            T(A, B; α, β) = a_match / (a_match + α·(|A|−a_match) + β·(|B|−b_match))
+            T(A, B; a, b) = a_match / (a_match + a·(|A|-a_match) + b·(|B|-b_match))
 
-        With α = β = 1 this is Jaccard; α = 1, β = 0 (or vice versa)
+        With a = b = 1 this is Jaccard; a = 1, b = 0 (or vice versa)
         recovers ``max(|∩|/|A|, |∩|/|B|)`` containment.
         """
         if a_size == 0 or b_size == 0:
diff --git a/valentine/algorithms/matcher_results.py b/valentine/algorithms/matcher_results.py
index 491a3ae..9f6efaf 100644
--- a/valentine/algorithms/matcher_results.py
+++ b/valentine/algorithms/matcher_results.py
@@ -90,7 +90,7 @@ def get_details(self, key: ColumnPair) -> dict[str, float] | None:
 
     # -- Transformations ---------------------------------------------------
 
-    def one_to_one_hungarian(self, threshold: float | None = None) -> MatcherResults:
+    def one_to_one_hungarian(self, threshold: float | None = None) -> MatcherResults:  # noqa: PLR0912
         """Globally optimal 1:1 column matching via Hungarian assignment.
 
         This is the **default** 1:1 selector — it is what
@@ -144,7 +144,7 @@ def one_to_one_hungarian(self, threshold: float | None = None) -> MatcherResults
             pair_lookup[(cp.source, cp.target)] = cp
 
         # Hungarian minimises cost; we want max similarity.
-        from scipy.optimize import linear_sum_assignment
+        from scipy.optimize import linear_sum_assignment  # noqa: PLC0415
 
         cost = [[-s for s in row] for row in sim]
         row_ind, col_ind = linear_sum_assignment(cost)
@@ -266,10 +266,12 @@ def one_to_one_mutual_top(self, n: int = 1) -> MatcherResults:
             lst.sort(reverse=True)
             tgt_top[t] = {s for _, s in lst[:n]}
 
-        selected: dict[ColumnPair, float] = {}
-        for cp, score in self._data.items():
-            if cp.target in src_top.get(cp.source, set()) and cp.source in tgt_top.get(cp.target, set()):
-                selected[cp] = score
+        selected: dict[ColumnPair, float] = {
+            cp: score
+            for cp, score in self._data.items()
+            if cp.target in src_top.get(cp.source, set())
+            and cp.source in tgt_top.get(cp.target, set())
+        }
 
         filtered_details = {k: v for k, v in self._details.items() if k in selected}
         return MatcherResults(selected, details=filtered_details)
diff --git a/valentine/metrics/metric_helpers.py b/valentine/metrics/metric_helpers.py
index e7342c1..d47046a 100644
--- a/valentine/metrics/metric_helpers.py
+++ b/valentine/metrics/metric_helpers.py
@@ -22,8 +22,7 @@ def _apply_one_to_one(matches: MatcherResults, method: OneToOneMethod) -> Matche
     if method == "mutual_top":
         return matches.one_to_one_mutual_top()
     raise ValueError(
-        f"Unknown one_to_one_method: {method!r}; "
-        "expected 'greedy', 'hungarian', or 'mutual_top'"
+        f"Unknown one_to_one_method: {method!r}; expected 'greedy', 'hungarian', or 'mutual_top'"
     )
 
 

From 976436aceb51214437a386a73c24b9e2fae7461c Mon Sep 17 00:00:00 2001
From: Kyriakos Psarakis <kpsarakis94@gmail.com>
Date: Tue, 5 May 2026 12:36:18 +0200
Subject: [PATCH 04/13] add more tests

---
 tests/test_coverage_gaps.py | 251 ++++++++++++++++++++++++++++++++++++
 1 file changed, 251 insertions(+)

diff --git a/tests/test_coverage_gaps.py b/tests/test_coverage_gaps.py
index 6fd4de4..3b5d784 100644
--- a/tests/test_coverage_gaps.py
+++ b/tests/test_coverage_gaps.py
@@ -4,6 +4,8 @@
 behaviour-focused test files.
 """
 
+from unittest.mock import MagicMock, patch
+
 import numpy as np
 import pandas as pd
 import pytest
@@ -23,6 +25,7 @@
 from valentine.algorithms.coma.similarity.tfidf import TfidfCorpus
 from valentine.algorithms.coma.similarity.tokens import tokenize_name, tokens_similarity
 from valentine.algorithms.cupid.linguistic_matching import _cached_synsets, get_synonyms
+from valentine.algorithms.jaccard_distance import StringDistanceFunction
 from valentine.algorithms.distribution_based.clustering_utils import (
     _COLUMN_STORE,
     _compute_ranks,
@@ -436,3 +439,251 @@ def test_ingestion_generator_skips_empty_columns(self, tmp_path):
         # Only the non-empty column survives.
         assert len(produced) == 1
         assert produced[0][0] == "full"
+
+
+# -- JaccardDistanceMatcher parameter validation ----------------------------
+
+
+class TestJaccardParameterValidation:
+    def test_embedding_batch_size_zero_raises(self):
+        with pytest.raises(ValueError, match="embedding_batch_size"):
+            JaccardDistanceMatcher(embedding_batch_size=0)
+
+    def test_embedding_batch_size_negative_raises(self):
+        with pytest.raises(ValueError, match="embedding_batch_size"):
+            JaccardDistanceMatcher(embedding_batch_size=-1)
+
+    def test_tversky_alpha_negative_raises(self):
+        with pytest.raises(ValueError, match="tversky"):
+            JaccardDistanceMatcher(tversky_alpha=-0.1)
+
+    def test_tversky_beta_negative_raises(self):
+        with pytest.raises(ValueError, match="tversky"):
+            JaccardDistanceMatcher(tversky_beta=-0.5)
+
+
+# -- JaccardDistanceMatcher embedding path ----------------------------------
+
+_EMB_PATCH = "valentine.algorithms.jaccard_distance.jaccard_distance._load_sentence_transformer"
+
+
+def _fake_encoder(dim: int = 4) -> MagicMock:
+    """Return a mock SentenceTransformer that yields deterministic L2-normalised embeddings."""
+    def encode(texts, **kwargs):
+        rng = np.random.default_rng(0)
+        emb = rng.random((len(texts), dim)).astype(np.float32)
+        norms = np.linalg.norm(emb, axis=1, keepdims=True)
+        return emb / np.where(norms == 0, 1.0, norms)
+
+    mock = MagicMock()
+    mock.encode.side_effect = encode
+    return mock
+
+
+class TestJaccardEmbeddingPath:
+    @patch(_EMB_PATCH)
+    def test_embedding_produces_matches(self, mock_load):
+        mock_load.return_value = _fake_encoder()
+        d1 = DataframeTable(pd.DataFrame({"col": ["alpha", "beta", "gamma"]}), name="A")
+        d2 = DataframeTable(pd.DataFrame({"col": ["alpha", "delta", "epsilon"]}), name="B")
+        matcher = JaccardDistanceMatcher(
+            distance_fun=StringDistanceFunction.Embedding, threshold_dist=0.0
+        )
+        assert len(matcher.get_matches(d1, d2)) > 0
+
+    @patch(_EMB_PATCH)
+    def test_embedding_encode_called_once_globally(self, mock_load):
+        mock = _fake_encoder()
+        mock_load.return_value = mock
+        d1 = DataframeTable(pd.DataFrame({"c1": ["a", "b"], "c2": ["c", "d"]}), name="A")
+        d2 = DataframeTable(pd.DataFrame({"c1": ["e", "f"], "c2": ["g", "h"]}), name="B")
+        matcher = JaccardDistanceMatcher(
+            distance_fun=StringDistanceFunction.Embedding, threshold_dist=0.0
+        )
+        matcher.get_matches(d1, d2)
+        assert mock.encode.call_count == 1
+        assert set(mock.encode.call_args[0][0]) == {"a", "b", "c", "d", "e", "f", "g", "h"}
+
+    @patch(_EMB_PATCH)
+    def test_embedding_batch_size_forwarded_to_encode(self, mock_load):
+        mock = _fake_encoder()
+        mock_load.return_value = mock
+        d1 = DataframeTable(pd.DataFrame({"col": ["x", "y"]}), name="A")
+        d2 = DataframeTable(pd.DataFrame({"col": ["z", "w"]}), name="B")
+        matcher = JaccardDistanceMatcher(
+            distance_fun=StringDistanceFunction.Embedding,
+            threshold_dist=0.0,
+            embedding_batch_size=32,
+        )
+        matcher.get_matches(d1, d2)
+        assert mock.encode.call_args[1].get("batch_size") == 32
+
+    @patch(_EMB_PATCH)
+    def test_embedding_no_batch_size_not_forwarded(self, mock_load):
+        mock = _fake_encoder()
+        mock_load.return_value = mock
+        d1 = DataframeTable(pd.DataFrame({"col": ["x", "y"]}), name="A")
+        d2 = DataframeTable(pd.DataFrame({"col": ["z", "w"]}), name="B")
+        matcher = JaccardDistanceMatcher(
+            distance_fun=StringDistanceFunction.Embedding, threshold_dist=0.0
+        )
+        matcher.get_matches(d1, d2)
+        assert "batch_size" not in mock.encode.call_args[1]
+
+    def test_all_empty_columns_skips_encode(self):
+        # vocab is empty → early return before _load_sentence_transformer is called,
+        # so no ImportError even though sentence_transformers is not installed.
+        d1 = DataframeTable(pd.DataFrame({"col": pd.Series([], dtype="object")}), name="A")
+        d2 = DataframeTable(pd.DataFrame({"col": pd.Series([], dtype="object")}), name="B")
+        matcher = JaccardDistanceMatcher(
+            distance_fun=StringDistanceFunction.Embedding, threshold_dist=0.5
+        )
+        results = matcher.get_matches(d1, d2)
+        assert all(score == 0.0 for score in results.values())
+
+    @patch(_EMB_PATCH)
+    def test_empty_source_column_similarity_is_zero(self, mock_load):
+        mock_load.return_value = _fake_encoder()
+        d1 = DataframeTable(pd.DataFrame({"col": pd.Series([], dtype="object")}), name="A")
+        d2 = DataframeTable(pd.DataFrame({"col": ["x", "y"]}), name="B")
+        matcher = JaccardDistanceMatcher(
+            distance_fun=StringDistanceFunction.Embedding, threshold_dist=0.5
+        )
+        results = matcher.get_matches(d1, d2)
+        assert all(score == 0.0 for score in results.values())
+
+
+# -- MatcherResults.one_to_one_hungarian caching & threshold ----------------
+
+
+class TestHungarianCachingAndThreshold:
+    def setup_method(self):
+        self.data = {
+            ColumnPair("s", "a", "t", "x"): 0.9,
+            ColumnPair("s", "b", "t", "y"): 0.8,
+            ColumnPair("s", "c", "t", "z"): 0.7,
+            ColumnPair("s", "a", "t", "y"): 0.4,
+            ColumnPair("s", "b", "t", "z"): 0.3,
+            ColumnPair("s", "c", "t", "x"): 0.2,
+        }
+        self.results = MatcherResults(self.data)
+
+    def test_cache_hit_returns_same_object(self):
+        first = self.results.one_to_one_hungarian()
+        second = self.results.one_to_one_hungarian()
+        assert first is second
+
+    def test_empty_data_result_is_cached(self):
+        empty = MatcherResults({})
+        result = empty.one_to_one_hungarian()
+        assert len(result) == 0
+        assert empty._cached_hungarian is result
+
+    def test_empty_data_with_explicit_threshold_not_cached(self):
+        empty = MatcherResults({})
+        empty.one_to_one_hungarian(threshold=0.5)
+        assert empty._cached_hungarian is None
+
+    def test_explicit_threshold_result_not_cached(self):
+        self.results.one_to_one_hungarian(threshold=0.8)
+        assert self.results._cached_hungarian is None
+
+    def test_explicit_threshold_filters_correctly(self):
+        result = self.results.one_to_one_hungarian(threshold=0.8)
+        assert all(score >= 0.8 for score in result.values())
+
+
+# -- MatcherResults.one_to_one_mutual_top -----------------------------------
+
+
+class TestMutualTopN:
+    def setup_method(self):
+        # 3 sources × 3 targets; diagonal pairs are mutual nearest neighbours.
+        self.data = {
+            ColumnPair("s", "a", "t", "x"): 0.9,
+            ColumnPair("s", "b", "t", "y"): 0.8,
+            ColumnPair("s", "c", "t", "z"): 0.7,
+            ColumnPair("s", "a", "t", "y"): 0.4,
+            ColumnPair("s", "b", "t", "z"): 0.3,
+            ColumnPair("s", "c", "t", "x"): 0.2,
+        }
+        self.results = MatcherResults(self.data)
+
+    def test_n_zero_raises(self):
+        with pytest.raises(ValueError, match="n must be >= 1"):
+            self.results.one_to_one_mutual_top(n=0)
+
+    def test_n_negative_raises(self):
+        with pytest.raises(ValueError, match="n must be >= 1"):
+            self.results.one_to_one_mutual_top(n=-1)
+
+    def test_empty_data_returns_empty(self):
+        assert len(MatcherResults({}).one_to_one_mutual_top()) == 0
+
+    def test_n1_keeps_only_mutual_nearest(self):
+        result = self.results.one_to_one_mutual_top(n=1)
+        pairs = {(cp.source_column, cp.target_column) for cp in result}
+        assert pairs == {("a", "x"), ("b", "y"), ("c", "z")}
+
+    def test_n2_admits_more_pairs_than_n1(self):
+        assert len(self.results.one_to_one_mutual_top(n=2)) >= len(
+            self.results.one_to_one_mutual_top(n=1)
+        )
+
+    def test_details_preserved_for_surviving_pairs(self):
+        details = {k: {"score": v} for k, v in self.data.items()}
+        result = MatcherResults(self.data, details=details).one_to_one_mutual_top(n=1)
+        for cp in result:
+            assert cp in result.details
+
+
+# -- MatcherResults.one_to_one_greedy early-return branch ------------------
+
+
+class TestGreedyEarlyReturn:
+    def test_all_identical_scores_returns_all_pairs(self):
+        # < 2 distinct values → skip threshold logic and return everything.
+        data = {
+            ColumnPair("s", "a", "t", "x"): 0.5,
+            ColumnPair("s", "b", "t", "y"): 0.5,
+        }
+        assert len(MatcherResults(data).one_to_one_greedy()) == 2
+
+
+# -- metric_helpers dispatch & ground-truth normalisation ------------------
+
+from valentine.metrics.metric_helpers import _apply_one_to_one, _normalize_ground_truth
+
+
+class TestMetricHelpers:
+    def _two_pair_results(self):
+        return MatcherResults({
+            ColumnPair("s", "a", "t", "x"): 0.9,
+            ColumnPair("s", "b", "t", "y"): 0.8,
+        })
+
+    def test_apply_invalid_method_raises(self):
+        with pytest.raises(ValueError, match="Unknown one_to_one_method"):
+            _apply_one_to_one(self._two_pair_results(), "invalid")
+
+    def test_apply_greedy_dispatches(self):
+        assert isinstance(_apply_one_to_one(self._two_pair_results(), "greedy"), MatcherResults)
+
+    def test_apply_mutual_top_dispatches(self):
+        assert isinstance(
+            _apply_one_to_one(self._two_pair_results(), "mutual_top"), MatcherResults
+        )
+
+    def test_normalize_empty_returns_false_flag(self):
+        pairs, table_aware = _normalize_ground_truth([])
+        assert pairs == [] and table_aware is False
+
+    def test_normalize_2field_not_table_aware(self):
+        pairs, table_aware = _normalize_ground_truth([("src_col", "tgt_col")])
+        assert pairs == [("src_col", "tgt_col")] and table_aware is False
+
+    def test_normalize_4field_is_table_aware(self):
+        pairs, table_aware = _normalize_ground_truth(
+            [("src_tbl", "src_col", "tgt_tbl", "tgt_col")]
+        )
+        assert pairs == [("src_tbl", "src_col", "tgt_tbl", "tgt_col")] and table_aware is True

From 25f6f1cd149eee3e3fef687a36e8914bfae9ec52 Mon Sep 17 00:00:00 2001
From: Kyriakos Psarakis <kpsarakis94@gmail.com>
Date: Tue, 5 May 2026 12:47:21 +0200
Subject: [PATCH 05/13] apply ruff rules

---
 tests/test_coverage_gaps.py | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/tests/test_coverage_gaps.py b/tests/test_coverage_gaps.py
index 3b5d784..c61d7a6 100644
--- a/tests/test_coverage_gaps.py
+++ b/tests/test_coverage_gaps.py
@@ -25,7 +25,6 @@
 from valentine.algorithms.coma.similarity.tfidf import TfidfCorpus
 from valentine.algorithms.coma.similarity.tokens import tokenize_name, tokens_similarity
 from valentine.algorithms.cupid.linguistic_matching import _cached_synsets, get_synonyms
-from valentine.algorithms.jaccard_distance import StringDistanceFunction
 from valentine.algorithms.distribution_based.clustering_utils import (
     _COLUMN_STORE,
     _compute_ranks,
@@ -38,9 +37,11 @@
     clear_global_ranks_cache,
 )
 from valentine.algorithms.distribution_based.quantile_histogram import QuantileHistogram
+from valentine.algorithms.jaccard_distance import StringDistanceFunction
 from valentine.algorithms.match import ColumnPair
 from valentine.algorithms.matcher_results import MatcherResults
 from valentine.data_sources.dataframe.dataframe_table import DataframeTable
+from valentine.metrics.metric_helpers import _apply_one_to_one, _normalize_ground_truth
 
 # -- MatcherResults dunder & transformation coverage ------------------------
 
@@ -469,6 +470,7 @@ def test_tversky_beta_negative_raises(self):
 
 def _fake_encoder(dim: int = 4) -> MagicMock:
     """Return a mock SentenceTransformer that yields deterministic L2-normalised embeddings."""
+
     def encode(texts, **kwargs):
         rng = np.random.default_rng(0)
         emb = rng.random((len(texts), dim)).astype(np.float32)
@@ -598,7 +600,7 @@ def test_explicit_threshold_filters_correctly(self):
 
 class TestMutualTopN:
     def setup_method(self):
-        # 3 sources × 3 targets; diagonal pairs are mutual nearest neighbours.
+        # 3 sources x 3 targets; diagonal pairs are mutual nearest neighbours.
         self.data = {
             ColumnPair("s", "a", "t", "x"): 0.9,
             ColumnPair("s", "b", "t", "y"): 0.8,
@@ -652,15 +654,15 @@ def test_all_identical_scores_returns_all_pairs(self):
 
 # -- metric_helpers dispatch & ground-truth normalisation ------------------
 
-from valentine.metrics.metric_helpers import _apply_one_to_one, _normalize_ground_truth
-
 
 class TestMetricHelpers:
     def _two_pair_results(self):
-        return MatcherResults({
-            ColumnPair("s", "a", "t", "x"): 0.9,
-            ColumnPair("s", "b", "t", "y"): 0.8,
-        })
+        return MatcherResults(
+            {
+                ColumnPair("s", "a", "t", "x"): 0.9,
+                ColumnPair("s", "b", "t", "y"): 0.8,
+            }
+        )
 
     def test_apply_invalid_method_raises(self):
         with pytest.raises(ValueError, match="Unknown one_to_one_method"):
@@ -670,9 +672,7 @@ def test_apply_greedy_dispatches(self):
         assert isinstance(_apply_one_to_one(self._two_pair_results(), "greedy"), MatcherResults)
 
     def test_apply_mutual_top_dispatches(self):
-        assert isinstance(
-            _apply_one_to_one(self._two_pair_results(), "mutual_top"), MatcherResults
-        )
+        assert isinstance(_apply_one_to_one(self._two_pair_results(), "mutual_top"), MatcherResults)
 
     def test_normalize_empty_returns_false_flag(self):
         pairs, table_aware = _normalize_ground_truth([])
@@ -683,7 +683,5 @@ def test_normalize_2field_not_table_aware(self):
         assert pairs == [("src_col", "tgt_col")] and table_aware is False
 
     def test_normalize_4field_is_table_aware(self):
-        pairs, table_aware = _normalize_ground_truth(
-            [("src_tbl", "src_col", "tgt_tbl", "tgt_col")]
-        )
+        pairs, table_aware = _normalize_ground_truth([("src_tbl", "src_col", "tgt_tbl", "tgt_col")])
         assert pairs == [("src_tbl", "src_col", "tgt_tbl", "tgt_col")] and table_aware is True

From c0b2348afab8bcf627d4f57771eb863c1865d962 Mon Sep 17 00:00:00 2001
From: Kyriakos Psarakis <kpsarakis94@gmail.com>
Date: Tue, 5 May 2026 12:49:04 +0200
Subject: [PATCH 06/13] add sentence-transformers in the requirements

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index d053701..fcc5ec1 100755
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,6 +10,7 @@ rapidfuzz==3.14.5
 PuLP==3.3.0
 POT==0.9.6.post1
 scipy==1.17.1
+sentence-transformers==5.4.1
 # data loading
 python-dateutil==2.9.0.post0
 # testing

From 0842916ba5dcfd7f213817506aee497265c7b1b3 Mon Sep 17 00:00:00 2001
From: Kyriakos Psarakis <kpsarakis94@gmail.com>
Date: Tue, 5 May 2026 13:14:22 +0200
Subject: [PATCH 07/13] add the new methods in the ci and experiment_nyu

---
 .github/workflows/build.yml                 |  1 +
 .github/workflows/ci-build-test-publish.yml |  2 ++
 experiments/experiment_nyu.py               | 14 +++++++++++++-
 3 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index a0b51d9..deddd07 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -29,6 +29,7 @@ jobs:
           python -m pip install --upgrade pip
           pip install .
           pip install ".[polars]" || true
+          pip install ".[embeddings]"
           pip install pytest==9.0.2 coverage==7.13.5 ruff==0.15.12
 
       - name: Ruff lint (must pass)
diff --git a/.github/workflows/ci-build-test-publish.yml b/.github/workflows/ci-build-test-publish.yml
index a193801..d95f8ce 100644
--- a/.github/workflows/ci-build-test-publish.yml
+++ b/.github/workflows/ci-build-test-publish.yml
@@ -61,6 +61,7 @@ jobs:
           python -m pip install --upgrade pip
           pip install .
           pip install ".[polars]" || true
+          pip install ".[embeddings]"
 
       - name: Install test deps
         run: pip install pytest==9.0.2
@@ -106,6 +107,7 @@ jobs:
             pip install dist/*.tar.gz
           fi
           pip install polars || true
+          pip install "sentence-transformers>=2.0,<6.0"
 
       - name: Install test deps
         run: pip install pytest==9.0.2
diff --git a/experiments/experiment_nyu.py b/experiments/experiment_nyu.py
index 1e1a8c7..d38583b 100644
--- a/experiments/experiment_nyu.py
+++ b/experiments/experiment_nyu.py
@@ -1,3 +1,4 @@
+import importlib.util
 import json
 import time
 from pathlib import Path
@@ -12,6 +13,7 @@
     JaccardDistanceMatcher,
     SimilarityFlooding,
 )
+from valentine.algorithms.jaccard_distance import StringDistanceFunction
 
 
 def _iter_datasets(data_root: Path) -> list[Path]:
@@ -43,13 +45,23 @@ def _load_ground_truth(path: Path) -> list[tuple[str, str]]:
 
 
 def _matcher_builders():
-    return [
+    builders = [
         ("Coma", lambda: Coma(use_instances=True)),
         ("Cupid", Cupid),
         ("DistributionBased", DistributionBased),
         ("JaccardDistanceMatcher", JaccardDistanceMatcher),
         ("SimilarityFlooding", SimilarityFlooding),
     ]
+    if importlib.util.find_spec("sentence_transformers") is not None:
+        builders.append((
+            "JaccardDistanceMatcher_emb",
+            lambda: JaccardDistanceMatcher(
+                distance_fun=StringDistanceFunction.Embedding,
+                threshold_dist=0.7,
+                embedding_device=None,
+            ),
+        ))
+    return builders
 
 
 def main():

From 55bac6d33d44f8e2021ad69815d491c965dfc93c Mon Sep 17 00:00:00 2001
From: Kyriakos Psarakis <kpsarakis94@gmail.com>
Date: Tue, 5 May 2026 13:31:22 +0200
Subject: [PATCH 08/13] update test workflows

---
 .github/workflows/ci-build-test-publish.yml |  6 +-
 tests/test_coverage_gaps.py                 | 85 +++++++++++++++++++++
 2 files changed, 88 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/ci-build-test-publish.yml b/.github/workflows/ci-build-test-publish.yml
index d95f8ce..e1d74d5 100644
--- a/.github/workflows/ci-build-test-publish.yml
+++ b/.github/workflows/ci-build-test-publish.yml
@@ -66,8 +66,8 @@ jobs:
       - name: Install test deps
         run: pip install pytest==9.0.2
 
-      - name: Run tests (unittest)
-        run: python -m unittest discover tests
+      - name: Run tests
+        run: python -m pytest -q tests
 
   # 3) Test the built wheel on all OS/Python
   test_wheel:
@@ -113,7 +113,7 @@ jobs:
         run: pip install pytest==9.0.2
 
       - name: Run tests (against installed wheel)
-        run: python -m unittest discover tests
+        run: python -m pytest -q tests
 
   # 4) Publish to PyPI on Release (tag publish event)
   publish-to-pypi:
diff --git a/tests/test_coverage_gaps.py b/tests/test_coverage_gaps.py
index c61d7a6..1897190 100644
--- a/tests/test_coverage_gaps.py
+++ b/tests/test_coverage_gaps.py
@@ -4,12 +4,15 @@
 behaviour-focused test files.
 """
 
+import importlib.util
 from unittest.mock import MagicMock, patch
 
 import numpy as np
 import pandas as pd
 import pytest
 
+_ST_AVAILABLE = importlib.util.find_spec("sentence_transformers") is not None
+
 from tests import df1, df2
 from valentine import InvalidMatcherError, valentine_match
 from valentine.algorithms import (
@@ -685,3 +688,85 @@ def test_normalize_2field_not_table_aware(self):
     def test_normalize_4field_is_table_aware(self):
         pairs, table_aware = _normalize_ground_truth([("src_tbl", "src_col", "tgt_tbl", "tgt_col")])
         assert pairs == [("src_tbl", "src_col", "tgt_tbl", "tgt_col")] and table_aware is True
+
+
+# -- JaccardDistanceMatcher real embedding integration ----------------------
+# These tests require sentence-transformers and are skipped when it is absent.
+# They exercise the actual SentenceTransformer model, unlike the mocked tests
+# above — use them to verify the real embedding path works end-to-end.
+
+@pytest.mark.skipif(not _ST_AVAILABLE, reason="sentence_transformers not installed")
+class TestJaccardEmbeddingIntegration:
+    """Integration tests that load a real SentenceTransformer model."""
+
+    _MATCHER = JaccardDistanceMatcher(
+        distance_fun=StringDistanceFunction.Embedding,
+        embedding_device="cpu",
+        threshold_dist=0.5,
+    )
+
+    def test_semantically_similar_columns_match(self):
+        # "customer_id" / "client_id" and "order_date" / "purchase_date" are
+        # semantically close; the embedding matcher should return non-zero
+        # similarity for at least one pair.
+        d1 = DataframeTable(
+            pd.DataFrame({"customer_id": ["C1", "C2", "C3"], "order_date": ["2024-01-01", "2024-01-02", "2024-01-03"]}),
+            name="orders",
+        )
+        d2 = DataframeTable(
+            pd.DataFrame({"client_id": ["C1", "C2", "C3"], "purchase_date": ["2024-01-01", "2024-01-02", "2024-01-03"]}),
+            name="purchases",
+        )
+        results = self._MATCHER.get_matches(d1, d2)
+        assert len(results) > 0
+        assert all(0.0 <= score <= 1.0 for score in results.values())
+
+    def test_identical_values_score_is_high(self):
+        # Two columns with identical string values should produce a near-1.0
+        # embedding similarity because the same text encodes to the same vector.
+        d1 = DataframeTable(pd.DataFrame({"city": ["London", "Paris", "Berlin"]}), name="A")
+        d2 = DataframeTable(pd.DataFrame({"city": ["London", "Paris", "Berlin"]}), name="B")
+        results = self._MATCHER.get_matches(d1, d2)
+        assert len(results) == 1
+        score = next(iter(results.values()))
+        assert score > 0.9
+
+    def test_batch_size_produces_same_result(self):
+        # Results with batch_size=1 must match results with the default batch
+        # size, verifying that batching does not affect the output.
+        d1 = DataframeTable(pd.DataFrame({"col": ["alpha", "beta", "gamma"]}), name="A")
+        d2 = DataframeTable(pd.DataFrame({"col": ["alpha", "delta"]}), name="B")
+        default_matcher = JaccardDistanceMatcher(
+            distance_fun=StringDistanceFunction.Embedding,
+            embedding_device="cpu",
+            threshold_dist=0.5,
+        )
+        batched_matcher = JaccardDistanceMatcher(
+            distance_fun=StringDistanceFunction.Embedding,
+            embedding_device="cpu",
+            threshold_dist=0.5,
+            embedding_batch_size=1,
+        )
+        default_res = default_matcher.get_matches(d1, d2)
+        batched_res = batched_matcher.get_matches(d1, d2)
+        assert set(default_res.keys()) == set(batched_res.keys())
+        for key in default_res:
+            assert abs(default_res[key] - batched_res[key]) < 1e-5
+
+    def test_get_matches_batch_shares_embeddings(self):
+        # get_matches_batch must encode each unique string exactly once
+        # across all tables. We verify this indirectly: the result contains
+        # cross-table pair entries for every (t1, t2) combination.
+        d1 = DataframeTable(pd.DataFrame({"col": ["x", "y"]}), name="T1")
+        d2 = DataframeTable(pd.DataFrame({"col": ["x", "z"]}), name="T2")
+        d3 = DataframeTable(pd.DataFrame({"col": ["y", "z"]}), name="T3")
+        matcher = JaccardDistanceMatcher(
+            distance_fun=StringDistanceFunction.Embedding,
+            embedding_device="cpu",
+            threshold_dist=0.0,
+        )
+        results = matcher.get_matches_batch([d1, d2, d3])
+        table_pairs = {(cp.source_table, cp.target_table) for cp in results}
+        assert ("T1", "T2") in table_pairs
+        assert ("T1", "T3") in table_pairs
+        assert ("T2", "T3") in table_pairs

From 8d09d376fd46612814275089c084aabe617e58d6 Mon Sep 17 00:00:00 2001
From: Kyriakos Psarakis <kpsarakis94@gmail.com>
Date: Tue, 5 May 2026 13:37:29 +0200
Subject: [PATCH 09/13] update baseline

---
 experiments/bench_baseline.json | 127 +++++++++++++++++++++-----------
 1 file changed, 85 insertions(+), 42 deletions(-)

diff --git a/experiments/bench_baseline.json b/experiments/bench_baseline.json
index 9e8ec1d..62249a9 100644
--- a/experiments/bench_baseline.json
+++ b/experiments/bench_baseline.json
@@ -3,260 +3,303 @@
     "Coma": {
       "pairs": {
         "customers": {
-          "seconds": 0.0204,
+          "seconds": 0.005,
           "n_matches": 5,
           "f1": 1.0,
           "recall_at_gt": 1.0,
           "mrr": 1.0
         },
         "orders": {
-          "seconds": 0.0098,
+          "seconds": 0.0051,
           "n_matches": 4,
           "f1": 0.8889,
           "recall_at_gt": 0.8,
           "mrr": 0.8
         },
         "products": {
-          "seconds": 0.0166,
+          "seconds": 0.0049,
           "n_matches": 6,
           "f1": 1.0,
           "recall_at_gt": 1.0,
           "mrr": 1.0
         },
         "events": {
-          "seconds": 0.0125,
+          "seconds": 0.0054,
           "n_matches": 4,
           "f1": 1.0,
           "recall_at_gt": 1.0,
           "mrr": 1.0
         },
         "addresses": {
-          "seconds": 0.0112,
+          "seconds": 0.0046,
           "n_matches": 5,
           "f1": 0.8889,
           "recall_at_gt": 1.0,
           "mrr": 1.0
         }
       },
-      "total_seconds": 0.0705,
-      "worst_seconds": 0.0204,
+      "total_seconds": 0.025,
+      "worst_seconds": 0.0054,
       "mean_f1": 0.9556,
       "mean_mrr": 0.96
     },
     "Coma_Inst": {
       "pairs": {
         "customers": {
-          "seconds": 0.0306,
+          "seconds": 0.0142,
           "n_matches": 5,
           "f1": 1.0,
           "recall_at_gt": 1.0,
           "mrr": 1.0
         },
         "orders": {
-          "seconds": 0.0333,
+          "seconds": 0.0149,
           "n_matches": 5,
           "f1": 1.0,
           "recall_at_gt": 1.0,
           "mrr": 1.0
         },
         "products": {
-          "seconds": 0.0233,
+          "seconds": 0.0116,
           "n_matches": 6,
           "f1": 0.9091,
           "recall_at_gt": 1.0,
           "mrr": 1.0
         },
         "events": {
-          "seconds": 0.0245,
+          "seconds": 0.0115,
           "n_matches": 4,
           "f1": 1.0,
           "recall_at_gt": 1.0,
           "mrr": 1.0
         },
         "addresses": {
-          "seconds": 0.022,
+          "seconds": 0.0101,
           "n_matches": 5,
           "f1": 0.8889,
           "recall_at_gt": 1.0,
           "mrr": 1.0
         }
       },
-      "total_seconds": 0.1337,
-      "worst_seconds": 0.0333,
+      "total_seconds": 0.0623,
+      "worst_seconds": 0.0149,
       "mean_f1": 0.9596,
       "mean_mrr": 1.0
     },
     "Cupid": {
       "pairs": {
         "customers": {
-          "seconds": 2.0127,
+          "seconds": 1.1821,
           "n_matches": 6,
           "f1": 0.8889,
           "recall_at_gt": 0.8,
           "mrr": 0.8
         },
         "orders": {
-          "seconds": 0.0297,
+          "seconds": 0.0178,
           "n_matches": 3,
           "f1": 0.75,
           "recall_at_gt": 0.6,
           "mrr": 0.6
         },
         "products": {
-          "seconds": 0.0301,
+          "seconds": 0.019,
           "n_matches": 6,
           "f1": 0.9091,
           "recall_at_gt": 0.8333,
           "mrr": 0.8333
         },
         "events": {
-          "seconds": 0.0151,
+          "seconds": 0.0092,
           "n_matches": 4,
           "f1": 1.0,
           "recall_at_gt": 1.0,
           "mrr": 1.0
         },
         "addresses": {
-          "seconds": 0.0657,
+          "seconds": 0.0417,
           "n_matches": 3,
           "f1": 0.75,
           "recall_at_gt": 0.6,
           "mrr": 0.6
         }
       },
-      "total_seconds": 2.1533,
-      "worst_seconds": 2.0127,
+      "total_seconds": 1.2698,
+      "worst_seconds": 1.1821,
       "mean_f1": 0.8596,
       "mean_mrr": 0.7667
     },
     "DistributionBased": {
       "pairs": {
         "customers": {
-          "seconds": 0.0993,
+          "seconds": 0.1208,
           "n_matches": 7,
           "f1": 1.0,
           "recall_at_gt": 1.0,
           "mrr": 1.0
         },
         "orders": {
-          "seconds": 0.0918,
+          "seconds": 0.1283,
           "n_matches": 4,
           "f1": 0.75,
           "recall_at_gt": 0.8,
           "mrr": 0.8
         },
         "products": {
-          "seconds": 0.0773,
+          "seconds": 0.1501,
           "n_matches": 5,
           "f1": 0.8,
           "recall_at_gt": 0.8333,
           "mrr": 0.8333
         },
         "events": {
-          "seconds": 0.0601,
+          "seconds": 0.2155,
           "n_matches": 4,
           "f1": 1.0,
           "recall_at_gt": 1.0,
           "mrr": 1.0
         },
         "addresses": {
-          "seconds": 0.0795,
+          "seconds": 0.1442,
           "n_matches": 3,
           "f1": 0.75,
           "recall_at_gt": 0.6,
           "mrr": 0.6
         }
       },
-      "total_seconds": 0.408,
-      "worst_seconds": 0.0993,
+      "total_seconds": 0.7589,
+      "worst_seconds": 0.2155,
       "mean_f1": 0.86,
       "mean_mrr": 0.8467
     },
     "JaccardDistanceMatcher": {
       "pairs": {
         "customers": {
-          "seconds": 0.0111,
+          "seconds": 0.0046,
           "n_matches": 7,
           "f1": 1.0,
           "recall_at_gt": 1.0,
           "mrr": 1.0
         },
         "orders": {
-          "seconds": 0.0139,
+          "seconds": 0.0091,
           "n_matches": 11,
           "f1": 0.8889,
           "recall_at_gt": 0.6,
           "mrr": 1.0
         },
         "products": {
-          "seconds": 0.0101,
+          "seconds": 0.0058,
           "n_matches": 7,
           "f1": 1.0,
           "recall_at_gt": 1.0,
           "mrr": 1.0
         },
         "events": {
-          "seconds": 0.0116,
+          "seconds": 0.0053,
           "n_matches": 6,
           "f1": 1.0,
           "recall_at_gt": 1.0,
           "mrr": 1.0
         },
         "addresses": {
-          "seconds": 0.0095,
+          "seconds": 0.0044,
           "n_matches": 5,
           "f1": 1.0,
           "recall_at_gt": 1.0,
           "mrr": 1.0
         }
       },
-      "total_seconds": 0.0562,
-      "worst_seconds": 0.0139,
+      "total_seconds": 0.0292,
+      "worst_seconds": 0.0091,
       "mean_f1": 0.9778,
       "mean_mrr": 1.0
     },
     "SimilarityFlooding": {
       "pairs": {
         "customers": {
-          "seconds": 0.0437,
+          "seconds": 0.0233,
           "n_matches": 25,
           "f1": 0.8889,
           "recall_at_gt": 0.8,
           "mrr": 1.0
         },
         "orders": {
-          "seconds": 0.0369,
+          "seconds": 0.0237,
           "n_matches": 25,
           "f1": 0.8889,
           "recall_at_gt": 0.8,
           "mrr": 0.9
         },
         "products": {
-          "seconds": 0.0644,
+          "seconds": 0.0347,
           "n_matches": 36,
           "f1": 0.9091,
           "recall_at_gt": 0.8333,
           "mrr": 1.0
         },
         "events": {
-          "seconds": 0.0364,
+          "seconds": 0.019,
           "n_matches": 16,
           "f1": 1.0,
           "recall_at_gt": 1.0,
           "mrr": 1.0
         },
         "addresses": {
-          "seconds": 0.0388,
+          "seconds": 0.021,
           "n_matches": 25,
           "f1": 1.0,
           "recall_at_gt": 0.8,
           "mrr": 1.0
         }
       },
-      "total_seconds": 0.2202,
-      "worst_seconds": 0.0644,
+      "total_seconds": 0.1217,
+      "worst_seconds": 0.0347,
       "mean_f1": 0.9374,
       "mean_mrr": 0.98
+    },
+    "JaccardDistanceMatcher_emb": {
+      "pairs": {
+        "customers": {
+          "seconds": 5.6881,
+          "n_matches": 7,
+          "f1": 1.0,
+          "recall_at_gt": 1.0,
+          "mrr": 1.0
+        },
+        "orders": {
+          "seconds": 0.2185,
+          "n_matches": 18,
+          "f1": 1.0,
+          "recall_at_gt": 1.0,
+          "mrr": 1.0
+        },
+        "products": {
+          "seconds": 0.1531,
+          "n_matches": 10,
+          "f1": 1.0,
+          "recall_at_gt": 1.0,
+          "mrr": 1.0
+        },
+        "events": {
+          "seconds": 0.0853,
+          "n_matches": 6,
+          "f1": 1.0,
+          "recall_at_gt": 1.0,
+          "mrr": 1.0
+        },
+        "addresses": {
+          "seconds": 0.1342,
+          "n_matches": 11,
+          "f1": 1.0,
+          "recall_at_gt": 1.0,
+          "mrr": 1.0
+        }
+      },
+      "total_seconds": 6.2792,
+      "worst_seconds": 5.6881,
+      "mean_f1": 1.0,
+      "mean_mrr": 1.0
     }
   }
 }
\ No newline at end of file

From 0d7aac8c00789f66f0c8ed3b0b772a458452f805 Mon Sep 17 00:00:00 2001
From: Kyriakos Psarakis <kpsarakis94@gmail.com>
Date: Tue, 5 May 2026 13:40:15 +0200
Subject: [PATCH 10/13] apply ruff rules

---
 experiments/experiment_nyu.py | 18 ++++++++++--------
 tests/test_coverage_gaps.py   | 19 +++++++++++++++----
 2 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/experiments/experiment_nyu.py b/experiments/experiment_nyu.py
index d38583b..7f33a35 100644
--- a/experiments/experiment_nyu.py
+++ b/experiments/experiment_nyu.py
@@ -53,14 +53,16 @@ def _matcher_builders():
         ("SimilarityFlooding", SimilarityFlooding),
     ]
     if importlib.util.find_spec("sentence_transformers") is not None:
-        builders.append((
-            "JaccardDistanceMatcher_emb",
-            lambda: JaccardDistanceMatcher(
-                distance_fun=StringDistanceFunction.Embedding,
-                threshold_dist=0.7,
-                embedding_device=None,
-            ),
-        ))
+        builders.append(
+            (
+                "JaccardDistanceMatcher_emb",
+                lambda: JaccardDistanceMatcher(
+                    distance_fun=StringDistanceFunction.Embedding,
+                    threshold_dist=0.7,
+                    embedding_device=None,
+                ),
+            )
+        )
     return builders
 
 
diff --git a/tests/test_coverage_gaps.py b/tests/test_coverage_gaps.py
index 1897190..8f8db0d 100644
--- a/tests/test_coverage_gaps.py
+++ b/tests/test_coverage_gaps.py
@@ -11,8 +11,6 @@
 import pandas as pd
 import pytest
 
-_ST_AVAILABLE = importlib.util.find_spec("sentence_transformers") is not None
-
 from tests import df1, df2
 from valentine import InvalidMatcherError, valentine_match
 from valentine.algorithms import (
@@ -46,6 +44,8 @@
 from valentine.data_sources.dataframe.dataframe_table import DataframeTable
 from valentine.metrics.metric_helpers import _apply_one_to_one, _normalize_ground_truth
 
+_ST_AVAILABLE = importlib.util.find_spec("sentence_transformers") is not None
+
 # -- MatcherResults dunder & transformation coverage ------------------------
 
 
@@ -695,6 +695,7 @@ def test_normalize_4field_is_table_aware(self):
 # They exercise the actual SentenceTransformer model, unlike the mocked tests
 # above — use them to verify the real embedding path works end-to-end.
 
+
 @pytest.mark.skipif(not _ST_AVAILABLE, reason="sentence_transformers not installed")
 class TestJaccardEmbeddingIntegration:
     """Integration tests that load a real SentenceTransformer model."""
@@ -710,11 +711,21 @@ def test_semantically_similar_columns_match(self):
         # semantically close; the embedding matcher should return non-zero
         # similarity for at least one pair.
         d1 = DataframeTable(
-            pd.DataFrame({"customer_id": ["C1", "C2", "C3"], "order_date": ["2024-01-01", "2024-01-02", "2024-01-03"]}),
+            pd.DataFrame(
+                {
+                    "customer_id": ["C1", "C2", "C3"],
+                    "order_date": ["2024-01-01", "2024-01-02", "2024-01-03"],
+                }
+            ),
             name="orders",
         )
         d2 = DataframeTable(
-            pd.DataFrame({"client_id": ["C1", "C2", "C3"], "purchase_date": ["2024-01-01", "2024-01-02", "2024-01-03"]}),
+            pd.DataFrame(
+                {
+                    "client_id": ["C1", "C2", "C3"],
+                    "purchase_date": ["2024-01-01", "2024-01-02", "2024-01-03"],
+                }
+            ),
             name="purchases",
         )
         results = self._MATCHER.get_matches(d1, d2)

From c1b805a0f4e4a4cf7bce6713b84b1824aca47545 Mon Sep 17 00:00:00 2001
From: Kyriakos Psarakis <kpsarakis94@gmail.com>
Date: Tue, 5 May 2026 13:57:46 +0200
Subject: [PATCH 11/13] make PULP deterministic

---
 experiments/bench_baseline.json               | 125 ++++++------------
 .../distribution_based/discovery.py           |  13 +-
 .../distribution_based/distribution_based.py  |  10 +-
 3 files changed, 51 insertions(+), 97 deletions(-)

diff --git a/experiments/bench_baseline.json b/experiments/bench_baseline.json
index 62249a9..f0f86ea 100644
--- a/experiments/bench_baseline.json
+++ b/experiments/bench_baseline.json
@@ -3,42 +3,42 @@
     "Coma": {
       "pairs": {
         "customers": {
-          "seconds": 0.005,
+          "seconds": 0.0054,
           "n_matches": 5,
           "f1": 1.0,
           "recall_at_gt": 1.0,
           "mrr": 1.0
         },
         "orders": {
-          "seconds": 0.0051,
+          "seconds": 0.0042,
           "n_matches": 4,
           "f1": 0.8889,
           "recall_at_gt": 0.8,
           "mrr": 0.8
         },
         "products": {
-          "seconds": 0.0049,
+          "seconds": 0.0039,
           "n_matches": 6,
           "f1": 1.0,
           "recall_at_gt": 1.0,
           "mrr": 1.0
         },
         "events": {
-          "seconds": 0.0054,
+          "seconds": 0.0048,
           "n_matches": 4,
           "f1": 1.0,
           "recall_at_gt": 1.0,
           "mrr": 1.0
         },
         "addresses": {
-          "seconds": 0.0046,
+          "seconds": 0.0038,
           "n_matches": 5,
           "f1": 0.8889,
           "recall_at_gt": 1.0,
           "mrr": 1.0
         }
       },
-      "total_seconds": 0.025,
+      "total_seconds": 0.0221,
       "worst_seconds": 0.0054,
       "mean_f1": 0.9556,
       "mean_mrr": 0.96
@@ -46,193 +46,193 @@
     "Coma_Inst": {
       "pairs": {
         "customers": {
-          "seconds": 0.0142,
+          "seconds": 0.0175,
           "n_matches": 5,
           "f1": 1.0,
           "recall_at_gt": 1.0,
           "mrr": 1.0
         },
         "orders": {
-          "seconds": 0.0149,
+          "seconds": 0.0117,
           "n_matches": 5,
           "f1": 1.0,
           "recall_at_gt": 1.0,
           "mrr": 1.0
         },
         "products": {
-          "seconds": 0.0116,
+          "seconds": 0.0099,
           "n_matches": 6,
           "f1": 0.9091,
           "recall_at_gt": 1.0,
           "mrr": 1.0
         },
         "events": {
-          "seconds": 0.0115,
+          "seconds": 0.0094,
           "n_matches": 4,
           "f1": 1.0,
           "recall_at_gt": 1.0,
           "mrr": 1.0
         },
         "addresses": {
-          "seconds": 0.0101,
+          "seconds": 0.0089,
           "n_matches": 5,
           "f1": 0.8889,
           "recall_at_gt": 1.0,
           "mrr": 1.0
         }
       },
-      "total_seconds": 0.0623,
-      "worst_seconds": 0.0149,
+      "total_seconds": 0.0574,
+      "worst_seconds": 0.0175,
       "mean_f1": 0.9596,
       "mean_mrr": 1.0
     },
     "Cupid": {
       "pairs": {
         "customers": {
-          "seconds": 1.1821,
+          "seconds": 1.4134,
           "n_matches": 6,
           "f1": 0.8889,
           "recall_at_gt": 0.8,
           "mrr": 0.8
         },
         "orders": {
-          "seconds": 0.0178,
+          "seconds": 0.0145,
           "n_matches": 3,
           "f1": 0.75,
           "recall_at_gt": 0.6,
           "mrr": 0.6
         },
         "products": {
-          "seconds": 0.019,
+          "seconds": 0.0153,
           "n_matches": 6,
           "f1": 0.9091,
           "recall_at_gt": 0.8333,
           "mrr": 0.8333
         },
         "events": {
-          "seconds": 0.0092,
+          "seconds": 0.0075,
           "n_matches": 4,
           "f1": 1.0,
           "recall_at_gt": 1.0,
           "mrr": 1.0
         },
         "addresses": {
-          "seconds": 0.0417,
+          "seconds": 0.0345,
           "n_matches": 3,
           "f1": 0.75,
           "recall_at_gt": 0.6,
           "mrr": 0.6
         }
       },
-      "total_seconds": 1.2698,
-      "worst_seconds": 1.1821,
+      "total_seconds": 1.4852,
+      "worst_seconds": 1.4134,
       "mean_f1": 0.8596,
       "mean_mrr": 0.7667
     },
     "DistributionBased": {
       "pairs": {
         "customers": {
-          "seconds": 0.1208,
+          "seconds": 0.1721,
           "n_matches": 7,
           "f1": 1.0,
-          "recall_at_gt": 1.0,
+          "recall_at_gt": 0.8,
           "mrr": 1.0
         },
         "orders": {
-          "seconds": 0.1283,
+          "seconds": 0.1298,
           "n_matches": 4,
           "f1": 0.75,
           "recall_at_gt": 0.8,
           "mrr": 0.8
         },
         "products": {
-          "seconds": 0.1501,
+          "seconds": 0.1606,
           "n_matches": 5,
           "f1": 0.8,
           "recall_at_gt": 0.8333,
           "mrr": 0.8333
         },
         "events": {
-          "seconds": 0.2155,
+          "seconds": 0.1061,
           "n_matches": 4,
           "f1": 1.0,
           "recall_at_gt": 1.0,
           "mrr": 1.0
         },
         "addresses": {
-          "seconds": 0.1442,
+          "seconds": 0.1517,
           "n_matches": 3,
           "f1": 0.75,
           "recall_at_gt": 0.6,
           "mrr": 0.6
         }
       },
-      "total_seconds": 0.7589,
-      "worst_seconds": 0.2155,
+      "total_seconds": 0.7203,
+      "worst_seconds": 0.1721,
       "mean_f1": 0.86,
       "mean_mrr": 0.8467
     },
     "JaccardDistanceMatcher": {
       "pairs": {
         "customers": {
-          "seconds": 0.0046,
+          "seconds": 0.0044,
           "n_matches": 7,
           "f1": 1.0,
           "recall_at_gt": 1.0,
           "mrr": 1.0
         },
         "orders": {
-          "seconds": 0.0091,
+          "seconds": 0.0079,
           "n_matches": 11,
           "f1": 0.8889,
           "recall_at_gt": 0.6,
           "mrr": 1.0
         },
         "products": {
-          "seconds": 0.0058,
+          "seconds": 0.0046,
           "n_matches": 7,
           "f1": 1.0,
           "recall_at_gt": 1.0,
           "mrr": 1.0
         },
         "events": {
-          "seconds": 0.0053,
+          "seconds": 0.0052,
           "n_matches": 6,
           "f1": 1.0,
           "recall_at_gt": 1.0,
           "mrr": 1.0
         },
         "addresses": {
-          "seconds": 0.0044,
+          "seconds": 0.0048,
           "n_matches": 5,
           "f1": 1.0,
           "recall_at_gt": 1.0,
           "mrr": 1.0
         }
       },
-      "total_seconds": 0.0292,
-      "worst_seconds": 0.0091,
+      "total_seconds": 0.0269,
+      "worst_seconds": 0.0079,
       "mean_f1": 0.9778,
       "mean_mrr": 1.0
     },
     "SimilarityFlooding": {
       "pairs": {
         "customers": {
-          "seconds": 0.0233,
+          "seconds": 0.0229,
           "n_matches": 25,
           "f1": 0.8889,
           "recall_at_gt": 0.8,
           "mrr": 1.0
         },
         "orders": {
-          "seconds": 0.0237,
+          "seconds": 0.02,
           "n_matches": 25,
           "f1": 0.8889,
           "recall_at_gt": 0.8,
           "mrr": 0.9
         },
         "products": {
-          "seconds": 0.0347,
+          "seconds": 0.0337,
           "n_matches": 36,
           "f1": 0.9091,
           "recall_at_gt": 0.8333,
@@ -246,60 +246,17 @@
           "mrr": 1.0
         },
         "addresses": {
-          "seconds": 0.021,
+          "seconds": 0.0209,
           "n_matches": 25,
           "f1": 1.0,
           "recall_at_gt": 0.8,
           "mrr": 1.0
         }
       },
-      "total_seconds": 0.1217,
-      "worst_seconds": 0.0347,
+      "total_seconds": 0.1165,
+      "worst_seconds": 0.0337,
       "mean_f1": 0.9374,
       "mean_mrr": 0.98
-    },
-    "JaccardDistanceMatcher_emb": {
-      "pairs": {
-        "customers": {
-          "seconds": 5.6881,
-          "n_matches": 7,
-          "f1": 1.0,
-          "recall_at_gt": 1.0,
-          "mrr": 1.0
-        },
-        "orders": {
-          "seconds": 0.2185,
-          "n_matches": 18,
-          "f1": 1.0,
-          "recall_at_gt": 1.0,
-          "mrr": 1.0
-        },
-        "products": {
-          "seconds": 0.1531,
-          "n_matches": 10,
-          "f1": 1.0,
-          "recall_at_gt": 1.0,
-          "mrr": 1.0
-        },
-        "events": {
-          "seconds": 0.0853,
-          "n_matches": 6,
-          "f1": 1.0,
-          "recall_at_gt": 1.0,
-          "mrr": 1.0
-        },
-        "addresses": {
-          "seconds": 0.1342,
-          "n_matches": 11,
-          "f1": 1.0,
-          "recall_at_gt": 1.0,
-          "mrr": 1.0
-        }
-      },
-      "total_seconds": 6.2792,
-      "worst_seconds": 5.6881,
-      "mean_f1": 1.0,
-      "mean_mrr": 1.0
     }
   }
 }
\ No newline at end of file
diff --git a/valentine/algorithms/distribution_based/discovery.py b/valentine/algorithms/distribution_based/discovery.py
index 5c5c235..f63f814 100644
--- a/valentine/algorithms/distribution_based/discovery.py
+++ b/valentine/algorithms/distribution_based/discovery.py
@@ -52,9 +52,7 @@ def compute_distribution_clusters(
 
     graph = create_graph(columns, edges_per_column)
 
-    connected_components = list(nx.connected_components(graph))
-
-    return connected_components
+    return sorted(nx.connected_components(graph), key=sorted)
 
 
 def compute_distribution_clusters_parallel(
@@ -99,9 +97,7 @@ def compute_distribution_clusters_parallel(
 
     graph = create_graph(columns, edges_per_column)
 
-    connected_components = list(nx.connected_components(graph))
-
-    return connected_components
+    return sorted(nx.connected_components(graph), key=sorted)
 
 
 def compute_attributes(
@@ -278,7 +274,7 @@ def correlation_clustering_pulp(vertexes: list, edges: dict):
                 if len({u, v, w}) == 3:
                     opt_model += x_vars[u, w] <= x_vars[u, v] + x_vars[v, w]
 
-    opt_model.solve(PULP_CBC_CMD(msg=False))
+    opt_model.solve(PULP_CBC_CMD(msg=False, options=["RandomS", "42"]))
 
     result = {}
 
@@ -319,8 +315,7 @@ def process_correlation_clustering_result(results: list, columns: list):
         m1, m2 = match
         edges_per_column.append([(m1, m2)])
     graph = create_graph(columns, edges_per_column)
-    connected_components = list(nx.connected_components(graph))
-    return connected_components
+    return sorted(nx.connected_components(graph), key=sorted)
 
 
 def create_graph(nodes: list, edges_per_column: list):
diff --git a/valentine/algorithms/distribution_based/distribution_based.py b/valentine/algorithms/distribution_based/distribution_based.py
index f53443c..0b2e94d 100644
--- a/valentine/algorithms/distribution_based/distribution_based.py
+++ b/valentine/algorithms/distribution_based/distribution_based.py
@@ -173,14 +173,15 @@ def __find_matches(self, tmp_folder_path: str, table_order: dict[str, int]):
         for components in connected_components:
             if len(components) > 1:
                 i = i + 1
+                sorted_components = sorted(components)
                 edges = discovery.compute_attributes(
-                    list(components),
+                    sorted_components,
                     self.__threshold2,
                     tmp_folder_path,
                     self.__quantiles,
                     self.__use_bloom_filters,
                 )
-                all_attributes.append((list(components), edges))
+                all_attributes.append((sorted_components, edges))
 
         results = []
         for components, edges in all_attributes:
@@ -220,15 +221,16 @@ def __find_matches_parallel(
         for components in connected_components:
             if len(components) > 1:
                 i = i + 1
+                sorted_components = sorted(components)
                 edges = discovery.compute_attributes_parallel(
-                    list(components),
+                    sorted_components,
                     self.__threshold2,
                     pool,
                     tmp_folder_path,
                     self.__quantiles,
                     self.__use_bloom_filters,
                 )
-                all_attributes.append((list(components), edges))
+                all_attributes.append((sorted_components, edges))
 
         results = []
         for components, edges in all_attributes:

From c88600ef15690008db9329fea1def61f598e1fad Mon Sep 17 00:00:00 2001
From: Kyriakos Psarakis <kpsarakis94@gmail.com>
Date: Tue, 5 May 2026 14:12:33 +0200
Subject: [PATCH 12/13] make benchmark deterministic

---
 .github/workflows/bench.yml                   |  27 +++-
 experiments/bench_baseline.json               | 129 ++++++++++++------
 .../distribution_based/distribution_based.py  |  69 +++++++---
 3 files changed, 157 insertions(+), 68 deletions(-)

diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index 160b34d..097e825 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -8,8 +8,15 @@ on:
 
 jobs:
   bench:
-    runs-on: ubuntu-latest
-    timeout-minutes: 10
+    name: bench (${{ matrix.os }})
+    runs-on: ${{ matrix.os }}
+    timeout-minutes: 15
+
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, windows-latest, macos-latest]
+        python-version: ['3.14']
 
     steps:
       - uses: actions/checkout@v6.0.2
@@ -17,9 +24,15 @@ jobs:
       - name: Set up Python
         uses: actions/setup-python@v6.2.0
         with:
-          python-version: '3.14'
+          python-version: ${{ matrix.python-version }}
           cache: 'pip'
 
+      - name: Set deterministic environment
+        run: |
+          echo "OMP_NUM_THREADS=1" >> $GITHUB_ENV
+          echo "MKL_NUM_THREADS=1" >> $GITHUB_ENV
+          echo "PYTHONHASHSEED=0" >> $GITHUB_ENV
+
       - name: Install package
         run: |
           python -m pip install --upgrade pip
@@ -29,7 +42,7 @@ jobs:
         run: |
           python experiments/bench.py \
             --quick \
-            --output bench_results.json \
+            --output bench_results_${{ matrix.os }}.json \
             --baseline experiments/bench_baseline.json \
             --accuracy-only
 
@@ -37,6 +50,6 @@ jobs:
         if: always()
         uses: actions/upload-artifact@v4
         with:
-          name: bench-results
-          path: bench_results.json
-          if-no-files-found: warn
+          name: bench-results-${{ matrix.os }}
+          path: bench_results_${{ matrix.os }}.json
+          if-no-files-found: warn
\ No newline at end of file
diff --git a/experiments/bench_baseline.json b/experiments/bench_baseline.json
index f0f86ea..eec4b62 100644
--- a/experiments/bench_baseline.json
+++ b/experiments/bench_baseline.json
@@ -3,42 +3,42 @@
     "Coma": {
       "pairs": {
         "customers": {
-          "seconds": 0.0054,
+          "seconds": 0.0049,
           "n_matches": 5,
           "f1": 1.0,
           "recall_at_gt": 1.0,
           "mrr": 1.0
         },
         "orders": {
-          "seconds": 0.0042,
+          "seconds": 0.0053,
           "n_matches": 4,
           "f1": 0.8889,
           "recall_at_gt": 0.8,
           "mrr": 0.8
         },
         "products": {
-          "seconds": 0.0039,
+          "seconds": 0.005,
           "n_matches": 6,
           "f1": 1.0,
           "recall_at_gt": 1.0,
           "mrr": 1.0
         },
         "events": {
-          "seconds": 0.0048,
+          "seconds": 0.0054,
           "n_matches": 4,
           "f1": 1.0,
           "recall_at_gt": 1.0,
           "mrr": 1.0
         },
         "addresses": {
-          "seconds": 0.0038,
+          "seconds": 0.0046,
           "n_matches": 5,
           "f1": 0.8889,
           "recall_at_gt": 1.0,
           "mrr": 1.0
         }
       },
-      "total_seconds": 0.0221,
+      "total_seconds": 0.0252,
       "worst_seconds": 0.0054,
       "mean_f1": 0.9556,
       "mean_mrr": 0.96
@@ -46,217 +46,260 @@
     "Coma_Inst": {
       "pairs": {
         "customers": {
-          "seconds": 0.0175,
+          "seconds": 0.0139,
           "n_matches": 5,
           "f1": 1.0,
           "recall_at_gt": 1.0,
           "mrr": 1.0
         },
         "orders": {
-          "seconds": 0.0117,
+          "seconds": 0.0154,
           "n_matches": 5,
           "f1": 1.0,
           "recall_at_gt": 1.0,
           "mrr": 1.0
         },
         "products": {
-          "seconds": 0.0099,
+          "seconds": 0.012,
           "n_matches": 6,
           "f1": 0.9091,
           "recall_at_gt": 1.0,
           "mrr": 1.0
         },
         "events": {
-          "seconds": 0.0094,
+          "seconds": 0.0117,
           "n_matches": 4,
           "f1": 1.0,
           "recall_at_gt": 1.0,
           "mrr": 1.0
         },
         "addresses": {
-          "seconds": 0.0089,
+          "seconds": 0.0103,
           "n_matches": 5,
           "f1": 0.8889,
           "recall_at_gt": 1.0,
           "mrr": 1.0
         }
       },
-      "total_seconds": 0.0574,
-      "worst_seconds": 0.0175,
+      "total_seconds": 0.0633,
+      "worst_seconds": 0.0154,
       "mean_f1": 0.9596,
       "mean_mrr": 1.0
     },
     "Cupid": {
       "pairs": {
         "customers": {
-          "seconds": 1.4134,
+          "seconds": 1.285,
           "n_matches": 6,
           "f1": 0.8889,
           "recall_at_gt": 0.8,
           "mrr": 0.8
         },
         "orders": {
-          "seconds": 0.0145,
+          "seconds": 0.018,
           "n_matches": 3,
           "f1": 0.75,
           "recall_at_gt": 0.6,
           "mrr": 0.6
         },
         "products": {
-          "seconds": 0.0153,
+          "seconds": 0.0193,
           "n_matches": 6,
           "f1": 0.9091,
           "recall_at_gt": 0.8333,
           "mrr": 0.8333
         },
         "events": {
-          "seconds": 0.0075,
+          "seconds": 0.0091,
           "n_matches": 4,
           "f1": 1.0,
           "recall_at_gt": 1.0,
           "mrr": 1.0
         },
         "addresses": {
-          "seconds": 0.0345,
+          "seconds": 0.0417,
           "n_matches": 3,
           "f1": 0.75,
           "recall_at_gt": 0.6,
           "mrr": 0.6
         }
       },
-      "total_seconds": 1.4852,
-      "worst_seconds": 1.4134,
+      "total_seconds": 1.3731,
+      "worst_seconds": 1.285,
       "mean_f1": 0.8596,
       "mean_mrr": 0.7667
     },
     "DistributionBased": {
       "pairs": {
         "customers": {
-          "seconds": 0.1721,
+          "seconds": 0.1268,
           "n_matches": 7,
           "f1": 1.0,
           "recall_at_gt": 0.8,
-          "mrr": 1.0
+          "mrr": 0.9
         },
         "orders": {
-          "seconds": 0.1298,
+          "seconds": 0.13,
           "n_matches": 4,
           "f1": 0.75,
           "recall_at_gt": 0.8,
           "mrr": 0.8
         },
         "products": {
-          "seconds": 0.1606,
+          "seconds": 0.1572,
           "n_matches": 5,
           "f1": 0.8,
           "recall_at_gt": 0.8333,
           "mrr": 0.8333
         },
         "events": {
-          "seconds": 0.1061,
+          "seconds": 0.1085,
           "n_matches": 4,
           "f1": 1.0,
           "recall_at_gt": 1.0,
           "mrr": 1.0
         },
         "addresses": {
-          "seconds": 0.1517,
+          "seconds": 0.149,
           "n_matches": 3,
           "f1": 0.75,
           "recall_at_gt": 0.6,
           "mrr": 0.6
         }
       },
-      "total_seconds": 0.7203,
-      "worst_seconds": 0.1721,
+      "total_seconds": 0.6715,
+      "worst_seconds": 0.1572,
       "mean_f1": 0.86,
-      "mean_mrr": 0.8467
+      "mean_mrr": 0.8267
     },
     "JaccardDistanceMatcher": {
       "pairs": {
         "customers": {
-          "seconds": 0.0044,
+          "seconds": 0.005,
           "n_matches": 7,
           "f1": 1.0,
           "recall_at_gt": 1.0,
           "mrr": 1.0
         },
         "orders": {
-          "seconds": 0.0079,
+          "seconds": 0.0089,
           "n_matches": 11,
           "f1": 0.8889,
           "recall_at_gt": 0.6,
           "mrr": 1.0
         },
         "products": {
-          "seconds": 0.0046,
+          "seconds": 0.0051,
           "n_matches": 7,
           "f1": 1.0,
           "recall_at_gt": 1.0,
           "mrr": 1.0
         },
         "events": {
-          "seconds": 0.0052,
+          "seconds": 0.0061,
           "n_matches": 6,
           "f1": 1.0,
           "recall_at_gt": 1.0,
           "mrr": 1.0
         },
         "addresses": {
-          "seconds": 0.0048,
+          "seconds": 0.0045,
           "n_matches": 5,
           "f1": 1.0,
           "recall_at_gt": 1.0,
           "mrr": 1.0
         }
       },
-      "total_seconds": 0.0269,
-      "worst_seconds": 0.0079,
+      "total_seconds": 0.0296,
+      "worst_seconds": 0.0089,
       "mean_f1": 0.9778,
       "mean_mrr": 1.0
     },
     "SimilarityFlooding": {
       "pairs": {
         "customers": {
-          "seconds": 0.0229,
+          "seconds": 0.0244,
           "n_matches": 25,
           "f1": 0.8889,
           "recall_at_gt": 0.8,
           "mrr": 1.0
         },
         "orders": {
-          "seconds": 0.02,
+          "seconds": 0.0237,
           "n_matches": 25,
           "f1": 0.8889,
           "recall_at_gt": 0.8,
           "mrr": 0.9
         },
         "products": {
-          "seconds": 0.0337,
+          "seconds": 0.0333,
           "n_matches": 36,
           "f1": 0.9091,
           "recall_at_gt": 0.8333,
           "mrr": 1.0
         },
         "events": {
-          "seconds": 0.019,
+          "seconds": 0.0226,
           "n_matches": 16,
           "f1": 1.0,
           "recall_at_gt": 1.0,
           "mrr": 1.0
         },
         "addresses": {
-          "seconds": 0.0209,
+          "seconds": 0.0208,
           "n_matches": 25,
           "f1": 1.0,
           "recall_at_gt": 0.8,
           "mrr": 1.0
         }
       },
-      "total_seconds": 0.1165,
-      "worst_seconds": 0.0337,
+      "total_seconds": 0.1248,
+      "worst_seconds": 0.0333,
       "mean_f1": 0.9374,
       "mean_mrr": 0.98
+    },
+    "JaccardDistanceMatcher_emb": {
+      "pairs": {
+        "customers": {
+          "seconds": 6.3396,
+          "n_matches": 7,
+          "f1": 1.0,
+          "recall_at_gt": 1.0,
+          "mrr": 1.0
+        },
+        "orders": {
+          "seconds": 0.2713,
+          "n_matches": 18,
+          "f1": 1.0,
+          "recall_at_gt": 1.0,
+          "mrr": 1.0
+        },
+        "products": {
+          "seconds": 0.1497,
+          "n_matches": 10,
+          "f1": 1.0,
+          "recall_at_gt": 1.0,
+          "mrr": 1.0
+        },
+        "events": {
+          "seconds": 0.0724,
+          "n_matches": 6,
+          "f1": 1.0,
+          "recall_at_gt": 1.0,
+          "mrr": 1.0
+        },
+        "addresses": {
+          "seconds": 0.1303,
+          "n_matches": 11,
+          "f1": 1.0,
+          "recall_at_gt": 1.0,
+          "mrr": 1.0
+        }
+      },
+      "total_seconds": 6.9633,
+      "worst_seconds": 6.3396,
+      "mean_f1": 1.0,
+      "mean_mrr": 1.0
     }
   }
 }
\ No newline at end of file
diff --git a/valentine/algorithms/distribution_based/distribution_based.py b/valentine/algorithms/distribution_based/distribution_based.py
index 0b2e94d..c85388c 100644
--- a/valentine/algorithms/distribution_based/distribution_based.py
+++ b/valentine/algorithms/distribution_based/distribution_based.py
@@ -268,27 +268,60 @@ def __rank_output(
             A ranked list that will look like: ((table_name1, column_name1), (table_name2, column_name2)): similarity
         """
         matches = {}
-        for cluster in attribute_clusters:
+
+        sorted_clusters = sorted(
+            [sorted(cluster) for cluster in attribute_clusters], key=lambda c: (len(c), c)
+        )
+
+        for cluster in sorted_clusters:
             if len(cluster) < 2:
                 continue
+
             for combination in combinations(cluster, 2):
                 table1 = combination[0][0]
                 table2 = combination[1][0]
-                if table1 != table2:
-                    k, emd = process_emd(
-                        (
-                            (combination[0], combination[1]),
-                            self.__quantiles,
-                            False,
-                            tmp_folder_path,
-                            False,
-                        )
+
+                if table1 == table2:
+                    continue
+
+                k, emd = process_emd(
+                    (
+                        (combination[0], combination[1]),
+                        self.__quantiles,
+                        False,
+                        tmp_folder_path,
+                        False,
                     )
-                    sim = 1 / (1 + emd)
-                    tn_i, _, cn_i, _ = k[0]
-                    tn_j, _, cn_j, _ = k[1]
-                    if table_order.get(tn_i, 0) > table_order.get(tn_j, 0):
-                        matches.update(Match(tn_i, cn_i, tn_j, cn_j, sim).to_dict)
-                    else:
-                        matches.update(Match(tn_j, cn_j, tn_i, cn_i, sim).to_dict)
-        return matches
+                )
+
+                emd = float(round(emd, 12))
+                sim = 1 / (1 + emd)
+
+                tn_i, _, cn_i, _ = k[0]
+                tn_j, _, cn_j, _ = k[1]
+
+                order_i = table_order.get(tn_i, float("inf"))
+                order_j = table_order.get(tn_j, float("inf"))
+
+                if (order_i, tn_i, cn_i) > (order_j, tn_j, cn_j):
+                    match_obj = Match(tn_i, cn_i, tn_j, cn_j, sim)
+                else:
+                    match_obj = Match(tn_j, cn_j, tn_i, cn_i, sim)
+
+                # Deterministic overwrite rule
+                key = (
+                    match_obj.source_table_name,
+                    match_obj.source_column_name,
+                    match_obj.target_table_name,
+                    match_obj.target_column_name,
+                )
+
+                if key not in matches or sim > matches[key].similarity:
+                    matches[key] = match_obj
+
+        # Convert back to expected format
+        final_matches = {}
+        for m in matches.values():
+            final_matches.update(m.to_dict)
+
+        return final_matches

From 7d0e0453f45a56d95d330c7dbfc867ed6391b0d1 Mon Sep 17 00:00:00 2001
From: Kyriakos Psarakis <kpsarakis94@gmail.com>
Date: Tue, 5 May 2026 14:16:46 +0200
Subject: [PATCH 13/13] use bash in the bench workflow

---
 .github/workflows/bench.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index 097e825..a40e77f 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -39,6 +39,7 @@ jobs:
           pip install .
 
       - name: Run accuracy regression check
+        shell: bash
         run: |
           python experiments/bench.py \
             --quick \