From cd59fe54352efab4e3c2e16fd74affd0b0fe2458 Mon Sep 17 00:00:00 2001 From: Christos Koutras Date: Mon, 4 May 2026 09:29:07 -0400 Subject: [PATCH 01/13] add embeddings support for jaccard distance, replace jaccard with generic tversky index, add more one-to-one filtering methods --- README.md | 4 +- docs/api.md | 41 ++- docs/changelog.md | 2 +- docs/example.md | 2 +- docs/faq.md | 2 +- docs/metrics.md | 2 +- docs/results.md | 10 +- examples/valentine_example_mixed.py | 2 +- examples/valentine_example_pandas.py | 2 +- examples/valentine_example_polars.py | 2 +- experiments/bench.py | 43 ++- pyproject.toml | 3 + tests/test_coverage_gaps.py | 8 +- tests/test_distribution_based_benchmark.py | 5 +- tests/test_docs_smoke.py | 4 +- tests/test_matcher_results.py | 13 +- .../algorithms/jaccard_distance/__init__.py | 3 + .../jaccard_distance/jaccard_distance.py | 284 +++++++++++++++++- valentine/algorithms/matcher_results.py | 173 +++++++++-- valentine/metrics/base_metric.py | 14 +- valentine/metrics/metric_helpers.py | 20 ++ valentine/metrics/metrics.py | 64 +++- 22 files changed, 622 insertions(+), 81 deletions(-) diff --git a/README.md b/README.md index 62fcc2e..d33087d 100644 --- a/README.md +++ b/README.md @@ -137,9 +137,9 @@ for pair, score in matches.items(): ```python top_n_matches = matches.take_top_n(5) top_n_percent_matches = matches.take_top_percent(25) -one_to_one_matches = matches.one_to_one() +one_to_one_matches = matches.one_to_one_hungarian() high_confidence = matches.filter(min_score=0.7) -one_to_one_strict = matches.one_to_one(threshold=0.5) +one_to_one_strict = matches.one_to_one_hungarian(threshold=0.5) ``` ### Match details (Coma) diff --git a/docs/api.md b/docs/api.md index a7e3b26..b34b189 100644 --- a/docs/api.md +++ b/docs/api.md @@ -141,7 +141,8 @@ class MatcherResults(Mapping[ColumnPair, float]): Immutable `Mapping` returned by [`valentine_match`](#valentine_match). Entries are sorted from highest to lowest similarity score on construction. Because the mapping is immutable, derived views (such as -the cached result of [`one_to_one`](#one_to_one)) cannot be silently +the cached result of [`one_to_one_hungarian`](#one_to_one_hungarian)) +cannot be silently invalidated. ### Mapping protocol @@ -186,15 +187,17 @@ All transformations return a **new** `MatcherResults` instance; the original is left untouched. Sub-matcher details are carried over to the filtered subset. -#### `one_to_one` +#### `one_to_one_hungarian` ```python -def one_to_one(threshold: float | None = None) -> MatcherResults +def one_to_one_hungarian(threshold: float | None = None) -> MatcherResults ``` -Greedy bipartite filter: starting from the highest-scoring pair, assign -each source and each target column **at most one** partner. Pairs below -`threshold` are discarded. +Default 1:1 selector. Globally optimal bipartite filter via Hungarian +assignment (`scipy.optimize.linear_sum_assignment`): each source and +each target column appears in **at most one** returned pair, with the +assignment chosen to maximise total similarity. Pairs below `threshold` +are discarded. - `threshold=None` (default) uses the median of unique similarity scores as the cutoff, and the result is cached. @@ -202,6 +205,30 @@ each source and each target column **at most one** partner. Pairs below - When the input has fewer than two distinct score values, all entries are returned unchanged. +#### `one_to_one_greedy` + +```python +def one_to_one_greedy(threshold: float | None = None) -> MatcherResults +``` + +Greedy bipartite filter, kept for backwards compatibility. Starting +from the highest-scoring pair, greedily assigns each source and each +target column at most one partner. Same threshold semantics as +`one_to_one_hungarian`. Greedy can lock in a locally-best pair that +blocks a better global assignment, so prefer the Hungarian variant +unless you need the legacy behaviour. + +#### `one_to_one_mutual_top` + +```python +def one_to_one_mutual_top(n: int = 1) -> MatcherResults +``` + +Mutual top-`n` filter: keeps pair `(s, t)` only if `t` is in `s`'s +top-`n` targets AND `s` is in `t`'s top-`n` sources. With `n=1` this +is the classic mutual nearest-neighbour filter — high-precision, drops +one-sided affinities. Strictly stricter than `one_to_one_hungarian`. + #### `filter` ```python @@ -615,7 +642,7 @@ Precision(one_to_one: bool = True) ``` `TP / (TP + FP)`. When `one_to_one=True` (default), applies -`MatcherResults.one_to_one()` before counting. +`MatcherResults.one_to_one_hungarian()` before counting. #### `Recall` diff --git a/docs/changelog.md b/docs/changelog.md index 543ad93..23e3fe4 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -84,7 +84,7 @@ coming from 0.5.x or earlier, the changes below will affect your code. `valentine_match` / `valentine_match_batch` pair. - **Immutable [`MatcherResults`](api.md#matcherresults).** The result object is now a `Mapping`, not a `dict` subclass. Derived views - (e.g. [`one_to_one()`](api.md#one_to_one)) are cached and cannot be + (e.g. [`one_to_one_hungarian()`](api.md#one_to_one_hungarian)) are cached and cannot be silently invalidated. - [`Coma`](api.md#coma) is now a pure-Python implementation of COMA 3.0 — no JVM dependency. Constructor signature updated to diff --git a/docs/example.md b/docs/example.md index d614442..af23f8f 100644 --- a/docs/example.md +++ b/docs/example.md @@ -60,7 +60,7 @@ def main(): # 4. Reduce to one-to-one matches (greedy, highest-first). print("\nGetting the one-to-one matches:") - pp.pprint(matches.one_to_one()) + pp.pprint(matches.one_to_one_hungarian()) # 5. If you have a ground truth, compute evaluation metrics. ground_truth = [ diff --git a/docs/faq.md b/docs/faq.md index e8ff040..114afdd 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -95,7 +95,7 @@ provides three reduction helpers: ```python matches.take_top_n(10) # absolute top 10 matches.take_top_percent(5) # top 5% -matches.one_to_one() # bidirectional best matches +matches.one_to_one_hungarian() # bidirectional best matches ``` All three return a new `MatcherResults` — the original is immutable. diff --git a/docs/metrics.md b/docs/metrics.md index 2acc695..7d061c3 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -71,7 +71,7 @@ from valentine.metrics import ( `Precision`, `Recall`, `F1Score` and `PrecisionTopNPercent` all accept a `one_to_one: bool` flag that applies -[`MatcherResults.one_to_one()`](api.md#one_to_one) before counting. +[`MatcherResults.one_to_one_hungarian()`](api.md#one_to_one_hungarian) before counting. `PrecisionTopNPercent` additionally takes `n: int` for the cutoff, and `RecallAtSizeofGroundTruth` defaults to `one_to_one=False`. See the [API reference](api.md#built-in-metrics) for full defaults. diff --git a/docs/results.md b/docs/results.md index 1fe5d6f..21e9362 100644 --- a/docs/results.md +++ b/docs/results.md @@ -10,7 +10,7 @@ mapping** of [`ColumnPair`](api.md#columnpair) keys to similarity scores, sorted from highest score to lowest. It behaves like a `dict` for lookup and iteration, but cannot be mutated (preventing accidental invalidation of cached derived views such as -[`one_to_one()`](api.md#one_to_one)). +[`one_to_one_hungarian()`](api.md#one_to_one_hungarian)). For the authoritative method signatures, see the API reference for [`MatcherResults`](api.md#matcherresults) and @@ -82,24 +82,24 @@ strong = matches.filter(min_score=0.7) # Reduce to one-to-one matches (greedy, highest-first). Threshold defaults # to the median score of the current results. -one_to_one = matches.one_to_one() +one_to_one = matches.one_to_one_hungarian() # Override the threshold to be stricter -strict = matches.one_to_one(threshold=0.8) +strict = matches.one_to_one_hungarian(threshold=0.8) ``` Each method is documented in full in the API reference: [`take_top_n`](api.md#take_top_n), [`take_top_percent`](api.md#take_top_percent), [`filter`](api.md#filter), and -[`one_to_one`](api.md#one_to_one). +[`one_to_one`](api.md#one_to_one_hungarian). Every transformation returns a **new** [`MatcherResults`](api.md#matcherresults) instance, so you can chain them: ```python -best_strict_pairs = matches.filter(min_score=0.5).one_to_one(threshold=0.7) +best_strict_pairs = matches.filter(min_score=0.5).one_to_one_hungarian(threshold=0.7) ``` !!! tip "Details propagation" diff --git a/examples/valentine_example_mixed.py b/examples/valentine_example_mixed.py index 8104a6a..9b24ff7 100644 --- a/examples/valentine_example_mixed.py +++ b/examples/valentine_example_mixed.py @@ -37,7 +37,7 @@ def main(): print(f" {pair.source_column:>20s} <-> {pair.target_column:<20s} {score:.4f}") print("\nOne-to-one matches:") - for pair, score in matches.one_to_one().items(): + for pair, score in matches.one_to_one_hungarian().items(): print(f" {pair.source_column:>20s} <-> {pair.target_column:<20s} {score:.4f}") # Evaluate against ground truth diff --git a/examples/valentine_example_pandas.py b/examples/valentine_example_pandas.py index 6ecec70..c3d93af 100644 --- a/examples/valentine_example_pandas.py +++ b/examples/valentine_example_pandas.py @@ -35,7 +35,7 @@ def main(): print(f" {'':>20s} [{breakdown}]") print("\nGetting the one-to-one matches:") - pp.pprint(matches.one_to_one()) + pp.pprint(matches.one_to_one_hungarian()) # If ground truth available valentine could calculate the metrics ground_truth = [ diff --git a/examples/valentine_example_polars.py b/examples/valentine_example_polars.py index 5512140..0a4f668 100644 --- a/examples/valentine_example_polars.py +++ b/examples/valentine_example_polars.py @@ -35,7 +35,7 @@ def main(): print(f" {'':>20s} [{breakdown}]") print("\nGetting the one-to-one matches:") - pp.pprint(matches.one_to_one()) + pp.pprint(matches.one_to_one_hungarian()) # If ground truth available valentine could calculate the metrics ground_truth = [ diff --git a/experiments/bench.py b/experiments/bench.py index 3cf9fed..292ed64 100644 --- a/experiments/bench.py +++ b/experiments/bench.py @@ -51,6 +51,7 @@ JaccardDistanceMatcher, SimilarityFlooding, ) +from valentine.algorithms.jaccard_distance import StringDistanceFunction from valentine.metrics import F1Score, MeanReciprocalRank, RecallAtSizeofGroundTruth try: @@ -67,7 +68,7 @@ def _matcher_builders() -> list[tuple[str, MatcherFactory]]: - return [ + builders: list[tuple[str, MatcherFactory]] = [ ("Coma", Coma), ("Coma_Inst", lambda: Coma(use_instances=True)), ("Cupid", Cupid), @@ -75,6 +76,46 @@ def _matcher_builders() -> list[tuple[str, MatcherFactory]]: ("JaccardDistanceMatcher", JaccardDistanceMatcher), ("SimilarityFlooding", SimilarityFlooding), ] + # Only include the embedding variant when sentence-transformers is + # actually importable; otherwise the bench would crash on import. + try: + import sentence_transformers + + builders.append( + ( + "JaccardDistanceMatcher_emb", + # embedding_device=None lets sentence-transformers / torch + # auto-pick: cuda > mps > cpu. So the bench transparently + # uses GPU on CUDA boxes and MPS on Apple Silicon without + # any config; CPU-only machines fall back automatically. + # + # embedding_batch_size is left unset, so the encode call + # uses sentence-transformers' library default (32). This + # is the out-of-the-box operating point. To trade memory + # for speed on capable hardware, pass an explicit larger + # value (e.g. embedding_batch_size=128 or 256): on the + # NYU full suite that drops total wall time from ~15s to + # ~11s on MPS without affecting accuracy. + # + # tversky_alpha=tversky_beta=1.0 reduces to Jaccard (the + # default; matches prior behaviour). Set both to 0.5 for + # Dice, or one to 0 to recover set containment — natural + # for subset/superset workloads (dataset discovery), but + # on the NYU bench it regressed mean F1 by ~12pp because + # asymmetric scoring inflates similarity for size- + # asymmetric pairs. match_weighting defaults to Binary + # (count-based intersection); switch to Margin to weight + # each value by its top1-vs-top2 confidence gap. + lambda: JaccardDistanceMatcher( + distance_fun=StringDistanceFunction.Embedding, + threshold_dist=0.7, + embedding_device=None, + ), + ) + ) + except ImportError: + pass + return builders # --------------------------------------------------------------------------- diff --git a/pyproject.toml b/pyproject.toml index db959c4..9adbe58 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,6 +55,9 @@ dev = [ polars = [ "polars>=1.0,<2.0", ] +embeddings = [ + "sentence-transformers>=2.0,<6.0", +] docs = [ # Pinned intentionally: Zensical is pre-1.0 and moves fast. Bump # deliberately rather than relying on a range, so a bad release diff --git a/tests/test_coverage_gaps.py b/tests/test_coverage_gaps.py index 6f7117d..6fd4de4 100644 --- a/tests/test_coverage_gaps.py +++ b/tests/test_coverage_gaps.py @@ -87,13 +87,13 @@ def test_details_empty_when_none(self): assert bare.details == {} assert bare.get_details(next(iter(bare))) is None - def test_one_to_one_with_explicit_threshold(self): - result = self.results.one_to_one(threshold=0.7) + def test_one_to_one_greedy_with_explicit_threshold(self): + result = self.results.one_to_one_greedy(threshold=0.7) # Only entries >= 0.7 survive the explicit threshold path assert all(score >= 0.7 for score in result.values()) assert len(result) == 3 - def test_one_to_one_identical_scores(self): + def test_one_to_one_hungarian_identical_scores(self): # Less than two distinct values -> early return branch flat = MatcherResults( { @@ -101,7 +101,7 @@ def test_one_to_one_identical_scores(self): ColumnPair("s", "b", "t", "b"): 0.5, } ) - assert len(flat.one_to_one()) == len(flat) + assert len(flat.one_to_one_hungarian()) == len(flat) def test_filter(self): result = self.results.filter(min_score=0.75) diff --git a/tests/test_distribution_based_benchmark.py b/tests/test_distribution_based_benchmark.py index afd175c..7076ad3 100644 --- a/tests/test_distribution_based_benchmark.py +++ b/tests/test_distribution_based_benchmark.py @@ -65,8 +65,9 @@ def test_synthetic_numeric_accuracy(self): metrics = matches.get_metrics(ground_truth, metrics={Precision(), Recall(), F1Score()}) # Baseline: P=1.0, R=0.75, F1=0.857 - # The algorithm correctly finds all 4 pairs (raw Recall=1.0), but one_to_one() - # post-processing may filter the weakest match below the median threshold. + # The algorithm correctly finds all 4 pairs (raw Recall=1.0), but the + # one_to_one_hungarian() post-processing may filter the weakest match + # below the median threshold. assert metrics["Precision"] >= 1.0, f"Precision dropped to {metrics['Precision']}" assert metrics["Recall"] >= 0.75, f"Recall dropped to {metrics['Recall']}" assert metrics["F1Score"] >= 0.85, f"F1Score dropped to {metrics['F1Score']}" diff --git a/tests/test_docs_smoke.py b/tests/test_docs_smoke.py index a22b933..0122bb3 100644 --- a/tests/test_docs_smoke.py +++ b/tests/test_docs_smoke.py @@ -130,7 +130,9 @@ def test_data_sources_exports(): def test_matcher_results_documented_methods(): """Every MatcherResults method referenced in the docs must exist.""" for name in ( - "one_to_one", + "one_to_one_hungarian", + "one_to_one_greedy", + "one_to_one_mutual_top", "filter", "take_top_n", "take_top_percent", diff --git a/tests/test_matcher_results.py b/tests/test_matcher_results.py index 674fc16..f58e46e 100644 --- a/tests/test_matcher_results.py +++ b/tests/test_matcher_results.py @@ -37,7 +37,7 @@ def test_get_metrics(self): metrics_specific = self.matches.get_metrics(self.ground_truth, metrics={Precision()}) assert "Precision" in metrics_specific - def test_one_to_one(self): + def test_one_to_one_greedy(self): m = self.matches n = len(m) assert n > 0 @@ -56,8 +56,8 @@ def test_one_to_one(self): assert len(m) == 2 * n - m_one_to_one = m.one_to_one() - # one_to_one should remove duplicates, returning fewer entries + m_one_to_one = m.one_to_one_greedy() + # 1:1 should remove duplicates, returning fewer entries assert len(m_one_to_one) <= n assert len(m_one_to_one) < len(m) @@ -65,16 +65,17 @@ def test_one_to_one(self): for pair in m_one_to_one: assert not pair.target_column.endswith("foo") - # Cache resets on new instance + # Cache resets on new instance — Hungarian is the cached default, + # not greedy. Verify the default-path cache lifecycle here. m_entry = MatcherResults(dict(m)) - assert m_entry._cached_one_to_one is None + assert m_entry._cached_hungarian is None # Add a new entry with distinct columns ext2 = dict(m_entry) ext2[ColumnPair("extra_src", "BLA", "extra_tgt", "BLA")] = 0.7214057 m_entry = MatcherResults(ext2) - m_entry_one_to_one = m_entry.one_to_one() + m_entry_one_to_one = m_entry.one_to_one_greedy() assert m_one_to_one != m_entry_one_to_one # All remaining values should be above the median diff --git a/valentine/algorithms/jaccard_distance/__init__.py b/valentine/algorithms/jaccard_distance/__init__.py index 2100b92..fc0d5c4 100644 --- a/valentine/algorithms/jaccard_distance/__init__.py +++ b/valentine/algorithms/jaccard_distance/__init__.py @@ -8,3 +8,6 @@ class StringDistanceFunction(Enum): JaroWinkler = auto() Hamming = auto() Exact = auto() + # Sentence-transformer embedding cosine similarity. Requires the + # ``sentence-transformers`` extra (``pip install valentine[embeddings]``). + Embedding = auto() diff --git a/valentine/algorithms/jaccard_distance/jaccard_distance.py b/valentine/algorithms/jaccard_distance/jaccard_distance.py index 890e0a8..09b5e3c 100644 --- a/valentine/algorithms/jaccard_distance/jaccard_distance.py +++ b/valentine/algorithms/jaccard_distance/jaccard_distance.py @@ -1,4 +1,5 @@ -from itertools import product +from functools import lru_cache +from itertools import combinations, product import numpy as np from rapidfuzz import process @@ -27,6 +28,31 @@ } +@lru_cache(maxsize=4) +def _load_sentence_transformer(model_name: str, device: str | None): + """Lazily load and cache a SentenceTransformer model on a device. + + Importing inside the function keeps ``sentence-transformers`` an + optional dependency: the rest of this module — and every other + ``StringDistanceFunction`` value — works without it installed. + + ``device`` is passed straight through to ``SentenceTransformer``. + ``None`` lets the library auto-pick (typically ``cuda`` if available, + else ``mps`` on Apple Silicon, else ``cpu``). Pass ``"cpu"``, + ``"cuda"``, ``"cuda:1"``, or ``"mps"`` to force a specific device. + The cache is keyed by ``(model_name, device)`` so switching devices + does not silently reuse a model loaded elsewhere. + """ + try: + from sentence_transformers import SentenceTransformer + except ImportError as exc: # pragma: no cover - depends on optional extra + raise ImportError( + "StringDistanceFunction.Embedding requires the 'sentence-transformers' " + "package. Install it with: pip install 'valentine[embeddings]'" + ) from exc + return SentenceTransformer(model_name, device=device) + + class JaccardDistanceMatcher(BaseMatcher): """Baseline instance-based matcher using Jaccard similarity. @@ -40,21 +66,58 @@ class JaccardDistanceMatcher(BaseMatcher): Acceptance threshold above which two string values are considered equal under the chosen ``distance_fun``, in ``[0, 1]`` (default: ``0.8``). Ignored when ``distance_fun`` is - :attr:`StringDistanceFunction.Exact`. + :attr:`StringDistanceFunction.Exact`. For + :attr:`StringDistanceFunction.Embedding`, the threshold is + applied to cosine similarity of sentence-transformer embeddings; + ~0.7 is a typical operating point. distance_fun : StringDistanceFunction, optional String similarity function. One of :attr:`StringDistanceFunction.Levenshtein` (default), :attr:`StringDistanceFunction.DamerauLevenshtein`, :attr:`StringDistanceFunction.Hamming`, :attr:`StringDistanceFunction.Jaro`, - :attr:`StringDistanceFunction.JaroWinkler`, or - :attr:`StringDistanceFunction.Exact`. + :attr:`StringDistanceFunction.JaroWinkler`, + :attr:`StringDistanceFunction.Exact`, or + :attr:`StringDistanceFunction.Embedding`. process_num : int, optional Number of worker threads passed to ``rapidfuzz.process.cdist`` (must be ``>= 1``, default: ``1``). Earlier versions used a ``multiprocessing.Pool``; with rapidfuzz the inner kernel is already C++ and parallelises via OpenMP threads, so the pool is no longer needed. + embedding_model : str, optional + Name of the sentence-transformers model used when + ``distance_fun=StringDistanceFunction.Embedding`` (default: + ``"all-MiniLM-L6-v2"``, a 23 MB / 384-dim model that runs well + on CPU). Ignored for non-embedding distances. + embedding_device : str or None, optional + Device to load the embedding model on. Passed straight through + to ``SentenceTransformer``. ``None`` (the default) lets the + library auto-detect — usually ``"cuda"`` if a GPU is present, + ``"mps"`` on Apple Silicon, otherwise ``"cpu"``. Pass + ``"cpu"``, ``"cuda"``, ``"cuda:1"``, or ``"mps"`` to force a + specific device. Ignored for non-embedding distances. + embedding_batch_size : int or None, optional + Batch size used for the global ``model.encode`` call. ``None`` + (the default) does not pass the kwarg, letting + sentence-transformers use its own default (``32``). Pass an + explicit value (e.g. ``128`` or ``256``) to amortise per-call + overhead when encoding large vocabularies on capable hardware. + Ignored for non-embedding distances. + tversky_alpha : float, optional + Tversky penalty for unmatched values on the *reference* side + (default: ``1.0``). The pair-similarity reduction is + ``T(A, B; α, β) = |A∩B| / (|A∩B| + α·|A−B| + β·|B−A|)``, + symmetrised by computing both ``T(A, B)`` and ``T(B, A)`` and + taking the max so the matcher remains direction-agnostic. With + ``α = β = 1.0`` this reduces to Jaccard; with ``α = 1.0, + β = 0.0`` (or vice versa) it reduces to ``max(|∩|/|A|, |∩|/|B|)``, + i.e. set containment — the right choice when one column is + expected to be a subset of the other. Intermediate values trade + off between these extremes. + tversky_beta : float, optional + Tversky penalty for unmatched values on the *other* side + (default: ``1.0``). See ``tversky_alpha``. """ def __init__( @@ -62,10 +125,31 @@ def __init__( threshold_dist: float = 0.8, distance_fun: StringDistanceFunction = StringDistanceFunction.Levenshtein, process_num: int = 1, + embedding_model: str = "all-MiniLM-L6-v2", + embedding_device: str | None = None, + embedding_batch_size: int | None = None, + tversky_alpha: float = 1.0, + tversky_beta: float = 1.0, ): self.__threshold_dist = float(threshold_dist) self.__process_num = int(process_num) self.__distance_function = distance_fun + self.__embedding_model_name = str(embedding_model) + self.__embedding_device = embedding_device + if embedding_batch_size is not None and embedding_batch_size < 1: + raise ValueError( + f"embedding_batch_size must be >= 1 or None, got {embedding_batch_size}" + ) + self.__embedding_batch_size = ( + None if embedding_batch_size is None else int(embedding_batch_size) + ) + if tversky_alpha < 0.0 or tversky_beta < 0.0: + raise ValueError( + f"tversky_alpha and tversky_beta must be >= 0, " + f"got alpha={tversky_alpha}, beta={tversky_beta}" + ) + self.__tversky_alpha = float(tversky_alpha) + self.__tversky_beta = float(tversky_beta) if not 0.0 <= self.__threshold_dist <= 1.0: raise ValueError( f"threshold_dist must be between 0.0 and 1.0, got {self.__threshold_dist}" @@ -74,17 +158,105 @@ def __init__( raise ValueError(f"process_num must be >= 1, got {self.__process_num}") def get_matches(self, source_input: BaseTable, target_input: BaseTable) -> dict: + col_embeddings = self.__build_col_embeddings([source_input, target_input]) + return self.__match_pair(source_input, target_input, col_embeddings) + + def get_matches_batch(self, tables: list[BaseTable]) -> dict: + """Match all unique table pairs, sharing one global embedding pass. + + For ``StringDistanceFunction.Embedding`` this means each unique + string across every column of every table is encoded exactly + once. With other distances the override is equivalent to the + default ``BaseMatcher.get_matches_batch``. + """ + col_embeddings = self.__build_col_embeddings(tables) + matches: dict = {} + for t1, t2 in combinations(tables, 2): + matches.update(self.__match_pair(t1, t2, col_embeddings)) + return matches + + def __match_pair( + self, + source_input: BaseTable, + target_input: BaseTable, + col_embeddings: dict[tuple[str, str], tuple[list[str], np.ndarray]] | None, + ) -> dict: matches: dict = {} for combination in self.__get_column_combinations( source_input, target_input, self.__threshold_dist, self.__distance_function, + col_embeddings, ): matches.update(self.process_jaccard_distance(combination)) # Remove the pairs with zero similarity return {k: v for k, v in matches.items() if v > 0.0} + def __build_col_embeddings( + self, tables: list[BaseTable] + ) -> dict[tuple[str, str], tuple[list[str], np.ndarray]] | None: + """Encode every column across every table with one batched call. + + Returns a ``(table_name, column_name) -> (values, embeddings)`` + map, or ``None`` when the chosen distance is not embedding-based. + + Two layers of deduplication keep this cheap: + + - **Per-column**: the column's value set is converted to a sorted + list of unique strings (deterministic for repeated runs). + - **Global vocabulary**: identical strings appearing in many + columns are encoded only once, then sliced back out. + + The encode itself is a single ``model.encode`` call with a large + batch size, which dominates the speedup over per-column encoding. + """ + if self.__distance_function != StringDistanceFunction.Embedding: + return None + + # Collect per-column unique values, deterministically ordered. + col_values: dict[tuple[str, str], list[str]] = {} + for table in tables: + for column in table.get_instances_columns(): + key = (table.name, column.name) + if key in col_values: + continue + col_values[key] = sorted({str(v) for v in column.data}) + + # Build a global vocabulary: each unique string is encoded once. + vocab: dict[str, int] = {} + for values in col_values.values(): + for v in values: + if v not in vocab: + vocab[v] = len(vocab) + + if not vocab: + return {key: (values, np.zeros((0, 0), dtype=np.float32)) for key, values in col_values.items()} + + model = _load_sentence_transformer( + self.__embedding_model_name, self.__embedding_device + ) + encode_kwargs: dict = { + "normalize_embeddings": True, + "show_progress_bar": False, + "convert_to_numpy": True, + } + if self.__embedding_batch_size is not None: + encode_kwargs["batch_size"] = self.__embedding_batch_size + all_embeddings = model.encode(list(vocab.keys()), **encode_kwargs).astype( + np.float32 + ) + + dim = all_embeddings.shape[1] + out: dict[tuple[str, str], tuple[list[str], np.ndarray]] = {} + for key, values in col_values.items(): + if not values: + out[key] = (values, np.zeros((0, dim), dtype=np.float32)) + continue + indices = [vocab[v] for v in values] + out[key] = (values, all_embeddings[indices]) + return out + def process_jaccard_distance(self, tup: tuple): ( source_data, @@ -95,8 +267,26 @@ def process_jaccard_distance(self, tup: tuple): source_table_name, source_column_name, distance_function, + embeddings, ) = tup + if distance_function == StringDistanceFunction.Embedding: + sim = self.__embedding_similarity( + embeddings, + source_table_name, + source_column_name, + target_table_name, + target_column_name, + threshold, + ) + return Match( + target_table_name, + target_column_name, + source_table_name, + source_column_name, + sim, + ).to_dict + set1 = {str(x) for x in source_data} set2 = {str(x) for x in target_data} # Iterate over the smaller set as queries: cdist scales with @@ -105,9 +295,11 @@ def process_jaccard_distance(self, tup: tuple): set1, set2 = set2, set1 if distance_function == StringDistanceFunction.Exact: - intersection_cnt = len(set1 & set2) + # Exact match is symmetric — both sides see the same intersection. + inter = len(set1 & set2) + a_match = b_match = float(inter) elif not set1 or not set2: - intersection_cnt = 0 + a_match = b_match = 0.0 else: scorer = _SCORER_MAP[distance_function] queries = list(set1) @@ -119,14 +311,9 @@ def process_jaccard_distance(self, tup: tuple): score_cutoff=threshold, workers=self.__process_num, ) - # Each query string in set1 contributes 1 to the intersection - # if at least one choice in set2 scores >= threshold. Scores - # below score_cutoff are returned as 0 by rapidfuzz, so the - # comparison is exact even when threshold == 0. - intersection_cnt = int(np.count_nonzero((scores >= threshold).any(axis=1))) + a_match, b_match = self.__directional_counts(scores, threshold) - union_cnt = len(set1) + len(set2) - intersection_cnt - sim = 0.0 if union_cnt == 0 else float(intersection_cnt) / union_cnt + sim = self.__aggregate(a_match, b_match, len(set1), len(set2)) return Match( target_table_name, @@ -136,12 +323,82 @@ def process_jaccard_distance(self, tup: tuple): sim, ).to_dict + def __embedding_similarity( + self, + embeddings: dict[tuple[str, str], tuple[list[str], np.ndarray]], + source_table_name: str, + source_column_name: str, + target_table_name: str, + target_column_name: str, + threshold: float, + ) -> float: + """Tversky-reduced set similarity using cosine on embeddings. + + Two values are treated as "matched" when their cosine similarity + is ``>= threshold``. Both directional match counts come from the + same ``sims`` matrix and are reduced via Tversky. + """ + src_values, src_emb = embeddings[(source_table_name, source_column_name)] + tgt_values, tgt_emb = embeddings[(target_table_name, target_column_name)] + if not src_values or not tgt_values: + return 0.0 + # Iterate over the smaller side, matching the rapidfuzz branch. + if len(src_values) > len(tgt_values): + src_values, tgt_values = tgt_values, src_values + src_emb, tgt_emb = tgt_emb, src_emb + # Embeddings are L2-normalised at encode-time, so cosine = dot product. + sims = src_emb @ tgt_emb.T + a_match, b_match = self.__directional_counts(sims, threshold) + return self.__aggregate(a_match, b_match, len(src_values), len(tgt_values)) + + @staticmethod + def __directional_counts( + scores: np.ndarray, threshold: float + ) -> tuple[float, float]: + """Count rows / columns whose best entry is at least ``threshold``. + + ``scores[i, j]`` is the similarity between A's i-th value and B's + j-th value. The first value is the count of A-side values with at + least one above-threshold partner; the second is the count on B's + side. + """ + hits = scores >= threshold + return ( + float(np.count_nonzero(hits.any(axis=1))), + float(np.count_nonzero(hits.any(axis=0))), + ) + + def __aggregate( + self, a_match: float, b_match: float, a_size: int, b_size: int + ) -> float: + """Reduce directional match counts to a similarity score via Tversky. + + Uses the asymmetric Tversky index in both directions and returns + the larger of the two so the matcher stays direction-agnostic: + + T(A, B; α, β) = a_match / (a_match + α·(|A|−a_match) + β·(|B|−b_match)) + + With α = β = 1 this is Jaccard; α = 1, β = 0 (or vice versa) + recovers ``max(|∩|/|A|, |∩|/|B|)`` containment. + """ + if a_size == 0 or b_size == 0: + return 0.0 + alpha, beta = self.__tversky_alpha, self.__tversky_beta + a_unmatched = max(a_size - a_match, 0.0) + b_unmatched = max(b_size - b_match, 0.0) + denom_ab = a_match + alpha * a_unmatched + beta * b_unmatched + denom_ba = b_match + alpha * b_unmatched + beta * a_unmatched + t_ab = 0.0 if denom_ab <= 0.0 else a_match / denom_ab + t_ba = 0.0 if denom_ba <= 0.0 else b_match / denom_ba + return float(max(t_ab, t_ba)) + @staticmethod def __get_column_combinations( source_table: BaseTable, target_table: BaseTable, threshold, distance_function: StringDistanceFunction, + col_embeddings: dict[tuple[str, str], tuple[list[str], np.ndarray]] | None, ): for source_column, target_column in product( source_table.get_instances_columns(), target_table.get_instances_columns() @@ -155,4 +412,5 @@ def __get_column_combinations( source_table.name, source_column.name, distance_function, + col_embeddings, ) diff --git a/valentine/algorithms/matcher_results.py b/valentine/algorithms/matcher_results.py index 928a3de..491a3ae 100644 --- a/valentine/algorithms/matcher_results.py +++ b/valentine/algorithms/matcher_results.py @@ -37,7 +37,10 @@ def __init__( sorted_matches = dict(sorted(matches.items(), key=lambda x: x[1], reverse=True)) self._data: dict[ColumnPair, float] = sorted_matches self._details: dict[ColumnPair, dict[str, float]] = details or {} - self._cached_one_to_one: MatcherResults | None = None + # Cached default 1:1 selection (Hungarian, since it is the default + # filter used by Precision / Recall / F1Score). Greedy and mutual + # variants are niche and not cached. + self._cached_hungarian: MatcherResults | None = None # -- Mapping protocol -------------------------------------------------- @@ -87,34 +90,118 @@ def get_details(self, key: ColumnPair) -> dict[str, float] | None: # -- Transformations --------------------------------------------------- - def one_to_one(self, threshold: float | None = None) -> MatcherResults: - """Filter to one-to-one column matches. + def one_to_one_hungarian(self, threshold: float | None = None) -> MatcherResults: + """Globally optimal 1:1 column matching via Hungarian assignment. - Starting from the highest-scoring pair, greedily assigns each source - and target column at most one match. Pairs below ``threshold`` are - discarded. When ``threshold`` is ``None`` (the default), the median - similarity score is used. + This is the **default** 1:1 selector — it is what + :class:`Precision` / :class:`Recall` / :class:`F1Score` call when + their ``one_to_one`` flag is set. Each source and target appears + in at most one returned pair, with the assignment chosen to + maximise **total** similarity over all valid one-to-one + assignments. Cost is O(n³) on column counts via + ``scipy.optimize.linear_sum_assignment`` — negligible for + typical schema sizes — and almost always strictly better than + the greedy variant. Parameters ---------- threshold : float | None - Minimum similarity to keep. If None, uses the median score. + Minimum similarity to keep. If ``None``, uses the median + similarity score. Returns ------- MatcherResults - A new instance with one-to-one matches only. + A new instance with the Hungarian-optimal one-to-one + assignment, post-thresholding. """ - if threshold is None and self._cached_one_to_one is not None: - return self._cached_one_to_one + if threshold is None and self._cached_hungarian is not None: + return self._cached_hungarian + if not self._data: + result = MatcherResults({}) + if threshold is None: + self._cached_hungarian = result + return result - set_match_values = set(self._data.values()) + # Stable index of unique sources and targets. + sources: list[tuple[str, str]] = [] + source_idx: dict[tuple[str, str], int] = {} + targets: list[tuple[str, str]] = [] + target_idx: dict[tuple[str, str], int] = {} + for cp in self._data: + if cp.source not in source_idx: + source_idx[cp.source] = len(sources) + sources.append(cp.source) + if cp.target not in target_idx: + target_idx[cp.target] = len(targets) + targets.append(cp.target) + + m, n = len(sources), len(targets) + sim = [[0.0] * n for _ in range(m)] + pair_lookup: dict[tuple, ColumnPair] = {} + for cp, score in self._data.items(): + sim[source_idx[cp.source]][target_idx[cp.target]] = score + pair_lookup[(cp.source, cp.target)] = cp + + # Hungarian minimises cost; we want max similarity. + from scipy.optimize import linear_sum_assignment + + cost = [[-s for s in row] for row in sim] + row_ind, col_ind = linear_sum_assignment(cost) + set_match_values = set(self._data.values()) if len(set_match_values) < 2: result = MatcherResults(dict(self._data), details=dict(self._details)) if threshold is None: - self._cached_one_to_one = result + self._cached_hungarian = result return result + if threshold is None: + min_sim = sorted(set_match_values, reverse=True)[math.ceil(len(set_match_values) / 2)] + else: + min_sim = threshold + + selected: dict[ColumnPair, float] = {} + for r, c in zip(row_ind, col_ind, strict=False): + cp = pair_lookup.get((sources[r], targets[c])) + if cp is None: + continue # no actual pair at this (s, t) + score = self._data[cp] + if score >= min_sim: + selected[cp] = score + + filtered_details = {k: v for k, v in self._details.items() if k in selected} + result = MatcherResults(selected, details=filtered_details) + if threshold is None: + self._cached_hungarian = result + return result + + def one_to_one_greedy(self, threshold: float | None = None) -> MatcherResults: + """Greedy 1:1 column matching, kept for backwards compatibility. + + Starting from the highest-scoring pair, greedily assigns each + source and target column at most one match. Pairs below + ``threshold`` are discarded. When ``threshold`` is ``None`` (the + default), the median similarity score is used. + + Greedy can lock in a locally-best pair that blocks a better + globally-optimal assignment, so :meth:`one_to_one_hungarian` is + the recommended default; this method is exposed for + compatibility and for test pinning. + + Parameters + ---------- + threshold : float | None + Minimum similarity to keep. If ``None``, uses the median score. + + Returns + ------- + MatcherResults + A new instance with the greedy 1:1 assignment. + """ + set_match_values = set(self._data.values()) + + if len(set_match_values) < 2: + return MatcherResults(dict(self._data), details=dict(self._details)) matched: dict[tuple[str, str], bool] = {} for key in self._data: @@ -137,10 +224,55 @@ def one_to_one(self, threshold: float | None = None) -> MatcherResults: break filtered_details = {k: v for k, v in self._details.items() if k in matches1to1} - result = MatcherResults(matches1to1, details=filtered_details) - if threshold is None: - self._cached_one_to_one = result - return result + return MatcherResults(matches1to1, details=filtered_details) + + def one_to_one_mutual_top(self, n: int = 1) -> MatcherResults: + """Keep pairs where each side ranks the other in its top *n*. + + Pair ``(s, t)`` survives iff ``t`` is among ``s``'s ``n`` highest- + scoring targets AND ``s`` is among ``t``'s ``n`` highest-scoring + sources. With ``n=1`` this is the classic mutual nearest- + neighbour filter — high-precision, drops one-sided affinities. + Strictly stricter than :meth:`one_to_one_hungarian`: only + mutually-confirmed pairs survive, even at the cost of recall. + + Parameters + ---------- + n : int + Top-n cutoff per side (default 1 = mutual nearest neighbour). + + Returns + ------- + MatcherResults + A new instance with only the mutually-confirmed pairs. + """ + if n < 1: + raise ValueError(f"n must be >= 1, got {n}") + if not self._data: + return MatcherResults({}) + + by_source: dict[tuple[str, str], list[tuple[float, tuple[str, str]]]] = {} + by_target: dict[tuple[str, str], list[tuple[float, tuple[str, str]]]] = {} + for cp, score in self._data.items(): + by_source.setdefault(cp.source, []).append((score, cp.target)) + by_target.setdefault(cp.target, []).append((score, cp.source)) + + src_top: dict[tuple[str, str], set] = {} + for s, lst in by_source.items(): + lst.sort(reverse=True) + src_top[s] = {t for _, t in lst[:n]} + tgt_top: dict[tuple[str, str], set] = {} + for t, lst in by_target.items(): + lst.sort(reverse=True) + tgt_top[t] = {s for _, s in lst[:n]} + + selected: dict[ColumnPair, float] = {} + for cp, score in self._data.items(): + if cp.target in src_top.get(cp.source, set()) and cp.source in tgt_top.get(cp.target, set()): + selected[cp] = score + + filtered_details = {k: v for k, v in self._details.items() if k in selected} + return MatcherResults(selected, details=filtered_details) def filter(self, min_score: float) -> MatcherResults: """Filter matches by minimum similarity score. @@ -223,6 +355,7 @@ def get_metrics( self, ground_truth: list[tuple[str, str]] | list[ColumnPair], metrics: set[Metric] = METRICS_CORE, + one_to_one_method: str = "hungarian", ) -> dict[str, Any]: """Compute evaluation metrics against a ground truth. @@ -235,6 +368,10 @@ def get_metrics( comparison. metrics : set[Metric], optional Set of metric instances to compute (default: ``METRICS_CORE``). + one_to_one_method : {"greedy", "hungarian", "mutual_top"} + Selection algorithm passed to each metric's ``apply`` method + for use when the metric's ``one_to_one`` flag is ``True`` + (default: ``"hungarian"``). Returns ------- @@ -243,7 +380,7 @@ def get_metrics( """ res: dict[str, Any] = {} for metric in metrics: - res.update(metric.apply(self, ground_truth)) + res.update(metric.apply(self, ground_truth, one_to_one_method=one_to_one_method)) return res # -- Copies ------------------------------------------------------------ diff --git a/valentine/metrics/base_metric.py b/valentine/metrics/base_metric.py index 22caa54..25a5946 100644 --- a/valentine/metrics/base_metric.py +++ b/valentine/metrics/base_metric.py @@ -4,7 +4,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Literal if TYPE_CHECKING: from ..algorithms.matcher_results import MatcherResults @@ -12,6 +12,10 @@ from dataclasses import dataclass from typing import Any, final +# Valid choices for the 1:1 selection algorithm used by Precision / +# Recall / F1Score / PrecisionTopNPercent when ``one_to_one=True``. +OneToOneMethod = Literal["greedy", "hungarian", "mutual_top"] + @dataclass(eq=True, frozen=True) class Metric(ABC): @@ -25,6 +29,7 @@ def apply( self: Metric, matches: MatcherResults, ground_truth: list[tuple[str, str]] | list, + one_to_one_method: OneToOneMethod = "hungarian", ) -> dict[str, Any]: """Apply the metric to a ``MatcherResults`` instance, given ground truth. @@ -37,6 +42,13 @@ def apply( ``[("src_col", "tgt_col"), ...]`` (table names ignored during comparison) or full :class:`~valentine.algorithms.ColumnPair` instances for table-aware comparison. + one_to_one_method : {"greedy", "hungarian", "mutual_top"} + Selection algorithm used when the metric's ``one_to_one`` + field is ``True``. Defaults to ``"hungarian"`` (globally + optimal). ``"greedy"`` matches the legacy behaviour; + ``"mutual_top"`` keeps only mutually-confirmed pairs (top-1 + on each side). Ignored when ``one_to_one`` is ``False`` or + for metrics that do not apply 1:1 filtering. """ pass diff --git a/valentine/metrics/metric_helpers.py b/valentine/metrics/metric_helpers.py index 93a33d4..e7342c1 100644 --- a/valentine/metrics/metric_helpers.py +++ b/valentine/metrics/metric_helpers.py @@ -5,6 +5,26 @@ if TYPE_CHECKING: from ..algorithms.match import ColumnPair from ..algorithms.matcher_results import MatcherResults + from .base_metric import OneToOneMethod + + +def _apply_one_to_one(matches: MatcherResults, method: OneToOneMethod) -> MatcherResults: + """Dispatch ``matches`` through the requested 1:1 selection algorithm. + + ``method`` is one of ``"greedy"``, ``"hungarian"``, or ``"mutual_top"``; + invalid values raise ``ValueError``. Mutual-top defaults to ``n=1`` + (mutual nearest neighbour). + """ + if method == "hungarian": + return matches.one_to_one_hungarian() + if method == "greedy": + return matches.one_to_one_greedy() + if method == "mutual_top": + return matches.one_to_one_mutual_top() + raise ValueError( + f"Unknown one_to_one_method: {method!r}; " + "expected 'greedy', 'hungarian', or 'mutual_top'" + ) def _normalize_ground_truth( diff --git a/valentine/metrics/metrics.py b/valentine/metrics/metrics.py index 33240e1..9d8f8e6 100644 --- a/valentine/metrics/metrics.py +++ b/valentine/metrics/metrics.py @@ -10,8 +10,14 @@ from dataclasses import dataclass from typing import Any -from .base_metric import Metric -from .metric_helpers import _matches_as_tuples, _normalize_ground_truth, get_fp, get_tp_fn +from .base_metric import Metric, OneToOneMethod +from .metric_helpers import ( + _apply_one_to_one, + _matches_as_tuples, + _normalize_ground_truth, + get_fp, + get_tp_fn, +) # Public exports __all__ = [ @@ -45,9 +51,14 @@ class Precision(Metric): one_to_one: bool = True - def apply(self, matches: Any, ground_truth: GroundTruth) -> dict[str, float]: + def apply( + self, + matches: Any, + ground_truth: GroundTruth, + one_to_one_method: OneToOneMethod = "hungarian", + ) -> dict[str, float]: if self.one_to_one: - matches = matches.one_to_one() + matches = _apply_one_to_one(matches, one_to_one_method) tp, _ = get_tp_fn(matches, ground_truth) fp = get_fp(matches, ground_truth) @@ -67,9 +78,14 @@ class Recall(Metric): one_to_one: bool = True - def apply(self, matches: Any, ground_truth: GroundTruth) -> dict[str, float]: + def apply( + self, + matches: Any, + ground_truth: GroundTruth, + one_to_one_method: OneToOneMethod = "hungarian", + ) -> dict[str, float]: if self.one_to_one: - matches = matches.one_to_one() + matches = _apply_one_to_one(matches, one_to_one_method) tp, fn = get_tp_fn(matches, ground_truth) recall = _safe_div(tp, tp + fn) @@ -88,9 +104,14 @@ class F1Score(Metric): one_to_one: bool = True - def apply(self, matches: Any, ground_truth: GroundTruth) -> dict[str, float]: + def apply( + self, + matches: Any, + ground_truth: GroundTruth, + one_to_one_method: OneToOneMethod = "hungarian", + ) -> dict[str, float]: if self.one_to_one: - matches = matches.one_to_one() + matches = _apply_one_to_one(matches, one_to_one_method) tp, fn = get_tp_fn(matches, ground_truth) fp = get_fp(matches, ground_truth) @@ -120,9 +141,14 @@ def name(self) -> str: # Replace the 'N' in the base name with the chosen percent, e.g. "PrecisionTop70Percent". return super().name().replace("N", str(self.n)) - def apply(self, matches: Any, ground_truth: GroundTruth) -> dict[str, float]: + def apply( + self, + matches: Any, + ground_truth: GroundTruth, + one_to_one_method: OneToOneMethod = "hungarian", + ) -> dict[str, float]: if self.one_to_one: - matches = matches.one_to_one() + matches = _apply_one_to_one(matches, one_to_one_method) # Clamp N to a sensible range without mutating the dataclass. n_clamped = min(100, max(0, int(self.n))) @@ -150,9 +176,14 @@ class RecallAtSizeofGroundTruth(Metric): one_to_one: bool = False - def apply(self, matches: Any, ground_truth: GroundTruth) -> dict[str, float]: + def apply( + self, + matches: Any, + ground_truth: GroundTruth, + one_to_one_method: OneToOneMethod = "hungarian", + ) -> dict[str, float]: if self.one_to_one: - matches = matches.one_to_one() + matches = _apply_one_to_one(matches, one_to_one_method) n_matches = matches.take_top_n(len(ground_truth)) tp, fn = get_tp_fn(n_matches, ground_truth) recall = _safe_div(tp, tp + fn) @@ -176,9 +207,14 @@ class MeanReciprocalRank(Metric): one_to_one: bool = False - def apply(self, matches: Any, ground_truth: GroundTruth) -> dict[str, float]: + def apply( + self, + matches: Any, + ground_truth: GroundTruth, + one_to_one_method: OneToOneMethod = "hungarian", + ) -> dict[str, float]: if self.one_to_one: - matches = matches.one_to_one() + matches = _apply_one_to_one(matches, one_to_one_method) gt_pairs, table_aware = _normalize_ground_truth(ground_truth) ranked = _matches_as_tuples(matches, table_aware) From bad9fc843aa40c2c2150f59961775d0bbdf7ecbc Mon Sep 17 00:00:00 2001 From: Christos Koutras Date: Mon, 4 May 2026 09:50:00 -0400 Subject: [PATCH 02/13] update readme --- README.md | 39 ++++++++++++++++++++++++++++++++------- 1 file changed, 32 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index d33087d..f472671 100644 --- a/README.md +++ b/README.md @@ -63,6 +63,12 @@ To enable **Polars** support, install the optional extra: pip install valentine[polars] ``` +To enable the **sentence-transformer embedding** distance for `JaccardDistanceMatcher` (see below), install: + +```shell +pip install valentine[embeddings] +``` + ## Usage Valentine can be used to find matches among columns of a given pair of pandas or Polars DataFrames. You can even mix pandas and Polars frames in the same call — Valentine auto-detects the frame type. @@ -89,17 +95,21 @@ In order to do so, the user can choose one of the following matching methods: * **threshold1**(*float*) - The threshold for phase 1 of the method, default is 0.15. * **threshold2**(*float*) - The threshold for phase 2 of the method, default is 0.15. -4. `JaccardDistanceMatcher(float: threshold_dist)` is a baseline method that uses Jaccard Similarity between columns to assess their correspondence score, optionally enhanced by a string similarity measure of choice. +4. `JaccardDistanceMatcher(...)` is a baseline method that scores column pairs by **Tversky** similarity over their value sets (Jaccard by default). Element equality between values can be decided by a configurable string distance function, including a sentence-transformer **embedding** option for semantic matching. * **Parameters**: - * **threshold_dist**(*float*) - Acceptance threshold for assessing two strings as equal, default is 0.8. - - * **distance_fun**(*StringDistanceFunction*) - String similarity function used to assess whether two strings are equal. The enumeration class type `StringDistanceFunction` can be imported from `valentine.algorithms.jaccard_distance`. Functions currently supported are: - * `StringDistanceFunction.Levenshtein`: [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance) + * **threshold_dist**(*float*) - Acceptance threshold above which two values are considered equal under the chosen `distance_fun`, default is 0.8. For embeddings, ~0.7 is a typical operating point. + * **distance_fun**(*StringDistanceFunction*) - Per-value similarity function. The enumeration class `StringDistanceFunction` can be imported from `valentine.algorithms.jaccard_distance`. Functions currently supported are: + * `StringDistanceFunction.Levenshtein`: [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance) (default) * `StringDistanceFunction.DamerauLevenshtein`: [Damerau-Levenshtein distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) * `StringDistanceFunction.Hamming`: [Hamming distance](https://en.wikipedia.org/wiki/Hamming_distance) * `StringDistanceFunction.Jaro`: [Jaro distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) * `StringDistanceFunction.JaroWinkler`: [Jaro-Winkler distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) - * `StringDistanceFunction.Exact`: String equality `==` + * `StringDistanceFunction.Exact`: String equality `==` + * `StringDistanceFunction.Embedding`: cosine similarity on sentence-transformer embeddings (requires the `valentine[embeddings]` extra) + * **tversky_alpha**(*float*) / **tversky_beta**(*float*) - Tversky penalty weights for unmatched values on each side (defaults `1.0`, `1.0`). Defaults give Jaccard; `0.5, 0.5` gives Sørensen-Dice; `1.0, 0.0` (or vice versa) gives set containment — useful when one column is expected to be a subset of the other. + * **embedding_model**(*str*) - Sentence-transformers model name when `distance_fun=Embedding` (default `"all-MiniLM-L6-v2"`). + * **embedding_device**(*str* or *None*) - Device override (`"cpu"`, `"cuda"`, `"mps"`). `None` (default) auto-picks: cuda → mps → cpu. + * **embedding_batch_size**(*int* or *None*) - Encode batch size; `None` uses the sentence-transformers default (32). Larger values amortise per-call overhead on capable hardware. 5. `SimilarityFlooding(Policy: coeff_policy, Formula: formula, StringMatcher: string_matcher)` is the python implementation of the paper [Similarity Flooding: A Versatile Graph Matching Algorithmand its Application to Schema Matching](https://ieeexplore.ieee.org/document/994702) * **Parameters**: @@ -137,11 +147,17 @@ for pair, score in matches.items(): ```python top_n_matches = matches.take_top_n(5) top_n_percent_matches = matches.take_top_percent(25) -one_to_one_matches = matches.one_to_one_hungarian() high_confidence = matches.filter(min_score=0.7) + +# One-to-one selectors — three flavours, pick the one that fits your task: +one_to_one_matches = matches.one_to_one_hungarian() # globally optimal (default) one_to_one_strict = matches.one_to_one_hungarian(threshold=0.5) +greedy_legacy = matches.one_to_one_greedy() # legacy greedy assignment +mutual_only = matches.one_to_one_mutual_top(n=1) # mutual nearest neighbour ``` +`one_to_one_hungarian` (Hungarian assignment via `scipy.optimize.linear_sum_assignment`) is the recommended default and is what `Precision` / `Recall` / `F1Score` apply when their `one_to_one` flag is set. `one_to_one_greedy` preserves the legacy greedy behaviour for backwards compatibility. `one_to_one_mutual_top(n)` keeps a pair only when each side ranks the other in its top-`n` — a high-precision filter that drops one-sided affinities. + ### Match details (Coma) When using the Coma matcher, per-sub-matcher score breakdowns are available via `.details`: @@ -175,6 +191,15 @@ metrics_custom = matches.get_metrics(ground_truth, metrics={F1Score(one_to_one=F metrics_predefined_set = matches.get_metrics(ground_truth, metrics=METRICS_PRECISION_INCREASING_N) ``` +The 1:1 selection algorithm used when a metric's `one_to_one` flag is `True` can be overridden per call (default `"hungarian"`): + +```python +metrics_strict = matches.get_metrics(ground_truth, metrics={F1Score()}, one_to_one_method="mutual_top") +metrics_legacy = matches.get_metrics(ground_truth, metrics={F1Score()}, one_to_one_method="greedy") +``` + +Valid values are `"hungarian"` (default), `"greedy"`, and `"mutual_top"`. Metrics whose `one_to_one` flag is `False` (e.g. `MeanReciprocalRank`, `RecallAtSizeofGroundTruth`) ignore the argument. + ### Example The following block of code shows: 1) how to run a matcher from Valentine on two DataFrames storing information about job candidates, and then 2) how to assess its effectiveness based on a given ground truth. More examples are available in the [`examples/`](https://github.com/delftdata/valentine/tree/master/examples) directory, including a [pandas example](https://github.com/delftdata/valentine/blob/master/examples/valentine_example_pandas.py), a [Polars example](https://github.com/delftdata/valentine/blob/master/examples/valentine_example_polars.py), and a [mixed pandas+Polars example](https://github.com/delftdata/valentine/blob/master/examples/valentine_example_mixed.py). From e12b22d2d0e7b681d77f9cc0467d1e9704ecda60 Mon Sep 17 00:00:00 2001 From: Kyriakos Psarakis Date: Tue, 5 May 2026 12:02:40 +0200 Subject: [PATCH 03/13] apply linter rules --- experiments/bench.py | 7 ++--- .../jaccard_distance/jaccard_distance.py | 31 ++++++++----------- valentine/algorithms/matcher_results.py | 14 +++++---- valentine/metrics/metric_helpers.py | 3 +- 4 files changed, 24 insertions(+), 31 deletions(-) diff --git a/experiments/bench.py b/experiments/bench.py index 292ed64..c409b42 100644 --- a/experiments/bench.py +++ b/experiments/bench.py @@ -26,6 +26,7 @@ from __future__ import annotations import argparse +import importlib.util import json import statistics import sys @@ -78,9 +79,7 @@ def _matcher_builders() -> list[tuple[str, MatcherFactory]]: ] # Only include the embedding variant when sentence-transformers is # actually importable; otherwise the bench would crash on import. - try: - import sentence_transformers - + if importlib.util.find_spec("sentence_transformers") is not None: builders.append( ( "JaccardDistanceMatcher_emb", @@ -113,8 +112,6 @@ def _matcher_builders() -> list[tuple[str, MatcherFactory]]: ), ) ) - except ImportError: - pass return builders diff --git a/valentine/algorithms/jaccard_distance/jaccard_distance.py b/valentine/algorithms/jaccard_distance/jaccard_distance.py index 09b5e3c..d60dc3a 100644 --- a/valentine/algorithms/jaccard_distance/jaccard_distance.py +++ b/valentine/algorithms/jaccard_distance/jaccard_distance.py @@ -44,7 +44,7 @@ def _load_sentence_transformer(model_name: str, device: str | None): does not silently reuse a model loaded elsewhere. """ try: - from sentence_transformers import SentenceTransformer + from sentence_transformers import SentenceTransformer # noqa: PLC0415 except ImportError as exc: # pragma: no cover - depends on optional extra raise ImportError( "StringDistanceFunction.Embedding requires the 'sentence-transformers' " @@ -107,10 +107,10 @@ class JaccardDistanceMatcher(BaseMatcher): tversky_alpha : float, optional Tversky penalty for unmatched values on the *reference* side (default: ``1.0``). The pair-similarity reduction is - ``T(A, B; α, β) = |A∩B| / (|A∩B| + α·|A−B| + β·|B−A|)``, + ``T(A, B; a, b) = |A∩B| / (|A∩B| + a·|A-B| + b·|B-A|)``, symmetrised by computing both ``T(A, B)`` and ``T(B, A)`` and taking the max so the matcher remains direction-agnostic. With - ``α = β = 1.0`` this reduces to Jaccard; with ``α = 1.0, + ``a = b = 1.0`` this reduces to Jaccard; with ``a = 1.0, β = 0.0`` (or vice versa) it reduces to ``max(|∩|/|A|, |∩|/|B|)``, i.e. set containment — the right choice when one column is expected to be a subset of the other. Intermediate values trade @@ -231,11 +231,12 @@ def __build_col_embeddings( vocab[v] = len(vocab) if not vocab: - return {key: (values, np.zeros((0, 0), dtype=np.float32)) for key, values in col_values.items()} + return { + key: (values, np.zeros((0, 0), dtype=np.float32)) + for key, values in col_values.items() + } - model = _load_sentence_transformer( - self.__embedding_model_name, self.__embedding_device - ) + model = _load_sentence_transformer(self.__embedding_model_name, self.__embedding_device) encode_kwargs: dict = { "normalize_embeddings": True, "show_progress_bar": False, @@ -243,9 +244,7 @@ def __build_col_embeddings( } if self.__embedding_batch_size is not None: encode_kwargs["batch_size"] = self.__embedding_batch_size - all_embeddings = model.encode(list(vocab.keys()), **encode_kwargs).astype( - np.float32 - ) + all_embeddings = model.encode(list(vocab.keys()), **encode_kwargs).astype(np.float32) dim = all_embeddings.shape[1] out: dict[tuple[str, str], tuple[list[str], np.ndarray]] = {} @@ -352,9 +351,7 @@ def __embedding_similarity( return self.__aggregate(a_match, b_match, len(src_values), len(tgt_values)) @staticmethod - def __directional_counts( - scores: np.ndarray, threshold: float - ) -> tuple[float, float]: + def __directional_counts(scores: np.ndarray, threshold: float) -> tuple[float, float]: """Count rows / columns whose best entry is at least ``threshold``. ``scores[i, j]`` is the similarity between A's i-th value and B's @@ -368,17 +365,15 @@ def __directional_counts( float(np.count_nonzero(hits.any(axis=0))), ) - def __aggregate( - self, a_match: float, b_match: float, a_size: int, b_size: int - ) -> float: + def __aggregate(self, a_match: float, b_match: float, a_size: int, b_size: int) -> float: """Reduce directional match counts to a similarity score via Tversky. Uses the asymmetric Tversky index in both directions and returns the larger of the two so the matcher stays direction-agnostic: - T(A, B; α, β) = a_match / (a_match + α·(|A|−a_match) + β·(|B|−b_match)) + T(A, B; a, b) = a_match / (a_match + a·(|A|-a_match) + b·(|B|-b_match)) - With α = β = 1 this is Jaccard; α = 1, β = 0 (or vice versa) + With a = b = 1 this is Jaccard; a = 1, b = 0 (or vice versa) recovers ``max(|∩|/|A|, |∩|/|B|)`` containment. """ if a_size == 0 or b_size == 0: diff --git a/valentine/algorithms/matcher_results.py b/valentine/algorithms/matcher_results.py index 491a3ae..9f6efaf 100644 --- a/valentine/algorithms/matcher_results.py +++ b/valentine/algorithms/matcher_results.py @@ -90,7 +90,7 @@ def get_details(self, key: ColumnPair) -> dict[str, float] | None: # -- Transformations --------------------------------------------------- - def one_to_one_hungarian(self, threshold: float | None = None) -> MatcherResults: + def one_to_one_hungarian(self, threshold: float | None = None) -> MatcherResults: # noqa: PLR0912 """Globally optimal 1:1 column matching via Hungarian assignment. This is the **default** 1:1 selector — it is what @@ -144,7 +144,7 @@ def one_to_one_hungarian(self, threshold: float | None = None) -> MatcherResults pair_lookup[(cp.source, cp.target)] = cp # Hungarian minimises cost; we want max similarity. - from scipy.optimize import linear_sum_assignment + from scipy.optimize import linear_sum_assignment # noqa: PLC0415 cost = [[-s for s in row] for row in sim] row_ind, col_ind = linear_sum_assignment(cost) @@ -266,10 +266,12 @@ def one_to_one_mutual_top(self, n: int = 1) -> MatcherResults: lst.sort(reverse=True) tgt_top[t] = {s for _, s in lst[:n]} - selected: dict[ColumnPair, float] = {} - for cp, score in self._data.items(): - if cp.target in src_top.get(cp.source, set()) and cp.source in tgt_top.get(cp.target, set()): - selected[cp] = score + selected: dict[ColumnPair, float] = { + cp: score + for cp, score in self._data.items() + if cp.target in src_top.get(cp.source, set()) + and cp.source in tgt_top.get(cp.target, set()) + } filtered_details = {k: v for k, v in self._details.items() if k in selected} return MatcherResults(selected, details=filtered_details) diff --git a/valentine/metrics/metric_helpers.py b/valentine/metrics/metric_helpers.py index e7342c1..d47046a 100644 --- a/valentine/metrics/metric_helpers.py +++ b/valentine/metrics/metric_helpers.py @@ -22,8 +22,7 @@ def _apply_one_to_one(matches: MatcherResults, method: OneToOneMethod) -> Matche if method == "mutual_top": return matches.one_to_one_mutual_top() raise ValueError( - f"Unknown one_to_one_method: {method!r}; " - "expected 'greedy', 'hungarian', or 'mutual_top'" + f"Unknown one_to_one_method: {method!r}; expected 'greedy', 'hungarian', or 'mutual_top'" ) From 976436aceb51214437a386a73c24b9e2fae7461c Mon Sep 17 00:00:00 2001 From: Kyriakos Psarakis Date: Tue, 5 May 2026 12:36:18 +0200 Subject: [PATCH 04/13] add more tests --- tests/test_coverage_gaps.py | 251 ++++++++++++++++++++++++++++++++++++ 1 file changed, 251 insertions(+) diff --git a/tests/test_coverage_gaps.py b/tests/test_coverage_gaps.py index 6fd4de4..3b5d784 100644 --- a/tests/test_coverage_gaps.py +++ b/tests/test_coverage_gaps.py @@ -4,6 +4,8 @@ behaviour-focused test files. """ +from unittest.mock import MagicMock, patch + import numpy as np import pandas as pd import pytest @@ -23,6 +25,7 @@ from valentine.algorithms.coma.similarity.tfidf import TfidfCorpus from valentine.algorithms.coma.similarity.tokens import tokenize_name, tokens_similarity from valentine.algorithms.cupid.linguistic_matching import _cached_synsets, get_synonyms +from valentine.algorithms.jaccard_distance import StringDistanceFunction from valentine.algorithms.distribution_based.clustering_utils import ( _COLUMN_STORE, _compute_ranks, @@ -436,3 +439,251 @@ def test_ingestion_generator_skips_empty_columns(self, tmp_path): # Only the non-empty column survives. assert len(produced) == 1 assert produced[0][0] == "full" + + +# -- JaccardDistanceMatcher parameter validation ---------------------------- + + +class TestJaccardParameterValidation: + def test_embedding_batch_size_zero_raises(self): + with pytest.raises(ValueError, match="embedding_batch_size"): + JaccardDistanceMatcher(embedding_batch_size=0) + + def test_embedding_batch_size_negative_raises(self): + with pytest.raises(ValueError, match="embedding_batch_size"): + JaccardDistanceMatcher(embedding_batch_size=-1) + + def test_tversky_alpha_negative_raises(self): + with pytest.raises(ValueError, match="tversky"): + JaccardDistanceMatcher(tversky_alpha=-0.1) + + def test_tversky_beta_negative_raises(self): + with pytest.raises(ValueError, match="tversky"): + JaccardDistanceMatcher(tversky_beta=-0.5) + + +# -- JaccardDistanceMatcher embedding path ---------------------------------- + +_EMB_PATCH = "valentine.algorithms.jaccard_distance.jaccard_distance._load_sentence_transformer" + + +def _fake_encoder(dim: int = 4) -> MagicMock: + """Return a mock SentenceTransformer that yields deterministic L2-normalised embeddings.""" + def encode(texts, **kwargs): + rng = np.random.default_rng(0) + emb = rng.random((len(texts), dim)).astype(np.float32) + norms = np.linalg.norm(emb, axis=1, keepdims=True) + return emb / np.where(norms == 0, 1.0, norms) + + mock = MagicMock() + mock.encode.side_effect = encode + return mock + + +class TestJaccardEmbeddingPath: + @patch(_EMB_PATCH) + def test_embedding_produces_matches(self, mock_load): + mock_load.return_value = _fake_encoder() + d1 = DataframeTable(pd.DataFrame({"col": ["alpha", "beta", "gamma"]}), name="A") + d2 = DataframeTable(pd.DataFrame({"col": ["alpha", "delta", "epsilon"]}), name="B") + matcher = JaccardDistanceMatcher( + distance_fun=StringDistanceFunction.Embedding, threshold_dist=0.0 + ) + assert len(matcher.get_matches(d1, d2)) > 0 + + @patch(_EMB_PATCH) + def test_embedding_encode_called_once_globally(self, mock_load): + mock = _fake_encoder() + mock_load.return_value = mock + d1 = DataframeTable(pd.DataFrame({"c1": ["a", "b"], "c2": ["c", "d"]}), name="A") + d2 = DataframeTable(pd.DataFrame({"c1": ["e", "f"], "c2": ["g", "h"]}), name="B") + matcher = JaccardDistanceMatcher( + distance_fun=StringDistanceFunction.Embedding, threshold_dist=0.0 + ) + matcher.get_matches(d1, d2) + assert mock.encode.call_count == 1 + assert set(mock.encode.call_args[0][0]) == {"a", "b", "c", "d", "e", "f", "g", "h"} + + @patch(_EMB_PATCH) + def test_embedding_batch_size_forwarded_to_encode(self, mock_load): + mock = _fake_encoder() + mock_load.return_value = mock + d1 = DataframeTable(pd.DataFrame({"col": ["x", "y"]}), name="A") + d2 = DataframeTable(pd.DataFrame({"col": ["z", "w"]}), name="B") + matcher = JaccardDistanceMatcher( + distance_fun=StringDistanceFunction.Embedding, + threshold_dist=0.0, + embedding_batch_size=32, + ) + matcher.get_matches(d1, d2) + assert mock.encode.call_args[1].get("batch_size") == 32 + + @patch(_EMB_PATCH) + def test_embedding_no_batch_size_not_forwarded(self, mock_load): + mock = _fake_encoder() + mock_load.return_value = mock + d1 = DataframeTable(pd.DataFrame({"col": ["x", "y"]}), name="A") + d2 = DataframeTable(pd.DataFrame({"col": ["z", "w"]}), name="B") + matcher = JaccardDistanceMatcher( + distance_fun=StringDistanceFunction.Embedding, threshold_dist=0.0 + ) + matcher.get_matches(d1, d2) + assert "batch_size" not in mock.encode.call_args[1] + + def test_all_empty_columns_skips_encode(self): + # vocab is empty → early return before _load_sentence_transformer is called, + # so no ImportError even though sentence_transformers is not installed. + d1 = DataframeTable(pd.DataFrame({"col": pd.Series([], dtype="object")}), name="A") + d2 = DataframeTable(pd.DataFrame({"col": pd.Series([], dtype="object")}), name="B") + matcher = JaccardDistanceMatcher( + distance_fun=StringDistanceFunction.Embedding, threshold_dist=0.5 + ) + results = matcher.get_matches(d1, d2) + assert all(score == 0.0 for score in results.values()) + + @patch(_EMB_PATCH) + def test_empty_source_column_similarity_is_zero(self, mock_load): + mock_load.return_value = _fake_encoder() + d1 = DataframeTable(pd.DataFrame({"col": pd.Series([], dtype="object")}), name="A") + d2 = DataframeTable(pd.DataFrame({"col": ["x", "y"]}), name="B") + matcher = JaccardDistanceMatcher( + distance_fun=StringDistanceFunction.Embedding, threshold_dist=0.5 + ) + results = matcher.get_matches(d1, d2) + assert all(score == 0.0 for score in results.values()) + + +# -- MatcherResults.one_to_one_hungarian caching & threshold ---------------- + + +class TestHungarianCachingAndThreshold: + def setup_method(self): + self.data = { + ColumnPair("s", "a", "t", "x"): 0.9, + ColumnPair("s", "b", "t", "y"): 0.8, + ColumnPair("s", "c", "t", "z"): 0.7, + ColumnPair("s", "a", "t", "y"): 0.4, + ColumnPair("s", "b", "t", "z"): 0.3, + ColumnPair("s", "c", "t", "x"): 0.2, + } + self.results = MatcherResults(self.data) + + def test_cache_hit_returns_same_object(self): + first = self.results.one_to_one_hungarian() + second = self.results.one_to_one_hungarian() + assert first is second + + def test_empty_data_result_is_cached(self): + empty = MatcherResults({}) + result = empty.one_to_one_hungarian() + assert len(result) == 0 + assert empty._cached_hungarian is result + + def test_empty_data_with_explicit_threshold_not_cached(self): + empty = MatcherResults({}) + empty.one_to_one_hungarian(threshold=0.5) + assert empty._cached_hungarian is None + + def test_explicit_threshold_result_not_cached(self): + self.results.one_to_one_hungarian(threshold=0.8) + assert self.results._cached_hungarian is None + + def test_explicit_threshold_filters_correctly(self): + result = self.results.one_to_one_hungarian(threshold=0.8) + assert all(score >= 0.8 for score in result.values()) + + +# -- MatcherResults.one_to_one_mutual_top ----------------------------------- + + +class TestMutualTopN: + def setup_method(self): + # 3 sources × 3 targets; diagonal pairs are mutual nearest neighbours. + self.data = { + ColumnPair("s", "a", "t", "x"): 0.9, + ColumnPair("s", "b", "t", "y"): 0.8, + ColumnPair("s", "c", "t", "z"): 0.7, + ColumnPair("s", "a", "t", "y"): 0.4, + ColumnPair("s", "b", "t", "z"): 0.3, + ColumnPair("s", "c", "t", "x"): 0.2, + } + self.results = MatcherResults(self.data) + + def test_n_zero_raises(self): + with pytest.raises(ValueError, match="n must be >= 1"): + self.results.one_to_one_mutual_top(n=0) + + def test_n_negative_raises(self): + with pytest.raises(ValueError, match="n must be >= 1"): + self.results.one_to_one_mutual_top(n=-1) + + def test_empty_data_returns_empty(self): + assert len(MatcherResults({}).one_to_one_mutual_top()) == 0 + + def test_n1_keeps_only_mutual_nearest(self): + result = self.results.one_to_one_mutual_top(n=1) + pairs = {(cp.source_column, cp.target_column) for cp in result} + assert pairs == {("a", "x"), ("b", "y"), ("c", "z")} + + def test_n2_admits_more_pairs_than_n1(self): + assert len(self.results.one_to_one_mutual_top(n=2)) >= len( + self.results.one_to_one_mutual_top(n=1) + ) + + def test_details_preserved_for_surviving_pairs(self): + details = {k: {"score": v} for k, v in self.data.items()} + result = MatcherResults(self.data, details=details).one_to_one_mutual_top(n=1) + for cp in result: + assert cp in result.details + + +# -- MatcherResults.one_to_one_greedy early-return branch ------------------ + + +class TestGreedyEarlyReturn: + def test_all_identical_scores_returns_all_pairs(self): + # < 2 distinct values → skip threshold logic and return everything. + data = { + ColumnPair("s", "a", "t", "x"): 0.5, + ColumnPair("s", "b", "t", "y"): 0.5, + } + assert len(MatcherResults(data).one_to_one_greedy()) == 2 + + +# -- metric_helpers dispatch & ground-truth normalisation ------------------ + +from valentine.metrics.metric_helpers import _apply_one_to_one, _normalize_ground_truth + + +class TestMetricHelpers: + def _two_pair_results(self): + return MatcherResults({ + ColumnPair("s", "a", "t", "x"): 0.9, + ColumnPair("s", "b", "t", "y"): 0.8, + }) + + def test_apply_invalid_method_raises(self): + with pytest.raises(ValueError, match="Unknown one_to_one_method"): + _apply_one_to_one(self._two_pair_results(), "invalid") + + def test_apply_greedy_dispatches(self): + assert isinstance(_apply_one_to_one(self._two_pair_results(), "greedy"), MatcherResults) + + def test_apply_mutual_top_dispatches(self): + assert isinstance( + _apply_one_to_one(self._two_pair_results(), "mutual_top"), MatcherResults + ) + + def test_normalize_empty_returns_false_flag(self): + pairs, table_aware = _normalize_ground_truth([]) + assert pairs == [] and table_aware is False + + def test_normalize_2field_not_table_aware(self): + pairs, table_aware = _normalize_ground_truth([("src_col", "tgt_col")]) + assert pairs == [("src_col", "tgt_col")] and table_aware is False + + def test_normalize_4field_is_table_aware(self): + pairs, table_aware = _normalize_ground_truth( + [("src_tbl", "src_col", "tgt_tbl", "tgt_col")] + ) + assert pairs == [("src_tbl", "src_col", "tgt_tbl", "tgt_col")] and table_aware is True From 25f6f1cd149eee3e3fef687a36e8914bfae9ec52 Mon Sep 17 00:00:00 2001 From: Kyriakos Psarakis Date: Tue, 5 May 2026 12:47:21 +0200 Subject: [PATCH 05/13] apply ruff rules --- tests/test_coverage_gaps.py | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/tests/test_coverage_gaps.py b/tests/test_coverage_gaps.py index 3b5d784..c61d7a6 100644 --- a/tests/test_coverage_gaps.py +++ b/tests/test_coverage_gaps.py @@ -25,7 +25,6 @@ from valentine.algorithms.coma.similarity.tfidf import TfidfCorpus from valentine.algorithms.coma.similarity.tokens import tokenize_name, tokens_similarity from valentine.algorithms.cupid.linguistic_matching import _cached_synsets, get_synonyms -from valentine.algorithms.jaccard_distance import StringDistanceFunction from valentine.algorithms.distribution_based.clustering_utils import ( _COLUMN_STORE, _compute_ranks, @@ -38,9 +37,11 @@ clear_global_ranks_cache, ) from valentine.algorithms.distribution_based.quantile_histogram import QuantileHistogram +from valentine.algorithms.jaccard_distance import StringDistanceFunction from valentine.algorithms.match import ColumnPair from valentine.algorithms.matcher_results import MatcherResults from valentine.data_sources.dataframe.dataframe_table import DataframeTable +from valentine.metrics.metric_helpers import _apply_one_to_one, _normalize_ground_truth # -- MatcherResults dunder & transformation coverage ------------------------ @@ -469,6 +470,7 @@ def test_tversky_beta_negative_raises(self): def _fake_encoder(dim: int = 4) -> MagicMock: """Return a mock SentenceTransformer that yields deterministic L2-normalised embeddings.""" + def encode(texts, **kwargs): rng = np.random.default_rng(0) emb = rng.random((len(texts), dim)).astype(np.float32) @@ -598,7 +600,7 @@ def test_explicit_threshold_filters_correctly(self): class TestMutualTopN: def setup_method(self): - # 3 sources × 3 targets; diagonal pairs are mutual nearest neighbours. + # 3 sources x 3 targets; diagonal pairs are mutual nearest neighbours. self.data = { ColumnPair("s", "a", "t", "x"): 0.9, ColumnPair("s", "b", "t", "y"): 0.8, @@ -652,15 +654,15 @@ def test_all_identical_scores_returns_all_pairs(self): # -- metric_helpers dispatch & ground-truth normalisation ------------------ -from valentine.metrics.metric_helpers import _apply_one_to_one, _normalize_ground_truth - class TestMetricHelpers: def _two_pair_results(self): - return MatcherResults({ - ColumnPair("s", "a", "t", "x"): 0.9, - ColumnPair("s", "b", "t", "y"): 0.8, - }) + return MatcherResults( + { + ColumnPair("s", "a", "t", "x"): 0.9, + ColumnPair("s", "b", "t", "y"): 0.8, + } + ) def test_apply_invalid_method_raises(self): with pytest.raises(ValueError, match="Unknown one_to_one_method"): @@ -670,9 +672,7 @@ def test_apply_greedy_dispatches(self): assert isinstance(_apply_one_to_one(self._two_pair_results(), "greedy"), MatcherResults) def test_apply_mutual_top_dispatches(self): - assert isinstance( - _apply_one_to_one(self._two_pair_results(), "mutual_top"), MatcherResults - ) + assert isinstance(_apply_one_to_one(self._two_pair_results(), "mutual_top"), MatcherResults) def test_normalize_empty_returns_false_flag(self): pairs, table_aware = _normalize_ground_truth([]) @@ -683,7 +683,5 @@ def test_normalize_2field_not_table_aware(self): assert pairs == [("src_col", "tgt_col")] and table_aware is False def test_normalize_4field_is_table_aware(self): - pairs, table_aware = _normalize_ground_truth( - [("src_tbl", "src_col", "tgt_tbl", "tgt_col")] - ) + pairs, table_aware = _normalize_ground_truth([("src_tbl", "src_col", "tgt_tbl", "tgt_col")]) assert pairs == [("src_tbl", "src_col", "tgt_tbl", "tgt_col")] and table_aware is True From c0b2348afab8bcf627d4f57771eb863c1865d962 Mon Sep 17 00:00:00 2001 From: Kyriakos Psarakis Date: Tue, 5 May 2026 12:49:04 +0200 Subject: [PATCH 06/13] add sentence-transformers in the requirements --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index d053701..fcc5ec1 100755 --- a/requirements.txt +++ b/requirements.txt @@ -10,6 +10,7 @@ rapidfuzz==3.14.5 PuLP==3.3.0 POT==0.9.6.post1 scipy==1.17.1 +sentence-transformers==5.4.1 # data loading python-dateutil==2.9.0.post0 # testing From 0842916ba5dcfd7f213817506aee497265c7b1b3 Mon Sep 17 00:00:00 2001 From: Kyriakos Psarakis Date: Tue, 5 May 2026 13:14:22 +0200 Subject: [PATCH 07/13] add the new methods in the ci and experiment_nyu --- .github/workflows/build.yml | 1 + .github/workflows/ci-build-test-publish.yml | 2 ++ experiments/experiment_nyu.py | 14 +++++++++++++- 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index a0b51d9..deddd07 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -29,6 +29,7 @@ jobs: python -m pip install --upgrade pip pip install . pip install ".[polars]" || true + pip install ".[embeddings]" pip install pytest==9.0.2 coverage==7.13.5 ruff==0.15.12 - name: Ruff lint (must pass) diff --git a/.github/workflows/ci-build-test-publish.yml b/.github/workflows/ci-build-test-publish.yml index a193801..d95f8ce 100644 --- a/.github/workflows/ci-build-test-publish.yml +++ b/.github/workflows/ci-build-test-publish.yml @@ -61,6 +61,7 @@ jobs: python -m pip install --upgrade pip pip install . pip install ".[polars]" || true + pip install ".[embeddings]" - name: Install test deps run: pip install pytest==9.0.2 @@ -106,6 +107,7 @@ jobs: pip install dist/*.tar.gz fi pip install polars || true + pip install "sentence-transformers>=2.0,<6.0" - name: Install test deps run: pip install pytest==9.0.2 diff --git a/experiments/experiment_nyu.py b/experiments/experiment_nyu.py index 1e1a8c7..d38583b 100644 --- a/experiments/experiment_nyu.py +++ b/experiments/experiment_nyu.py @@ -1,3 +1,4 @@ +import importlib.util import json import time from pathlib import Path @@ -12,6 +13,7 @@ JaccardDistanceMatcher, SimilarityFlooding, ) +from valentine.algorithms.jaccard_distance import StringDistanceFunction def _iter_datasets(data_root: Path) -> list[Path]: @@ -43,13 +45,23 @@ def _load_ground_truth(path: Path) -> list[tuple[str, str]]: def _matcher_builders(): - return [ + builders = [ ("Coma", lambda: Coma(use_instances=True)), ("Cupid", Cupid), ("DistributionBased", DistributionBased), ("JaccardDistanceMatcher", JaccardDistanceMatcher), ("SimilarityFlooding", SimilarityFlooding), ] + if importlib.util.find_spec("sentence_transformers") is not None: + builders.append(( + "JaccardDistanceMatcher_emb", + lambda: JaccardDistanceMatcher( + distance_fun=StringDistanceFunction.Embedding, + threshold_dist=0.7, + embedding_device=None, + ), + )) + return builders def main(): From 55bac6d33d44f8e2021ad69815d491c965dfc93c Mon Sep 17 00:00:00 2001 From: Kyriakos Psarakis Date: Tue, 5 May 2026 13:31:22 +0200 Subject: [PATCH 08/13] update test workflows --- .github/workflows/ci-build-test-publish.yml | 6 +- tests/test_coverage_gaps.py | 85 +++++++++++++++++++++ 2 files changed, 88 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci-build-test-publish.yml b/.github/workflows/ci-build-test-publish.yml index d95f8ce..e1d74d5 100644 --- a/.github/workflows/ci-build-test-publish.yml +++ b/.github/workflows/ci-build-test-publish.yml @@ -66,8 +66,8 @@ jobs: - name: Install test deps run: pip install pytest==9.0.2 - - name: Run tests (unittest) - run: python -m unittest discover tests + - name: Run tests + run: python -m pytest -q tests # 3) Test the built wheel on all OS/Python test_wheel: @@ -113,7 +113,7 @@ jobs: run: pip install pytest==9.0.2 - name: Run tests (against installed wheel) - run: python -m unittest discover tests + run: python -m pytest -q tests # 4) Publish to PyPI on Release (tag publish event) publish-to-pypi: diff --git a/tests/test_coverage_gaps.py b/tests/test_coverage_gaps.py index c61d7a6..1897190 100644 --- a/tests/test_coverage_gaps.py +++ b/tests/test_coverage_gaps.py @@ -4,12 +4,15 @@ behaviour-focused test files. """ +import importlib.util from unittest.mock import MagicMock, patch import numpy as np import pandas as pd import pytest +_ST_AVAILABLE = importlib.util.find_spec("sentence_transformers") is not None + from tests import df1, df2 from valentine import InvalidMatcherError, valentine_match from valentine.algorithms import ( @@ -685,3 +688,85 @@ def test_normalize_2field_not_table_aware(self): def test_normalize_4field_is_table_aware(self): pairs, table_aware = _normalize_ground_truth([("src_tbl", "src_col", "tgt_tbl", "tgt_col")]) assert pairs == [("src_tbl", "src_col", "tgt_tbl", "tgt_col")] and table_aware is True + + +# -- JaccardDistanceMatcher real embedding integration ---------------------- +# These tests require sentence-transformers and are skipped when it is absent. +# They exercise the actual SentenceTransformer model, unlike the mocked tests +# above — use them to verify the real embedding path works end-to-end. + +@pytest.mark.skipif(not _ST_AVAILABLE, reason="sentence_transformers not installed") +class TestJaccardEmbeddingIntegration: + """Integration tests that load a real SentenceTransformer model.""" + + _MATCHER = JaccardDistanceMatcher( + distance_fun=StringDistanceFunction.Embedding, + embedding_device="cpu", + threshold_dist=0.5, + ) + + def test_semantically_similar_columns_match(self): + # "customer_id" / "client_id" and "order_date" / "purchase_date" are + # semantically close; the embedding matcher should return non-zero + # similarity for at least one pair. + d1 = DataframeTable( + pd.DataFrame({"customer_id": ["C1", "C2", "C3"], "order_date": ["2024-01-01", "2024-01-02", "2024-01-03"]}), + name="orders", + ) + d2 = DataframeTable( + pd.DataFrame({"client_id": ["C1", "C2", "C3"], "purchase_date": ["2024-01-01", "2024-01-02", "2024-01-03"]}), + name="purchases", + ) + results = self._MATCHER.get_matches(d1, d2) + assert len(results) > 0 + assert all(0.0 <= score <= 1.0 for score in results.values()) + + def test_identical_values_score_is_high(self): + # Two columns with identical string values should produce a near-1.0 + # embedding similarity because the same text encodes to the same vector. + d1 = DataframeTable(pd.DataFrame({"city": ["London", "Paris", "Berlin"]}), name="A") + d2 = DataframeTable(pd.DataFrame({"city": ["London", "Paris", "Berlin"]}), name="B") + results = self._MATCHER.get_matches(d1, d2) + assert len(results) == 1 + score = next(iter(results.values())) + assert score > 0.9 + + def test_batch_size_produces_same_result(self): + # Results with batch_size=1 must match results with the default batch + # size, verifying that batching does not affect the output. + d1 = DataframeTable(pd.DataFrame({"col": ["alpha", "beta", "gamma"]}), name="A") + d2 = DataframeTable(pd.DataFrame({"col": ["alpha", "delta"]}), name="B") + default_matcher = JaccardDistanceMatcher( + distance_fun=StringDistanceFunction.Embedding, + embedding_device="cpu", + threshold_dist=0.5, + ) + batched_matcher = JaccardDistanceMatcher( + distance_fun=StringDistanceFunction.Embedding, + embedding_device="cpu", + threshold_dist=0.5, + embedding_batch_size=1, + ) + default_res = default_matcher.get_matches(d1, d2) + batched_res = batched_matcher.get_matches(d1, d2) + assert set(default_res.keys()) == set(batched_res.keys()) + for key in default_res: + assert abs(default_res[key] - batched_res[key]) < 1e-5 + + def test_get_matches_batch_shares_embeddings(self): + # get_matches_batch must encode each unique string exactly once + # across all tables. We verify this indirectly: the result contains + # cross-table pair entries for every (t1, t2) combination. + d1 = DataframeTable(pd.DataFrame({"col": ["x", "y"]}), name="T1") + d2 = DataframeTable(pd.DataFrame({"col": ["x", "z"]}), name="T2") + d3 = DataframeTable(pd.DataFrame({"col": ["y", "z"]}), name="T3") + matcher = JaccardDistanceMatcher( + distance_fun=StringDistanceFunction.Embedding, + embedding_device="cpu", + threshold_dist=0.0, + ) + results = matcher.get_matches_batch([d1, d2, d3]) + table_pairs = {(cp.source_table, cp.target_table) for cp in results} + assert ("T1", "T2") in table_pairs + assert ("T1", "T3") in table_pairs + assert ("T2", "T3") in table_pairs From 8d09d376fd46612814275089c084aabe617e58d6 Mon Sep 17 00:00:00 2001 From: Kyriakos Psarakis Date: Tue, 5 May 2026 13:37:29 +0200 Subject: [PATCH 09/13] update baseline --- experiments/bench_baseline.json | 127 +++++++++++++++++++++----------- 1 file changed, 85 insertions(+), 42 deletions(-) diff --git a/experiments/bench_baseline.json b/experiments/bench_baseline.json index 9e8ec1d..62249a9 100644 --- a/experiments/bench_baseline.json +++ b/experiments/bench_baseline.json @@ -3,260 +3,303 @@ "Coma": { "pairs": { "customers": { - "seconds": 0.0204, + "seconds": 0.005, "n_matches": 5, "f1": 1.0, "recall_at_gt": 1.0, "mrr": 1.0 }, "orders": { - "seconds": 0.0098, + "seconds": 0.0051, "n_matches": 4, "f1": 0.8889, "recall_at_gt": 0.8, "mrr": 0.8 }, "products": { - "seconds": 0.0166, + "seconds": 0.0049, "n_matches": 6, "f1": 1.0, "recall_at_gt": 1.0, "mrr": 1.0 }, "events": { - "seconds": 0.0125, + "seconds": 0.0054, "n_matches": 4, "f1": 1.0, "recall_at_gt": 1.0, "mrr": 1.0 }, "addresses": { - "seconds": 0.0112, + "seconds": 0.0046, "n_matches": 5, "f1": 0.8889, "recall_at_gt": 1.0, "mrr": 1.0 } }, - "total_seconds": 0.0705, - "worst_seconds": 0.0204, + "total_seconds": 0.025, + "worst_seconds": 0.0054, "mean_f1": 0.9556, "mean_mrr": 0.96 }, "Coma_Inst": { "pairs": { "customers": { - "seconds": 0.0306, + "seconds": 0.0142, "n_matches": 5, "f1": 1.0, "recall_at_gt": 1.0, "mrr": 1.0 }, "orders": { - "seconds": 0.0333, + "seconds": 0.0149, "n_matches": 5, "f1": 1.0, "recall_at_gt": 1.0, "mrr": 1.0 }, "products": { - "seconds": 0.0233, + "seconds": 0.0116, "n_matches": 6, "f1": 0.9091, "recall_at_gt": 1.0, "mrr": 1.0 }, "events": { - "seconds": 0.0245, + "seconds": 0.0115, "n_matches": 4, "f1": 1.0, "recall_at_gt": 1.0, "mrr": 1.0 }, "addresses": { - "seconds": 0.022, + "seconds": 0.0101, "n_matches": 5, "f1": 0.8889, "recall_at_gt": 1.0, "mrr": 1.0 } }, - "total_seconds": 0.1337, - "worst_seconds": 0.0333, + "total_seconds": 0.0623, + "worst_seconds": 0.0149, "mean_f1": 0.9596, "mean_mrr": 1.0 }, "Cupid": { "pairs": { "customers": { - "seconds": 2.0127, + "seconds": 1.1821, "n_matches": 6, "f1": 0.8889, "recall_at_gt": 0.8, "mrr": 0.8 }, "orders": { - "seconds": 0.0297, + "seconds": 0.0178, "n_matches": 3, "f1": 0.75, "recall_at_gt": 0.6, "mrr": 0.6 }, "products": { - "seconds": 0.0301, + "seconds": 0.019, "n_matches": 6, "f1": 0.9091, "recall_at_gt": 0.8333, "mrr": 0.8333 }, "events": { - "seconds": 0.0151, + "seconds": 0.0092, "n_matches": 4, "f1": 1.0, "recall_at_gt": 1.0, "mrr": 1.0 }, "addresses": { - "seconds": 0.0657, + "seconds": 0.0417, "n_matches": 3, "f1": 0.75, "recall_at_gt": 0.6, "mrr": 0.6 } }, - "total_seconds": 2.1533, - "worst_seconds": 2.0127, + "total_seconds": 1.2698, + "worst_seconds": 1.1821, "mean_f1": 0.8596, "mean_mrr": 0.7667 }, "DistributionBased": { "pairs": { "customers": { - "seconds": 0.0993, + "seconds": 0.1208, "n_matches": 7, "f1": 1.0, "recall_at_gt": 1.0, "mrr": 1.0 }, "orders": { - "seconds": 0.0918, + "seconds": 0.1283, "n_matches": 4, "f1": 0.75, "recall_at_gt": 0.8, "mrr": 0.8 }, "products": { - "seconds": 0.0773, + "seconds": 0.1501, "n_matches": 5, "f1": 0.8, "recall_at_gt": 0.8333, "mrr": 0.8333 }, "events": { - "seconds": 0.0601, + "seconds": 0.2155, "n_matches": 4, "f1": 1.0, "recall_at_gt": 1.0, "mrr": 1.0 }, "addresses": { - "seconds": 0.0795, + "seconds": 0.1442, "n_matches": 3, "f1": 0.75, "recall_at_gt": 0.6, "mrr": 0.6 } }, - "total_seconds": 0.408, - "worst_seconds": 0.0993, + "total_seconds": 0.7589, + "worst_seconds": 0.2155, "mean_f1": 0.86, "mean_mrr": 0.8467 }, "JaccardDistanceMatcher": { "pairs": { "customers": { - "seconds": 0.0111, + "seconds": 0.0046, "n_matches": 7, "f1": 1.0, "recall_at_gt": 1.0, "mrr": 1.0 }, "orders": { - "seconds": 0.0139, + "seconds": 0.0091, "n_matches": 11, "f1": 0.8889, "recall_at_gt": 0.6, "mrr": 1.0 }, "products": { - "seconds": 0.0101, + "seconds": 0.0058, "n_matches": 7, "f1": 1.0, "recall_at_gt": 1.0, "mrr": 1.0 }, "events": { - "seconds": 0.0116, + "seconds": 0.0053, "n_matches": 6, "f1": 1.0, "recall_at_gt": 1.0, "mrr": 1.0 }, "addresses": { - "seconds": 0.0095, + "seconds": 0.0044, "n_matches": 5, "f1": 1.0, "recall_at_gt": 1.0, "mrr": 1.0 } }, - "total_seconds": 0.0562, - "worst_seconds": 0.0139, + "total_seconds": 0.0292, + "worst_seconds": 0.0091, "mean_f1": 0.9778, "mean_mrr": 1.0 }, "SimilarityFlooding": { "pairs": { "customers": { - "seconds": 0.0437, + "seconds": 0.0233, "n_matches": 25, "f1": 0.8889, "recall_at_gt": 0.8, "mrr": 1.0 }, "orders": { - "seconds": 0.0369, + "seconds": 0.0237, "n_matches": 25, "f1": 0.8889, "recall_at_gt": 0.8, "mrr": 0.9 }, "products": { - "seconds": 0.0644, + "seconds": 0.0347, "n_matches": 36, "f1": 0.9091, "recall_at_gt": 0.8333, "mrr": 1.0 }, "events": { - "seconds": 0.0364, + "seconds": 0.019, "n_matches": 16, "f1": 1.0, "recall_at_gt": 1.0, "mrr": 1.0 }, "addresses": { - "seconds": 0.0388, + "seconds": 0.021, "n_matches": 25, "f1": 1.0, "recall_at_gt": 0.8, "mrr": 1.0 } }, - "total_seconds": 0.2202, - "worst_seconds": 0.0644, + "total_seconds": 0.1217, + "worst_seconds": 0.0347, "mean_f1": 0.9374, "mean_mrr": 0.98 + }, + "JaccardDistanceMatcher_emb": { + "pairs": { + "customers": { + "seconds": 5.6881, + "n_matches": 7, + "f1": 1.0, + "recall_at_gt": 1.0, + "mrr": 1.0 + }, + "orders": { + "seconds": 0.2185, + "n_matches": 18, + "f1": 1.0, + "recall_at_gt": 1.0, + "mrr": 1.0 + }, + "products": { + "seconds": 0.1531, + "n_matches": 10, + "f1": 1.0, + "recall_at_gt": 1.0, + "mrr": 1.0 + }, + "events": { + "seconds": 0.0853, + "n_matches": 6, + "f1": 1.0, + "recall_at_gt": 1.0, + "mrr": 1.0 + }, + "addresses": { + "seconds": 0.1342, + "n_matches": 11, + "f1": 1.0, + "recall_at_gt": 1.0, + "mrr": 1.0 + } + }, + "total_seconds": 6.2792, + "worst_seconds": 5.6881, + "mean_f1": 1.0, + "mean_mrr": 1.0 } } } \ No newline at end of file From 0d7aac8c00789f66f0c8ed3b0b772a458452f805 Mon Sep 17 00:00:00 2001 From: Kyriakos Psarakis Date: Tue, 5 May 2026 13:40:15 +0200 Subject: [PATCH 10/13] apply ruff rules --- experiments/experiment_nyu.py | 18 ++++++++++-------- tests/test_coverage_gaps.py | 19 +++++++++++++++---- 2 files changed, 25 insertions(+), 12 deletions(-) diff --git a/experiments/experiment_nyu.py b/experiments/experiment_nyu.py index d38583b..7f33a35 100644 --- a/experiments/experiment_nyu.py +++ b/experiments/experiment_nyu.py @@ -53,14 +53,16 @@ def _matcher_builders(): ("SimilarityFlooding", SimilarityFlooding), ] if importlib.util.find_spec("sentence_transformers") is not None: - builders.append(( - "JaccardDistanceMatcher_emb", - lambda: JaccardDistanceMatcher( - distance_fun=StringDistanceFunction.Embedding, - threshold_dist=0.7, - embedding_device=None, - ), - )) + builders.append( + ( + "JaccardDistanceMatcher_emb", + lambda: JaccardDistanceMatcher( + distance_fun=StringDistanceFunction.Embedding, + threshold_dist=0.7, + embedding_device=None, + ), + ) + ) return builders diff --git a/tests/test_coverage_gaps.py b/tests/test_coverage_gaps.py index 1897190..8f8db0d 100644 --- a/tests/test_coverage_gaps.py +++ b/tests/test_coverage_gaps.py @@ -11,8 +11,6 @@ import pandas as pd import pytest -_ST_AVAILABLE = importlib.util.find_spec("sentence_transformers") is not None - from tests import df1, df2 from valentine import InvalidMatcherError, valentine_match from valentine.algorithms import ( @@ -46,6 +44,8 @@ from valentine.data_sources.dataframe.dataframe_table import DataframeTable from valentine.metrics.metric_helpers import _apply_one_to_one, _normalize_ground_truth +_ST_AVAILABLE = importlib.util.find_spec("sentence_transformers") is not None + # -- MatcherResults dunder & transformation coverage ------------------------ @@ -695,6 +695,7 @@ def test_normalize_4field_is_table_aware(self): # They exercise the actual SentenceTransformer model, unlike the mocked tests # above — use them to verify the real embedding path works end-to-end. + @pytest.mark.skipif(not _ST_AVAILABLE, reason="sentence_transformers not installed") class TestJaccardEmbeddingIntegration: """Integration tests that load a real SentenceTransformer model.""" @@ -710,11 +711,21 @@ def test_semantically_similar_columns_match(self): # semantically close; the embedding matcher should return non-zero # similarity for at least one pair. d1 = DataframeTable( - pd.DataFrame({"customer_id": ["C1", "C2", "C3"], "order_date": ["2024-01-01", "2024-01-02", "2024-01-03"]}), + pd.DataFrame( + { + "customer_id": ["C1", "C2", "C3"], + "order_date": ["2024-01-01", "2024-01-02", "2024-01-03"], + } + ), name="orders", ) d2 = DataframeTable( - pd.DataFrame({"client_id": ["C1", "C2", "C3"], "purchase_date": ["2024-01-01", "2024-01-02", "2024-01-03"]}), + pd.DataFrame( + { + "client_id": ["C1", "C2", "C3"], + "purchase_date": ["2024-01-01", "2024-01-02", "2024-01-03"], + } + ), name="purchases", ) results = self._MATCHER.get_matches(d1, d2) From c1b805a0f4e4a4cf7bce6713b84b1824aca47545 Mon Sep 17 00:00:00 2001 From: Kyriakos Psarakis Date: Tue, 5 May 2026 13:57:46 +0200 Subject: [PATCH 11/13] make PULP deterministic --- experiments/bench_baseline.json | 125 ++++++------------ .../distribution_based/discovery.py | 13 +- .../distribution_based/distribution_based.py | 10 +- 3 files changed, 51 insertions(+), 97 deletions(-) diff --git a/experiments/bench_baseline.json b/experiments/bench_baseline.json index 62249a9..f0f86ea 100644 --- a/experiments/bench_baseline.json +++ b/experiments/bench_baseline.json @@ -3,42 +3,42 @@ "Coma": { "pairs": { "customers": { - "seconds": 0.005, + "seconds": 0.0054, "n_matches": 5, "f1": 1.0, "recall_at_gt": 1.0, "mrr": 1.0 }, "orders": { - "seconds": 0.0051, + "seconds": 0.0042, "n_matches": 4, "f1": 0.8889, "recall_at_gt": 0.8, "mrr": 0.8 }, "products": { - "seconds": 0.0049, + "seconds": 0.0039, "n_matches": 6, "f1": 1.0, "recall_at_gt": 1.0, "mrr": 1.0 }, "events": { - "seconds": 0.0054, + "seconds": 0.0048, "n_matches": 4, "f1": 1.0, "recall_at_gt": 1.0, "mrr": 1.0 }, "addresses": { - "seconds": 0.0046, + "seconds": 0.0038, "n_matches": 5, "f1": 0.8889, "recall_at_gt": 1.0, "mrr": 1.0 } }, - "total_seconds": 0.025, + "total_seconds": 0.0221, "worst_seconds": 0.0054, "mean_f1": 0.9556, "mean_mrr": 0.96 @@ -46,193 +46,193 @@ "Coma_Inst": { "pairs": { "customers": { - "seconds": 0.0142, + "seconds": 0.0175, "n_matches": 5, "f1": 1.0, "recall_at_gt": 1.0, "mrr": 1.0 }, "orders": { - "seconds": 0.0149, + "seconds": 0.0117, "n_matches": 5, "f1": 1.0, "recall_at_gt": 1.0, "mrr": 1.0 }, "products": { - "seconds": 0.0116, + "seconds": 0.0099, "n_matches": 6, "f1": 0.9091, "recall_at_gt": 1.0, "mrr": 1.0 }, "events": { - "seconds": 0.0115, + "seconds": 0.0094, "n_matches": 4, "f1": 1.0, "recall_at_gt": 1.0, "mrr": 1.0 }, "addresses": { - "seconds": 0.0101, + "seconds": 0.0089, "n_matches": 5, "f1": 0.8889, "recall_at_gt": 1.0, "mrr": 1.0 } }, - "total_seconds": 0.0623, - "worst_seconds": 0.0149, + "total_seconds": 0.0574, + "worst_seconds": 0.0175, "mean_f1": 0.9596, "mean_mrr": 1.0 }, "Cupid": { "pairs": { "customers": { - "seconds": 1.1821, + "seconds": 1.4134, "n_matches": 6, "f1": 0.8889, "recall_at_gt": 0.8, "mrr": 0.8 }, "orders": { - "seconds": 0.0178, + "seconds": 0.0145, "n_matches": 3, "f1": 0.75, "recall_at_gt": 0.6, "mrr": 0.6 }, "products": { - "seconds": 0.019, + "seconds": 0.0153, "n_matches": 6, "f1": 0.9091, "recall_at_gt": 0.8333, "mrr": 0.8333 }, "events": { - "seconds": 0.0092, + "seconds": 0.0075, "n_matches": 4, "f1": 1.0, "recall_at_gt": 1.0, "mrr": 1.0 }, "addresses": { - "seconds": 0.0417, + "seconds": 0.0345, "n_matches": 3, "f1": 0.75, "recall_at_gt": 0.6, "mrr": 0.6 } }, - "total_seconds": 1.2698, - "worst_seconds": 1.1821, + "total_seconds": 1.4852, + "worst_seconds": 1.4134, "mean_f1": 0.8596, "mean_mrr": 0.7667 }, "DistributionBased": { "pairs": { "customers": { - "seconds": 0.1208, + "seconds": 0.1721, "n_matches": 7, "f1": 1.0, - "recall_at_gt": 1.0, + "recall_at_gt": 0.8, "mrr": 1.0 }, "orders": { - "seconds": 0.1283, + "seconds": 0.1298, "n_matches": 4, "f1": 0.75, "recall_at_gt": 0.8, "mrr": 0.8 }, "products": { - "seconds": 0.1501, + "seconds": 0.1606, "n_matches": 5, "f1": 0.8, "recall_at_gt": 0.8333, "mrr": 0.8333 }, "events": { - "seconds": 0.2155, + "seconds": 0.1061, "n_matches": 4, "f1": 1.0, "recall_at_gt": 1.0, "mrr": 1.0 }, "addresses": { - "seconds": 0.1442, + "seconds": 0.1517, "n_matches": 3, "f1": 0.75, "recall_at_gt": 0.6, "mrr": 0.6 } }, - "total_seconds": 0.7589, - "worst_seconds": 0.2155, + "total_seconds": 0.7203, + "worst_seconds": 0.1721, "mean_f1": 0.86, "mean_mrr": 0.8467 }, "JaccardDistanceMatcher": { "pairs": { "customers": { - "seconds": 0.0046, + "seconds": 0.0044, "n_matches": 7, "f1": 1.0, "recall_at_gt": 1.0, "mrr": 1.0 }, "orders": { - "seconds": 0.0091, + "seconds": 0.0079, "n_matches": 11, "f1": 0.8889, "recall_at_gt": 0.6, "mrr": 1.0 }, "products": { - "seconds": 0.0058, + "seconds": 0.0046, "n_matches": 7, "f1": 1.0, "recall_at_gt": 1.0, "mrr": 1.0 }, "events": { - "seconds": 0.0053, + "seconds": 0.0052, "n_matches": 6, "f1": 1.0, "recall_at_gt": 1.0, "mrr": 1.0 }, "addresses": { - "seconds": 0.0044, + "seconds": 0.0048, "n_matches": 5, "f1": 1.0, "recall_at_gt": 1.0, "mrr": 1.0 } }, - "total_seconds": 0.0292, - "worst_seconds": 0.0091, + "total_seconds": 0.0269, + "worst_seconds": 0.0079, "mean_f1": 0.9778, "mean_mrr": 1.0 }, "SimilarityFlooding": { "pairs": { "customers": { - "seconds": 0.0233, + "seconds": 0.0229, "n_matches": 25, "f1": 0.8889, "recall_at_gt": 0.8, "mrr": 1.0 }, "orders": { - "seconds": 0.0237, + "seconds": 0.02, "n_matches": 25, "f1": 0.8889, "recall_at_gt": 0.8, "mrr": 0.9 }, "products": { - "seconds": 0.0347, + "seconds": 0.0337, "n_matches": 36, "f1": 0.9091, "recall_at_gt": 0.8333, @@ -246,60 +246,17 @@ "mrr": 1.0 }, "addresses": { - "seconds": 0.021, + "seconds": 0.0209, "n_matches": 25, "f1": 1.0, "recall_at_gt": 0.8, "mrr": 1.0 } }, - "total_seconds": 0.1217, - "worst_seconds": 0.0347, + "total_seconds": 0.1165, + "worst_seconds": 0.0337, "mean_f1": 0.9374, "mean_mrr": 0.98 - }, - "JaccardDistanceMatcher_emb": { - "pairs": { - "customers": { - "seconds": 5.6881, - "n_matches": 7, - "f1": 1.0, - "recall_at_gt": 1.0, - "mrr": 1.0 - }, - "orders": { - "seconds": 0.2185, - "n_matches": 18, - "f1": 1.0, - "recall_at_gt": 1.0, - "mrr": 1.0 - }, - "products": { - "seconds": 0.1531, - "n_matches": 10, - "f1": 1.0, - "recall_at_gt": 1.0, - "mrr": 1.0 - }, - "events": { - "seconds": 0.0853, - "n_matches": 6, - "f1": 1.0, - "recall_at_gt": 1.0, - "mrr": 1.0 - }, - "addresses": { - "seconds": 0.1342, - "n_matches": 11, - "f1": 1.0, - "recall_at_gt": 1.0, - "mrr": 1.0 - } - }, - "total_seconds": 6.2792, - "worst_seconds": 5.6881, - "mean_f1": 1.0, - "mean_mrr": 1.0 } } } \ No newline at end of file diff --git a/valentine/algorithms/distribution_based/discovery.py b/valentine/algorithms/distribution_based/discovery.py index 5c5c235..f63f814 100644 --- a/valentine/algorithms/distribution_based/discovery.py +++ b/valentine/algorithms/distribution_based/discovery.py @@ -52,9 +52,7 @@ def compute_distribution_clusters( graph = create_graph(columns, edges_per_column) - connected_components = list(nx.connected_components(graph)) - - return connected_components + return sorted(nx.connected_components(graph), key=sorted) def compute_distribution_clusters_parallel( @@ -99,9 +97,7 @@ def compute_distribution_clusters_parallel( graph = create_graph(columns, edges_per_column) - connected_components = list(nx.connected_components(graph)) - - return connected_components + return sorted(nx.connected_components(graph), key=sorted) def compute_attributes( @@ -278,7 +274,7 @@ def correlation_clustering_pulp(vertexes: list, edges: dict): if len({u, v, w}) == 3: opt_model += x_vars[u, w] <= x_vars[u, v] + x_vars[v, w] - opt_model.solve(PULP_CBC_CMD(msg=False)) + opt_model.solve(PULP_CBC_CMD(msg=False, options=["RandomS", "42"])) result = {} @@ -319,8 +315,7 @@ def process_correlation_clustering_result(results: list, columns: list): m1, m2 = match edges_per_column.append([(m1, m2)]) graph = create_graph(columns, edges_per_column) - connected_components = list(nx.connected_components(graph)) - return connected_components + return sorted(nx.connected_components(graph), key=sorted) def create_graph(nodes: list, edges_per_column: list): diff --git a/valentine/algorithms/distribution_based/distribution_based.py b/valentine/algorithms/distribution_based/distribution_based.py index f53443c..0b2e94d 100644 --- a/valentine/algorithms/distribution_based/distribution_based.py +++ b/valentine/algorithms/distribution_based/distribution_based.py @@ -173,14 +173,15 @@ def __find_matches(self, tmp_folder_path: str, table_order: dict[str, int]): for components in connected_components: if len(components) > 1: i = i + 1 + sorted_components = sorted(components) edges = discovery.compute_attributes( - list(components), + sorted_components, self.__threshold2, tmp_folder_path, self.__quantiles, self.__use_bloom_filters, ) - all_attributes.append((list(components), edges)) + all_attributes.append((sorted_components, edges)) results = [] for components, edges in all_attributes: @@ -220,15 +221,16 @@ def __find_matches_parallel( for components in connected_components: if len(components) > 1: i = i + 1 + sorted_components = sorted(components) edges = discovery.compute_attributes_parallel( - list(components), + sorted_components, self.__threshold2, pool, tmp_folder_path, self.__quantiles, self.__use_bloom_filters, ) - all_attributes.append((list(components), edges)) + all_attributes.append((sorted_components, edges)) results = [] for components, edges in all_attributes: From c88600ef15690008db9329fea1def61f598e1fad Mon Sep 17 00:00:00 2001 From: Kyriakos Psarakis Date: Tue, 5 May 2026 14:12:33 +0200 Subject: [PATCH 12/13] make benchmark deterministic --- .github/workflows/bench.yml | 27 +++- experiments/bench_baseline.json | 129 ++++++++++++------ .../distribution_based/distribution_based.py | 69 +++++++--- 3 files changed, 157 insertions(+), 68 deletions(-) diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index 160b34d..097e825 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -8,8 +8,15 @@ on: jobs: bench: - runs-on: ubuntu-latest - timeout-minutes: 10 + name: bench (${{ matrix.os }}) + runs-on: ${{ matrix.os }} + timeout-minutes: 15 + + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, windows-latest, macos-latest] + python-version: ['3.14'] steps: - uses: actions/checkout@v6.0.2 @@ -17,9 +24,15 @@ jobs: - name: Set up Python uses: actions/setup-python@v6.2.0 with: - python-version: '3.14' + python-version: ${{ matrix.python-version }} cache: 'pip' + - name: Set deterministic environment + run: | + echo "OMP_NUM_THREADS=1" >> $GITHUB_ENV + echo "MKL_NUM_THREADS=1" >> $GITHUB_ENV + echo "PYTHONHASHSEED=0" >> $GITHUB_ENV + - name: Install package run: | python -m pip install --upgrade pip @@ -29,7 +42,7 @@ jobs: run: | python experiments/bench.py \ --quick \ - --output bench_results.json \ + --output bench_results_${{ matrix.os }}.json \ --baseline experiments/bench_baseline.json \ --accuracy-only @@ -37,6 +50,6 @@ jobs: if: always() uses: actions/upload-artifact@v4 with: - name: bench-results - path: bench_results.json - if-no-files-found: warn + name: bench-results-${{ matrix.os }} + path: bench_results_${{ matrix.os }}.json + if-no-files-found: warn \ No newline at end of file diff --git a/experiments/bench_baseline.json b/experiments/bench_baseline.json index f0f86ea..eec4b62 100644 --- a/experiments/bench_baseline.json +++ b/experiments/bench_baseline.json @@ -3,42 +3,42 @@ "Coma": { "pairs": { "customers": { - "seconds": 0.0054, + "seconds": 0.0049, "n_matches": 5, "f1": 1.0, "recall_at_gt": 1.0, "mrr": 1.0 }, "orders": { - "seconds": 0.0042, + "seconds": 0.0053, "n_matches": 4, "f1": 0.8889, "recall_at_gt": 0.8, "mrr": 0.8 }, "products": { - "seconds": 0.0039, + "seconds": 0.005, "n_matches": 6, "f1": 1.0, "recall_at_gt": 1.0, "mrr": 1.0 }, "events": { - "seconds": 0.0048, + "seconds": 0.0054, "n_matches": 4, "f1": 1.0, "recall_at_gt": 1.0, "mrr": 1.0 }, "addresses": { - "seconds": 0.0038, + "seconds": 0.0046, "n_matches": 5, "f1": 0.8889, "recall_at_gt": 1.0, "mrr": 1.0 } }, - "total_seconds": 0.0221, + "total_seconds": 0.0252, "worst_seconds": 0.0054, "mean_f1": 0.9556, "mean_mrr": 0.96 @@ -46,217 +46,260 @@ "Coma_Inst": { "pairs": { "customers": { - "seconds": 0.0175, + "seconds": 0.0139, "n_matches": 5, "f1": 1.0, "recall_at_gt": 1.0, "mrr": 1.0 }, "orders": { - "seconds": 0.0117, + "seconds": 0.0154, "n_matches": 5, "f1": 1.0, "recall_at_gt": 1.0, "mrr": 1.0 }, "products": { - "seconds": 0.0099, + "seconds": 0.012, "n_matches": 6, "f1": 0.9091, "recall_at_gt": 1.0, "mrr": 1.0 }, "events": { - "seconds": 0.0094, + "seconds": 0.0117, "n_matches": 4, "f1": 1.0, "recall_at_gt": 1.0, "mrr": 1.0 }, "addresses": { - "seconds": 0.0089, + "seconds": 0.0103, "n_matches": 5, "f1": 0.8889, "recall_at_gt": 1.0, "mrr": 1.0 } }, - "total_seconds": 0.0574, - "worst_seconds": 0.0175, + "total_seconds": 0.0633, + "worst_seconds": 0.0154, "mean_f1": 0.9596, "mean_mrr": 1.0 }, "Cupid": { "pairs": { "customers": { - "seconds": 1.4134, + "seconds": 1.285, "n_matches": 6, "f1": 0.8889, "recall_at_gt": 0.8, "mrr": 0.8 }, "orders": { - "seconds": 0.0145, + "seconds": 0.018, "n_matches": 3, "f1": 0.75, "recall_at_gt": 0.6, "mrr": 0.6 }, "products": { - "seconds": 0.0153, + "seconds": 0.0193, "n_matches": 6, "f1": 0.9091, "recall_at_gt": 0.8333, "mrr": 0.8333 }, "events": { - "seconds": 0.0075, + "seconds": 0.0091, "n_matches": 4, "f1": 1.0, "recall_at_gt": 1.0, "mrr": 1.0 }, "addresses": { - "seconds": 0.0345, + "seconds": 0.0417, "n_matches": 3, "f1": 0.75, "recall_at_gt": 0.6, "mrr": 0.6 } }, - "total_seconds": 1.4852, - "worst_seconds": 1.4134, + "total_seconds": 1.3731, + "worst_seconds": 1.285, "mean_f1": 0.8596, "mean_mrr": 0.7667 }, "DistributionBased": { "pairs": { "customers": { - "seconds": 0.1721, + "seconds": 0.1268, "n_matches": 7, "f1": 1.0, "recall_at_gt": 0.8, - "mrr": 1.0 + "mrr": 0.9 }, "orders": { - "seconds": 0.1298, + "seconds": 0.13, "n_matches": 4, "f1": 0.75, "recall_at_gt": 0.8, "mrr": 0.8 }, "products": { - "seconds": 0.1606, + "seconds": 0.1572, "n_matches": 5, "f1": 0.8, "recall_at_gt": 0.8333, "mrr": 0.8333 }, "events": { - "seconds": 0.1061, + "seconds": 0.1085, "n_matches": 4, "f1": 1.0, "recall_at_gt": 1.0, "mrr": 1.0 }, "addresses": { - "seconds": 0.1517, + "seconds": 0.149, "n_matches": 3, "f1": 0.75, "recall_at_gt": 0.6, "mrr": 0.6 } }, - "total_seconds": 0.7203, - "worst_seconds": 0.1721, + "total_seconds": 0.6715, + "worst_seconds": 0.1572, "mean_f1": 0.86, - "mean_mrr": 0.8467 + "mean_mrr": 0.8267 }, "JaccardDistanceMatcher": { "pairs": { "customers": { - "seconds": 0.0044, + "seconds": 0.005, "n_matches": 7, "f1": 1.0, "recall_at_gt": 1.0, "mrr": 1.0 }, "orders": { - "seconds": 0.0079, + "seconds": 0.0089, "n_matches": 11, "f1": 0.8889, "recall_at_gt": 0.6, "mrr": 1.0 }, "products": { - "seconds": 0.0046, + "seconds": 0.0051, "n_matches": 7, "f1": 1.0, "recall_at_gt": 1.0, "mrr": 1.0 }, "events": { - "seconds": 0.0052, + "seconds": 0.0061, "n_matches": 6, "f1": 1.0, "recall_at_gt": 1.0, "mrr": 1.0 }, "addresses": { - "seconds": 0.0048, + "seconds": 0.0045, "n_matches": 5, "f1": 1.0, "recall_at_gt": 1.0, "mrr": 1.0 } }, - "total_seconds": 0.0269, - "worst_seconds": 0.0079, + "total_seconds": 0.0296, + "worst_seconds": 0.0089, "mean_f1": 0.9778, "mean_mrr": 1.0 }, "SimilarityFlooding": { "pairs": { "customers": { - "seconds": 0.0229, + "seconds": 0.0244, "n_matches": 25, "f1": 0.8889, "recall_at_gt": 0.8, "mrr": 1.0 }, "orders": { - "seconds": 0.02, + "seconds": 0.0237, "n_matches": 25, "f1": 0.8889, "recall_at_gt": 0.8, "mrr": 0.9 }, "products": { - "seconds": 0.0337, + "seconds": 0.0333, "n_matches": 36, "f1": 0.9091, "recall_at_gt": 0.8333, "mrr": 1.0 }, "events": { - "seconds": 0.019, + "seconds": 0.0226, "n_matches": 16, "f1": 1.0, "recall_at_gt": 1.0, "mrr": 1.0 }, "addresses": { - "seconds": 0.0209, + "seconds": 0.0208, "n_matches": 25, "f1": 1.0, "recall_at_gt": 0.8, "mrr": 1.0 } }, - "total_seconds": 0.1165, - "worst_seconds": 0.0337, + "total_seconds": 0.1248, + "worst_seconds": 0.0333, "mean_f1": 0.9374, "mean_mrr": 0.98 + }, + "JaccardDistanceMatcher_emb": { + "pairs": { + "customers": { + "seconds": 6.3396, + "n_matches": 7, + "f1": 1.0, + "recall_at_gt": 1.0, + "mrr": 1.0 + }, + "orders": { + "seconds": 0.2713, + "n_matches": 18, + "f1": 1.0, + "recall_at_gt": 1.0, + "mrr": 1.0 + }, + "products": { + "seconds": 0.1497, + "n_matches": 10, + "f1": 1.0, + "recall_at_gt": 1.0, + "mrr": 1.0 + }, + "events": { + "seconds": 0.0724, + "n_matches": 6, + "f1": 1.0, + "recall_at_gt": 1.0, + "mrr": 1.0 + }, + "addresses": { + "seconds": 0.1303, + "n_matches": 11, + "f1": 1.0, + "recall_at_gt": 1.0, + "mrr": 1.0 + } + }, + "total_seconds": 6.9633, + "worst_seconds": 6.3396, + "mean_f1": 1.0, + "mean_mrr": 1.0 } } } \ No newline at end of file diff --git a/valentine/algorithms/distribution_based/distribution_based.py b/valentine/algorithms/distribution_based/distribution_based.py index 0b2e94d..c85388c 100644 --- a/valentine/algorithms/distribution_based/distribution_based.py +++ b/valentine/algorithms/distribution_based/distribution_based.py @@ -268,27 +268,60 @@ def __rank_output( A ranked list that will look like: ((table_name1, column_name1), (table_name2, column_name2)): similarity """ matches = {} - for cluster in attribute_clusters: + + sorted_clusters = sorted( + [sorted(cluster) for cluster in attribute_clusters], key=lambda c: (len(c), c) + ) + + for cluster in sorted_clusters: if len(cluster) < 2: continue + for combination in combinations(cluster, 2): table1 = combination[0][0] table2 = combination[1][0] - if table1 != table2: - k, emd = process_emd( - ( - (combination[0], combination[1]), - self.__quantiles, - False, - tmp_folder_path, - False, - ) + + if table1 == table2: + continue + + k, emd = process_emd( + ( + (combination[0], combination[1]), + self.__quantiles, + False, + tmp_folder_path, + False, ) - sim = 1 / (1 + emd) - tn_i, _, cn_i, _ = k[0] - tn_j, _, cn_j, _ = k[1] - if table_order.get(tn_i, 0) > table_order.get(tn_j, 0): - matches.update(Match(tn_i, cn_i, tn_j, cn_j, sim).to_dict) - else: - matches.update(Match(tn_j, cn_j, tn_i, cn_i, sim).to_dict) - return matches + ) + + emd = float(round(emd, 12)) + sim = 1 / (1 + emd) + + tn_i, _, cn_i, _ = k[0] + tn_j, _, cn_j, _ = k[1] + + order_i = table_order.get(tn_i, float("inf")) + order_j = table_order.get(tn_j, float("inf")) + + if (order_i, tn_i, cn_i) > (order_j, tn_j, cn_j): + match_obj = Match(tn_i, cn_i, tn_j, cn_j, sim) + else: + match_obj = Match(tn_j, cn_j, tn_i, cn_i, sim) + + # Deterministic overwrite rule + key = ( + match_obj.source_table_name, + match_obj.source_column_name, + match_obj.target_table_name, + match_obj.target_column_name, + ) + + if key not in matches or sim > matches[key].similarity: + matches[key] = match_obj + + # Convert back to expected format + final_matches = {} + for m in matches.values(): + final_matches.update(m.to_dict) + + return final_matches From 7d0e0453f45a56d95d330c7dbfc867ed6391b0d1 Mon Sep 17 00:00:00 2001 From: Kyriakos Psarakis Date: Tue, 5 May 2026 14:16:46 +0200 Subject: [PATCH 13/13] use bash in the bench workflow --- .github/workflows/bench.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index 097e825..a40e77f 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -39,6 +39,7 @@ jobs: pip install . - name: Run accuracy regression check + shell: bash run: | python experiments/bench.py \ --quick \