From 75a8d0c3eb3cabed74981ecf329b88e41221d493 Mon Sep 17 00:00:00 2001
From: chupei <njuchupei@gmail.com>
Date: Fri, 12 Jun 2026 10:46:07 +0800
Subject: [PATCH 1/3] feat: retriavl metrics calculate add backup

---
 dingo/exec/retrieval.py                       | 57 ++++++++++++++++++-
 dingo/retrieval/eval_utils.py                 | 33 ++++++++++-
 .../retrieval/test_retrieval_executor.py      | 57 +++++++++++++++++++
 3 files changed, 142 insertions(+), 5 deletions(-)

diff --git a/dingo/exec/retrieval.py b/dingo/exec/retrieval.py
index 9d1b070f..b12c62f4 100644
--- a/dingo/exec/retrieval.py
+++ b/dingo/exec/retrieval.py
@@ -18,7 +18,7 @@
 from dingo.config.input_args import InputArgs
 from dingo.exec.base import Executor
 from dingo.io import SummaryModel
-from dingo.retrieval.eval_utils import make_output_dir, save_json
+from dingo.retrieval.eval_utils import compute_query_metrics, make_output_dir, save_json
 from dingo.retrieval.mteb_adapter import SearchClientModel
 from dingo.retrieval.search_client import create_client
 
@@ -118,9 +118,29 @@ def execute(self) -> SummaryModel:
                     overwrite_strategy="always",
                 )
                 task_metrics = self._extract_metrics(results)
+                if not task_metrics:
+                    logger.warning(
+                        "MTEB returned empty metrics for task %r; "
+                        "falling back to search trace metrics",
+                        task_name,
+                    )
+                    task_metrics = self._compute_metrics_from_search_traces(
+                        model.get_search_traces(),
+                        task_name,
+                    )
                 all_results[task_name] = task_metrics
             except Exception as e:
-                logger.error(f"Task {task_name!r} failed: {e}")
+                logger.error(f"Task {task_name!r} failed: {e}", exc_info=True)
+                task_metrics = self._compute_metrics_from_search_traces(
+                    model.get_search_traces(),
+                    task_name,
+                )
+                if task_metrics:
+                    logger.warning(
+                        "Using search trace fallback metrics for failed task %r",
+                        task_name,
+                    )
+                    all_results[task_name] = task_metrics
                 continue
 
         self._all_results = all_results
@@ -257,6 +277,39 @@ def _extract_metrics(self, model_result) -> dict[str, float]:
                             metrics[key] = round(score_entry[key], 5)
         return metrics
 
+    @staticmethod
+    def _compute_metrics_from_search_traces(
+        traces: list[dict[str, Any]],
+        task_name: str,
+    ) -> dict[str, float]:
+        """Compute fallback retrieval metrics from stored trace qrels/results."""
+        metric_values: dict[str, list[float]] = {}
+        for trace in traces:
+            if trace.get("task") != task_name:
+                continue
+            for query in trace.get("queries", []):
+                retrieved_doc_ids = query.get("retrieved_doc_ids") or []
+                gold_doc_ids = set(query.get("gold_doc_ids") or [])
+                if not gold_doc_ids:
+                    continue
+
+                query_metrics = compute_query_metrics(
+                    retrieved_doc_ids,
+                    gold_doc_ids,
+                )
+                query_metrics["main_score"] = query_metrics.get("ndcg_at_10", 0.0)
+
+                for key in METRICS_OF_INTEREST:
+                    if key not in query_metrics:
+                        continue
+                    metric_values.setdefault(key, []).append(query_metrics[key])
+
+        return {
+            key: round(sum(values) / len(values), 5)
+            for key, values in metric_values.items()
+            if values
+        }
+
     def load_data(self):
         pass
 
diff --git a/dingo/retrieval/eval_utils.py b/dingo/retrieval/eval_utils.py
index ec94e632..f7091c9d 100644
--- a/dingo/retrieval/eval_utils.py
+++ b/dingo/retrieval/eval_utils.py
@@ -54,9 +54,10 @@ def compute_query_metrics(
     retrieved_doc_ids: list[str],
     relevant_doc_ids: set[str],
 ) -> dict[str, Any]:
-    """Compute nDCG@10, MRR@10, Recall@{5,10,100,1000} for a single query."""
+    """Compute standard retrieval metrics for a single query."""
     top5 = retrieved_doc_ids[:5]
     top10 = retrieved_doc_ids[:10]
+    top20 = retrieved_doc_ids[:20]
     top100 = retrieved_doc_ids[:100]
     top1000 = retrieved_doc_ids[:1000]
 
@@ -64,6 +65,7 @@ def compute_query_metrics(
     rel_in_5 = sum(rel_flags_5)
     rel_flags_10 = [1 if did in relevant_doc_ids else 0 for did in top10]
     rel_in_10 = sum(rel_flags_10)
+    rel_flags_100 = [1 if did in relevant_doc_ids else 0 for did in top100]
     rel_total = len(relevant_doc_ids)
 
     first_rel_rank = -1
@@ -73,12 +75,21 @@ def compute_query_metrics(
             break
     mrr10 = 1.0 / first_rel_rank if first_rel_rank > 0 else 0.0
 
-    ideal_len = min(rel_total, 10)
-    idcg10 = dcg([1] * ideal_len, 10) if ideal_len > 0 else 0.0
+    ideal_len_10 = min(rel_total, 10)
+    idcg10 = dcg([1] * ideal_len_10, 10) if ideal_len_10 > 0 else 0.0
     ndcg10 = (dcg(rel_flags_10, 10) / idcg10) if idcg10 > 0 else 0.0
 
+    ideal_len_100 = min(rel_total, 100)
+    idcg100 = dcg([1] * ideal_len_100, 100) if ideal_len_100 > 0 else 0.0
+    ndcg100 = (dcg(rel_flags_100, 100) / idcg100) if idcg100 > 0 else 0.0
+
     recall5 = (rel_in_5 / rel_total) if rel_total > 0 else 0.0
     recall10 = (rel_in_10 / rel_total) if rel_total > 0 else 0.0
+    recall20 = (
+        sum(1 for did in top20 if did in relevant_doc_ids) / rel_total
+        if rel_total > 0
+        else 0.0
+    )
     recall100 = (
         sum(1 for did in top100 if did in relevant_doc_ids) / rel_total
         if rel_total > 0
@@ -90,16 +101,32 @@ def compute_query_metrics(
         else 0.0
     )
 
+    hits = 0
+    precision_sum = 0.0
+    for rank, did in enumerate(top10, start=1):
+        if did in relevant_doc_ids:
+            hits += 1
+            precision_sum += hits / rank
+    map10 = (
+        precision_sum / min(rel_total, 10)
+        if rel_total > 0
+        else 0.0
+    )
+
     return {
         "first_relevant_rank_at_10": first_rel_rank,
         "relevant_in_top10": rel_in_10,
         "relevant_total": rel_total,
         "ndcg_at_10": round(ndcg10, 5),
+        "ndcg_at_100": round(ndcg100, 5),
         "mrr_at_10": round(mrr10, 5),
         "recall_at_5": round(recall5, 5),
         "recall_at_10": round(recall10, 5),
+        "recall_at_20": round(recall20, 5),
         "recall_at_100": round(recall100, 5),
         "recall_at_1000": round(recall1000, 5),
+        "precision_at_10": round(rel_in_10 / 10, 5),
+        "map_at_10": round(map10, 5),
     }
 
 
diff --git a/test/scripts/retrieval/test_retrieval_executor.py b/test/scripts/retrieval/test_retrieval_executor.py
index bab4de77..9341d278 100644
--- a/test/scripts/retrieval/test_retrieval_executor.py
+++ b/test/scripts/retrieval/test_retrieval_executor.py
@@ -4,6 +4,7 @@
 import os
 import subprocess
 import sys
+from types import SimpleNamespace
 
 import pytest
 
@@ -126,3 +127,59 @@ def test_help(self):
     def test_api_url_is_optional(self):
         stdout, _, _ = self._run_cli("eval-retrieval", "--help")
         assert "default depends on backend" in stdout
+
+
+class TestRetrievalExecutorFallbackMetrics:
+    def test_execute_uses_trace_metrics_when_mteb_metrics_empty(self, tmp_path, monkeypatch):
+        import dingo.exec.retrieval as retrieval_module
+        from dingo.exec.retrieval import RetrievalExecutor
+
+        class FakeClient:
+            name = "fake-openalex"
+
+        input_args = InputArgs(**{
+            "input_path": "SciFact",
+            "output_path": str(tmp_path),
+            "executor": {
+                "retrieval": {
+                    "backend": "openalex",
+                    "api_url": "https://api.openalex.org",
+                    "limit": 10,
+                }
+            },
+        })
+        executor = RetrievalExecutor(input_args)
+
+        monkeypatch.setattr(retrieval_module, "create_client", lambda *a, **k: FakeClient())
+        monkeypatch.setattr(retrieval_module.mteb, "get_tasks", lambda tasks: [object()])
+        monkeypatch.setattr(RetrievalExecutor, "_attach_relevant_docs", lambda self, model, tasks: None)
+
+        def fake_evaluate(model, tasks, overwrite_strategy):
+            model._search_traces.append({
+                "task": "SciFact",
+                "total_queries": 2,
+                "queries": [
+                    {
+                        "qid": "q1",
+                        "retrieved_doc_ids": ["d1", "d2"],
+                        "gold_doc_ids": ["d1"],
+                    },
+                    {
+                        "qid": "q2",
+                        "retrieved_doc_ids": ["d3"],
+                        "gold_doc_ids": ["d4"],
+                    },
+                ],
+            })
+            return SimpleNamespace(
+                task_results=[SimpleNamespace(scores={})],
+            )
+
+        monkeypatch.setattr(retrieval_module.mteb, "evaluate", fake_evaluate)
+
+        summary = executor.execute()
+
+        assert summary.score == 0.5
+        assert summary.metrics_score_stats["SciFact"]["main_score"] == 0.5
+        assert summary.metrics_score_stats["SciFact"]["ndcg_at_10"] == 0.5
+        assert summary.metrics_score_stats["SciFact"]["recall_at_10"] == 0.5

From 2ed0af40f494b8390ac1e0b9bb399533f86e81fc Mon Sep 17 00:00:00 2001
From: chupei <njuchupei@gmail.com>
Date: Mon, 15 Jun 2026 15:46:52 +0800
Subject: [PATCH 2/3] feat: retrival eval title fallback add fuzzy matching

---
 dingo/config/input_args.py                |  5 ++
 dingo/exec/retrieval.py                   | 10 ++++
 dingo/retrieval/eval_utils.py             | 46 ++++++++++++--
 dingo/retrieval/mteb_adapter.py           | 73 ++++++++++++++++++++++-
 dingo/run/cli.py                          | 25 ++++++++
 test/scripts/retrieval/test_eval_utils.py | 49 +++++++++++++--
 6 files changed, 195 insertions(+), 13 deletions(-)

diff --git a/dingo/config/input_args.py b/dingo/config/input_args.py
index ba3e3c73..11d78bf0 100644
--- a/dingo/config/input_args.py
+++ b/dingo/config/input_args.py
@@ -94,6 +94,11 @@ class RetrievalArgs(BaseModel):
     freshness_boost: Optional[str] = None
     filters: Optional[List[Dict[str, Any]] | Dict[str, Any]] = None
     max_queries: Optional[int] = None
+    title_fuzzy_enabled: bool = False
+    title_fuzzy_threshold: float = 0.95
+    title_fuzzy_margin: float = 0.01
+    title_fuzzy_min_len: int = 20
+    title_fuzzy_max_candidates: int = 300
     timeout: float = 120.0
     rate_limit: Optional[float] = None
     max_retries: int = 3
diff --git a/dingo/exec/retrieval.py b/dingo/exec/retrieval.py
index b12c62f4..ce925333 100644
--- a/dingo/exec/retrieval.py
+++ b/dingo/exec/retrieval.py
@@ -84,6 +84,11 @@ def execute(self) -> SummaryModel:
             search_limit=ra.limit,
             max_queries=ra.max_queries,
             max_workers=ra.max_workers,
+            title_fuzzy_enabled=ra.title_fuzzy_enabled,
+            title_fuzzy_threshold=ra.title_fuzzy_threshold,
+            title_fuzzy_margin=ra.title_fuzzy_margin,
+            title_fuzzy_min_len=ra.title_fuzzy_min_len,
+            title_fuzzy_max_candidates=ra.title_fuzzy_max_candidates,
         )
 
         output_dir = make_output_dir(
@@ -159,6 +164,11 @@ def execute(self) -> SummaryModel:
             "limit": ra.limit,
             "retrieval_mode": ra.retrieval_mode,
             "sub_queries": ra.sub_queries,
+            "title_fuzzy_enabled": ra.title_fuzzy_enabled,
+            "title_fuzzy_threshold": ra.title_fuzzy_threshold,
+            "title_fuzzy_margin": ra.title_fuzzy_margin,
+            "title_fuzzy_min_len": ra.title_fuzzy_min_len,
+            "title_fuzzy_max_candidates": ra.title_fuzzy_max_candidates,
             "max_queries": ra.max_queries,
             "tasks": task_names,
         }
diff --git a/dingo/retrieval/eval_utils.py b/dingo/retrieval/eval_utils.py
index f7091c9d..bff6a04f 100644
--- a/dingo/retrieval/eval_utils.py
+++ b/dingo/retrieval/eval_utils.py
@@ -16,6 +16,7 @@
 import re
 import unicodedata
 from datetime import datetime
+from difflib import SequenceMatcher
 from typing import Any
 
 logger = logging.getLogger(__name__)
@@ -134,25 +135,58 @@ def resolve_hit(
     hit: dict[str, Any],
     title_index: dict[str, list[str]],
     corpus_id_set: set[str],
-) -> tuple[str, str]:
+    *,
+    title_fuzzy_enabled: bool = False,
+    title_fuzzy_threshold: float = 0.95,
+    title_fuzzy_margin: float = 0.01,
+    title_fuzzy_min_len: int = 20,
+    title_norm_candidates: list[tuple[str, list[str]]] | None = None,
+) -> tuple[str, str, float | None]:
     """Resolve a search hit to a corpus ID.
 
-    Returns ``(corpus_id, mapping_source)`` where *mapping_source* is one of
-    ``"doc_id_exact"``, ``"title_fallback"``, or ``"unmatched"``.
+    Returns ``(corpus_id, mapping_source, fuzzy_similarity)`` where
+    *mapping_source* is one of ``"doc_id_exact"``, ``"title_fallback"``,
+    ``"title_fuzzy"``, or ``"unmatched"``.
     """
     raw_id = str(hit.get("doc_id") or hit.get("paper_id") or "").strip()
     if raw_id:
         stripped = strip_d_prefix(raw_id)
         for candidate in (raw_id, stripped, f"d{stripped}"):
             if candidate in corpus_id_set:
-                return candidate, "doc_id_exact"
+                return candidate, "doc_id_exact", None
     title = str(hit.get("title") or "")
     norm = normalize_title(title)
     if norm:
         candidates = title_index.get(norm)
         if candidates:
-            return candidates[0], "title_fallback"
-    return "", "unmatched"
+            return candidates[0], "title_fallback", None
+
+    if title_fuzzy_enabled and norm and len(norm) >= title_fuzzy_min_len:
+        iterable_candidates = (
+            title_norm_candidates
+            if title_norm_candidates is not None
+            else list(title_index.items())
+        )
+        best_ids: list[str] | None = None
+        best_score = -1.0
+        second_score = -1.0
+        for candidate_norm, candidate_ids in iterable_candidates:
+            score = SequenceMatcher(None, norm, candidate_norm).ratio()
+            if score > best_score:
+                second_score = best_score
+                best_score = score
+                best_ids = candidate_ids
+            elif score > second_score:
+                second_score = score
+
+        if (
+            best_ids
+            and best_score >= title_fuzzy_threshold
+            and (best_score - second_score) >= title_fuzzy_margin
+        ):
+            return best_ids[0], "title_fuzzy", round(best_score, 6)
+
+    return "", "unmatched", None
 
 
 def make_output_dir(explicit_dir: str | None, default_prefix: str) -> str:
diff --git a/dingo/retrieval/mteb_adapter.py b/dingo/retrieval/mteb_adapter.py
index 8643cd93..a6421bd4 100644
--- a/dingo/retrieval/mteb_adapter.py
+++ b/dingo/retrieval/mteb_adapter.py
@@ -20,6 +20,7 @@
 import concurrent.futures
 import logging
 from collections import defaultdict
+from heapq import nlargest
 from typing import TYPE_CHECKING, Any
 
 from mteb.models.model_meta import ModelMeta
@@ -98,6 +99,15 @@ def _instruction_trace_fields(
     return fields
 
 
+def _title_ngrams(text: str, n: int = 3) -> set[str]:
+    """Return character n-grams for normalized title text."""
+    if not text:
+        return set()
+    if len(text) < n:
+        return {text}
+    return {text[i : i + n] for i in range(len(text) - n + 1)}
+
+
 # Workaround for mteb versions where confidence_scores crashes on empty input.
 try:
     from mteb._evaluators import retrieval_metrics as _rm
@@ -123,13 +133,25 @@ def __init__(
         search_limit: int = 100,
         max_queries: int | None = None,
         max_workers: int = 1,
+        title_fuzzy_enabled: bool = False,
+        title_fuzzy_threshold: float = 0.95,
+        title_fuzzy_margin: float = 0.01,
+        title_fuzzy_min_len: int = 20,
+        title_fuzzy_max_candidates: int = 300,
     ):
         self.client = client
         self.search_limit = search_limit
         self.max_queries = max_queries
         self.max_workers = max_workers
+        self.title_fuzzy_enabled = title_fuzzy_enabled
+        self.title_fuzzy_threshold = title_fuzzy_threshold
+        self.title_fuzzy_margin = title_fuzzy_margin
+        self.title_fuzzy_min_len = title_fuzzy_min_len
+        self.title_fuzzy_max_candidates = title_fuzzy_max_candidates
 
         self._title_to_ids: dict[str, list[str]] = defaultdict(list)
+        self._normalized_titles: list[str] = []
+        self._title_ngrams_index: dict[str, list[int]] = defaultdict(list)
         self._corpus_ids: set[str] = set()
         self._corpus_size = 0
         self._collisions = 0
@@ -202,6 +224,8 @@ def index(
         num_proc: int | None = None,
     ) -> None:
         self._title_to_ids.clear()
+        self._normalized_titles.clear()
+        self._title_ngrams_index.clear()
         self._corpus_ids.clear()
         count = 0
         for row in corpus:
@@ -212,6 +236,11 @@ def index(
                 continue
             normalized = normalize_title(title)
             if normalized:
+                if normalized not in self._title_to_ids:
+                    title_idx = len(self._normalized_titles)
+                    self._normalized_titles.append(normalized)
+                    for gram in _title_ngrams(normalized):
+                        self._title_ngrams_index[gram].append(title_idx)
                 self._title_to_ids[normalized].append(doc_id)
                 count += 1
 
@@ -225,6 +254,31 @@ def index(
             f"[task={task_metadata.name}]"
         )
 
+    def _get_fuzzy_title_candidates(self, norm_title: str) -> list[tuple[str, list[str]]]:
+        """Get top fuzzy candidates via n-gram overlap prefiltering."""
+        grams = _title_ngrams(norm_title)
+        if not grams:
+            return []
+
+        overlap_counts: dict[int, int] = defaultdict(int)
+        for gram in grams:
+            for title_idx in self._title_ngrams_index.get(gram, []):
+                overlap_counts[title_idx] += 1
+
+        if not overlap_counts:
+            return []
+
+        top = nlargest(
+            self.title_fuzzy_max_candidates,
+            overlap_counts.items(),
+            key=lambda x: x[1],
+        )
+
+        return [
+            (self._normalized_titles[title_idx], self._title_to_ids[self._normalized_titles[title_idx]])
+            for title_idx, _ in top
+        ]
+
     def search(
         self,
         queries: "QueryDatasetType",
@@ -307,13 +361,27 @@ def _process_query(idx_qid_text):
             mapping_stats: dict[str, int] = {
                 "doc_id_exact": 0,
                 "title_fallback": 0,
+                "title_fuzzy": 0,
                 "unmatched": 0,
             }
 
             for rank, paper in enumerate(response.results):
                 hit = {"paper_id": paper.paper_id, "title": paper.title}
-                resolved_id, src = resolve_hit(
-                    hit, self._title_to_ids, self._corpus_ids
+                norm_hit_title = normalize_title(hit.get("title") or "")
+                fuzzy_candidates = (
+                    self._get_fuzzy_title_candidates(norm_hit_title)
+                    if self.title_fuzzy_enabled and norm_hit_title
+                    else None
+                )
+                resolved_id, src, fuzzy_similarity = resolve_hit(
+                    hit,
+                    self._title_to_ids,
+                    self._corpus_ids,
+                    title_fuzzy_enabled=self.title_fuzzy_enabled,
+                    title_fuzzy_threshold=self.title_fuzzy_threshold,
+                    title_fuzzy_margin=self.title_fuzzy_margin,
+                    title_fuzzy_min_len=self.title_fuzzy_min_len,
+                    title_norm_candidates=fuzzy_candidates,
                 )
                 mapping_stats[src] = mapping_stats.get(src, 0) + 1
                 top_api_results.append(
@@ -324,6 +392,7 @@ def _process_query(idx_qid_text):
                         "score": paper.score,
                         "resolved_corpus_id": resolved_id,
                         "mapping_source": src,
+                        "title_fuzzy_similarity": fuzzy_similarity,
                         "is_relevant": (
                             bool(resolved_id and resolved_id in relevant_doc_ids)
                             if relevant_doc_ids is not None
diff --git a/dingo/run/cli.py b/dingo/run/cli.py
index f554201b..15f6baf6 100644
--- a/dingo/run/cli.py
+++ b/dingo/run/cli.py
@@ -135,6 +135,26 @@ def parse_args():
         "--max-queries", type=int, default=None,
         help="Limit number of queries for quick testing",
     )
+    ret_parser.add_argument(
+        "--title-fuzzy-enabled", action="store_true", default=False,
+        help="Enable fuzzy title fallback matching (default: disabled)",
+    )
+    ret_parser.add_argument(
+        "--title-fuzzy-threshold", type=float, default=0.95,
+        help="Minimum title similarity to accept fuzzy match (default: 0.95)",
+    )
+    ret_parser.add_argument(
+        "--title-fuzzy-margin", type=float, default=0.01,
+        help="Minimum gap between best and second-best fuzzy score (default: 0.01)",
+    )
+    ret_parser.add_argument(
+        "--title-fuzzy-min-len", type=int, default=20,
+        help="Minimum normalized title length for fuzzy matching (default: 20)",
+    )
+    ret_parser.add_argument(
+        "--title-fuzzy-max-candidates", type=int, default=300,
+        help="Max fuzzy candidates re-ranked per hit (default: 300)",
+    )
     ret_parser.add_argument(
         "--timeout", type=float, default=120.0,
         help="HTTP request timeout in seconds (default: 120)",
@@ -351,6 +371,11 @@ def cmd_eval_retrieval(args):
         freshness_boost=args.freshness_boost,
         filters=filters,
         max_queries=args.max_queries,
+        title_fuzzy_enabled=args.title_fuzzy_enabled,
+        title_fuzzy_threshold=args.title_fuzzy_threshold,
+        title_fuzzy_margin=args.title_fuzzy_margin,
+        title_fuzzy_min_len=args.title_fuzzy_min_len,
+        title_fuzzy_max_candidates=args.title_fuzzy_max_candidates,
         timeout=args.timeout,
         rate_limit=args.rate_limit,
         max_retries=3,
diff --git a/test/scripts/retrieval/test_eval_utils.py b/test/scripts/retrieval/test_eval_utils.py
index 953f5820..ffb3800b 100644
--- a/test/scripts/retrieval/test_eval_utils.py
+++ b/test/scripts/retrieval/test_eval_utils.py
@@ -105,30 +105,69 @@ def setup_method(self):
 
     def test_exact_match_with_d_prefix(self):
         hit = {"paper_id": "d123", "title": "Something"}
-        cid, src = resolve_hit(hit, self.title_index, self.corpus_ids)
+        cid, src, score = resolve_hit(hit, self.title_index, self.corpus_ids)
         assert cid == "d123"
         assert src == "doc_id_exact"
+        assert score is None
 
     def test_exact_match_without_prefix(self):
         hit = {"paper_id": "123", "title": "Something"}
-        cid, src = resolve_hit(hit, self.title_index, self.corpus_ids)
+        cid, src, score = resolve_hit(hit, self.title_index, self.corpus_ids)
         assert cid == "d123"
         assert src == "doc_id_exact"
+        assert score is None
 
     def test_title_fallback(self):
         hit = {"paper_id": "999", "title": "Attention Is All You Need"}
-        cid, src = resolve_hit(hit, self.title_index, self.corpus_ids)
+        cid, src, score = resolve_hit(hit, self.title_index, self.corpus_ids)
         assert cid == "d123"
         assert src == "title_fallback"
+        assert score is None
 
     def test_unmatched(self):
         hit = {"paper_id": "999", "title": "Unknown Paper"}
-        cid, src = resolve_hit(hit, self.title_index, self.corpus_ids)
+        cid, src, score = resolve_hit(hit, self.title_index, self.corpus_ids)
         assert cid == ""
         assert src == "unmatched"
+        assert score is None
 
     def test_empty_hit(self):
         hit = {"paper_id": "", "title": ""}
-        cid, src = resolve_hit(hit, self.title_index, self.corpus_ids)
+        cid, src, score = resolve_hit(hit, self.title_index, self.corpus_ids)
         assert cid == ""
         assert src == "unmatched"
+        assert score is None
+
+    def test_title_fuzzy_markup_match(self):
+        title_index = {
+            "linkagedisequilibriummappingofchek2commonvariationandbreastcancerrisk": ["d999"]
+        }
+        hit = {
+            "paper_id": "not-in-corpus",
+            "title": "Linkage disequilibrium mapping of [!i]CHEK2[!/i]:: Common variation and breast cancer risk",
+        }
+        cid, src, score = resolve_hit(
+            hit,
+            title_index,
+            {"d999"},
+            title_fuzzy_enabled=True,
+            title_fuzzy_threshold=0.98,
+            title_fuzzy_margin=0.001,
+            title_fuzzy_min_len=10,
+        )
+        assert cid == "d999"
+        assert src == "title_fuzzy"
+        assert score is not None and score >= 0.98
+
+    def test_title_fuzzy_disabled_stays_unmatched(self):
+        title_index = {
+            "linkagedisequilibriummappingofchek2commonvariationandbreastcancerrisk": ["d999"]
+        }
+        hit = {
+            "paper_id": "not-in-corpus",
+            "title": "Linkage disequilibrium mapping of [!i]CHEK2[!/i]:: Common variation and breast cancer risk",
+        }
+        cid, src, score = resolve_hit(hit, title_index, {"d999"})
+        assert cid == ""
+        assert src == "unmatched"
+        assert score is None

From bedd4cbcb5ad310e422c81ba942532ddaa4293f9 Mon Sep 17 00:00:00 2001
From: GitHub Action <action@github.com>
Date: Mon, 15 Jun 2026 07:48:27 +0000
Subject: [PATCH 3/3] =?UTF-8?q?=F0=9F=93=9A=20Auto-update=20metrics=20docu?=
 =?UTF-8?q?mentation?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 docs/metrics.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/metrics.md b/docs/metrics.md
index 3acdf41d..586d26f0 100644
--- a/docs/metrics.md
+++ b/docs/metrics.md
@@ -61,7 +61,7 @@ This document provides comprehensive information about all quality metrics used
 | Type | Metric | Description | Paper Source | Evaluation Results | Examples |
 |------|--------|-------------|--------------|-------------------|----------|
 | `QUALITY_BAD_COMPLETENESS` | RuleLineEndWithEllipsis, RuleLineEndWithTerminal, RuleSentenceNumber, RuleWordNumber | Checks whether the ratio of lines ending with ellipsis is below threshold; Checks whether the ratio of lines ending w... | [RedPajama: an Open Dataset for Training Large Language Models](https://github.com/togethercomputer/RedPajama-Data) (Together Computer, 2023) | [📊 See Results](eval/rule/slimpajama_data_evaluated_by_rule.md) | N/A |
-| `QUALITY_BAD_EFFECTIVENESS` | RuleDoi, RuleIsbn, RuleAbnormalChar, RuleAbnormalHtml, RuleAlphaWords, RuleAudioDataFormat, RuleCharNumber, RuleColonEnd, RuleContentNull, RuleContentShort, RuleContentShortMultiLan, RuleEnterAndSpace, RuleEnterMore, RuleEnterRatioMore, RuleHtmlEntity, RuleHtmlTag, RuleInvisibleChar, RuleImageDataFormat, RuleLatexSpecialChar, RuleLineJavascriptCount, RuleLoremIpsum, RuleMeanWordLength, RuleNlpDataFormat, RuleSftDataFormat, RuleSpaceMore, RuleSpecialCharacter, RuleStopWord, RuleSymbolWordRatio, RuleVedioDataFormat, RuleOnlyUrl, RuleDictConsistency | Check whether the string is in the correct format of the doi; Check whether the string is in the correct format of th... | Internal Implementation | N/A | N/A |
+| `QUALITY_BAD_EFFECTIVENESS` | RuleAbnormalChar, RuleAbnormalHtml, RuleAlphaWords, RuleAudioDataFormat, RuleCharNumber, RuleColonEnd, RuleContentNull, RuleContentShort, RuleContentShortMultiLan, RuleEnterAndSpace, RuleEnterMore, RuleEnterRatioMore, RuleHtmlEntity, RuleHtmlTag, RuleInvisibleChar, RuleImageDataFormat, RuleLatexSpecialChar, RuleLineJavascriptCount, RuleLoremIpsum, RuleMeanWordLength, RuleNlpDataFormat, RuleSftDataFormat, RuleSpaceMore, RuleSpecialCharacter, RuleStopWord, RuleSymbolWordRatio, RuleVedioDataFormat, RuleOnlyUrl, RuleDictConsistency, RuleDoi, RuleIsbn | Detects garbled text and anti-crawling characters by combining special character and invisible character detection; D... | [RedPajama: an Open Dataset for Training Large Language Models](https://github.com/togethercomputer/RedPajama-Data) (Together Computer, 2023) | [📊 See Results](eval/rule/slimpajama_data_evaluated_by_rule.md) | N/A |
 | `QUALITY_BAD_FLUENCY` | RuleAbnormalNumber, RuleCharSplit, RuleNoPunc, RuleWordSplit, RuleWordStuck | Checks PDF content for abnormal book page or index numbers that disrupt text flow; Checks PDF content for abnormal ch... | [RedPajama: an Open Dataset for Training Large Language Models](https://github.com/togethercomputer/RedPajama-Data) (Together Computer, 2023) | [📊 See Results](eval/rule/slimpajama_data_evaluated_by_rule.md) | N/A |
 | `QUALITY_BAD_RELEVANCE` | RuleHeadWordAr, RuleHeadWordCs, RuleHeadWordHu, RuleHeadWordKo, RuleHeadWordRu, RuleHeadWordSr, RuleHeadWordTh, RuleHeadWordVi, RulePatternSearch, RuleWatermark | Checks whether Arabic content contains irrelevant tail source information; Checks whether Czech content contains irre... | [RedPajama: an Open Dataset for Training Large Language Models](https://github.com/togethercomputer/RedPajama-Data) (Together Computer, 2023) | [📊 See Results](eval/rule/slimpajama_data_evaluated_by_rule.md) | N/A |
 | `QUALITY_BAD_SECURITY` | RuleIDCard, RuleUnsafeWords, RulePIIDetection | Checks whether content contains ID card information; Checks whether content contains unsafe words; Detects Personal I... | [RedPajama: an Open Dataset for Training Large Language Models](https://github.com/togethercomputer/RedPajama-Data) (Together Computer, 2023) | [📊 See Results](eval/rule/slimpajama_data_evaluated_by_rule.md) | N/A |
@@ -126,7 +126,7 @@ This document provides comprehensive information about all quality metrics used
 
 | Type | Metric | Description | Paper Source | Evaluation Results | Examples |
 |------|--------|-------------|--------------|-------------------|----------|
-| `QUALITY_BAD_EFFECTIVENESS` | RuleMetadataSimilarity | 检查元数据字段与基准数据的相似度匹配，阈值默认为0.6 | Internal Implementation | N/A | N/A |
+| `QUALITY_BAD_EFFECTIVENESS` | RuleMetadataSimilarity, RuleQuanliangFieldValidation | 检查元数据字段与基准数据的相似度匹配，阈值默认为0.6; Validate Quanliang metadata fields and report invalid fields | Internal Implementation | N/A | N/A |
 
 ### Rule-Based RESUME Quality Metrics