From 75a8d0c3eb3cabed74981ecf329b88e41221d493 Mon Sep 17 00:00:00 2001 From: chupei Date: Fri, 12 Jun 2026 10:46:07 +0800 Subject: [PATCH 1/3] feat: retriavl metrics calculate add backup --- dingo/exec/retrieval.py | 57 ++++++++++++++++++- dingo/retrieval/eval_utils.py | 33 ++++++++++- .../retrieval/test_retrieval_executor.py | 57 +++++++++++++++++++ 3 files changed, 142 insertions(+), 5 deletions(-) diff --git a/dingo/exec/retrieval.py b/dingo/exec/retrieval.py index 9d1b070f..b12c62f4 100644 --- a/dingo/exec/retrieval.py +++ b/dingo/exec/retrieval.py @@ -18,7 +18,7 @@ from dingo.config.input_args import InputArgs from dingo.exec.base import Executor from dingo.io import SummaryModel -from dingo.retrieval.eval_utils import make_output_dir, save_json +from dingo.retrieval.eval_utils import compute_query_metrics, make_output_dir, save_json from dingo.retrieval.mteb_adapter import SearchClientModel from dingo.retrieval.search_client import create_client @@ -118,9 +118,29 @@ def execute(self) -> SummaryModel: overwrite_strategy="always", ) task_metrics = self._extract_metrics(results) + if not task_metrics: + logger.warning( + "MTEB returned empty metrics for task %r; " + "falling back to search trace metrics", + task_name, + ) + task_metrics = self._compute_metrics_from_search_traces( + model.get_search_traces(), + task_name, + ) all_results[task_name] = task_metrics except Exception as e: - logger.error(f"Task {task_name!r} failed: {e}") + logger.error(f"Task {task_name!r} failed: {e}", exc_info=True) + task_metrics = self._compute_metrics_from_search_traces( + model.get_search_traces(), + task_name, + ) + if task_metrics: + logger.warning( + "Using search trace fallback metrics for failed task %r", + task_name, + ) + all_results[task_name] = task_metrics continue self._all_results = all_results @@ -257,6 +277,39 @@ def _extract_metrics(self, model_result) -> dict[str, float]: metrics[key] = round(score_entry[key], 5) return metrics + @staticmethod + def _compute_metrics_from_search_traces( + traces: list[dict[str, Any]], + task_name: str, + ) -> dict[str, float]: + """Compute fallback retrieval metrics from stored trace qrels/results.""" + metric_values: dict[str, list[float]] = {} + for trace in traces: + if trace.get("task") != task_name: + continue + for query in trace.get("queries", []): + retrieved_doc_ids = query.get("retrieved_doc_ids") or [] + gold_doc_ids = set(query.get("gold_doc_ids") or []) + if not gold_doc_ids: + continue + + query_metrics = compute_query_metrics( + retrieved_doc_ids, + gold_doc_ids, + ) + query_metrics["main_score"] = query_metrics.get("ndcg_at_10", 0.0) + + for key in METRICS_OF_INTEREST: + if key not in query_metrics: + continue + metric_values.setdefault(key, []).append(query_metrics[key]) + + return { + key: round(sum(values) / len(values), 5) + for key, values in metric_values.items() + if values + } + def load_data(self): pass diff --git a/dingo/retrieval/eval_utils.py b/dingo/retrieval/eval_utils.py index ec94e632..f7091c9d 100644 --- a/dingo/retrieval/eval_utils.py +++ b/dingo/retrieval/eval_utils.py @@ -54,9 +54,10 @@ def compute_query_metrics( retrieved_doc_ids: list[str], relevant_doc_ids: set[str], ) -> dict[str, Any]: - """Compute nDCG@10, MRR@10, Recall@{5,10,100,1000} for a single query.""" + """Compute standard retrieval metrics for a single query.""" top5 = retrieved_doc_ids[:5] top10 = retrieved_doc_ids[:10] + top20 = retrieved_doc_ids[:20] top100 = retrieved_doc_ids[:100] top1000 = retrieved_doc_ids[:1000] @@ -64,6 +65,7 @@ def compute_query_metrics( rel_in_5 = sum(rel_flags_5) rel_flags_10 = [1 if did in relevant_doc_ids else 0 for did in top10] rel_in_10 = sum(rel_flags_10) + rel_flags_100 = [1 if did in relevant_doc_ids else 0 for did in top100] rel_total = len(relevant_doc_ids) first_rel_rank = -1 @@ -73,12 +75,21 @@ def compute_query_metrics( break mrr10 = 1.0 / first_rel_rank if first_rel_rank > 0 else 0.0 - ideal_len = min(rel_total, 10) - idcg10 = dcg([1] * ideal_len, 10) if ideal_len > 0 else 0.0 + ideal_len_10 = min(rel_total, 10) + idcg10 = dcg([1] * ideal_len_10, 10) if ideal_len_10 > 0 else 0.0 ndcg10 = (dcg(rel_flags_10, 10) / idcg10) if idcg10 > 0 else 0.0 + ideal_len_100 = min(rel_total, 100) + idcg100 = dcg([1] * ideal_len_100, 100) if ideal_len_100 > 0 else 0.0 + ndcg100 = (dcg(rel_flags_100, 100) / idcg100) if idcg100 > 0 else 0.0 + recall5 = (rel_in_5 / rel_total) if rel_total > 0 else 0.0 recall10 = (rel_in_10 / rel_total) if rel_total > 0 else 0.0 + recall20 = ( + sum(1 for did in top20 if did in relevant_doc_ids) / rel_total + if rel_total > 0 + else 0.0 + ) recall100 = ( sum(1 for did in top100 if did in relevant_doc_ids) / rel_total if rel_total > 0 @@ -90,16 +101,32 @@ def compute_query_metrics( else 0.0 ) + hits = 0 + precision_sum = 0.0 + for rank, did in enumerate(top10, start=1): + if did in relevant_doc_ids: + hits += 1 + precision_sum += hits / rank + map10 = ( + precision_sum / min(rel_total, 10) + if rel_total > 0 + else 0.0 + ) + return { "first_relevant_rank_at_10": first_rel_rank, "relevant_in_top10": rel_in_10, "relevant_total": rel_total, "ndcg_at_10": round(ndcg10, 5), + "ndcg_at_100": round(ndcg100, 5), "mrr_at_10": round(mrr10, 5), "recall_at_5": round(recall5, 5), "recall_at_10": round(recall10, 5), + "recall_at_20": round(recall20, 5), "recall_at_100": round(recall100, 5), "recall_at_1000": round(recall1000, 5), + "precision_at_10": round(rel_in_10 / 10, 5), + "map_at_10": round(map10, 5), } diff --git a/test/scripts/retrieval/test_retrieval_executor.py b/test/scripts/retrieval/test_retrieval_executor.py index bab4de77..9341d278 100644 --- a/test/scripts/retrieval/test_retrieval_executor.py +++ b/test/scripts/retrieval/test_retrieval_executor.py @@ -4,6 +4,7 @@ import os import subprocess import sys +from types import SimpleNamespace import pytest @@ -126,3 +127,59 @@ def test_help(self): def test_api_url_is_optional(self): stdout, _, _ = self._run_cli("eval-retrieval", "--help") assert "default depends on backend" in stdout + + +class TestRetrievalExecutorFallbackMetrics: + def test_execute_uses_trace_metrics_when_mteb_metrics_empty(self, tmp_path, monkeypatch): + import dingo.exec.retrieval as retrieval_module + from dingo.exec.retrieval import RetrievalExecutor + + class FakeClient: + name = "fake-openalex" + + input_args = InputArgs(**{ + "input_path": "SciFact", + "output_path": str(tmp_path), + "executor": { + "retrieval": { + "backend": "openalex", + "api_url": "https://api.openalex.org", + "limit": 10, + } + }, + }) + executor = RetrievalExecutor(input_args) + + monkeypatch.setattr(retrieval_module, "create_client", lambda *a, **k: FakeClient()) + monkeypatch.setattr(retrieval_module.mteb, "get_tasks", lambda tasks: [object()]) + monkeypatch.setattr(RetrievalExecutor, "_attach_relevant_docs", lambda self, model, tasks: None) + + def fake_evaluate(model, tasks, overwrite_strategy): + model._search_traces.append({ + "task": "SciFact", + "total_queries": 2, + "queries": [ + { + "qid": "q1", + "retrieved_doc_ids": ["d1", "d2"], + "gold_doc_ids": ["d1"], + }, + { + "qid": "q2", + "retrieved_doc_ids": ["d3"], + "gold_doc_ids": ["d4"], + }, + ], + }) + return SimpleNamespace( + task_results=[SimpleNamespace(scores={})], + ) + + monkeypatch.setattr(retrieval_module.mteb, "evaluate", fake_evaluate) + + summary = executor.execute() + + assert summary.score == 0.5 + assert summary.metrics_score_stats["SciFact"]["main_score"] == 0.5 + assert summary.metrics_score_stats["SciFact"]["ndcg_at_10"] == 0.5 + assert summary.metrics_score_stats["SciFact"]["recall_at_10"] == 0.5 From 2ed0af40f494b8390ac1e0b9bb399533f86e81fc Mon Sep 17 00:00:00 2001 From: chupei Date: Mon, 15 Jun 2026 15:46:52 +0800 Subject: [PATCH 2/3] feat: retrival eval title fallback add fuzzy matching --- dingo/config/input_args.py | 5 ++ dingo/exec/retrieval.py | 10 ++++ dingo/retrieval/eval_utils.py | 46 ++++++++++++-- dingo/retrieval/mteb_adapter.py | 73 ++++++++++++++++++++++- dingo/run/cli.py | 25 ++++++++ test/scripts/retrieval/test_eval_utils.py | 49 +++++++++++++-- 6 files changed, 195 insertions(+), 13 deletions(-) diff --git a/dingo/config/input_args.py b/dingo/config/input_args.py index ba3e3c73..11d78bf0 100644 --- a/dingo/config/input_args.py +++ b/dingo/config/input_args.py @@ -94,6 +94,11 @@ class RetrievalArgs(BaseModel): freshness_boost: Optional[str] = None filters: Optional[List[Dict[str, Any]] | Dict[str, Any]] = None max_queries: Optional[int] = None + title_fuzzy_enabled: bool = False + title_fuzzy_threshold: float = 0.95 + title_fuzzy_margin: float = 0.01 + title_fuzzy_min_len: int = 20 + title_fuzzy_max_candidates: int = 300 timeout: float = 120.0 rate_limit: Optional[float] = None max_retries: int = 3 diff --git a/dingo/exec/retrieval.py b/dingo/exec/retrieval.py index b12c62f4..ce925333 100644 --- a/dingo/exec/retrieval.py +++ b/dingo/exec/retrieval.py @@ -84,6 +84,11 @@ def execute(self) -> SummaryModel: search_limit=ra.limit, max_queries=ra.max_queries, max_workers=ra.max_workers, + title_fuzzy_enabled=ra.title_fuzzy_enabled, + title_fuzzy_threshold=ra.title_fuzzy_threshold, + title_fuzzy_margin=ra.title_fuzzy_margin, + title_fuzzy_min_len=ra.title_fuzzy_min_len, + title_fuzzy_max_candidates=ra.title_fuzzy_max_candidates, ) output_dir = make_output_dir( @@ -159,6 +164,11 @@ def execute(self) -> SummaryModel: "limit": ra.limit, "retrieval_mode": ra.retrieval_mode, "sub_queries": ra.sub_queries, + "title_fuzzy_enabled": ra.title_fuzzy_enabled, + "title_fuzzy_threshold": ra.title_fuzzy_threshold, + "title_fuzzy_margin": ra.title_fuzzy_margin, + "title_fuzzy_min_len": ra.title_fuzzy_min_len, + "title_fuzzy_max_candidates": ra.title_fuzzy_max_candidates, "max_queries": ra.max_queries, "tasks": task_names, } diff --git a/dingo/retrieval/eval_utils.py b/dingo/retrieval/eval_utils.py index f7091c9d..bff6a04f 100644 --- a/dingo/retrieval/eval_utils.py +++ b/dingo/retrieval/eval_utils.py @@ -16,6 +16,7 @@ import re import unicodedata from datetime import datetime +from difflib import SequenceMatcher from typing import Any logger = logging.getLogger(__name__) @@ -134,25 +135,58 @@ def resolve_hit( hit: dict[str, Any], title_index: dict[str, list[str]], corpus_id_set: set[str], -) -> tuple[str, str]: + *, + title_fuzzy_enabled: bool = False, + title_fuzzy_threshold: float = 0.95, + title_fuzzy_margin: float = 0.01, + title_fuzzy_min_len: int = 20, + title_norm_candidates: list[tuple[str, list[str]]] | None = None, +) -> tuple[str, str, float | None]: """Resolve a search hit to a corpus ID. - Returns ``(corpus_id, mapping_source)`` where *mapping_source* is one of - ``"doc_id_exact"``, ``"title_fallback"``, or ``"unmatched"``. + Returns ``(corpus_id, mapping_source, fuzzy_similarity)`` where + *mapping_source* is one of ``"doc_id_exact"``, ``"title_fallback"``, + ``"title_fuzzy"``, or ``"unmatched"``. """ raw_id = str(hit.get("doc_id") or hit.get("paper_id") or "").strip() if raw_id: stripped = strip_d_prefix(raw_id) for candidate in (raw_id, stripped, f"d{stripped}"): if candidate in corpus_id_set: - return candidate, "doc_id_exact" + return candidate, "doc_id_exact", None title = str(hit.get("title") or "") norm = normalize_title(title) if norm: candidates = title_index.get(norm) if candidates: - return candidates[0], "title_fallback" - return "", "unmatched" + return candidates[0], "title_fallback", None + + if title_fuzzy_enabled and norm and len(norm) >= title_fuzzy_min_len: + iterable_candidates = ( + title_norm_candidates + if title_norm_candidates is not None + else list(title_index.items()) + ) + best_ids: list[str] | None = None + best_score = -1.0 + second_score = -1.0 + for candidate_norm, candidate_ids in iterable_candidates: + score = SequenceMatcher(None, norm, candidate_norm).ratio() + if score > best_score: + second_score = best_score + best_score = score + best_ids = candidate_ids + elif score > second_score: + second_score = score + + if ( + best_ids + and best_score >= title_fuzzy_threshold + and (best_score - second_score) >= title_fuzzy_margin + ): + return best_ids[0], "title_fuzzy", round(best_score, 6) + + return "", "unmatched", None def make_output_dir(explicit_dir: str | None, default_prefix: str) -> str: diff --git a/dingo/retrieval/mteb_adapter.py b/dingo/retrieval/mteb_adapter.py index 8643cd93..a6421bd4 100644 --- a/dingo/retrieval/mteb_adapter.py +++ b/dingo/retrieval/mteb_adapter.py @@ -20,6 +20,7 @@ import concurrent.futures import logging from collections import defaultdict +from heapq import nlargest from typing import TYPE_CHECKING, Any from mteb.models.model_meta import ModelMeta @@ -98,6 +99,15 @@ def _instruction_trace_fields( return fields +def _title_ngrams(text: str, n: int = 3) -> set[str]: + """Return character n-grams for normalized title text.""" + if not text: + return set() + if len(text) < n: + return {text} + return {text[i : i + n] for i in range(len(text) - n + 1)} + + # Workaround for mteb versions where confidence_scores crashes on empty input. try: from mteb._evaluators import retrieval_metrics as _rm @@ -123,13 +133,25 @@ def __init__( search_limit: int = 100, max_queries: int | None = None, max_workers: int = 1, + title_fuzzy_enabled: bool = False, + title_fuzzy_threshold: float = 0.95, + title_fuzzy_margin: float = 0.01, + title_fuzzy_min_len: int = 20, + title_fuzzy_max_candidates: int = 300, ): self.client = client self.search_limit = search_limit self.max_queries = max_queries self.max_workers = max_workers + self.title_fuzzy_enabled = title_fuzzy_enabled + self.title_fuzzy_threshold = title_fuzzy_threshold + self.title_fuzzy_margin = title_fuzzy_margin + self.title_fuzzy_min_len = title_fuzzy_min_len + self.title_fuzzy_max_candidates = title_fuzzy_max_candidates self._title_to_ids: dict[str, list[str]] = defaultdict(list) + self._normalized_titles: list[str] = [] + self._title_ngrams_index: dict[str, list[int]] = defaultdict(list) self._corpus_ids: set[str] = set() self._corpus_size = 0 self._collisions = 0 @@ -202,6 +224,8 @@ def index( num_proc: int | None = None, ) -> None: self._title_to_ids.clear() + self._normalized_titles.clear() + self._title_ngrams_index.clear() self._corpus_ids.clear() count = 0 for row in corpus: @@ -212,6 +236,11 @@ def index( continue normalized = normalize_title(title) if normalized: + if normalized not in self._title_to_ids: + title_idx = len(self._normalized_titles) + self._normalized_titles.append(normalized) + for gram in _title_ngrams(normalized): + self._title_ngrams_index[gram].append(title_idx) self._title_to_ids[normalized].append(doc_id) count += 1 @@ -225,6 +254,31 @@ def index( f"[task={task_metadata.name}]" ) + def _get_fuzzy_title_candidates(self, norm_title: str) -> list[tuple[str, list[str]]]: + """Get top fuzzy candidates via n-gram overlap prefiltering.""" + grams = _title_ngrams(norm_title) + if not grams: + return [] + + overlap_counts: dict[int, int] = defaultdict(int) + for gram in grams: + for title_idx in self._title_ngrams_index.get(gram, []): + overlap_counts[title_idx] += 1 + + if not overlap_counts: + return [] + + top = nlargest( + self.title_fuzzy_max_candidates, + overlap_counts.items(), + key=lambda x: x[1], + ) + + return [ + (self._normalized_titles[title_idx], self._title_to_ids[self._normalized_titles[title_idx]]) + for title_idx, _ in top + ] + def search( self, queries: "QueryDatasetType", @@ -307,13 +361,27 @@ def _process_query(idx_qid_text): mapping_stats: dict[str, int] = { "doc_id_exact": 0, "title_fallback": 0, + "title_fuzzy": 0, "unmatched": 0, } for rank, paper in enumerate(response.results): hit = {"paper_id": paper.paper_id, "title": paper.title} - resolved_id, src = resolve_hit( - hit, self._title_to_ids, self._corpus_ids + norm_hit_title = normalize_title(hit.get("title") or "") + fuzzy_candidates = ( + self._get_fuzzy_title_candidates(norm_hit_title) + if self.title_fuzzy_enabled and norm_hit_title + else None + ) + resolved_id, src, fuzzy_similarity = resolve_hit( + hit, + self._title_to_ids, + self._corpus_ids, + title_fuzzy_enabled=self.title_fuzzy_enabled, + title_fuzzy_threshold=self.title_fuzzy_threshold, + title_fuzzy_margin=self.title_fuzzy_margin, + title_fuzzy_min_len=self.title_fuzzy_min_len, + title_norm_candidates=fuzzy_candidates, ) mapping_stats[src] = mapping_stats.get(src, 0) + 1 top_api_results.append( @@ -324,6 +392,7 @@ def _process_query(idx_qid_text): "score": paper.score, "resolved_corpus_id": resolved_id, "mapping_source": src, + "title_fuzzy_similarity": fuzzy_similarity, "is_relevant": ( bool(resolved_id and resolved_id in relevant_doc_ids) if relevant_doc_ids is not None diff --git a/dingo/run/cli.py b/dingo/run/cli.py index f554201b..15f6baf6 100644 --- a/dingo/run/cli.py +++ b/dingo/run/cli.py @@ -135,6 +135,26 @@ def parse_args(): "--max-queries", type=int, default=None, help="Limit number of queries for quick testing", ) + ret_parser.add_argument( + "--title-fuzzy-enabled", action="store_true", default=False, + help="Enable fuzzy title fallback matching (default: disabled)", + ) + ret_parser.add_argument( + "--title-fuzzy-threshold", type=float, default=0.95, + help="Minimum title similarity to accept fuzzy match (default: 0.95)", + ) + ret_parser.add_argument( + "--title-fuzzy-margin", type=float, default=0.01, + help="Minimum gap between best and second-best fuzzy score (default: 0.01)", + ) + ret_parser.add_argument( + "--title-fuzzy-min-len", type=int, default=20, + help="Minimum normalized title length for fuzzy matching (default: 20)", + ) + ret_parser.add_argument( + "--title-fuzzy-max-candidates", type=int, default=300, + help="Max fuzzy candidates re-ranked per hit (default: 300)", + ) ret_parser.add_argument( "--timeout", type=float, default=120.0, help="HTTP request timeout in seconds (default: 120)", @@ -351,6 +371,11 @@ def cmd_eval_retrieval(args): freshness_boost=args.freshness_boost, filters=filters, max_queries=args.max_queries, + title_fuzzy_enabled=args.title_fuzzy_enabled, + title_fuzzy_threshold=args.title_fuzzy_threshold, + title_fuzzy_margin=args.title_fuzzy_margin, + title_fuzzy_min_len=args.title_fuzzy_min_len, + title_fuzzy_max_candidates=args.title_fuzzy_max_candidates, timeout=args.timeout, rate_limit=args.rate_limit, max_retries=3, diff --git a/test/scripts/retrieval/test_eval_utils.py b/test/scripts/retrieval/test_eval_utils.py index 953f5820..ffb3800b 100644 --- a/test/scripts/retrieval/test_eval_utils.py +++ b/test/scripts/retrieval/test_eval_utils.py @@ -105,30 +105,69 @@ def setup_method(self): def test_exact_match_with_d_prefix(self): hit = {"paper_id": "d123", "title": "Something"} - cid, src = resolve_hit(hit, self.title_index, self.corpus_ids) + cid, src, score = resolve_hit(hit, self.title_index, self.corpus_ids) assert cid == "d123" assert src == "doc_id_exact" + assert score is None def test_exact_match_without_prefix(self): hit = {"paper_id": "123", "title": "Something"} - cid, src = resolve_hit(hit, self.title_index, self.corpus_ids) + cid, src, score = resolve_hit(hit, self.title_index, self.corpus_ids) assert cid == "d123" assert src == "doc_id_exact" + assert score is None def test_title_fallback(self): hit = {"paper_id": "999", "title": "Attention Is All You Need"} - cid, src = resolve_hit(hit, self.title_index, self.corpus_ids) + cid, src, score = resolve_hit(hit, self.title_index, self.corpus_ids) assert cid == "d123" assert src == "title_fallback" + assert score is None def test_unmatched(self): hit = {"paper_id": "999", "title": "Unknown Paper"} - cid, src = resolve_hit(hit, self.title_index, self.corpus_ids) + cid, src, score = resolve_hit(hit, self.title_index, self.corpus_ids) assert cid == "" assert src == "unmatched" + assert score is None def test_empty_hit(self): hit = {"paper_id": "", "title": ""} - cid, src = resolve_hit(hit, self.title_index, self.corpus_ids) + cid, src, score = resolve_hit(hit, self.title_index, self.corpus_ids) assert cid == "" assert src == "unmatched" + assert score is None + + def test_title_fuzzy_markup_match(self): + title_index = { + "linkagedisequilibriummappingofchek2commonvariationandbreastcancerrisk": ["d999"] + } + hit = { + "paper_id": "not-in-corpus", + "title": "Linkage disequilibrium mapping of [!i]CHEK2[!/i]:: Common variation and breast cancer risk", + } + cid, src, score = resolve_hit( + hit, + title_index, + {"d999"}, + title_fuzzy_enabled=True, + title_fuzzy_threshold=0.98, + title_fuzzy_margin=0.001, + title_fuzzy_min_len=10, + ) + assert cid == "d999" + assert src == "title_fuzzy" + assert score is not None and score >= 0.98 + + def test_title_fuzzy_disabled_stays_unmatched(self): + title_index = { + "linkagedisequilibriummappingofchek2commonvariationandbreastcancerrisk": ["d999"] + } + hit = { + "paper_id": "not-in-corpus", + "title": "Linkage disequilibrium mapping of [!i]CHEK2[!/i]:: Common variation and breast cancer risk", + } + cid, src, score = resolve_hit(hit, title_index, {"d999"}) + assert cid == "" + assert src == "unmatched" + assert score is None From bedd4cbcb5ad310e422c81ba942532ddaa4293f9 Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Mon, 15 Jun 2026 07:48:27 +0000 Subject: [PATCH 3/3] =?UTF-8?q?=F0=9F=93=9A=20Auto-update=20metrics=20docu?= =?UTF-8?q?mentation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/metrics.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/metrics.md b/docs/metrics.md index 3acdf41d..586d26f0 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -61,7 +61,7 @@ This document provides comprehensive information about all quality metrics used | Type | Metric | Description | Paper Source | Evaluation Results | Examples | |------|--------|-------------|--------------|-------------------|----------| | `QUALITY_BAD_COMPLETENESS` | RuleLineEndWithEllipsis, RuleLineEndWithTerminal, RuleSentenceNumber, RuleWordNumber | Checks whether the ratio of lines ending with ellipsis is below threshold; Checks whether the ratio of lines ending w... | [RedPajama: an Open Dataset for Training Large Language Models](https://github.com/togethercomputer/RedPajama-Data) (Together Computer, 2023) | [📊 See Results](eval/rule/slimpajama_data_evaluated_by_rule.md) | N/A | -| `QUALITY_BAD_EFFECTIVENESS` | RuleDoi, RuleIsbn, RuleAbnormalChar, RuleAbnormalHtml, RuleAlphaWords, RuleAudioDataFormat, RuleCharNumber, RuleColonEnd, RuleContentNull, RuleContentShort, RuleContentShortMultiLan, RuleEnterAndSpace, RuleEnterMore, RuleEnterRatioMore, RuleHtmlEntity, RuleHtmlTag, RuleInvisibleChar, RuleImageDataFormat, RuleLatexSpecialChar, RuleLineJavascriptCount, RuleLoremIpsum, RuleMeanWordLength, RuleNlpDataFormat, RuleSftDataFormat, RuleSpaceMore, RuleSpecialCharacter, RuleStopWord, RuleSymbolWordRatio, RuleVedioDataFormat, RuleOnlyUrl, RuleDictConsistency | Check whether the string is in the correct format of the doi; Check whether the string is in the correct format of th... | Internal Implementation | N/A | N/A | +| `QUALITY_BAD_EFFECTIVENESS` | RuleAbnormalChar, RuleAbnormalHtml, RuleAlphaWords, RuleAudioDataFormat, RuleCharNumber, RuleColonEnd, RuleContentNull, RuleContentShort, RuleContentShortMultiLan, RuleEnterAndSpace, RuleEnterMore, RuleEnterRatioMore, RuleHtmlEntity, RuleHtmlTag, RuleInvisibleChar, RuleImageDataFormat, RuleLatexSpecialChar, RuleLineJavascriptCount, RuleLoremIpsum, RuleMeanWordLength, RuleNlpDataFormat, RuleSftDataFormat, RuleSpaceMore, RuleSpecialCharacter, RuleStopWord, RuleSymbolWordRatio, RuleVedioDataFormat, RuleOnlyUrl, RuleDictConsistency, RuleDoi, RuleIsbn | Detects garbled text and anti-crawling characters by combining special character and invisible character detection; D... | [RedPajama: an Open Dataset for Training Large Language Models](https://github.com/togethercomputer/RedPajama-Data) (Together Computer, 2023) | [📊 See Results](eval/rule/slimpajama_data_evaluated_by_rule.md) | N/A | | `QUALITY_BAD_FLUENCY` | RuleAbnormalNumber, RuleCharSplit, RuleNoPunc, RuleWordSplit, RuleWordStuck | Checks PDF content for abnormal book page or index numbers that disrupt text flow; Checks PDF content for abnormal ch... | [RedPajama: an Open Dataset for Training Large Language Models](https://github.com/togethercomputer/RedPajama-Data) (Together Computer, 2023) | [📊 See Results](eval/rule/slimpajama_data_evaluated_by_rule.md) | N/A | | `QUALITY_BAD_RELEVANCE` | RuleHeadWordAr, RuleHeadWordCs, RuleHeadWordHu, RuleHeadWordKo, RuleHeadWordRu, RuleHeadWordSr, RuleHeadWordTh, RuleHeadWordVi, RulePatternSearch, RuleWatermark | Checks whether Arabic content contains irrelevant tail source information; Checks whether Czech content contains irre... | [RedPajama: an Open Dataset for Training Large Language Models](https://github.com/togethercomputer/RedPajama-Data) (Together Computer, 2023) | [📊 See Results](eval/rule/slimpajama_data_evaluated_by_rule.md) | N/A | | `QUALITY_BAD_SECURITY` | RuleIDCard, RuleUnsafeWords, RulePIIDetection | Checks whether content contains ID card information; Checks whether content contains unsafe words; Detects Personal I... | [RedPajama: an Open Dataset for Training Large Language Models](https://github.com/togethercomputer/RedPajama-Data) (Together Computer, 2023) | [📊 See Results](eval/rule/slimpajama_data_evaluated_by_rule.md) | N/A | @@ -126,7 +126,7 @@ This document provides comprehensive information about all quality metrics used | Type | Metric | Description | Paper Source | Evaluation Results | Examples | |------|--------|-------------|--------------|-------------------|----------| -| `QUALITY_BAD_EFFECTIVENESS` | RuleMetadataSimilarity | 检查元数据字段与基准数据的相似度匹配,阈值默认为0.6 | Internal Implementation | N/A | N/A | +| `QUALITY_BAD_EFFECTIVENESS` | RuleMetadataSimilarity, RuleQuanliangFieldValidation | 检查元数据字段与基准数据的相似度匹配,阈值默认为0.6; Validate Quanliang metadata fields and report invalid fields | Internal Implementation | N/A | N/A | ### Rule-Based RESUME Quality Metrics