diff --git a/app/verify/cli.py b/app/verify/cli.py index 6a9db3e56af..a2883ef229b 100644 --- a/app/verify/cli.py +++ b/app/verify/cli.py @@ -406,15 +406,12 @@ def cmd_crossref(args: argparse.Namespace) -> int: now_year = offline.now_year_today() categories = tuple(args.category) if args.category else CATEGORIES - # Escalation target: yellow/red unverified frontier (greens promote via live T1). - targets = [] - for rec in _ranked_unverified(records, soc_release, now_year, categories): - s = offline.score_record(rec, now_year, soc_release) - if s.band in ("yellow", "red"): - targets.append(rec) - targets = targets[: args.max] + # Cross-reference the whole unverified frontier, ranked by score. Greens are + # included on purpose: reality must be able to CONFIRM them (strongest promote) + # or CONTRADICT them (veto) before they are verified. + targets = _ranked_unverified(records, soc_release, now_year, categories)[: args.max] - fetcher = crossref.WikipediaFetcher() + fetcher = crossref.WikidataFetcher() cache = promote.load_crossref_cache() ts = _now_iso() decisions = Counter() @@ -569,7 +566,7 @@ def cmd_pr(args: argparse.Namespace) -> int: print() # Tier 2 — external cross-reference (network, exact-heading only). - fetcher = crossref.WikipediaFetcher() + fetcher = crossref.WikidataFetcher() xref: dict[str, str] = {} decisions = Counter() for r, _ in scored: diff --git a/app/verify/crossref.py b/app/verify/crossref.py index d181aa3c6d0..adc804519d4 100644 --- a/app/verify/crossref.py +++ b/app/verify/crossref.py @@ -59,10 +59,29 @@ def _year_of(value: Any) -> int | None: return None +def _heading_matches(rec_name: str, cand_title: str) -> bool: + """Exact normalized match, or the candidate is the model-name suffix of the + record (authoritative sources often omit the maker prefix: record 'AMD Ryzen 7 + 5800X' vs Wikidata label 'Ryzen 7 5800X'). This is NOT fuzzy matching — it + requires a full, contiguous suffix of >=4 chars, so it can't drift to a + different SKU the way Levenshtein does.""" + r, c = normalize_heading(rec_name), normalize_heading(cand_title) + if not r or not c: + return False + if r == c: + return True + return len(c) >= 4 and (r.endswith(c) or c.endswith(r)) + + def crossref_record( rec: dict[str, Any], fetcher: Fetcher, source: str = "wikidata" ) -> CrossrefResult: - """Decide confirm/ambiguous/contradict/notfound for one record.""" + """Decide confirm/ambiguous/contradict/notfound for one record. + + Reality-based: CONFIRM requires an exact-heading authoritative entity whose + release year agrees. A year disagreement is a CONTRADICT (reality veto — the + record must NOT be promoted, even if it scored green). A name match with no + comparable year is only AMBIGUOUS (existence, but specs unconfirmed).""" name = rec.get("name") slug = rec.get("slug") or "" if not isinstance(name, str) or not name.strip(): @@ -72,27 +91,80 @@ def crossref_record( if not candidates: return CrossrefResult(slug, source, NOTFOUND, False, None, 0) - target = normalize_heading(name) - exact = [c for c in candidates if normalize_heading(c.title) == target] + exact = [c for c in candidates if _heading_matches(name, c.title)] if not exact: - # Something came back, but no title matches exactly -> do not trust. return CrossrefResult(slug, source, AMBIGUOUS, False, candidates[0].url, 0) - cand = exact[0] - # Secondary gate: if both sides expose a release year, they must roughly agree. + # Prefer an exact match that carries a year (so we can actually confirm specs). + cand = next((c for c in exact if c.year is not None), exact[0]) rec_year = _year_of(rec.get("release_date")) - agreements = 0 if rec_year is not None and cand.year is not None: if abs(cand.year - rec_year) <= 1: - agreements = 1 - else: - return CrossrefResult(slug, source, CONTRADICT, True, cand.url, 0) - return CrossrefResult(slug, source, CONFIRM, True, cand.url, agreements) + return CrossrefResult(slug, source, CONFIRM, True, cand.url, 1) + return CrossrefResult(slug, source, CONTRADICT, True, cand.url, 0) + # Name matches an authoritative entity but no year to verify the data against. + return CrossrefResult(slug, source, AMBIGUOUS, True, cand.url, 0) # --- concrete fetchers (network; not exercised by unit tests) -------------------- +def _wikidata_claim_year(entity: dict) -> int | None: + """First year from inception (P571) or publication date (P577) claims.""" + claims = entity.get("claims", {}) + for prop in ("P571", "P577"): + for claim in claims.get(prop, []): + try: + t = claim["mainsnak"]["datavalue"]["value"]["time"] # "+2007-02-19T..." + except (KeyError, TypeError): + continue + digits = t.lstrip("+")[:4] + if digits.isdigit(): + return int(digits) + return None + + +class WikidataFetcher: + """Structured cross-reference against Wikidata: search entities by label, then + read their release year (P571/P577) to verify the record's data against reality. + Two HTTP calls per record (search + a batched entity fetch).""" + + API = "https://www.wikidata.org/w/api.php" + UA = "TechAPI-verify/0.1 (https://github.com/GetTechAPI)" + + def __init__(self, timeout: float = 10.0, limit: int = 5) -> None: + self.timeout = timeout + self.limit = limit + + def _get(self, url: str) -> dict: + req = Request(url, headers={"User-Agent": self.UA}) + with urlopen(req, timeout=self.timeout) as resp: + return json.loads(resp.read().decode("utf-8")) + + def search(self, name: str) -> list[Candidate]: + try: + data = self._get( + f"{self.API}?action=wbsearchentities&format=json&language=en" + f"&limit={self.limit}&search={quote(name)}" + ) + hits = data.get("search", []) + if not hits: + return [] + ids = "|".join(h["id"] for h in hits if h.get("id")) + ent = self._get( + f"{self.API}?action=wbgetentities&format=json&props=claims&ids={ids}" + ).get("entities", {}) + except Exception: + return [] + out: list[Candidate] = [] + for h in hits: + qid = h.get("id") + label = h.get("label") or h.get("match", {}).get("text", "") + year = _wikidata_claim_year(ent.get(qid, {})) if qid else None + out.append(Candidate(title=label, url=f"https://www.wikidata.org/wiki/{qid}", year=year)) + return out + + class WikipediaFetcher: """Queries the MediaWiki opensearch API for candidate page titles.""" diff --git a/app/verify/promote.py b/app/verify/promote.py index 4d70acadbd6..2485f8974b7 100644 --- a/app/verify/promote.py +++ b/app/verify/promote.py @@ -58,8 +58,17 @@ def decide( *, band: str, source_urls: list[str], url_cache: dict[str, dict[str, Any]], crossref_decision: str | None, ) -> PromotionDecision: + # Reality veto: if an authoritative external source contradicts the record's + # specs (e.g. release year mismatch), never promote — even a green record. + # Accuracy must be reality-based; that's the whole point of verification. + if crossref_decision == "contradict": + return PromotionDecision(False, "crossref-contradict") + # Reality confirm: external source agrees -> strongest promotion. if crossref_decision == "confirm": return PromotionDecision(True, "crossref-confirm") + # Heuristic fallback where reality is silent: a green record (consistent + + # complete + authoritative-source) whose source is live. green≈verified was + # validated against the human-curated set, so this is a sound proxy. if band == "green" and has_live_authoritative_source(source_urls, url_cache): return PromotionDecision(True, "green+live-source") return PromotionDecision(False, "needs-confirmation") diff --git a/tests/verify/test_promote_crossref.py b/tests/verify/test_promote_crossref.py index c45cd009188..4457dc106c1 100644 --- a/tests/verify/test_promote_crossref.py +++ b/tests/verify/test_promote_crossref.py @@ -43,6 +43,20 @@ def test_no_candidates_is_notfound(): assert crossref.crossref_record(rec, FakeFetcher([])).decision == crossref.NOTFOUND +def test_exact_heading_without_year_is_ambiguous(): + # Name matches an authoritative entity but there's no year to verify specs. + rec = {"slug": "x", "name": "Widget 9000", "release_date": "2018-01-01"} + f = FakeFetcher([Candidate("Widget 9000", "http://x", None)]) + assert crossref.crossref_record(rec, f).decision == crossref.AMBIGUOUS + + +def test_model_suffix_matches_maker_prefixed_record(): + # Wikidata often labels without the maker prefix. + rec = {"slug": "x", "name": "AMD Ryzen 7 5800X", "release_date": "2020-11-05"} + f = FakeFetcher([Candidate("Ryzen 7 5800X", "http://x", 2020)]) + assert crossref.crossref_record(rec, f).decision == crossref.CONFIRM + + def test_normalize_heading(): assert crossref.normalize_heading("iPhone XR") == "iphonexr" assert crossref.normalize_heading("Core i9-14900K") == "corei914900k" @@ -135,6 +149,17 @@ def test_yellow_with_crossref_confirm_promotes(): assert d.promote and d.reason == "crossref-confirm" +def test_crossref_contradict_vetoes_even_green(): + # Reality veto: a green record with a live source is NOT promoted if an + # authoritative source contradicts its specs. + cache = {"https://en.wikipedia.org/wiki/X": {"alive": True}} + d = promote.decide( + band="green", source_urls=["https://en.wikipedia.org/wiki/X"], + url_cache=cache, crossref_decision="contradict", + ) + assert not d.promote and d.reason == "crossref-contradict" + + def test_dead_t1_does_not_promote(): cache = {"https://en.wikipedia.org/wiki/X": {"alive": False}} d = promote.decide(band="green", source_urls=["https://en.wikipedia.org/wiki/X"],