1010
1111from pydantic import BaseModel , ConfigDict , Field , ValidationError
1212from five08 .crm_normalization import (
13+ normalized_website_identity_key as shared_normalized_website_identity_key ,
1314 normalize_city as shared_normalize_city ,
1415 normalize_country as shared_normalize_country ,
1516 normalize_role as shared_normalize_role ,
@@ -1120,10 +1121,10 @@ def _normalize_website_links(value: Any) -> list[str]:
11201121 normalized_link = _normalize_website_url (candidate )
11211122 if not normalized_link :
11221123 continue
1123- lower = normalized_link . lower ( )
1124- if lower in seen :
1124+ dedupe_key = _website_identity_key ( normalized_link )
1125+ if dedupe_key is None or dedupe_key in seen :
11251126 continue
1126- seen .add (lower )
1127+ seen .add (dedupe_key )
11271128 normalized_links .append (normalized_link )
11281129
11291130 return normalized_links
@@ -1187,7 +1188,9 @@ def _extract_website_url_candidates(
11871188 if confidence < LLM_WEBSITE_URL_MIN_CONFIDENCE :
11881189 continue
11891190
1190- key = normalized_url .casefold ()
1191+ key = _website_identity_key (normalized_url )
1192+ if key is None :
1193+ continue
11911194 prior = normalized .get (key )
11921195 if prior is not None and prior [2 ] >= confidence :
11931196 continue
@@ -1250,6 +1253,10 @@ def _normalized_host(host: str | None) -> str:
12501253 return normalized
12511254
12521255
1256+ def _website_identity_key (value : str ) -> str | None :
1257+ return shared_normalized_website_identity_key (value )
1258+
1259+
12531260def _host_matches_domain (host : str | None , domain : str ) -> bool :
12541261 normalized_host = _normalized_host (host )
12551262 return normalized_host == domain or normalized_host .endswith (f".{ domain } " )
@@ -1587,6 +1594,7 @@ def _build_website_and_social_from_candidates(
15871594) -> tuple [list [str ], list [str ]]:
15881595 urls_to_consider : list [str ] = []
15891596 seen : set [str ] = set ()
1597+ has_llm_url_candidates = False
15901598
15911599 for candidate_url , candidate_kind , candidate_confidence in llm_candidates :
15921600 if candidate_kind == LLM_URL_CANDIDATE_KIND_PERSONAL :
@@ -1608,18 +1616,21 @@ def _build_website_and_social_from_candidates(
16081616 ):
16091617 continue
16101618
1611- candidate_key = candidate_url . casefold ( )
1612- if candidate_key in seen :
1619+ candidate_key = _website_identity_key ( candidate_url )
1620+ if candidate_key is None or candidate_key in seen :
16131621 continue
16141622 seen .add (candidate_key )
16151623 urls_to_consider .append (candidate_url )
1624+ has_llm_url_candidates = True
16161625
16171626 for candidate_url , candidate_confidence in heuristic_candidates :
16181627 if candidate_confidence < MIDDLE_WEBSITE_POSITION_SCALE :
16191628 continue
1629+ if has_llm_url_candidates :
1630+ continue
16201631
1621- candidate_key = candidate_url . casefold ( )
1622- if candidate_key in seen :
1632+ candidate_key = _website_identity_key ( candidate_url )
1633+ if candidate_key is None or candidate_key in seen :
16231634 continue
16241635 seen .add (candidate_key )
16251636 urls_to_consider .append (candidate_url )
@@ -1647,8 +1658,8 @@ def _split_social_and_website_links(
16471658 continue
16481659 social_profile = _normalize_social_profile_url (candidate )
16491660 if social_profile :
1650- social_key = social_profile . casefold ( )
1651- if social_key in seen_social :
1661+ social_key = _website_identity_key ( social_profile )
1662+ if social_key is None or social_key in seen_social :
16521663 continue
16531664 seen_social .add (social_key )
16541665 social_links .append (social_profile )
@@ -1658,8 +1669,8 @@ def _split_social_and_website_links(
16581669 if _is_personal_website_disallowed (candidate ):
16591670 continue
16601671
1661- normal_key = candidate . casefold ( )
1662- if normal_key in seen_normal :
1672+ normal_key = _website_identity_key ( candidate )
1673+ if normal_key is None or normal_key in seen_normal :
16631674 continue
16641675 seen_normal .add (normal_key )
16651676 normal_links .append (candidate )
@@ -3408,8 +3419,8 @@ def _extract_website_link_candidates(
34083419 end_index ,
34093420 ):
34103421 continue
3411- normalized_key = normalized_link . casefold ( )
3412- if normalized_key in seen :
3422+ normalized_key = _website_identity_key ( normalized_link )
3423+ if normalized_key is None or normalized_key in seen :
34133424 continue
34143425 seen .add (normalized_key )
34153426 normalized_links .append ((normalized_link , confidence ))
0 commit comments