Skip to content

Commit 2b0dbdd

Browse files
authored
fix: dedupe personal website candidates (#198)
* Fix website candidate dedupe * Suppress heuristic URL backfill * Refine website dedupe helpers
1 parent 89c965a commit 2b0dbdd

File tree

6 files changed

+299
-20
lines changed

6 files changed

+299
-20
lines changed

apps/worker/src/five08/worker/crm/resume_profile_processor.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from five08.clients.espo import EspoAPIError, EspoClient
1313
from five08.crm_normalization import (
1414
ROLE_NORMALIZATION_MAP,
15+
normalized_website_identity_key,
1516
normalize_city,
1617
normalize_country,
1718
normalize_role,
@@ -1046,8 +1047,8 @@ def _coerce_website_links(self, value: Any) -> list[str]:
10461047
normalized_link = self._normalize_website_url(raw_value.strip())
10471048
if normalized_link is None:
10481049
continue
1049-
dedupe_key = normalized_link.casefold()
1050-
if dedupe_key in seen:
1050+
dedupe_key = normalized_website_identity_key(normalized_link)
1051+
if dedupe_key is None or dedupe_key in seen:
10511052
continue
10521053
seen.add(dedupe_key)
10531054
normalized.append(normalized_link)
@@ -1066,8 +1067,8 @@ def _merge_website_links(
10661067
normalized = self._normalize_website_url(value)
10671068
if not normalized:
10681069
continue
1069-
dedupe_key = normalized.casefold()
1070-
if dedupe_key in seen:
1070+
dedupe_key = normalized_website_identity_key(normalized)
1071+
if dedupe_key is None or dedupe_key in seen:
10711072
continue
10721073
seen.add(dedupe_key)
10731074
merged.append(normalized)
@@ -1078,8 +1079,8 @@ def _merge_website_links(
10781079
normalized = self._normalize_website_url(value)
10791080
if not normalized:
10801081
continue
1081-
dedupe_key = normalized.casefold()
1082-
if dedupe_key in seen:
1082+
dedupe_key = normalized_website_identity_key(normalized)
1083+
if dedupe_key is None or dedupe_key in seen:
10831084
continue
10841085
seen.add(dedupe_key)
10851086
merged.append(normalized)

packages/shared/src/five08/crm_normalization.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -707,3 +707,36 @@ def normalize_website_url(
707707
elif normalized.startswith("http://www."):
708708
normalized = normalized.replace("http://www.", "http://", 1)
709709
return normalized
710+
711+
712+
def normalized_website_identity_key(normalized_url: str) -> str | None:
713+
try:
714+
parsed = urlsplit(normalized_url)
715+
except Exception:
716+
return normalized_url.casefold()
717+
718+
netloc = parsed.netloc.casefold()
719+
if netloc.startswith("www."):
720+
netloc = netloc[4:]
721+
path = re.sub(r"/+", "/", parsed.path or "").rstrip("/").casefold()
722+
query = parsed.query.casefold()
723+
key = f"{netloc}{path}"
724+
if query:
725+
key = f"{key}?{query}"
726+
return key
727+
728+
729+
def website_identity_key(
730+
value: str,
731+
*,
732+
allow_scheme_less: bool = True,
733+
disallowed_host_predicate: Callable[[str], bool] | None = None,
734+
) -> str | None:
735+
normalized = normalize_website_url(
736+
value,
737+
allow_scheme_less=allow_scheme_less,
738+
disallowed_host_predicate=disallowed_host_predicate,
739+
)
740+
if normalized is None:
741+
return None
742+
return normalized_website_identity_key(normalized)

packages/shared/src/five08/resume_extractor.py

Lines changed: 25 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
from pydantic import BaseModel, ConfigDict, Field, ValidationError
1212
from five08.crm_normalization import (
13+
normalized_website_identity_key as shared_normalized_website_identity_key,
1314
normalize_city as shared_normalize_city,
1415
normalize_country as shared_normalize_country,
1516
normalize_role as shared_normalize_role,
@@ -1120,10 +1121,10 @@ def _normalize_website_links(value: Any) -> list[str]:
11201121
normalized_link = _normalize_website_url(candidate)
11211122
if not normalized_link:
11221123
continue
1123-
lower = normalized_link.lower()
1124-
if lower in seen:
1124+
dedupe_key = _website_identity_key(normalized_link)
1125+
if dedupe_key is None or dedupe_key in seen:
11251126
continue
1126-
seen.add(lower)
1127+
seen.add(dedupe_key)
11271128
normalized_links.append(normalized_link)
11281129

11291130
return normalized_links
@@ -1187,7 +1188,9 @@ def _extract_website_url_candidates(
11871188
if confidence < LLM_WEBSITE_URL_MIN_CONFIDENCE:
11881189
continue
11891190

1190-
key = normalized_url.casefold()
1191+
key = _website_identity_key(normalized_url)
1192+
if key is None:
1193+
continue
11911194
prior = normalized.get(key)
11921195
if prior is not None and prior[2] >= confidence:
11931196
continue
@@ -1250,6 +1253,10 @@ def _normalized_host(host: str | None) -> str:
12501253
return normalized
12511254

12521255

1256+
def _website_identity_key(value: str) -> str | None:
1257+
return shared_normalized_website_identity_key(value)
1258+
1259+
12531260
def _host_matches_domain(host: str | None, domain: str) -> bool:
12541261
normalized_host = _normalized_host(host)
12551262
return normalized_host == domain or normalized_host.endswith(f".{domain}")
@@ -1587,6 +1594,7 @@ def _build_website_and_social_from_candidates(
15871594
) -> tuple[list[str], list[str]]:
15881595
urls_to_consider: list[str] = []
15891596
seen: set[str] = set()
1597+
has_llm_url_candidates = False
15901598

15911599
for candidate_url, candidate_kind, candidate_confidence in llm_candidates:
15921600
if candidate_kind == LLM_URL_CANDIDATE_KIND_PERSONAL:
@@ -1608,18 +1616,21 @@ def _build_website_and_social_from_candidates(
16081616
):
16091617
continue
16101618

1611-
candidate_key = candidate_url.casefold()
1612-
if candidate_key in seen:
1619+
candidate_key = _website_identity_key(candidate_url)
1620+
if candidate_key is None or candidate_key in seen:
16131621
continue
16141622
seen.add(candidate_key)
16151623
urls_to_consider.append(candidate_url)
1624+
has_llm_url_candidates = True
16161625

16171626
for candidate_url, candidate_confidence in heuristic_candidates:
16181627
if candidate_confidence < MIDDLE_WEBSITE_POSITION_SCALE:
16191628
continue
1629+
if has_llm_url_candidates:
1630+
continue
16201631

1621-
candidate_key = candidate_url.casefold()
1622-
if candidate_key in seen:
1632+
candidate_key = _website_identity_key(candidate_url)
1633+
if candidate_key is None or candidate_key in seen:
16231634
continue
16241635
seen.add(candidate_key)
16251636
urls_to_consider.append(candidate_url)
@@ -1647,8 +1658,8 @@ def _split_social_and_website_links(
16471658
continue
16481659
social_profile = _normalize_social_profile_url(candidate)
16491660
if social_profile:
1650-
social_key = social_profile.casefold()
1651-
if social_key in seen_social:
1661+
social_key = _website_identity_key(social_profile)
1662+
if social_key is None or social_key in seen_social:
16521663
continue
16531664
seen_social.add(social_key)
16541665
social_links.append(social_profile)
@@ -1658,8 +1669,8 @@ def _split_social_and_website_links(
16581669
if _is_personal_website_disallowed(candidate):
16591670
continue
16601671

1661-
normal_key = candidate.casefold()
1662-
if normal_key in seen_normal:
1672+
normal_key = _website_identity_key(candidate)
1673+
if normal_key is None or normal_key in seen_normal:
16631674
continue
16641675
seen_normal.add(normal_key)
16651676
normal_links.append(candidate)
@@ -3408,8 +3419,8 @@ def _extract_website_link_candidates(
34083419
end_index,
34093420
):
34103421
continue
3411-
normalized_key = normalized_link.casefold()
3412-
if normalized_key in seen:
3422+
normalized_key = _website_identity_key(normalized_link)
3423+
if normalized_key is None or normalized_key in seen:
34133424
continue
34143425
seen.add(normalized_key)
34153426
normalized_links.append((normalized_link, confidence))

tests/unit/test_crm_normalization.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
normalize_state,
1010
normalize_timezone,
1111
normalize_website_url,
12+
website_identity_key,
1213
)
1314

1415

@@ -47,6 +48,12 @@ def test_normalize_website_url_respects_disallowed_host_predicate() -> None:
4748
)
4849

4950

51+
def test_website_identity_key_ignores_scheme_and_path_casing() -> None:
52+
assert website_identity_key("http://Example.com/About") == website_identity_key(
53+
"https://example.com/about/"
54+
)
55+
56+
5057
def test_normalize_country_and_city() -> None:
5158
assert normalize_country(" united states ") == "United States"
5259
assert normalize_country("Taiwan") == "Taiwan"

0 commit comments

Comments
 (0)