From ad09db91deb45e9ad29a84e0a8de2d1cf9519ed2 Mon Sep 17 00:00:00 2001 From: Michael Wu Date: Mon, 9 Mar 2026 21:30:58 +0100 Subject: [PATCH 1/4] fix: skip heuristic role backfill for llm roles --- .../shared/src/five08/resume_extractor.py | 35 +++++++---- tests/unit/test_resume_extractor.py | 62 +++++++++++++++++++ 2 files changed, 86 insertions(+), 11 deletions(-) diff --git a/packages/shared/src/five08/resume_extractor.py b/packages/shared/src/five08/resume_extractor.py index e4aa766..0f2ebc0 100644 --- a/packages/shared/src/five08/resume_extractor.py +++ b/packages/shared/src/five08/resume_extractor.py @@ -2330,6 +2330,29 @@ def extract( limit=5, ) parsed_role_rationale = _normalize_scalar(parsed.get("role_rationale")) + parsed_primary_roles_raw = parsed.get("primary_roles") + if not parsed_primary_roles_raw: + parsed_primary_roles_raw = parsed.get("primary_role") + parsed_primary_roles = _normalize_role_collection(parsed_primary_roles_raw) + llm_provided_role_suggestion = False + if isinstance(parsed_primary_roles_raw, str): + llm_provided_role_suggestion = bool(parsed_primary_roles_raw.strip()) + elif isinstance(parsed_primary_roles_raw, (list, tuple)): + llm_provided_role_suggestion = any( + isinstance(item, str) and bool(item.strip()) + for item in parsed_primary_roles_raw + ) + resolved_primary_roles = parsed_primary_roles + if not llm_provided_role_suggestion: + resolved_primary_roles = ( + resolved_primary_roles + or self._infer_roles_from_signals( + current_title=parsed_current_title, + recent_titles=parsed_recent_titles, + role_rationale=parsed_role_rationale, + ) + or self._infer_roles_from_resume(resume_text) + ) ( parsed_city, parsed_state, @@ -2354,17 +2377,7 @@ def extract( email=parsed_email, additional_emails=parsed_emails, description=_normalize_description(parsed.get("description")), - primary_roles=( - _normalize_role_collection( - parsed.get("primary_roles") or parsed.get("primary_role") - ) - or self._infer_roles_from_signals( - current_title=parsed_current_title, - recent_titles=parsed_recent_titles, - role_rationale=parsed_role_rationale, - ) - or self._infer_roles_from_resume(resume_text) - ), + primary_roles=resolved_primary_roles, github_username=github_username, linkedin_url=linkedin_url, timezone=parsed_timezone, diff --git a/tests/unit/test_resume_extractor.py b/tests/unit/test_resume_extractor.py index 3134856..5fbb916 100644 --- a/tests/unit/test_resume_extractor.py +++ b/tests/unit/test_resume_extractor.py @@ -1713,6 +1713,68 @@ def create(**_: object) -> object: assert result.current_location_evidence is not None +def test_extract_does_not_backfill_heuristic_roles_when_llm_suggests_roles() -> None: + """LLM-suggested role fields should not be expanded by heuristic role inference.""" + + class _FakeChatCompletions: + @staticmethod + def create(**_: object) -> object: + return type( + "Response", + (), + { + "choices": [ + type( + "Choice", + (), + { + "message": type( + "Message", + (), + { + "content": ( + '{"name": "Jane Doe", ' + '"email": "jane@example.com", ' + '"primary_roles": ["platform specialist"], ' + '"current_title": "Software Engineer", ' + '"recent_titles": ["Software Engineer"], ' + '"role_rationale": "Engineering title indicates a developer profile.", ' + '"current_location_raw": null, ' + '"current_location_source": null, ' + '"current_location_evidence": null, ' + '"address_city": null, ' + '"address_state": null, ' + '"address_country": null, ' + '"timezone": null, ' + '"website_url_candidates": [], ' + '"website_links": [], ' + '"social_links": [], ' + '"phone": null, ' + '"skills": [], ' + '"skill_attrs": null, ' + '"confidence": 0.88}' + ) + }, + )() + }, + )() + ] + }, + )() + + extractor = ResumeProfileExtractor(api_key="test-key") + extractor.client = type( + "Client", + (), + {"chat": type("Chat", (), {"completions": _FakeChatCompletions()})()}, + )() + extractor.model = "fake-model" + + result = extractor.extract("Jane Doe\nSoftware Engineer") + + assert result.primary_roles == [] + + def test_extract_discards_invalid_country_and_repairs_current_location_region() -> None: """Invalid LLM location fields should be replaced by deterministic parsing.""" From b1ac43224cf86a4ce5eed77bad0d240c340d25a3 Mon Sep 17 00:00:00 2001 From: Michael Wu Date: Mon, 9 Mar 2026 22:02:29 +0100 Subject: [PATCH 2/4] test: fix llm role suggestion expectation --- tests/unit/test_resume_extractor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/test_resume_extractor.py b/tests/unit/test_resume_extractor.py index 5fbb916..da3109e 100644 --- a/tests/unit/test_resume_extractor.py +++ b/tests/unit/test_resume_extractor.py @@ -1772,7 +1772,7 @@ def create(**_: object) -> object: result = extractor.extract("Jane Doe\nSoftware Engineer") - assert result.primary_roles == [] + assert result.primary_roles == ["platform specialist"] def test_extract_discards_invalid_country_and_repairs_current_location_region() -> None: From 5ab00d8fc3e92f53b9705253a2c6075251c5ce63 Mon Sep 17 00:00:00 2001 From: Michael Wu Date: Mon, 9 Mar 2026 22:04:05 +0100 Subject: [PATCH 3/4] test: assert no heuristic developer role add --- tests/unit/test_resume_extractor.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/unit/test_resume_extractor.py b/tests/unit/test_resume_extractor.py index da3109e..11c8ae2 100644 --- a/tests/unit/test_resume_extractor.py +++ b/tests/unit/test_resume_extractor.py @@ -1772,7 +1772,10 @@ def create(**_: object) -> object: result = extractor.extract("Jane Doe\nSoftware Engineer") + # LLM-suggested roles should be preserved, and heuristic roles like + # "developer" should not be added on top. assert result.primary_roles == ["platform specialist"] + assert "developer" not in result.primary_roles def test_extract_discards_invalid_country_and_repairs_current_location_region() -> None: From dd02aabe9a6fdc521a4b1e2b1ae55cf2a9219eec Mon Sep 17 00:00:00 2001 From: Michael Wu Date: Mon, 9 Mar 2026 22:11:29 +0100 Subject: [PATCH 4/4] fix: gate role heuristics on normalized llm roles --- .../shared/src/five08/resume_extractor.py | 9 +-- tests/unit/test_resume_extractor.py | 64 +++++++++++++++++++ 2 files changed, 65 insertions(+), 8 deletions(-) diff --git a/packages/shared/src/five08/resume_extractor.py b/packages/shared/src/five08/resume_extractor.py index 0f2ebc0..738d083 100644 --- a/packages/shared/src/five08/resume_extractor.py +++ b/packages/shared/src/five08/resume_extractor.py @@ -2334,14 +2334,7 @@ def extract( if not parsed_primary_roles_raw: parsed_primary_roles_raw = parsed.get("primary_role") parsed_primary_roles = _normalize_role_collection(parsed_primary_roles_raw) - llm_provided_role_suggestion = False - if isinstance(parsed_primary_roles_raw, str): - llm_provided_role_suggestion = bool(parsed_primary_roles_raw.strip()) - elif isinstance(parsed_primary_roles_raw, (list, tuple)): - llm_provided_role_suggestion = any( - isinstance(item, str) and bool(item.strip()) - for item in parsed_primary_roles_raw - ) + llm_provided_role_suggestion = bool(parsed_primary_roles) resolved_primary_roles = parsed_primary_roles if not llm_provided_role_suggestion: resolved_primary_roles = ( diff --git a/tests/unit/test_resume_extractor.py b/tests/unit/test_resume_extractor.py index 11c8ae2..8b0b244 100644 --- a/tests/unit/test_resume_extractor.py +++ b/tests/unit/test_resume_extractor.py @@ -1778,6 +1778,70 @@ def create(**_: object) -> object: assert "developer" not in result.primary_roles +def test_extract_does_not_backfill_heuristic_roles_for_legacy_primary_role() -> None: + """Legacy primary_role should also suppress heuristic role expansion.""" + + class _FakeChatCompletions: + @staticmethod + def create(**_: object) -> object: + return type( + "Response", + (), + { + "choices": [ + type( + "Choice", + (), + { + "message": type( + "Message", + (), + { + "content": ( + '{"name": "Jane Doe", ' + '"email": "jane@example.com", ' + '"primary_roles": null, ' + '"primary_role": "platform specialist", ' + '"current_title": "Software Engineer", ' + '"recent_titles": ["Software Engineer"], ' + '"role_rationale": "Engineering title indicates a developer profile.", ' + '"current_location_raw": null, ' + '"current_location_source": null, ' + '"current_location_evidence": null, ' + '"address_city": null, ' + '"address_state": null, ' + '"address_country": null, ' + '"timezone": null, ' + '"website_url_candidates": [], ' + '"website_links": [], ' + '"social_links": [], ' + '"phone": null, ' + '"skills": [], ' + '"skill_attrs": null, ' + '"confidence": 0.88}' + ) + }, + )() + }, + )() + ] + }, + )() + + extractor = ResumeProfileExtractor(api_key="test-key") + extractor.client = type( + "Client", + (), + {"chat": type("Chat", (), {"completions": _FakeChatCompletions()})()}, + )() + extractor.model = "fake-model" + + result = extractor.extract("Jane Doe\nSoftware Engineer") + + assert result.primary_roles == ["platform specialist"] + assert "developer" not in result.primary_roles + + def test_extract_discards_invalid_country_and_repairs_current_location_region() -> None: """Invalid LLM location fields should be replaced by deterministic parsing."""