From b47f2fef24d2b31200cc67e777ccff30d0c073df Mon Sep 17 00:00:00 2001 From: Chris Dell <2529187+dellch@users.noreply.github.com> Date: Tue, 2 Jun 2026 15:34:21 -0700 Subject: [PATCH 1/2] test: add failing tests for deterministic PR-description ticket selection extract_ticket_links_from_pr_description() accumulates issue URLs in a set and caps via list(set)[:3], so when a PR description references more than 3 issues the selected subset is arbitrary and varies between runs. These tests assert first-seen ordering and a deterministic capped subset. The cap test fails against the current set-based implementation on every hash seed (verified across PYTHONHASHSEED 0-11); the fix follows in the next commit. Co-Authored-By: Claude --- .../test_extract_issue_from_branch.py | 41 ++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/tests/unittest/test_extract_issue_from_branch.py b/tests/unittest/test_extract_issue_from_branch.py index 6e957ba3f8..e8a240448c 100644 --- a/tests/unittest/test_extract_issue_from_branch.py +++ b/tests/unittest/test_extract_issue_from_branch.py @@ -1,6 +1,12 @@ import pytest -from pr_agent.tools.ticket_pr_compliance_check import extract_ticket_links_from_branch_name +from pr_agent.tools.ticket_pr_compliance_check import ( + extract_ticket_links_from_branch_name, + extract_ticket_links_from_pr_description, +) + +# The PR-description extractor caps results at 3 (hardcoded in the function). +MAX_TICKETS = 3 class TestExtractTicketsLinkFromBranchName: @@ -110,3 +116,36 @@ def test_multiple_matches_deduplicated(self): "https://github.com/org/repo/issues/1", "https://github.com/org/repo/issues/2", } + + +class TestExtractTicketLinksFromPrDescription: + """GitHub issue extraction from the PR description.""" + + def test_preserves_first_seen_order(self): + """Issues are returned in first-seen order, de-duplicated. + + Note: this documents the intended ordered behaviour. It does not reliably fail + against the old set-based code, because set iteration order is randomised per + process (PYTHONHASHSEED) and may coincidentally match insertion order for a + small input. test_cap_selects_deterministic_first_seen_subset is the reliable + regression guard (see its note).""" + desc = "Fixes #3, relates to #1, also #3 again and #2" + result = extract_ticket_links_from_pr_description(desc, "org/repo", "https://github.com") + assert result == [ + "https://github.com/org/repo/issues/3", + "https://github.com/org/repo/issues/1", + "https://github.com/org/repo/issues/2", + ] + + def test_cap_selects_deterministic_first_seen_subset(self): + """When more than MAX_TICKETS issues are present, the first MAX_TICKETS in + first-seen order are kept (not an arbitrary subset from a set). + + This is the reliable regression guard for the bug: with > MAX_TICKETS issues, + the old code sliced list(set)[:MAX_TICKETS], so it returned an arbitrary subset + that (essentially) never equals the first-seen subset, on any hash seed.""" + nums = list(range(1, MAX_TICKETS + 4)) + desc = " ".join(f"#{n}" for n in nums) + result = extract_ticket_links_from_pr_description(desc, "org/repo", "https://github.com") + expected = [f"https://github.com/org/repo/issues/{n}" for n in nums[:MAX_TICKETS]] + assert result == expected From 03979aa26d2a8a093090172c33707ab96a45fb14 Mon Sep 17 00:00:00 2001 From: Chris Dell <2529187+dellch@users.noreply.github.com> Date: Tue, 2 Jun 2026 15:35:07 -0700 Subject: [PATCH 2/2] fix: make GitHub PR-description ticket selection deterministic Track issue URLs in first-seen order while de-duplicating (a seen set plus an ordered list), then apply the 3-ticket cap by slicing the ordered list, instead of accumulating in a set and slicing list(set)[:3]. Selection is now stable and predictable across runs. Behaviour is unchanged for 3 or fewer issues. Makes the tests added in the previous commit pass. Co-Authored-By: Claude --- pr_agent/tools/ticket_pr_compliance_check.py | 21 ++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/pr_agent/tools/ticket_pr_compliance_check.py b/pr_agent/tools/ticket_pr_compliance_check.py index 6d25d76b19..b323986b12 100644 --- a/pr_agent/tools/ticket_pr_compliance_check.py +++ b/pr_agent/tools/ticket_pr_compliance_check.py @@ -39,31 +39,40 @@ def extract_ticket_links_from_pr_description(pr_description, repo_path, base_url """ Extract all ticket links from PR description """ - github_tickets = set() + # Preserve first-seen order while de-duplicating, so the cap below selects a + # deterministic subset (a plain set would slice an arbitrary, run-varying one). + seen = set() + github_tickets = [] + + def _add(url): + if url not in seen: + seen.add(url) + github_tickets.append(url) + try: # Use the updated pattern to find matches matches = GITHUB_TICKET_PATTERN.findall(pr_description) for match in matches: if match[0]: # Full URL match - github_tickets.add(match[0]) + _add(match[0]) elif match[1]: # Shorthand notation match: owner/repo#issue_number owner, repo, issue_number = match[2], match[3], match[4] - github_tickets.add(f'{base_url_html.strip("/")}/{owner}/{repo}/issues/{issue_number}') + _add(f"{base_url_html.strip('/')}/{owner}/{repo}/issues/{issue_number}") else: # #123 format issue_number = match[5][1:] # remove # if issue_number.isdigit() and len(issue_number) < 5 and repo_path: - github_tickets.add(f'{base_url_html.strip("/")}/{repo_path}/issues/{issue_number}') + _add(f"{base_url_html.strip('/')}/{repo_path}/issues/{issue_number}") if len(github_tickets) > 3: get_logger().info(f"Too many tickets found in PR description: {len(github_tickets)}") # Limit the number of tickets to 3 - github_tickets = set(list(github_tickets)[:3]) + github_tickets = github_tickets[:3] except Exception as e: get_logger().error(f"Error extracting tickets error= {e}", artifact={"traceback": traceback.format_exc()}) - return list(github_tickets) + return github_tickets def extract_ticket_links_from_branch_name(branch_name, repo_path, base_url_html="https://github.com"): """