From 505e2d28a00bc10a79ba6f583189e5d721391aa9 Mon Sep 17 00:00:00 2001 From: "Vishal Koparde, PhD" Date: Thu, 28 May 2026 14:02:17 -0400 Subject: [PATCH 1/7] fix(metrics): count contributor commits across all branches Update contributor counting to traverse all repository branches and deduplicate by commit SHA so merged commits are not double-counted. Enable all-branch counting in the profile README generation call for TOP contributors. _commit message is ai-generated_ --- profile/README.qmd | 2 +- src/make_readme/get_per_user_commits.py | 118 +++++++++++++++++------- 2 files changed, 88 insertions(+), 32 deletions(-) diff --git a/profile/README.qmd b/profile/README.qmd index aa2e359..5716386 100644 --- a/profile/README.qmd +++ b/profile/README.qmd @@ -43,7 +43,7 @@ print(get_recent_releases_table(nmonths=3)) ## TOP contributors ```{python} -print(get_per_user_commits()) +print(get_per_user_commits(include_all_branches=True)) ``` ## About Us diff --git a/src/make_readme/get_per_user_commits.py b/src/make_readme/get_per_user_commits.py index 7666b5e..05c2283 100644 --- a/src/make_readme/get_per_user_commits.py +++ b/src/make_readme/get_per_user_commits.py @@ -86,54 +86,106 @@ def get_members(org_name): return members -def get_commits_count(repo_full_name, eligible_members): - commits_count_by_user = defaultdict( - lambda: {"total": 0, "last_month": 0, "last_6_months": 0} - ) +def get_repo_branches(repo_full_name): + branches = [] page = 1 - today = datetime.utcnow() - one_month_ago = today - timedelta(days=30) - six_months_ago = today - timedelta(days=180) - has_more_pages = True + while has_more_pages: response = requests.get( - f"https://api.github.com/repos/{repo_full_name}/commits?per_page=100&page={page}", + f"https://api.github.com/repos/{repo_full_name}/branches?per_page=100&page={page}", headers=headers, ) if response.status_code != 200: if is_critical_api_error(response): - raise_api_error(response, f"commits for repo '{repo_full_name}'") + raise_api_error(response, f"branches for repo '{repo_full_name}'") log_noncritical_api_error( response, - f"commits for repo '{repo_full_name}'", - "no commits for that repository", + f"branches for repo '{repo_full_name}'", + "default-branch-only commit counting", logger, ) - has_more_pages = False - continue - commits = response.json() - has_more_pages = bool(commits) - - for commit in commits: - author_login = commit["author"]["login"] if commit["author"] else "unknown" - commit_date_str = commit["commit"]["author"]["date"] - commit_date = datetime.strptime(commit_date_str, "%Y-%m-%dT%H:%M:%SZ") - - if author_login != "unknown" and author_login in eligible_members: - commits_count_by_user[author_login]["total"] += 1 - if commit_date >= one_month_ago: - commits_count_by_user[author_login]["last_month"] += 1 - if commit_date >= six_months_ago: - commits_count_by_user[author_login]["last_6_months"] += 1 + return [] + + page_branches = response.json() + has_more_pages = bool(page_branches) + for branch in page_branches: + branch_name = branch.get("name") + if branch_name: + branches.append(branch_name) if has_more_pages: page += 1 + return branches + + +def get_commits_count(repo_full_name, eligible_members, include_all_branches=False): + commits_count_by_user = defaultdict( + lambda: {"total": 0, "last_month": 0, "last_6_months": 0} + ) + today = datetime.utcnow() + one_month_ago = today - timedelta(days=30) + six_months_ago = today - timedelta(days=180) + + branch_refs = [None] + if include_all_branches: + repo_branches = get_repo_branches(repo_full_name) + if repo_branches: + branch_refs = repo_branches + + seen_shas = set() + for branch_ref in branch_refs: + page = 1 + has_more_pages = True + + while has_more_pages: + commits_url = ( + f"https://api.github.com/repos/{repo_full_name}/commits?per_page=100&page={page}" + ) + if branch_ref: + commits_url += f"&sha={branch_ref}" + + response = requests.get(commits_url, headers=headers) + if response.status_code != 200: + if is_critical_api_error(response): + raise_api_error(response, f"commits for repo '{repo_full_name}'") + log_noncritical_api_error( + response, + f"commits for repo '{repo_full_name}'", + "no commits for that repository", + logger, + ) + break + + commits = response.json() + has_more_pages = bool(commits) + + for commit in commits: + sha = commit.get("sha") + if sha in seen_shas: + continue + if sha: + seen_shas.add(sha) + + author_login = commit["author"]["login"] if commit["author"] else "unknown" + commit_date_str = commit["commit"]["author"]["date"] + commit_date = datetime.strptime(commit_date_str, "%Y-%m-%dT%H:%M:%SZ") + + if author_login != "unknown" and author_login in eligible_members: + commits_count_by_user[author_login]["total"] += 1 + if commit_date >= one_month_ago: + commits_count_by_user[author_login]["last_month"] += 1 + if commit_date >= six_months_ago: + commits_count_by_user[author_login]["last_6_months"] += 1 + + if has_more_pages: + page += 1 + return commits_count_by_user -def get_per_user_commits(): +def get_per_user_commits(include_all_branches=False): members = get_members(ORG_NAME) repos = get_repos(ORG_NAME) @@ -144,7 +196,11 @@ def get_per_user_commits(): for repo in repos: repo_full_name = repo["full_name"] # print(f"Processing repository: {repo_full_name}") - commits_count_by_user = get_commits_count(repo_full_name, members) + commits_count_by_user = get_commits_count( + repo_full_name, + members, + include_all_branches=include_all_branches, + ) for user, counts in commits_count_by_user.items(): user_commits[user]["total"] += counts["total"] user_commits[user]["last_month"] += counts["last_month"] @@ -189,7 +245,7 @@ def get_per_user_commits(): def main(): - print(get_per_user_commits()) + print(get_per_user_commits(include_all_branches=True)) if __name__ == "__main__": From 967bd99615fdd461adebb64ef0f25376f22d43d4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 28 May 2026 20:50:14 +0000 Subject: [PATCH 2/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/make_readme/get_per_user_commits.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/make_readme/get_per_user_commits.py b/src/make_readme/get_per_user_commits.py index 05c2283..05b8ab6 100644 --- a/src/make_readme/get_per_user_commits.py +++ b/src/make_readme/get_per_user_commits.py @@ -140,9 +140,7 @@ def get_commits_count(repo_full_name, eligible_members, include_all_branches=Fal has_more_pages = True while has_more_pages: - commits_url = ( - f"https://api.github.com/repos/{repo_full_name}/commits?per_page=100&page={page}" - ) + commits_url = f"https://api.github.com/repos/{repo_full_name}/commits?per_page=100&page={page}" if branch_ref: commits_url += f"&sha={branch_ref}" @@ -168,7 +166,9 @@ def get_commits_count(repo_full_name, eligible_members, include_all_branches=Fal if sha: seen_shas.add(sha) - author_login = commit["author"]["login"] if commit["author"] else "unknown" + author_login = ( + commit["author"]["login"] if commit["author"] else "unknown" + ) commit_date_str = commit["commit"]["author"]["date"] commit_date = datetime.strptime(commit_date_str, "%Y-%m-%dT%H:%M:%SZ") From f51f7401537007bedf9e1c3708c203ee381b7a9c Mon Sep 17 00:00:00 2001 From: "Vishal Koparde, PhD" Date: Thu, 28 May 2026 16:59:35 -0400 Subject: [PATCH 3/7] perf(metrics): add cached incremental all-branch commit counting Implement persistent state-backed contributor counting to reduce API load while keeping counts accurate. Adds branch tip tracking, compare-based delta processing, full-resync fallback for non-fast-forward histories, and configurable archived/fork repository filters. _commit message is ai-generated_ --- src/make_readme/get_per_user_commits.py | 383 +++++++++++++++++++----- 1 file changed, 304 insertions(+), 79 deletions(-) diff --git a/src/make_readme/get_per_user_commits.py b/src/make_readme/get_per_user_commits.py index 05b8ab6..884b915 100644 --- a/src/make_readme/get_per_user_commits.py +++ b/src/make_readme/get_per_user_commits.py @@ -1,9 +1,12 @@ -import requests -import os -import pandas as pd +import json import logging +import os from collections import defaultdict from datetime import datetime, timedelta +from pathlib import Path + +import pandas as pd +import requests try: from .github_api import ( @@ -34,6 +37,95 @@ logger = logging.getLogger(__name__) +STATE_SCHEMA_VERSION = 1 +STATE_FILE_PATH = ( + Path(__file__).resolve().parents[2] + / "profile" + / "activity_data" + / "per_user_commits_state.json" +) + + +def initialize_state(): + return { + "schema_version": STATE_SCHEMA_VERSION, + "updated_at": None, + "repos": {}, + "commit_metadata": {}, + } + + +def load_state(state_path=STATE_FILE_PATH): + if not state_path.exists(): + return initialize_state() + + try: + with state_path.open("r", encoding="utf-8") as fh: + state = json.load(fh) + except (OSError, json.JSONDecodeError) as error: + logger.warning( + "Failed to read state file %s (%s). Rebuilding state from scratch.", + state_path, + error, + ) + return initialize_state() + + if state.get("schema_version") != STATE_SCHEMA_VERSION: + logger.warning( + "State file schema mismatch in %s. Rebuilding state from scratch.", + state_path, + ) + return initialize_state() + + if not isinstance(state.get("repos"), dict) or not isinstance( + state.get("commit_metadata"), dict + ): + logger.warning( + "State file %s is missing required keys. Rebuilding state from scratch.", + state_path, + ) + return initialize_state() + + return state + + +def save_state(state, state_path=STATE_FILE_PATH): + state_path.parent.mkdir(parents=True, exist_ok=True) + state["updated_at"] = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ") + temp_path = state_path.with_suffix(".tmp") + with temp_path.open("w", encoding="utf-8") as fh: + json.dump(state, fh, indent=2, sort_keys=True) + temp_path.replace(state_path) + + +def parse_commit_date(commit_date_str): + return datetime.strptime(commit_date_str, "%Y-%m-%dT%H:%M:%SZ") + + +def get_commit_author_login(commit): + if commit.get("author") and commit["author"].get("login"): + return commit["author"]["login"] + return None + + +def get_commit_date(commit): + commit_author = commit.get("commit", {}).get("author", {}) + return commit_author.get("date") + + +def record_eligible_commit_metadata(commit, eligible_members, commit_metadata): + sha = commit.get("sha") + author_login = get_commit_author_login(commit) + commit_date = get_commit_date(commit) + + if not sha or not author_login or not commit_date: + return None + if author_login not in eligible_members: + return None + + commit_metadata[sha] = {"author": author_login, "date": commit_date} + return sha + def get_repos(org_name): repos = [] @@ -86,8 +178,8 @@ def get_members(org_name): return members -def get_repo_branches(repo_full_name): - branches = [] +def get_repo_branch_tips(repo_full_name): + branch_tips = {} page = 1 has_more_pages = True @@ -102,109 +194,235 @@ def get_repo_branches(repo_full_name): log_noncritical_api_error( response, f"branches for repo '{repo_full_name}'", - "default-branch-only commit counting", + "cached branch data for that repository", logger, ) - return [] + return None page_branches = response.json() has_more_pages = bool(page_branches) for branch in page_branches: branch_name = branch.get("name") - if branch_name: - branches.append(branch_name) + branch_sha = branch.get("commit", {}).get("sha") + if branch_name and branch_sha: + branch_tips[branch_name] = branch_sha if has_more_pages: page += 1 - return branches + return branch_tips -def get_commits_count(repo_full_name, eligible_members, include_all_branches=False): - commits_count_by_user = defaultdict( - lambda: {"total": 0, "last_month": 0, "last_6_months": 0} - ) - today = datetime.utcnow() - one_month_ago = today - timedelta(days=30) - six_months_ago = today - timedelta(days=180) +def get_branch_commits_full(repo_full_name, branch_name, eligible_members, commit_metadata): + branch_shas = set() + page = 1 + has_more_pages = True - branch_refs = [None] - if include_all_branches: - repo_branches = get_repo_branches(repo_full_name) - if repo_branches: - branch_refs = repo_branches - - seen_shas = set() - for branch_ref in branch_refs: - page = 1 - has_more_pages = True - - while has_more_pages: - commits_url = f"https://api.github.com/repos/{repo_full_name}/commits?per_page=100&page={page}" - if branch_ref: - commits_url += f"&sha={branch_ref}" - - response = requests.get(commits_url, headers=headers) - if response.status_code != 200: - if is_critical_api_error(response): - raise_api_error(response, f"commits for repo '{repo_full_name}'") - log_noncritical_api_error( + while has_more_pages: + response = requests.get( + f"https://api.github.com/repos/{repo_full_name}/commits" + f"?sha={branch_name}&per_page=100&page={page}", + headers=headers, + ) + if response.status_code != 200: + if is_critical_api_error(response): + raise_api_error( response, - f"commits for repo '{repo_full_name}'", - "no commits for that repository", - logger, + f"commits for repo '{repo_full_name}' branch '{branch_name}'", ) - break - - commits = response.json() - has_more_pages = bool(commits) + log_noncritical_api_error( + response, + f"commits for repo '{repo_full_name}' branch '{branch_name}'", + "cached branch commit data", + logger, + ) + return None - for commit in commits: - sha = commit.get("sha") - if sha in seen_shas: - continue - if sha: - seen_shas.add(sha) + commits = response.json() + has_more_pages = bool(commits) + for commit in commits: + sha = record_eligible_commit_metadata(commit, eligible_members, commit_metadata) + if sha: + branch_shas.add(sha) - author_login = ( - commit["author"]["login"] if commit["author"] else "unknown" - ) - commit_date_str = commit["commit"]["author"]["date"] - commit_date = datetime.strptime(commit_date_str, "%Y-%m-%dT%H:%M:%SZ") + if has_more_pages: + page += 1 - if author_login != "unknown" and author_login in eligible_members: - commits_count_by_user[author_login]["total"] += 1 - if commit_date >= one_month_ago: - commits_count_by_user[author_login]["last_month"] += 1 - if commit_date >= six_months_ago: - commits_count_by_user[author_login]["last_6_months"] += 1 + return branch_shas - if has_more_pages: - page += 1 - return commits_count_by_user +def get_compare_data(repo_full_name, base_sha, head_sha): + response = requests.get( + f"https://api.github.com/repos/{repo_full_name}/compare/{base_sha}...{head_sha}", + headers=headers, + ) + if response.status_code != 200: + if is_critical_api_error(response): + raise_api_error( + response, + f"compare for repo '{repo_full_name}' ({base_sha}...{head_sha})", + ) + log_noncritical_api_error( + response, + f"compare for repo '{repo_full_name}' ({base_sha}...{head_sha})", + "full branch resync", + logger, + ) + return None + return response.json() -def get_per_user_commits(include_all_branches=False): - members = get_members(ORG_NAME) - repos = get_repos(ORG_NAME) +def aggregate_user_counts(active_shas, commit_metadata, eligible_members): user_commits = defaultdict( lambda: {"total": 0, "last_month": 0, "last_6_months": 0} ) + today = datetime.utcnow() + one_month_ago = today - timedelta(days=30) + six_months_ago = today - timedelta(days=180) + + for sha in active_shas: + metadata = commit_metadata.get(sha) + if not metadata: + continue + author = metadata.get("author") + commit_date_str = metadata.get("date") + if not author or not commit_date_str: + continue + if author not in eligible_members: + continue + + commit_date = parse_commit_date(commit_date_str) + user_commits[author]["total"] += 1 + if commit_date >= one_month_ago: + user_commits[author]["last_month"] += 1 + if commit_date >= six_months_ago: + user_commits[author]["last_6_months"] += 1 + + return user_commits + + +def get_per_user_commits( + include_all_branches=False, + include_archived=True, + include_forks=True, + use_cache=True, +): + members = get_members(ORG_NAME) + repos = get_repos(ORG_NAME) + + state = load_state() if use_cache else initialize_state() + commit_metadata = dict(state.get("commit_metadata", {})) + + next_repos_state = {} + for repo in repos: + if not include_archived and repo.get("archived", False): + continue + if not include_forks and repo.get("fork", False): + continue + repo_full_name = repo["full_name"] - # print(f"Processing repository: {repo_full_name}") - commits_count_by_user = get_commits_count( - repo_full_name, - members, - include_all_branches=include_all_branches, - ) - for user, counts in commits_count_by_user.items(): - user_commits[user]["total"] += counts["total"] - user_commits[user]["last_month"] += counts["last_month"] - user_commits[user]["last_6_months"] += counts["last_6_months"] + cached_repo_state = state.get("repos", {}).get(repo_full_name, {}) + cached_branches = cached_repo_state.get("branches", {}) + + branch_tips = get_repo_branch_tips(repo_full_name) + if branch_tips is None: + if cached_repo_state: + next_repos_state[repo_full_name] = cached_repo_state + continue + + if include_all_branches: + target_branch_tips = branch_tips + else: + default_branch = repo.get("default_branch") + if default_branch and default_branch in branch_tips: + target_branch_tips = {default_branch: branch_tips[default_branch]} + else: + target_branch_tips = {} + + next_branch_state = {} + for branch_name, branch_tip_sha in target_branch_tips.items(): + cached_branch_state = cached_branches.get(branch_name, {}) + cached_tip_sha = cached_branch_state.get("tip_sha") + cached_shas = set(cached_branch_state.get("commit_shas", [])) + + # Default to cached values when branch tip has not changed. + branch_shas = cached_shas + branch_tip_to_store = branch_tip_sha + + if not cached_tip_sha or cached_tip_sha != branch_tip_sha: + branch_shas = None + + if cached_tip_sha: + compare_data = get_compare_data( + repo_full_name, + cached_tip_sha, + branch_tip_sha, + ) + if compare_data is not None: + compare_status = compare_data.get("status") + commits = compare_data.get("commits", []) + total_commits = compare_data.get("total_commits", 0) + compare_is_truncated = total_commits > len(commits) + + if compare_status in {"ahead", "identical"} and not compare_is_truncated: + branch_shas = set(cached_shas) + for commit in commits: + sha = record_eligible_commit_metadata( + commit, + members, + commit_metadata, + ) + if sha: + branch_shas.add(sha) + + if branch_shas is None: + full_scan_shas = get_branch_commits_full( + repo_full_name, + branch_name, + members, + commit_metadata, + ) + if full_scan_shas is None: + if cached_branch_state: + branch_shas = set(cached_shas) + branch_tip_to_store = cached_tip_sha + else: + branch_shas = set() + else: + branch_shas = full_scan_shas + + next_branch_state[branch_name] = { + "tip_sha": branch_tip_to_store, + "commit_shas": sorted(branch_shas), + } + + next_repos_state[repo_full_name] = { + "archived": repo.get("archived", False), + "fork": repo.get("fork", False), + "branches": next_branch_state, + } + + active_commit_shas = set() + for repo_state in next_repos_state.values(): + for branch_state in repo_state.get("branches", {}).values(): + active_commit_shas.update(branch_state.get("commit_shas", [])) + + # Keep metadata only for commits currently reachable by counted branches. + commit_metadata = { + sha: metadata + for sha, metadata in commit_metadata.items() + if sha in active_commit_shas + } + + user_commits = aggregate_user_counts(active_commit_shas, commit_metadata, members) + + if use_cache: + state["repos"] = next_repos_state + state["commit_metadata"] = commit_metadata + save_state(state) # Convert to a DataFrame data = [] @@ -245,7 +463,14 @@ def get_per_user_commits(include_all_branches=False): def main(): - print(get_per_user_commits(include_all_branches=True)) + print( + get_per_user_commits( + include_all_branches=True, + include_archived=True, + include_forks=True, + use_cache=True, + ) + ) if __name__ == "__main__": From 0f185e0d02c83516c209e949d79c2b2ea4d1e581 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 28 May 2026 21:00:31 +0000 Subject: [PATCH 4/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/make_readme/get_per_user_commits.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/make_readme/get_per_user_commits.py b/src/make_readme/get_per_user_commits.py index 884b915..6211d02 100644 --- a/src/make_readme/get_per_user_commits.py +++ b/src/make_readme/get_per_user_commits.py @@ -213,7 +213,9 @@ def get_repo_branch_tips(repo_full_name): return branch_tips -def get_branch_commits_full(repo_full_name, branch_name, eligible_members, commit_metadata): +def get_branch_commits_full( + repo_full_name, branch_name, eligible_members, commit_metadata +): branch_shas = set() page = 1 has_more_pages = True @@ -241,7 +243,9 @@ def get_branch_commits_full(repo_full_name, branch_name, eligible_members, commi commits = response.json() has_more_pages = bool(commits) for commit in commits: - sha = record_eligible_commit_metadata(commit, eligible_members, commit_metadata) + sha = record_eligible_commit_metadata( + commit, eligible_members, commit_metadata + ) if sha: branch_shas.add(sha) @@ -367,7 +371,10 @@ def get_per_user_commits( total_commits = compare_data.get("total_commits", 0) compare_is_truncated = total_commits > len(commits) - if compare_status in {"ahead", "identical"} and not compare_is_truncated: + if ( + compare_status in {"ahead", "identical"} + and not compare_is_truncated + ): branch_shas = set(cached_shas) for commit in commits: sha = record_eligible_commit_metadata( From ff9ecf4b233a58ab4e57d5a018a4a3f9d32f8bb4 Mon Sep 17 00:00:00 2001 From: "Vishal Koparde, PhD" Date: Thu, 28 May 2026 17:02:21 -0400 Subject: [PATCH 5/7] chore(metrics): log cache-hit and resync run summary Add per-run summary logging for contributor counting so CI output shows cache hits, compare deltas, and full resync paths. This makes API-pressure behavior observable after enabling all-branch counting. _commit message is ai-generated_ --- src/make_readme/get_per_user_commits.py | 69 +++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/src/make_readme/get_per_user_commits.py b/src/make_readme/get_per_user_commits.py index 6211d02..09b2db9 100644 --- a/src/make_readme/get_per_user_commits.py +++ b/src/make_readme/get_per_user_commits.py @@ -312,10 +312,30 @@ def get_per_user_commits( include_archived=True, include_forks=True, use_cache=True, + log_run_summary=True, ): members = get_members(ORG_NAME) repos = get_repos(ORG_NAME) + run_stats = { + "repos_total": len(repos), + "repos_processed": 0, + "repos_skipped_archived": 0, + "repos_skipped_forks": 0, + "repos_branch_tip_failures": 0, + "repos_reused_cached_on_tip_failure": 0, + "branches_seen": 0, + "branch_cache_hits": 0, + "branch_compare_attempts": 0, + "branch_compare_delta_applied": 0, + "branch_compare_resync_required": 0, + "branch_full_resync_attempts": 0, + "branch_full_resync_success": 0, + "branch_full_resync_fallback_cached": 0, + "branch_full_resync_empty": 0, + "delta_commits_added": 0, + } + state = load_state() if use_cache else initialize_state() commit_metadata = dict(state.get("commit_metadata", {})) @@ -323,18 +343,24 @@ def get_per_user_commits( for repo in repos: if not include_archived and repo.get("archived", False): + run_stats["repos_skipped_archived"] += 1 continue if not include_forks and repo.get("fork", False): + run_stats["repos_skipped_forks"] += 1 continue + run_stats["repos_processed"] += 1 + repo_full_name = repo["full_name"] cached_repo_state = state.get("repos", {}).get(repo_full_name, {}) cached_branches = cached_repo_state.get("branches", {}) branch_tips = get_repo_branch_tips(repo_full_name) if branch_tips is None: + run_stats["repos_branch_tip_failures"] += 1 if cached_repo_state: next_repos_state[repo_full_name] = cached_repo_state + run_stats["repos_reused_cached_on_tip_failure"] += 1 continue if include_all_branches: @@ -348,6 +374,7 @@ def get_per_user_commits( next_branch_state = {} for branch_name, branch_tip_sha in target_branch_tips.items(): + run_stats["branches_seen"] += 1 cached_branch_state = cached_branches.get(branch_name, {}) cached_tip_sha = cached_branch_state.get("tip_sha") cached_shas = set(cached_branch_state.get("commit_shas", [])) @@ -360,6 +387,7 @@ def get_per_user_commits( branch_shas = None if cached_tip_sha: + run_stats["branch_compare_attempts"] += 1 compare_data = get_compare_data( repo_full_name, cached_tip_sha, @@ -376,6 +404,7 @@ def get_per_user_commits( and not compare_is_truncated ): branch_shas = set(cached_shas) + added_from_delta = 0 for commit in commits: sha = record_eligible_commit_metadata( commit, @@ -383,9 +412,18 @@ def get_per_user_commits( commit_metadata, ) if sha: + if sha not in branch_shas: + added_from_delta += 1 branch_shas.add(sha) + run_stats["branch_compare_delta_applied"] += 1 + run_stats["delta_commits_added"] += added_from_delta + else: + run_stats["branch_compare_resync_required"] += 1 + else: + run_stats["branch_compare_resync_required"] += 1 if branch_shas is None: + run_stats["branch_full_resync_attempts"] += 1 full_scan_shas = get_branch_commits_full( repo_full_name, branch_name, @@ -396,10 +434,15 @@ def get_per_user_commits( if cached_branch_state: branch_shas = set(cached_shas) branch_tip_to_store = cached_tip_sha + run_stats["branch_full_resync_fallback_cached"] += 1 else: branch_shas = set() + run_stats["branch_full_resync_empty"] += 1 else: branch_shas = full_scan_shas + run_stats["branch_full_resync_success"] += 1 + else: + run_stats["branch_cache_hits"] += 1 next_branch_state[branch_name] = { "tip_sha": branch_tip_to_store, @@ -431,6 +474,32 @@ def get_per_user_commits( state["commit_metadata"] = commit_metadata save_state(state) + if log_run_summary: + logger.info( + "per_user_commits summary: repos=%s processed=%s " + "skipped_archived=%s skipped_forks=%s tip_failures=%s tip_failures_with_cache=%s " + "branches=%s cache_hits=%s compare_attempts=%s compare_deltas=%s compare_resyncs=%s " + "full_resync_attempts=%s full_resync_success=%s full_resync_cached=%s full_resync_empty=%s " + "delta_commits_added=%s active_unique_shas=%s", + run_stats["repos_total"], + run_stats["repos_processed"], + run_stats["repos_skipped_archived"], + run_stats["repos_skipped_forks"], + run_stats["repos_branch_tip_failures"], + run_stats["repos_reused_cached_on_tip_failure"], + run_stats["branches_seen"], + run_stats["branch_cache_hits"], + run_stats["branch_compare_attempts"], + run_stats["branch_compare_delta_applied"], + run_stats["branch_compare_resync_required"], + run_stats["branch_full_resync_attempts"], + run_stats["branch_full_resync_success"], + run_stats["branch_full_resync_fallback_cached"], + run_stats["branch_full_resync_empty"], + run_stats["delta_commits_added"], + len(active_commit_shas), + ) + # Convert to a DataFrame data = [] for user, counts in user_commits.items(): From e5f06ed2f5726e6b084bfbb5c27932ac27d9d1af Mon Sep 17 00:00:00 2001 From: "Vishal Koparde, PhD" Date: Thu, 28 May 2026 17:47:08 -0400 Subject: [PATCH 6/7] fix(metrics): address Copilot review findings Move cache state outside published profile artifacts, store full commit metadata for membership-change correctness, harden cached date parsing, and use URL-safe query params for branch commit requests. _commit message is ai-generated_ --- src/make_readme/get_per_user_commits.py | 57 +++++++++++++++---------- 1 file changed, 34 insertions(+), 23 deletions(-) diff --git a/src/make_readme/get_per_user_commits.py b/src/make_readme/get_per_user_commits.py index 09b2db9..20b4331 100644 --- a/src/make_readme/get_per_user_commits.py +++ b/src/make_readme/get_per_user_commits.py @@ -1,6 +1,7 @@ import json import logging import os +import tempfile from collections import defaultdict from datetime import datetime, timedelta from pathlib import Path @@ -37,13 +38,24 @@ logger = logging.getLogger(__name__) -STATE_SCHEMA_VERSION = 1 -STATE_FILE_PATH = ( - Path(__file__).resolve().parents[2] - / "profile" - / "activity_data" - / "per_user_commits_state.json" -) +STATE_SCHEMA_VERSION = 2 + + +def get_default_state_file_path(): + state_path_override = os.getenv("PER_USER_COMMITS_STATE_PATH") + if state_path_override: + return Path(state_path_override).expanduser().resolve() + + cache_root = os.getenv("XDG_CACHE_HOME") + if cache_root: + cache_base = Path(cache_root).expanduser() + else: + cache_base = Path(tempfile.gettempdir()) / "ccbr_cache" + + return cache_base / "per_user_commits_state.json" + + +STATE_FILE_PATH = get_default_state_file_path() def initialize_state(): @@ -113,15 +125,13 @@ def get_commit_date(commit): return commit_author.get("date") -def record_eligible_commit_metadata(commit, eligible_members, commit_metadata): +def record_commit_metadata(commit, commit_metadata): sha = commit.get("sha") author_login = get_commit_author_login(commit) commit_date = get_commit_date(commit) if not sha or not author_login or not commit_date: return None - if author_login not in eligible_members: - return None commit_metadata[sha] = {"author": author_login, "date": commit_date} return sha @@ -214,7 +224,7 @@ def get_repo_branch_tips(repo_full_name): def get_branch_commits_full( - repo_full_name, branch_name, eligible_members, commit_metadata + repo_full_name, branch_name, commit_metadata ): branch_shas = set() page = 1 @@ -222,8 +232,8 @@ def get_branch_commits_full( while has_more_pages: response = requests.get( - f"https://api.github.com/repos/{repo_full_name}/commits" - f"?sha={branch_name}&per_page=100&page={page}", + f"https://api.github.com/repos/{repo_full_name}/commits", + params={"sha": branch_name, "per_page": 100, "page": page}, headers=headers, ) if response.status_code != 200: @@ -243,9 +253,7 @@ def get_branch_commits_full( commits = response.json() has_more_pages = bool(commits) for commit in commits: - sha = record_eligible_commit_metadata( - commit, eligible_members, commit_metadata - ) + sha = record_commit_metadata(commit, commit_metadata) if sha: branch_shas.add(sha) @@ -297,7 +305,15 @@ def aggregate_user_counts(active_shas, commit_metadata, eligible_members): if author not in eligible_members: continue - commit_date = parse_commit_date(commit_date_str) + try: + commit_date = parse_commit_date(commit_date_str) + except ValueError: + logger.warning( + "Skipping commit %s due to invalid cached date format: %s", + sha, + commit_date_str, + ) + continue user_commits[author]["total"] += 1 if commit_date >= one_month_ago: user_commits[author]["last_month"] += 1 @@ -406,11 +422,7 @@ def get_per_user_commits( branch_shas = set(cached_shas) added_from_delta = 0 for commit in commits: - sha = record_eligible_commit_metadata( - commit, - members, - commit_metadata, - ) + sha = record_commit_metadata(commit, commit_metadata) if sha: if sha not in branch_shas: added_from_delta += 1 @@ -427,7 +439,6 @@ def get_per_user_commits( full_scan_shas = get_branch_commits_full( repo_full_name, branch_name, - members, commit_metadata, ) if full_scan_shas is None: From ca3a0282112b3835c76162665fd81878c804e04f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 28 May 2026 22:07:44 +0000 Subject: [PATCH 7/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/make_readme/get_per_user_commits.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/make_readme/get_per_user_commits.py b/src/make_readme/get_per_user_commits.py index 20b4331..d0f3371 100644 --- a/src/make_readme/get_per_user_commits.py +++ b/src/make_readme/get_per_user_commits.py @@ -223,9 +223,7 @@ def get_repo_branch_tips(repo_full_name): return branch_tips -def get_branch_commits_full( - repo_full_name, branch_name, commit_metadata -): +def get_branch_commits_full(repo_full_name, branch_name, commit_metadata): branch_shas = set() page = 1 has_more_pages = True