diff --git a/src/cli.ts b/src/cli.ts index 92e028f..f4864ea 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -113,17 +113,28 @@ async function validateGraphqlSearchToken(token: string): Promise { Authorization: `Bearer ${token}` }, body: JSON.stringify({ - query: `query OssrankTokenCheck($query: String!) { + query: `query OssrankTokenCheck($query: String!, $login: String!, $from: DateTime!, $to: DateTime!) { search(type: USER, query: $query, first: 1) { userCount } + user(login: $login) { + contributionsCollection(from: $from, to: $to) { + totalCommitContributions + totalPullRequestContributions + } + } rateLimit { remaining cost } }`, - variables: { query: 'type:user repos:>0' } + variables: { + query: 'type:user repos:>0', + login: 'rogerchappel', + from: new Date(Date.now() - 365 * 24 * 60 * 60 * 1000).toISOString(), + to: new Date().toISOString() + } }) }); const body = await response.json() as { errors?: Array<{ message: string }>; data?: unknown }; if (!response.ok || body.errors?.length) { const message = body.errors?.map((error) => error.message).join('; ') ?? response.statusText; - throw new Error(`GitHub token cannot run OSSRank GraphQL search: ${message}`); + throw new Error(`GitHub token cannot run OSSRank GraphQL search/contribution queries: ${message}`); } } @@ -144,7 +155,7 @@ async function main(): Promise { if (options.requireGraphqlSearch) { await validateGraphqlSearchToken(token); } - await emit({ ok: true, provider: 'github', mode: options.requireGraphqlSearch ? 'graphql-search-access' : 'token-present' }, options); + await emit({ ok: true, provider: 'github', mode: options.requireGraphqlSearch ? 'graphql-search-and-contribution-access' : 'token-present' }, options); return; } diff --git a/src/lib/github.ts b/src/lib/github.ts index c3776f9..b6712ea 100644 --- a/src/lib/github.ts +++ b/src/lib/github.ts @@ -65,6 +65,25 @@ interface GitHubUserSearchResponse { rateLimit?: { remaining?: number; cost?: number }; } +interface GitHubUserActivityBatchResponse { + [alias: string]: unknown; + rateLimit?: { remaining?: number; cost?: number }; +} + +interface GitHubUserActivityNode { + login: string; + name: string | null; + url: string; + repositories: { totalCount: number }; + gists: { totalCount: number }; + followers: { totalCount: number }; + location: string | null; + contributionsCollection: { + totalCommitContributions: number; + totalPullRequestContributions: number; + }; +} + interface CandidateQueryStat { query: string; total: number; @@ -366,63 +385,72 @@ async function fetchWithRetry(url: string, init: RequestInit, attempts = 3): Pro } // --------------------------------------------------------------------------- -// REST user profile + public activity +// Batched GraphQL user profile + contribution activity // --------------------------------------------------------------------------- -function isUnsearchableUserError(error: unknown): boolean { - if (!(error instanceof Error)) return false; - return error.message.includes('GitHub 422 for /search/') - && error.message.includes('The listed users cannot be searched'); +function chunks(items: T[], size: number): T[][] { + const result: T[][] = []; + for (let index = 0; index < items.length; index += size) result.push(items.slice(index, index + size)); + return result; } -async function safeSearchTotal(client: GitHubClient, path: string, login: string, label: 'commits' | 'pull requests'): Promise { - try { - return (await client.search(path)).total_count; - } catch (error) { - if (isUnsearchableUserError(error)) { - process.stderr.write(`[warn] Skipping ${label} activity for ${login}: GitHub search says the user cannot be searched.\n`); - return 0; +function toUserCandidate(node: GitHubUserActivityNode): { user: GitHubUserDetail; activity: { commits: number; pullRequests: number } } { + return { + user: { + login: node.login, + name: node.name, + html_url: node.url, + public_repos: node.repositories.totalCount, + public_gists: node.gists.totalCount, + followers: node.followers.totalCount, + location: node.location + }, + activity: { + commits: node.contributionsCollection.totalCommitContributions, + pullRequests: node.contributionsCollection.totalPullRequestContributions } - throw error; - } + }; } -async function publicActivityViaSearch( +async function userProfilesWithActivityBatch( client: GitHubClient, - login: string, + logins: string[], generatedAt: string -): Promise<{ commits: number; pullRequests: number }> { - const fromDate = yyyyMmDd(daysAgoIso(generatedAt, 365)); - const toDate = yyyyMmDd(generatedAt); - const commitQuery = `author:${login} committer-date:${fromDate}..${toDate}`; - const pullRequestQuery = `type:pr author:${login} created:${fromDate}..${toDate}`; - - const commits = await safeSearchTotal(client, `/search/commits?${encodeQuery(commitQuery, 1, 'committer-date')}`, login, 'commits'); - const pullRequests = await safeSearchTotal(client, `/search/issues?${encodeQuery(pullRequestQuery, 1, 'created')}`, login, 'pull requests'); +): Promise> { + const from = daysAgoIso(generatedAt, 365); + const to = generatedAt; + const results: Array<{ user: GitHubUserDetail; activity: { commits: number; pullRequests: number } }> = []; + + for (const batch of chunks(logins, 5)) { + const variableDefinitions = batch.map((_, index) => `$login${index}: String!`).join(', '); + const aliases = batch.map((_, index) => ` + u${index}: user(login: $login${index}) { + login + name + url + repositories { totalCount } + gists { totalCount } + followers { totalCount } + location + contributionsCollection(from: $from, to: $to) { + totalCommitContributions + totalPullRequestContributions + } + }`).join(''); + const variables: Record = { from, to }; + batch.forEach((login, index) => { variables[`login${index}`] = login; }); - return { commits, pullRequests }; -} + const data = await client.graphql(`query OssrankUserActivityBatch($from: DateTime!, $to: DateTime!, ${variableDefinitions}) {${aliases} + rateLimit { remaining cost } + }`, variables); -async function userProfileWithActivity( - client: GitHubClient, - login: string, - generatedAt: string -): Promise<{ user: GitHubUserDetail; activity: { commits: number; pullRequests: number } } | null> { - const profile = await client.get(`/users/${login}`); - if (!profile?.login) return null; + for (let index = 0; index < batch.length; index += 1) { + const node = data[`u${index}`] as GitHubUserActivityNode | null | undefined; + if (node?.login) results.push(toUserCandidate(node)); + } + } - return { - user: { - login: profile.login, - name: profile.name, - html_url: profile.html_url, - public_repos: profile.public_repos, - public_gists: profile.public_gists, - followers: profile.followers, - location: profile.location - }, - activity: await publicActivityViaSearch(client, profile.login, generatedAt) - }; + return results; } // --------------------------------------------------------------------------- @@ -477,12 +505,11 @@ async function searchUsers(client: GitHubClient, query: string, limit: number): } // --------------------------------------------------------------------------- -// Collect users — now uses REST for profiles (separate rate limit bucket) +// Collect users — batched GraphQL profiles and contribution activity // --------------------------------------------------------------------------- -async function collectUsers(client: GitHubClient, queries: string | string[], limit: number, generatedAt: string, throttler: SmartThrottler, locationTerms?: string[], countryName?: string, candidateLimit = Math.max(50, limit * 5)): Promise<{ total: number; users: RankedContributor[]; queryStats: CandidateQueryStat[] }> { +async function collectUsers(client: GitHubClient, queries: string | string[], limit: number, generatedAt: string, throttler: SmartThrottler, locationTerms?: string[], countryName?: string, candidateLimit = Math.max(25, limit * 3)): Promise<{ total: number; users: RankedContributor[]; queryStats: CandidateQueryStat[] }> { const searchQueries = Array.isArray(queries) ? queries : [queries]; - const concurrency = throttler.getConcurrency(); const details = new Map(); const queryStats: CandidateQueryStat[] = []; let total = 0; @@ -493,9 +520,9 @@ async function collectUsers(client: GitHubClient, queries: string | string[], li total += search.total; const before = details.size; const unseen = search.items.filter((item) => item.type !== 'Organization' && !details.has(item.login.toLowerCase())); - const fetched = await mapLimit(unseen, concurrency, async (item) => userProfileWithActivity(client, item.login, generatedAt)); + const fetched = await userProfilesWithActivityBatch(client, unseen.map((item) => item.login), generatedAt); for (const detail of fetched) { - if (detail && (!locationTerms || matchesLocation(detail.user.location, locationTerms))) { + if (!locationTerms || matchesLocation(detail.user.location, locationTerms)) { details.set(detail.user.login.toLowerCase(), { ...detail, discoveredByQuery: userQuery(query) }); } } @@ -648,11 +675,11 @@ async function saveCountrySnapshot( ): Promise { const { config, total, users, queryStats } = result; const snapshot: RankingSnapshot = { - ...snapshotBase('country', config.slug, config.name, `Top observed GitHub contributors in ${config.name}`, generatedAt, 'fresh', 'github-rest-search-one-year-public-activity'), + ...snapshotBase('country', config.slug, config.name, `Top observed GitHub contributors in ${config.name}`, generatedAt, 'fresh', 'github-graphql-one-year-contribution-activity'), code: config.code, candidate_count: total, caveats: [ - 'Live data uses GitHub REST search plus public profile fields; it is an observed sample, not a complete census.', + 'Live data uses GitHub GraphQL contribution and public profile fields; it is an observed sample, not a complete census.', 'Location matching uses free-text GitHub profile locations and must not be treated as verified nationality or residence.' ], discovery_queries: config.queries.map(userQuery), @@ -747,9 +774,9 @@ export async function collectLiveSnapshots(options: GitHubCollectorOptions): Pro const openclaw = await collectRepos(client, ['openclaw archived:false', 'topic:openclaw archived:false', 'openclaw agent archived:false'], limit, generatedAt, throttler, Math.max(40, limit * 3)); const contributorCaveats = [ - 'Live data uses GitHub REST search plus public profile fields; it is an observed sample, not a complete census.', + 'Live data uses GitHub GraphQL contribution and public profile fields; it is an observed sample, not a complete census.', 'Location matching uses free-text GitHub profile locations and must not be treated as verified nationality or residence.', - 'Contributor pages expose public repository counts plus one-year public commit and pull request activity from GitHub REST search. These are not all-time totals and may differ from private/authenticated GitHub profile contribution graphs.', + 'Contributor pages expose public repository counts plus one-year GitHub contribution activity from GraphQL. These are not all-time totals and may differ from private contribution graphs.', 'The OSSRank score is retained only as a combined proxy; raw commits, pull requests, and repository tables are preferred for review and SEO pages.' ]; const projectCaveats = [ @@ -769,13 +796,13 @@ export async function collectLiveSnapshots(options: GitHubCollectorOptions): Pro const globalEntries = rankContributors([...new Map(globalContributorPool.map((user) => [user.login.toLowerCase(), user])).values()]).slice(0, limit); const derivedGlobalStat = { query: 'derived from current country, language, and global contributor snapshots', total: globalContributorPool.length, accepted: Math.max(0, globalEntries.length - global.users.length) }; const globalContributors: RankingSnapshot = { - ...snapshotBase('global', 'contributors', 'Global', 'Top observed GitHub contributors globally', generatedAt, 'fresh', 'github-rest-search-one-year-public-activity'), + ...snapshotBase('global', 'contributors', 'Global', 'Top observed GitHub contributors globally', generatedAt, 'fresh', 'github-graphql-one-year-contribution-activity'), candidate_count: global.total, caveats: contributorCaveats, discovery_queries: ['followers:>1000 repos:>20 type:user', 'repos:>100 followers:>500 type:user', 'derived from current country and language contributor snapshots'], candidate_count_by_query: [...global.queryStats, derivedGlobalStat], history: { weeks: [generatedAt.slice(0, 10)], ranked_items: [globalEntries.length], top_10_signal: [globalEntries.slice(0, 10).reduce((sum, user) => sum + user.public_contributions, 0)] }, entries: globalEntries }; const refreshedCountries: Array> = countryResults.map(({ config, total, users }) => ({ - ...snapshotBase('country', config.slug, config.name, `Top observed GitHub contributors in ${config.name}`, generatedAt, 'fresh', 'github-rest-search-one-year-public-activity'), + ...snapshotBase('country', config.slug, config.name, `Top observed GitHub contributors in ${config.name}`, generatedAt, 'fresh', 'github-graphql-one-year-contribution-activity'), code: config.code, candidate_count: total, caveats: contributorCaveats, discovery_queries: config.queries.map(userQuery), candidate_count_by_query: (countryResults.find((r) => r.config.slug === config.slug)?.queryStats ?? []), history: { weeks: [generatedAt.slice(0, 10)], ranked_items: [users.length], top_10_signal: [users.slice(0, 10).reduce((sum, user) => sum + user.public_contributions, 0)] }, entries: users @@ -840,7 +867,7 @@ export async function collectLiveSnapshots(options: GitHubCollectorOptions): Pro const uniqueUsers = [...new Map(allUsers.map((user) => [user.login.toLowerCase(), user])).values()]; const risingUsers = rankRisingContributors(uniqueUsers).slice(0, limit); const rising: RankingSnapshot = { - ...snapshotBase('rising', 'contributors', 'Rising Contributors', 'High-signal observed GitHub contributors with strong activity relative to audience size', generatedAt, 'fresh', 'derived-github-rest-search-one-year-public-activity'), + ...snapshotBase('rising', 'contributors', 'Rising Contributors', 'High-signal observed GitHub contributors with strong activity relative to audience size', generatedAt, 'fresh', 'derived-github-graphql-one-year-contribution-activity'), candidate_count: uniqueUsers.length, caveats: contributorCaveats, discovery_queries: ['derived from current contributor snapshots'], candidate_count_by_query: [], history: { weeks: [generatedAt.slice(0, 10)], ranked_items: [risingUsers.length], top_10_signal: [risingUsers.slice(0, 10).reduce((sum, user) => sum + user.public_contributions, 0)] }, entries: risingUsers