From 043895f061083bf23cf430a5f554bb4196d6ff4b Mon Sep 17 00:00:00 2001 From: Seungpyo1007 Date: Tue, 23 Jun 2026 19:47:09 +0900 Subject: [PATCH] fix(site): show full dataset history, not just the last 7 commits The homepage History section called the GitHub commits API live (per_page=8, sliced to 7) and re-fetched the dump at each SHA, so older syncs were invisible and the unauthenticated rate limit (60/h) made it flaky. Precompute the whole timeline at deploy time instead: build-history.mjs walks the full git history of site/public/v1/index.json and writes site/public/v1/history.json (a build-only, gitignored artifact). The page reads that one static file and only falls back to the GitHub API for local `astro dev`. deploy-pages now checks out at fetch-depth: 0 so the generator can see old commits. Refs #1 --- .github/workflows/deploy-pages.yml | 4 + .gitignore | 5 +- site/package.json | 2 + site/scripts/build-history.mjs | 113 +++++++++++++++++++++++++++++ site/src/pages/index.astro | 2 +- site/src/scripts/techapi.js | 43 +++++++++-- 6 files changed, 160 insertions(+), 9 deletions(-) create mode 100644 site/scripts/build-history.mjs diff --git a/.github/workflows/deploy-pages.yml b/.github/workflows/deploy-pages.yml index 383f1828b52..a694a91dd6d 100644 --- a/.github/workflows/deploy-pages.yml +++ b/.github/workflows/deploy-pages.yml @@ -24,6 +24,10 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 + with: + # build-history.mjs walks the full git history of the public dump to + # rebuild the homepage growth timeline; a shallow clone hides old syncs. + fetch-depth: 0 - uses: actions/setup-node@v4 with: diff --git a/.gitignore b/.gitignore index 6b890c3926a..63b0dd0c73c 100644 --- a/.gitignore +++ b/.gitignore @@ -31,9 +31,8 @@ env/ # Note: data/_staging/ (raw collected candidate pool) is intentionally tracked — # comprehensive data collection is a purpose of this repo. -# Verification layer caches: full Tier 0 scores + network caches are cheap to -# recompute. Only data/_verify/ledger.jsonl (the promotion audit trail) is tracked. -data/_verify/state/ +# Build-only: regenerated from full git history on every Pages deploy (site/scripts/build-history.mjs) +site/public/v1/history.json # Testing / coverage .pytest_cache/ diff --git a/site/package.json b/site/package.json index e05ae60ee5d..45c7e9e020b 100644 --- a/site/package.json +++ b/site/package.json @@ -5,6 +5,8 @@ "private": true, "scripts": { "dev": "astro dev", + "build:history": "node scripts/build-history.mjs", + "prebuild": "node scripts/build-history.mjs", "build": "astro build", "preview": "astro preview" }, diff --git a/site/scripts/build-history.mjs b/site/scripts/build-history.mjs new file mode 100644 index 00000000000..bc242f783d2 --- /dev/null +++ b/site/scripts/build-history.mjs @@ -0,0 +1,113 @@ +// Precompute the dataset growth timeline from git history at build time. +// +// The homepage History section used to call the GitHub API live and could only +// show the last ~7 commits (per_page=8 + slice(7)), so older syncs were invisible +// and the unauthenticated API rate limit (60/h) made it fragile. Instead we walk +// the full git history of `site/public/v1/index.json` once during the deploy build +// and emit `site/public/v1/history.json` — a single static file the page reads. +// +// Build-only artifact (gitignored): regenerated on every Pages deploy, so it is +// always complete and never churns data PRs. Requires a full-depth checkout +// (fetch-depth: 0) to see old commits; degrades to an empty timeline otherwise. + +import { execFileSync } from "node:child_process"; +import { mkdirSync, writeFileSync } from "node:fs"; +import { dirname, resolve } from "node:path"; +import { fileURLToPath } from "node:url"; + +const SITE_DIR = resolve(dirname(fileURLToPath(import.meta.url)), ".."); +const REPO_ROOT = resolve(SITE_DIR, ".."); +const TRACKED = "site/public/v1/index.json"; +const OUT = resolve(SITE_DIR, "public/v1/history.json"); +const REPO_URL = "https://github.com/GetTechAPI/TechAPI"; + +// Categories we sum into the record total (matches the public dump manifest). +const ORDER = ["smartphones", "tablets", "watches", "pdas", "socs", "gpus", "cpus", "brands"]; +// Keep the chart legible if the history ever grows large: downsample to at most +// MAX points, always preserving the first (baseline) and last (latest) commits. +const MAX = 40; + +function git(args) { + return execFileSync("git", ["-C", REPO_ROOT, ...args], { + encoding: "utf8", + maxBuffer: 64 * 1024 * 1024, + }); +} + +function countsOf(manifest) { + const counts = {}; + let total = 0; + for (const key of ORDER) { + const n = manifest?.collections?.[key]?.count; + if (typeof n === "number") { + counts[key] = n; + total += n; + } + } + return { counts, total }; +} + +function downsample(points) { + if (points.length <= MAX) return points; + const step = (points.length - 1) / (MAX - 1); + const picked = []; + const seen = new Set(); + for (let i = 0; i < MAX; i++) { + const idx = Math.round(i * step); + if (!seen.has(idx)) { + seen.add(idx); + picked.push(points[idx]); + } + } + return picked; +} + +function buildPoints() { + // %H sha, %cI committer ISO date, %s subject — 0x1f-separated, one line/commit. + const raw = git(["log", "--format=%H%x1f%cI%x1f%s", "--", TRACKED]).trim(); + if (!raw) return []; + const commits = raw.split("\n").map((line) => { + const [sha, date, ...rest] = line.split("\x1f"); + return { sha, date, title: rest.join("\x1f") }; + }); + // git log is newest-first; the timeline reads oldest-first. + commits.reverse(); + + const points = []; + for (const c of commits) { + let manifest; + try { + manifest = JSON.parse(git(["show", `${c.sha}:${TRACKED}`])); + } catch { + continue; // file absent/unparseable at this commit — skip it + } + const { counts, total } = countsOf(manifest); + if (!total) continue; + points.push({ + sha: c.sha.slice(0, 7), + date: c.date, + title: (c.title || "Dataset sync").trim(), + url: `${REPO_URL}/commit/${c.sha}`, + total, + counts, + }); + } + return downsample(points); +} + +function main() { + let points = []; + try { + points = buildPoints(); + } catch (err) { + console.warn(`[build-history] git history unavailable: ${err.message}`); + } + mkdirSync(dirname(OUT), { recursive: true }); + writeFileSync( + OUT, + JSON.stringify({ generated_at: new Date().toISOString(), schema: 1, points }, null, 2), + ); + console.log(`[build-history] wrote ${points.length} point(s) -> ${OUT}`); +} + +main(); diff --git a/site/src/pages/index.astro b/site/src/pages/index.astro index 55dafee5f25..e926020086f 100644 --- a/site/src/pages/index.astro +++ b/site/src/pages/index.astro @@ -116,7 +116,7 @@ const endpoints = [
00 - History

Dataset growth over time

-

Recent dump commits are replayed into a small growth chart, showing how many records each sync added.

+

Every dump commit is replayed into a growth chart — from the first snapshot to the latest sync — showing how many records each one added.

v1/index.json diff --git a/site/src/scripts/techapi.js b/site/src/scripts/techapi.js index 8a39affc6e9..681250c792d 100644 --- a/site/src/scripts/techapi.js +++ b/site/src/scripts/techapi.js @@ -288,12 +288,41 @@ function countUp(node, target) { }).join(""); } - async function loadCommitHistory(currentManifest) { - const commitsUrl = `https://api.github.com/repos/GetTechAPI/TechAPI/commits?path=${encodeURIComponent(dumpPath)}&per_page=8`; + const fmtWhen = (date) => date + ? date.toLocaleDateString(undefined, { month: "short", day: "numeric", year: "numeric" }) + : "recent"; + const rowsFromCounts = (counts) => order + .map((key) => ({ key, count: counts?.[key] })) + .filter((row) => row.count != null); + + // Preferred path: a single prebuilt manifest (build-history.mjs) holding the + // FULL dump timeline. No GitHub API, no per-commit fetches, no rate limit, and + // old commits stay visible (the live API path only ever showed the last 7). + async function pointsFromStaticHistory() { + const data = await getJSON("v1/history.json"); + const points = (data.points || []).map((p) => { + const date = p.date ? new Date(p.date) : null; + return { + sha: String(p.sha || "").slice(0, 7), + dateValue: date ? date.getTime() : 0, + when: fmtWhen(date), + title: String(p.title || "Dataset sync").split("\n")[0], + url: p.url || "https://github.com/GetTechAPI/TechAPI", + rows: rowsFromCounts(p.counts), + total: p.total != null ? p.total : rowsFromCounts(p.counts).reduce((s, r) => s + r.count, 0), + }; + }).filter((p) => p.total > 0); + return points.sort((a, b) => a.dateValue - b.dateValue); + } + + // Fallback (e.g. local `astro dev` with no prebuilt history.json): the old live + // GitHub API replay, capped at the most recent commits. + async function pointsFromGitHubApi() { + const commitsUrl = `https://api.github.com/repos/GetTechAPI/TechAPI/commits?path=${encodeURIComponent(dumpPath)}&per_page=10`; const response = await fetch(commitsUrl); if (!response.ok) throw new Error(response.statusText); const commits = await response.json(); - const items = Array.isArray(commits) ? commits.slice(0, 7) : []; + const items = Array.isArray(commits) ? commits.slice(0, 10) : []; const snapshots = await Promise.all(items.map(async (item) => { const sha = String(item.sha || ""); const rawUrl = `https://raw.githubusercontent.com/GetTechAPI/TechAPI/${sha}/${dumpPath}`; @@ -304,15 +333,19 @@ function countUp(node, target) { return { sha: sha.slice(0, 7), dateValue: date ? date.getTime() : 0, - when: date ? date.toLocaleDateString(undefined, { month: "short", day: "numeric", year: "numeric" }) : "recent", + when: fmtWhen(date), title: (item.commit?.message || "Dataset sync").split("\n")[0], url: item.html_url || "https://github.com/GetTechAPI/TechAPI", rows: countRows(manifest), total: totalRecords(manifest), }; })); + return snapshots.filter(Boolean).sort((a, b) => a.dateValue - b.dateValue); + } - const points = snapshots.filter(Boolean).sort((a, b) => a.dateValue - b.dateValue); + async function loadCommitHistory(currentManifest) { + let points = await pointsFromStaticHistory().catch(() => null); + if (!points || !points.length) points = await pointsFromGitHubApi(); if (!points.length) throw new Error("empty history"); const currentTotal = totalRecords(currentManifest);