Skip to content

weekly-ingest

weekly-ingest #8

Workflow file for this run

name: weekly-ingest
# Weekly: scrape upstream catalogs, draft missing SKUs into a TechAPI worktree,
# open a PR for curator review.
on:
schedule:
- cron: "29 6 * * 1" # Mondays 06:29 UTC (weekly to keep curator review load sane)
workflow_dispatch:
inputs:
category:
description: "Category to ingest"
type: choice
options: [cpu, gpu, smartphone]
default: cpu
limit:
description: "Max candidates per source"
type: string
default: "50"
include_drafts:
description: "Write incomplete records too (PR marked as draft)"
type: boolean
default: false
permissions:
contents: read
jobs:
ingest:
runs-on: ubuntu-latest
env:
CATEGORY: ${{ inputs.category || 'cpu' }}
LIMIT: ${{ inputs.limit || '50' }}
INCLUDE_DRAFTS: ${{ inputs.include_drafts || 'false' }}
TECHAPI_TOKEN: ${{ secrets.TECHAPI_TOKEN }}
steps:
- uses: actions/checkout@v4
# Use the PAT when present so we can push to TechAPI later;
# fall back to the default token for read-only test runs.
- uses: actions/checkout@v4
with:
repository: GetTechAPI/TechAPI
path: TechAPI
token: ${{ secrets.TECHAPI_TOKEN || secrets.GITHUB_TOKEN }}
- uses: actions/setup-python@v5
with:
python-version: "3.12"
cache: pip
- name: Install
run: pip install -e .
- name: Run ingest
env:
TECHAPI_DATA_DIR: ${{ github.workspace }}/TechAPI/data
run: |
DRAFTS_FLAG=""
if [ "$INCLUDE_DRAFTS" = "true" ]; then
DRAFTS_FLAG="--include-drafts"
fi
python -m app.ingest \
--category "$CATEGORY" \
--limit "$LIMIT" \
--data-root TechAPI/data \
--summary ingest-summary.md \
$DRAFTS_FLAG
# Variant-safe benchmark backfill on existing CPU records (PassMark).
# CPU-only; never overwrites, only fills nulls on exact heading matches.
# Non-fatal: a scrape hiccup must not sink the weekly ingest PR.
- name: Enrich benchmarks (PassMark, cpu only)
if: env.CATEGORY == 'cpu'
continue-on-error: true
env:
TECHAPI_DATA_DIR: ${{ github.workspace }}/TechAPI/data
run: |
python -m app.ingest.enrich \
--data-root TechAPI/data \
--limit "$LIMIT" \
--min-year 2008 \
--sleep 0.5 \
--summary enrich-summary.md
- name: Combine summaries for PR body
run: |
cp ingest-summary.md pr-body.md
if [ -f enrich-summary.md ]; then
printf '\n\n---\n\n' >> pr-body.md
cat enrich-summary.md >> pr-body.md
fi
- name: Upload summary artifact
uses: actions/upload-artifact@v4
with:
name: ingest-summary
path: |
ingest-summary.md
enrich-summary.md
pr-body.md
- name: Check whether ingest produced any additions
id: changes
run: |
cd TechAPI
if [ -n "$(git status --porcelain)" ]; then
echo "has_changes=true" >> "$GITHUB_OUTPUT"
else
echo "has_changes=false" >> "$GITHUB_OUTPUT"
fi
- name: Open PR against TechAPI
if: steps.changes.outputs.has_changes == 'true'
env:
GH_TOKEN: ${{ secrets.TECHAPI_TOKEN }}
run: |
set -euo pipefail
if [ -z "${GH_TOKEN:-}" ]; then
echo "::warning::Ingest produced additions but TECHAPI_TOKEN is unset; skipping PR. Summary attached as artifact."
exit 0
fi
cd TechAPI
BRANCH="ingest/${CATEGORY}-$(date -u +%Y%m%d-%H%M%S)"
git config user.name "TechEngineBot"
git config user.email "289859915+TechEngineBot@users.noreply.github.com"
git checkout -b "$BRANCH"
git add data/
git commit -m "feat(data/${CATEGORY}): weekly ingest"
git push -u origin "$BRANCH"
DRAFT_FLAG=""
if [ "$INCLUDE_DRAFTS" = "true" ]; then
DRAFT_FLAG="--draft"
fi
gh pr create \
--title "feat(data/${CATEGORY}): weekly ingest" \
--body-file ../pr-body.md \
--base main \
--head "$BRANCH" \
$DRAFT_FLAG