diff --git a/.github/workflows/scrape-pack.yml b/.github/workflows/scrape-pack.yml new file mode 100644 index 0000000..00bf1c2 --- /dev/null +++ b/.github/workflows/scrape-pack.yml @@ -0,0 +1,281 @@ +# scrape-pack — batch scrape the registry on GH-hosted runners, consolidate +# into a single deadzone.db, optionally publish via `deadzone dbrelease`. +# +# Decisions live in docs/research/batch-scrape-actions.md and #126: +# - workflow_dispatch only (no cron until #47) +# - per-lib artifact cache IS the freshness shim across runs +# - inter-JOB transport in a SINGLE run uses Pattern B (upload-artifact +# staging) because Pattern C (REST cache API) is not implementable — +# the cache archive download URL is a runtime-signed Azure Blob link +# that only @actions/cache's Node runtime token can negotiate, not +# reachable from `gh api`. The cross-run freshness story still runs +# entirely through the cache layer; upload-artifact is scratch +# transport scoped to one dispatch. +# - dbrelease fires only when inputs.tag is non-empty; empty tag stops +# at the consolidated-db cache (no side effects on the releases page) + +name: scrape-pack + +on: + workflow_dispatch: + inputs: + lib: + description: "Filter base lib_id (e.g. /hashicorp/terraform). Empty = every resolved lib." + required: false + type: string + default: "" + tag: + description: "Release tag (e.g. v0.1.0). Non-empty chains `deadzone dbrelease`. Empty stops at consolidated-db cache." + required: false + type: string + default: "" + +permissions: + contents: write # dbrelease writes GH Release assets when inputs.tag != "" + +concurrency: + # Queue serially: parallel dispatches would fight over the same cache + # keys, and dbrelease --clobbers the same asset names, so we let one + # finish before the next starts. + group: scrape-pack + cancel-in-progress: false + +env: + # Mirror ci.yml verbatim — same pinned tokenizer static archive, same + # model + ORT cache roots. Bumping TOKENIZERS_VERSION in ci.yml + # invalidates the composite action's cache key via its own hashFiles + # rule; this workflow piggybacks on that invalidation. + TOKENIZERS_VERSION: v1.26.0 + DEADZONE_HUGOT_CACHE: ${{ github.workspace }}/.deadzone-cache/models + DEADZONE_ORT_CACHE: ${{ github.workspace }}/.deadzone-cache/ort + CGO_ENABLED: "1" + CGO_LDFLAGS: -L/tmp/deadzone-deps/tokenizers + +jobs: + expand-libs: + name: expand-libs + runs-on: ubuntu-latest + outputs: + libs: ${{ steps.list.outputs.libs }} + steps: + - uses: actions/checkout@v6 + - uses: actions/setup-go@v6 + with: + go-version-file: go.mod + - name: Install native deps + uses: ./.github/actions/install-native-deps + - name: Emit resolved libs as JSON + id: list + shell: bash + run: | + set -euo pipefail + args=(scrape -list -config libraries_sources.yaml) + if [ -n "${{ inputs.lib }}" ]; then + args+=(-lib "${{ inputs.lib }}") + fi + libs="$(go run -tags ORT ./cmd/deadzone "${args[@]}")" + echo "resolved: $libs" + echo "libs=$libs" >> "$GITHUB_OUTPUT" + + scrape: + name: scrape (${{ matrix.entry.slug }}) + needs: expand-libs + runs-on: ubuntu-latest + strategy: + # Siblings must keep running when one lib breaks — continue-on-error + # semantics at the matrix level (#93 mirrors this inside a single + # scrape invocation). The consolidate summary surfaces per-slot + # failures so a broken slot is visible, not silently dropped. + fail-fast: false + max-parallel: 20 + matrix: + entry: ${{ fromJSON(needs.expand-libs.outputs.libs) }} + steps: + - uses: actions/checkout@v6 + - uses: actions/setup-go@v6 + with: + go-version-file: go.mod + - name: Install native deps + uses: ./.github/actions/install-native-deps + - name: Cache embedding model + # Key verbatim from .github/workflows/ci.yml — bumping the embedder + # (internal/embed/hugot.go) invalidates both caches in lockstep, + # which is exactly the freshness invariant for the artifact cache + # below. + uses: actions/cache@v5 + with: + path: ${{ env.DEADZONE_HUGOT_CACHE }} + key: hugot-model-${{ runner.os }}-${{ hashFiles('internal/embed/hugot.go') }} + restore-keys: | + hugot-model-${{ runner.os }}- + - name: Cache ONNX Runtime library + # Key verbatim from .github/workflows/ci.yml — see comment above. + uses: actions/cache@v5 + with: + path: ${{ env.DEADZONE_ORT_CACHE }} + key: ort-lib-${{ runner.os }}-${{ hashFiles('internal/ort/ort.go') }} + restore-keys: | + ort-lib-${{ runner.os }}- + - name: Cache per-lib artifact + id: artifact-cache + uses: actions/cache@v5 + with: + # libraries_sources.yaml hash gates every resolved entry, so a + # URL edit anywhere in the registry invalidates every lib's + # cache — intentional over-invalidation vs per-section hashing + # that would be fragile to YAML reordering. Embedder hash is + # included so a vector-space change forces a rescrape in + # lockstep with the embedding model cache above. + path: artifacts/${{ matrix.entry.slug }} + key: artifact-${{ matrix.entry.slug }}-${{ matrix.entry.version }}-${{ hashFiles('libraries_sources.yaml') }}-${{ hashFiles('internal/embed/hugot.go') }} + - name: Scrape (cache miss) + if: steps.artifact-cache.outputs.cache-hit != 'true' + shell: bash + run: | + set -euo pipefail + args=(scrape -artifacts ./artifacts -lib "${{ matrix.entry.lib_id }}") + if [ -n "${{ matrix.entry.version }}" ]; then + args+=(-version "${{ matrix.entry.version }}") + fi + go run -tags ORT ./cmd/deadzone "${args[@]}" + - name: Record slot status + anchor upload LCA + # Two jobs in one shell because they're both "prep for the + # upload-artifact step below": + # + # 1. .run_status marks the slot as scraped-or-cached so + # consolidate's summary can render the run table. Runs on + # both cache-hit (scrape step skipped) and cache-miss + # (scrape step succeeded). A failed scrape skips this step + # entirely via the default "prior step failed" gate; + # consolidate's summary falls back to "failed" on a + # missing .run_status file. + # + # 2. artifacts/.pack-root anchors upload-artifact@v4's LCA + # calculation at `artifacts/`. Without it, matching only + # `artifacts//...` would collapse the LCA to + # `artifacts//`, stripping the slug prefix from the + # archive and causing every slot's artifact.db to collide + # at the same root after download-artifact merge-multiple. + # The sentinel is a file (not a dir), so + # db.Consolidate's `/*/artifact.db` glob skips it. + shell: bash + run: | + set -euo pipefail + mkdir -p "artifacts/${{ matrix.entry.slug }}" + : > artifacts/.pack-root + if [ "${{ steps.artifact-cache.outputs.cache-hit }}" = "true" ]; then + echo cached > "artifacts/${{ matrix.entry.slug }}/.run_status" + else + echo scraped > "artifacts/${{ matrix.entry.slug }}/.run_status" + fi + - name: Stage artifact for consolidate + uses: actions/upload-artifact@v4 + with: + # Pattern B fan-in staging — see research doc §4 for why + # Pattern C (REST cache API) is not buildable today. Retention + # is pinned to 1 day because this is strictly inter-job scratch + # inside a single dispatch; the long-lived transport is the + # artifact cache restored above. + # + # path is `artifacts/` (not `artifacts/`) on purpose — + # see the LCA note on the "Record slot status + anchor upload + # LCA" step above. Each matrix slot restored only its own + # slug into `artifacts//`, so the archive carries + # exactly one lib's payload under `/` plus the LCA + # sentinel .pack-root. + name: scrape-pack-${{ matrix.entry.slug }} + path: artifacts/ + include-hidden-files: true + if-no-files-found: error + retention-days: 1 + + consolidate: + name: consolidate + needs: [expand-libs, scrape] + # Run even when individual scrape slots failed — fail-fast: false on + # the matrix is pointless if the downstream job refuses to render a + # partial DB + summary. expand-libs must succeed though, because + # without its JSON there is no summary to write. + if: always() && needs.expand-libs.result == 'success' + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + - uses: actions/setup-go@v6 + with: + go-version-file: go.mod + - name: Install native deps + uses: ./.github/actions/install-native-deps + - name: Restore hugot model cache + uses: actions/cache@v5 + with: + path: ${{ env.DEADZONE_HUGOT_CACHE }} + key: hugot-model-${{ runner.os }}-${{ hashFiles('internal/embed/hugot.go') }} + restore-keys: | + hugot-model-${{ runner.os }}- + - name: Restore ORT cache + uses: actions/cache@v5 + with: + path: ${{ env.DEADZONE_ORT_CACHE }} + key: ort-lib-${{ runner.os }}-${{ hashFiles('internal/ort/ort.go') }} + restore-keys: | + ort-lib-${{ runner.os }}- + - name: Fetch staged artifacts + uses: actions/download-artifact@v4 + with: + # Each scrape slot uploaded with LCA = artifacts/, so every + # archive carries `/...` at its root. Extracting under + # `path: artifacts/` with merge-multiple therefore rebuilds + # `artifacts//...` per slot without collisions — the + # shape `deadzone consolidate -artifacts ./artifacts` expects. + pattern: scrape-pack-* + merge-multiple: true + path: artifacts/ + - name: Consolidate + run: go run -tags ORT ./cmd/deadzone consolidate -db deadzone.db -artifacts ./artifacts + - name: Release (if tag) + if: inputs.tag != '' + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: go run -tags ORT ./cmd/deadzone dbrelease -db deadzone.db -tag "${{ inputs.tag }}" + - name: Summary + if: always() + shell: bash + env: + LIBS_JSON: ${{ needs.expand-libs.outputs.libs }} + RELEASE_TAG: ${{ inputs.tag }} + run: | + set -euo pipefail + scraped=0; cached=0; failed=0 + { + echo "## scrape-pack summary" + echo "" + if [ -n "$RELEASE_TAG" ]; then + echo "Release tag: \`$RELEASE_TAG\`" + else + echo "Release tag: _(none — consolidated cache only)_" + fi + echo "" + echo "| lib | version | status |" + echo "| --- | --- | --- |" + } >> "$GITHUB_STEP_SUMMARY" + # jq pre-substitutes "-" for empty versions so bash's IFS=$'\t' + # read does not collapse consecutive tabs on single-version + # slots (whitespace IFS semantics would eat the empty field). + while IFS=$'\t' read -r lib_id version slug; do + status_path="artifacts/${slug}/.run_status" + if [ -f "$status_path" ]; then + status="$(tr -d '[:space:]' < "$status_path")" + else + status="failed" + fi + case "$status" in + scraped) scraped=$((scraped + 1)) ;; + cached) cached=$((cached + 1)) ;; + *) status="failed"; failed=$((failed + 1)) ;; + esac + echo "| \`$lib_id\` | \`$version\` | $status |" >> "$GITHUB_STEP_SUMMARY" + done < <(printf '%s' "$LIBS_JSON" | jq -r '.[] | [.lib_id, (if .version == "" then "-" else .version end), .slug] | @tsv') + { + echo "" + echo "**Totals:** $scraped scraped, $cached cached, $failed failed" + } >> "$GITHUB_STEP_SUMMARY" diff --git a/README.md b/README.md index 682b95a..fe8d0ac 100644 --- a/README.md +++ b/README.md @@ -247,6 +247,8 @@ The per-lib artifact folders under `./artifacts//` (each containing `artif > **Note.** The per-artifact GitHub Release distribution flow (`deadzone packs {upload,download,list}`) is paused as of [#101](https://github.com/laradji/deadzone/issues/101) — contributors who want a working DB run `just scrape && just consolidate` locally. Releases carry `deadzone.db` as a single consolidated asset; per-artifact distribution will return when CI takes over at scale. +The full registry can also be scraped from GitHub Actions via the `scrape-pack` workflow (see [`.github/workflows/scrape-pack.yml`](.github/workflows/scrape-pack.yml)) — `gh workflow run scrape-pack.yml -f tag=` scrapes every resolved lib in parallel, consolidates, and publishes `deadzone.db` to the tagged release; omit `-f tag=…` to stop at a consolidated-db cache. + Run `just` (no args) to list every recipe. Override the DB path with positional args: `just consolidate foo.db` / `just serve foo.db`. If you'd rather call `go` directly, prefix every command with `mise exec --` so you pick up the pinned toolchain. ### Building release binaries diff --git a/cmd/deadzone/scrape.go b/cmd/deadzone/scrape.go index 69b04d5..bdd3eb5 100644 --- a/cmd/deadzone/scrape.go +++ b/cmd/deadzone/scrape.go @@ -21,6 +21,7 @@ package main import ( "context" + "encoding/json" "errors" "flag" "fmt" @@ -87,6 +88,12 @@ func runScrape(args []string) error { parallelScrapeViaAgent := fs.Int("parallel-scrape-via-agent", envIntOr(EnvParallelScrapeViaAgent, defaultParallelScrapeViaAgent), "max concurrent scrape-via-agent libs (env: "+EnvParallelScrapeViaAgent+"; flag wins over env)") + // -list short-circuits before embedder/agent setup and emits the + // resolved (lib_id, version, slug) matrix to stdout as JSON. Consumed + // by .github/workflows/scrape-pack.yml's expand-libs job (see #126); + // intentionally the only side-effect-free flag on this subcommand so + // a CI runner can list libs without a model cache or network. + listOnly := fs.Bool("list", false, "emit JSON array of {lib_id, version, slug} resolved from -config and exit; skips the embedder and all I/O") if err := fs.Parse(args); err != nil { return err } @@ -130,6 +137,10 @@ func runScrape(args []string) error { } } + if *listOnly { + return emitResolvedList(sources) + } + // One artifacts/ dir per scrape run; created on demand so the first // invocation on a fresh checkout doesn't require an extra `mkdir -p` // step in the README. @@ -683,6 +694,33 @@ func setupAgent(ctx context.Context, sources []scraper.ResolvedSource) (*scraper return agent, nil } +// emitResolvedList writes the resolved (lib_id, version, slug) matrix +// to stdout as a JSON array, one object per ResolvedSource. Consumed by +// .github/workflows/scrape-pack.yml's expand-libs step, which pipes the +// value into a `matrix:` via `fromJSON`. slug matches packs.Slug so the +// cache-key path in each scrape matrix slot is trivially reconstructible +// from the JSON alone. +func emitResolvedList(sources []scraper.ResolvedSource) error { + type libEntry struct { + LibID string `json:"lib_id"` + Version string `json:"version"` + Slug string `json:"slug"` + } + entries := make([]libEntry, 0, len(sources)) + for _, s := range sources { + entries = append(entries, libEntry{ + LibID: s.LibID, + Version: s.Version, + Slug: packs.Slug(s.LibID, s.Version), + }) + } + enc := json.NewEncoder(os.Stdout) + // Single-line output: GitHub Actions' `$GITHUB_OUTPUT` protocol + // breaks on embedded newlines unless the multi-line heredoc form is + // used, and the expand-libs job uses the single-line form. + return enc.Encode(entries) +} + // envIntOr reads an integer from env var name, falling back to def if // the var is unset, empty, or unparseable. Used to make the // -parallel-* flag defaults env-overridable without needing a separate diff --git a/justfile b/justfile index 6266b6c..553906a 100644 --- a/justfile +++ b/justfile @@ -110,10 +110,10 @@ vet: tidy: mise exec -- go mod tidy -# Run the scraper, writing one artifact per lib to ./artifacts/ (pass lib=/org/project to refresh only that entry) -scrape lib="": +# Run the scraper, writing one artifact per lib to ./artifacts/ (pass lib=/org/project to refresh only that entry; pass version=X to pin to one expanded version) +scrape lib="" version="": CGO_ENABLED=1 CGO_LDFLAGS="-L${DEADZONE_TOKENIZERS_LIB:-./lib}" \ - mise exec -- go run -tags ORT ./cmd/deadzone scrape -artifacts ./artifacts {{ if lib != "" { "-lib " + lib } else { "" } }} + mise exec -- go run -tags ORT ./cmd/deadzone scrape -artifacts ./artifacts {{ if lib != "" { "-lib " + lib } else { "" } }} {{ if version != "" { "-version " + version } else { "" } }} # Merge per-lib artifacts in ./artifacts/ into the main deadzone DB consolidate db="deadzone.db":