Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
281 changes: 281 additions & 0 deletions .github/workflows/scrape-pack.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,281 @@
# scrape-pack — batch scrape the registry on GH-hosted runners, consolidate
# into a single deadzone.db, optionally publish via `deadzone dbrelease`.
#
# Decisions live in docs/research/batch-scrape-actions.md and #126:
# - workflow_dispatch only (no cron until #47)
# - per-lib artifact cache IS the freshness shim across runs
# - inter-JOB transport in a SINGLE run uses Pattern B (upload-artifact
# staging) because Pattern C (REST cache API) is not implementable —
# the cache archive download URL is a runtime-signed Azure Blob link
# that only @actions/cache's Node runtime token can negotiate, not
# reachable from `gh api`. The cross-run freshness story still runs
# entirely through the cache layer; upload-artifact is scratch
# transport scoped to one dispatch.
# - dbrelease fires only when inputs.tag is non-empty; empty tag stops
# at the consolidated-db cache (no side effects on the releases page)

name: scrape-pack

on:
workflow_dispatch:
inputs:
lib:
description: "Filter base lib_id (e.g. /hashicorp/terraform). Empty = every resolved lib."
required: false
type: string
default: ""
tag:
description: "Release tag (e.g. v0.1.0). Non-empty chains `deadzone dbrelease`. Empty stops at consolidated-db cache."
required: false
type: string
default: ""

permissions:
contents: write # dbrelease writes GH Release assets when inputs.tag != ""

concurrency:
# Queue serially: parallel dispatches would fight over the same cache
# keys, and dbrelease --clobbers the same asset names, so we let one
# finish before the next starts.
group: scrape-pack
cancel-in-progress: false

env:
# Mirror ci.yml verbatim — same pinned tokenizer static archive, same
# model + ORT cache roots. Bumping TOKENIZERS_VERSION in ci.yml
# invalidates the composite action's cache key via its own hashFiles
# rule; this workflow piggybacks on that invalidation.
TOKENIZERS_VERSION: v1.26.0
DEADZONE_HUGOT_CACHE: ${{ github.workspace }}/.deadzone-cache/models
DEADZONE_ORT_CACHE: ${{ github.workspace }}/.deadzone-cache/ort
CGO_ENABLED: "1"
CGO_LDFLAGS: -L/tmp/deadzone-deps/tokenizers

jobs:
expand-libs:
name: expand-libs
runs-on: ubuntu-latest
outputs:
libs: ${{ steps.list.outputs.libs }}
steps:
- uses: actions/checkout@v6
- uses: actions/setup-go@v6
with:
go-version-file: go.mod
- name: Install native deps
uses: ./.github/actions/install-native-deps
- name: Emit resolved libs as JSON
id: list
shell: bash
run: |
set -euo pipefail
args=(scrape -list -config libraries_sources.yaml)
if [ -n "${{ inputs.lib }}" ]; then
args+=(-lib "${{ inputs.lib }}")
fi
libs="$(go run -tags ORT ./cmd/deadzone "${args[@]}")"
echo "resolved: $libs"
echo "libs=$libs" >> "$GITHUB_OUTPUT"

scrape:
name: scrape (${{ matrix.entry.slug }})
needs: expand-libs
runs-on: ubuntu-latest
strategy:
# Siblings must keep running when one lib breaks — continue-on-error
# semantics at the matrix level (#93 mirrors this inside a single
# scrape invocation). The consolidate summary surfaces per-slot
# failures so a broken slot is visible, not silently dropped.
fail-fast: false
max-parallel: 20
matrix:
entry: ${{ fromJSON(needs.expand-libs.outputs.libs) }}
steps:
- uses: actions/checkout@v6
- uses: actions/setup-go@v6
with:
go-version-file: go.mod
- name: Install native deps
uses: ./.github/actions/install-native-deps
- name: Cache embedding model
# Key verbatim from .github/workflows/ci.yml — bumping the embedder
# (internal/embed/hugot.go) invalidates both caches in lockstep,
# which is exactly the freshness invariant for the artifact cache
# below.
uses: actions/cache@v5
with:
path: ${{ env.DEADZONE_HUGOT_CACHE }}
key: hugot-model-${{ runner.os }}-${{ hashFiles('internal/embed/hugot.go') }}
restore-keys: |
hugot-model-${{ runner.os }}-
- name: Cache ONNX Runtime library
# Key verbatim from .github/workflows/ci.yml — see comment above.
uses: actions/cache@v5
with:
path: ${{ env.DEADZONE_ORT_CACHE }}
key: ort-lib-${{ runner.os }}-${{ hashFiles('internal/ort/ort.go') }}
restore-keys: |
ort-lib-${{ runner.os }}-
- name: Cache per-lib artifact
id: artifact-cache
uses: actions/cache@v5
with:
# libraries_sources.yaml hash gates every resolved entry, so a
# URL edit anywhere in the registry invalidates every lib's
# cache — intentional over-invalidation vs per-section hashing
# that would be fragile to YAML reordering. Embedder hash is
# included so a vector-space change forces a rescrape in
# lockstep with the embedding model cache above.
path: artifacts/${{ matrix.entry.slug }}
key: artifact-${{ matrix.entry.slug }}-${{ matrix.entry.version }}-${{ hashFiles('libraries_sources.yaml') }}-${{ hashFiles('internal/embed/hugot.go') }}
- name: Scrape (cache miss)
if: steps.artifact-cache.outputs.cache-hit != 'true'
shell: bash
run: |
set -euo pipefail
args=(scrape -artifacts ./artifacts -lib "${{ matrix.entry.lib_id }}")
if [ -n "${{ matrix.entry.version }}" ]; then
args+=(-version "${{ matrix.entry.version }}")
fi
go run -tags ORT ./cmd/deadzone "${args[@]}"
- name: Record slot status + anchor upload LCA
# Two jobs in one shell because they're both "prep for the
# upload-artifact step below":
#
# 1. .run_status marks the slot as scraped-or-cached so
# consolidate's summary can render the run table. Runs on
# both cache-hit (scrape step skipped) and cache-miss
# (scrape step succeeded). A failed scrape skips this step
# entirely via the default "prior step failed" gate;
# consolidate's summary falls back to "failed" on a
# missing .run_status file.
#
# 2. artifacts/.pack-root anchors upload-artifact@v4's LCA
# calculation at `artifacts/`. Without it, matching only
# `artifacts/<slug>/...` would collapse the LCA to
# `artifacts/<slug>/`, stripping the slug prefix from the
# archive and causing every slot's artifact.db to collide
# at the same root after download-artifact merge-multiple.
# The sentinel is a file (not a dir), so
# db.Consolidate's `<dir>/*/artifact.db` glob skips it.
shell: bash
run: |
set -euo pipefail
mkdir -p "artifacts/${{ matrix.entry.slug }}"
: > artifacts/.pack-root
if [ "${{ steps.artifact-cache.outputs.cache-hit }}" = "true" ]; then
echo cached > "artifacts/${{ matrix.entry.slug }}/.run_status"
else
echo scraped > "artifacts/${{ matrix.entry.slug }}/.run_status"
fi
- name: Stage artifact for consolidate
uses: actions/upload-artifact@v4
with:
# Pattern B fan-in staging — see research doc §4 for why
# Pattern C (REST cache API) is not buildable today. Retention
# is pinned to 1 day because this is strictly inter-job scratch
# inside a single dispatch; the long-lived transport is the
# artifact cache restored above.
#
# path is `artifacts/` (not `artifacts/<slug>`) on purpose —
# see the LCA note on the "Record slot status + anchor upload
# LCA" step above. Each matrix slot restored only its own
# slug into `artifacts/<slug>/`, so the archive carries
# exactly one lib's payload under `<slug>/` plus the LCA
# sentinel .pack-root.
name: scrape-pack-${{ matrix.entry.slug }}
path: artifacts/
include-hidden-files: true
if-no-files-found: error
retention-days: 1

consolidate:
name: consolidate
needs: [expand-libs, scrape]
# Run even when individual scrape slots failed — fail-fast: false on
# the matrix is pointless if the downstream job refuses to render a
# partial DB + summary. expand-libs must succeed though, because
# without its JSON there is no summary to write.
if: always() && needs.expand-libs.result == 'success'
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v6
- uses: actions/setup-go@v6
with:
go-version-file: go.mod
- name: Install native deps
uses: ./.github/actions/install-native-deps
- name: Restore hugot model cache
uses: actions/cache@v5
with:
path: ${{ env.DEADZONE_HUGOT_CACHE }}
key: hugot-model-${{ runner.os }}-${{ hashFiles('internal/embed/hugot.go') }}
restore-keys: |
hugot-model-${{ runner.os }}-
- name: Restore ORT cache
uses: actions/cache@v5
with:
path: ${{ env.DEADZONE_ORT_CACHE }}
key: ort-lib-${{ runner.os }}-${{ hashFiles('internal/ort/ort.go') }}
restore-keys: |
ort-lib-${{ runner.os }}-
- name: Fetch staged artifacts
uses: actions/download-artifact@v4
with:
# Each scrape slot uploaded with LCA = artifacts/, so every
# archive carries `<slug>/...` at its root. Extracting under
# `path: artifacts/` with merge-multiple therefore rebuilds
# `artifacts/<slug>/...` per slot without collisions — the
# shape `deadzone consolidate -artifacts ./artifacts` expects.
pattern: scrape-pack-*
merge-multiple: true
path: artifacts/
- name: Consolidate
run: go run -tags ORT ./cmd/deadzone consolidate -db deadzone.db -artifacts ./artifacts
- name: Release (if tag)
if: inputs.tag != ''
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: go run -tags ORT ./cmd/deadzone dbrelease -db deadzone.db -tag "${{ inputs.tag }}"
- name: Summary
if: always()
shell: bash
env:
LIBS_JSON: ${{ needs.expand-libs.outputs.libs }}
RELEASE_TAG: ${{ inputs.tag }}
run: |
set -euo pipefail
scraped=0; cached=0; failed=0
{
echo "## scrape-pack summary"
echo ""
if [ -n "$RELEASE_TAG" ]; then
echo "Release tag: \`$RELEASE_TAG\`"
else
echo "Release tag: _(none — consolidated cache only)_"
fi
echo ""
echo "| lib | version | status |"
echo "| --- | --- | --- |"
} >> "$GITHUB_STEP_SUMMARY"
# jq pre-substitutes "-" for empty versions so bash's IFS=$'\t'
# read does not collapse consecutive tabs on single-version
# slots (whitespace IFS semantics would eat the empty field).
while IFS=$'\t' read -r lib_id version slug; do
status_path="artifacts/${slug}/.run_status"
if [ -f "$status_path" ]; then
status="$(tr -d '[:space:]' < "$status_path")"
else
status="failed"
fi
case "$status" in
scraped) scraped=$((scraped + 1)) ;;
cached) cached=$((cached + 1)) ;;
*) status="failed"; failed=$((failed + 1)) ;;
esac
echo "| \`$lib_id\` | \`$version\` | $status |" >> "$GITHUB_STEP_SUMMARY"
done < <(printf '%s' "$LIBS_JSON" | jq -r '.[] | [.lib_id, (if .version == "" then "-" else .version end), .slug] | @tsv')
{
echo ""
echo "**Totals:** $scraped scraped, $cached cached, $failed failed"
} >> "$GITHUB_STEP_SUMMARY"
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,8 @@ The per-lib artifact folders under `./artifacts/<slug>/` (each containing `artif

> **Note.** The per-artifact GitHub Release distribution flow (`deadzone packs {upload,download,list}`) is paused as of [#101](https://github.com/laradji/deadzone/issues/101) — contributors who want a working DB run `just scrape && just consolidate` locally. Releases carry `deadzone.db` as a single consolidated asset; per-artifact distribution will return when CI takes over at scale.

The full registry can also be scraped from GitHub Actions via the `scrape-pack` workflow (see [`.github/workflows/scrape-pack.yml`](.github/workflows/scrape-pack.yml)) — `gh workflow run scrape-pack.yml -f tag=<tag>` scrapes every resolved lib in parallel, consolidates, and publishes `deadzone.db` to the tagged release; omit `-f tag=…` to stop at a consolidated-db cache.

Run `just` (no args) to list every recipe. Override the DB path with positional args: `just consolidate foo.db` / `just serve foo.db`. If you'd rather call `go` directly, prefix every command with `mise exec --` so you pick up the pinned toolchain.

### Building release binaries
Expand Down
38 changes: 38 additions & 0 deletions cmd/deadzone/scrape.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ package main

import (
"context"
"encoding/json"
"errors"
"flag"
"fmt"
Expand Down Expand Up @@ -87,6 +88,12 @@ func runScrape(args []string) error {
parallelScrapeViaAgent := fs.Int("parallel-scrape-via-agent",
envIntOr(EnvParallelScrapeViaAgent, defaultParallelScrapeViaAgent),
"max concurrent scrape-via-agent libs (env: "+EnvParallelScrapeViaAgent+"; flag wins over env)")
// -list short-circuits before embedder/agent setup and emits the
// resolved (lib_id, version, slug) matrix to stdout as JSON. Consumed
// by .github/workflows/scrape-pack.yml's expand-libs job (see #126);
// intentionally the only side-effect-free flag on this subcommand so
// a CI runner can list libs without a model cache or network.
listOnly := fs.Bool("list", false, "emit JSON array of {lib_id, version, slug} resolved from -config and exit; skips the embedder and all I/O")
if err := fs.Parse(args); err != nil {
return err
}
Expand Down Expand Up @@ -130,6 +137,10 @@ func runScrape(args []string) error {
}
}

if *listOnly {
return emitResolvedList(sources)
}

// One artifacts/ dir per scrape run; created on demand so the first
// invocation on a fresh checkout doesn't require an extra `mkdir -p`
// step in the README.
Expand Down Expand Up @@ -683,6 +694,33 @@ func setupAgent(ctx context.Context, sources []scraper.ResolvedSource) (*scraper
return agent, nil
}

// emitResolvedList writes the resolved (lib_id, version, slug) matrix
// to stdout as a JSON array, one object per ResolvedSource. Consumed by
// .github/workflows/scrape-pack.yml's expand-libs step, which pipes the
// value into a `matrix:` via `fromJSON`. slug matches packs.Slug so the
// cache-key path in each scrape matrix slot is trivially reconstructible
// from the JSON alone.
func emitResolvedList(sources []scraper.ResolvedSource) error {
type libEntry struct {
LibID string `json:"lib_id"`
Version string `json:"version"`
Slug string `json:"slug"`
}
entries := make([]libEntry, 0, len(sources))
for _, s := range sources {
entries = append(entries, libEntry{
LibID: s.LibID,
Version: s.Version,
Slug: packs.Slug(s.LibID, s.Version),
})
}
enc := json.NewEncoder(os.Stdout)
// Single-line output: GitHub Actions' `$GITHUB_OUTPUT` protocol
// breaks on embedded newlines unless the multi-line heredoc form is
// used, and the expand-libs job uses the single-line form.
return enc.Encode(entries)
}

// envIntOr reads an integer from env var name, falling back to def if
// the var is unset, empty, or unparseable. Used to make the
// -parallel-* flag defaults env-overridable without needing a separate
Expand Down
6 changes: 3 additions & 3 deletions justfile
Original file line number Diff line number Diff line change
Expand Up @@ -110,10 +110,10 @@ vet:
tidy:
mise exec -- go mod tidy

# Run the scraper, writing one artifact per lib to ./artifacts/ (pass lib=/org/project to refresh only that entry)
scrape lib="":
# Run the scraper, writing one artifact per lib to ./artifacts/ (pass lib=/org/project to refresh only that entry; pass version=X to pin to one expanded version)
scrape lib="" version="":
CGO_ENABLED=1 CGO_LDFLAGS="-L${DEADZONE_TOKENIZERS_LIB:-./lib}" \
mise exec -- go run -tags ORT ./cmd/deadzone scrape -artifacts ./artifacts {{ if lib != "" { "-lib " + lib } else { "" } }}
mise exec -- go run -tags ORT ./cmd/deadzone scrape -artifacts ./artifacts {{ if lib != "" { "-lib " + lib } else { "" } }} {{ if version != "" { "-version " + version } else { "" } }}

# Merge per-lib artifacts in ./artifacts/ into the main deadzone DB
consolidate db="deadzone.db":
Expand Down