Skip to content

feat: Add Papers With Code and Hugging Face datasets #1

feat: Add Papers With Code and Hugging Face datasets

feat: Add Papers With Code and Hugging Face datasets #1

name: Validate Data Sources
on:
pull_request:
paths:
- "firstdata/sources/**/*.json"
- "firstdata/schemas/datasource-schema.json"
jobs:
protect-schema:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Block schema modifications
run: |
if git diff --name-only origin/${{ github.base_ref }}...HEAD | grep -q "firstdata/schemas/datasource-schema.json"; then
echo "❌ PRs must not modify firstdata/schemas/datasource-schema.json"
echo "Schema changes require direct commit to main by a maintainer."
exit 1
fi
validate:
needs: protect-schema
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Install uv
uses: astral-sh/setup-uv@v5
- name: Install dependencies
run: uv sync
- name: Validate all source JSON files
run: |
find firstdata/sources -name "*.json" | xargs uv run check-jsonschema \
--schemafile firstdata/schemas/datasource-schema.json
- name: Check for duplicate IDs
run: |
uv run python - <<'EOF'
import json, sys
from pathlib import Path
seen = {}
errors = []
for path in sorted(Path("firstdata/sources").rglob("*.json")):
data = json.loads(path.read_text(encoding="utf-8"))
id_ = data.get("id")
if id_ in seen:
errors.append(f"Duplicate id '{id_}' in:\n {seen[id_]}\n {path}")
else:
seen[id_] = path
if errors:
print("❌ Duplicate IDs found:")
for e in errors:
print(e)
sys.exit(1)
print(f"✅ All {len(seen)} IDs are unique.")
EOF