Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/example-data-seeder.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ jobs:
- name: Build and push image
uses: docker/build-push-action@v7
with:
context: ./example-data
context: ./collection-seeding
tags: ${{ steps.dockerMetadata.outputs.tags }}
cache-from: type=gha,scope=example-data-seeder-${{ github.ref }}
cache-to: type=gha,mode=max,scope=example-data-seeder-${{ github.ref }}
Expand Down
7 changes: 7 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,10 @@ logs
node_modules/

.env

# Python
__pycache__/
*.pyc

# pixi
.pixi/
14 changes: 14 additions & 0 deletions collection-seeding/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Stage 1: use pixi to resolve and install dependencies
FROM ghcr.io/prefix-dev/pixi:0.58.0 AS builder
WORKDIR /app
COPY pixi.toml pixi.lock .
RUN pixi install --frozen

# Stage 2: slim runtime image — copy only the installed site-packages
FROM python:3.13-slim AS final
WORKDIR /app
COPY --from=builder /app/.pixi/envs/default/lib/python3.13/site-packages \
/usr/local/lib/python3.13/site-packages
COPY seed.py backend.py .
COPY sources/ sources/
CMD ["python", "seed.py"]
43 changes: 43 additions & 0 deletions collection-seeding/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# collection-seeding

Seeds the backend with example collections:

- **covid-resistance-mutations** — resistance mutation data for 3CLpro, RdRp, and Spike mAb
- **covid-pango-lineages** — one collection per pango lineage, with nucleotide substitutions as variants

The script is idempotent — re-running it will create new collections or update existing ones (matched by name).

Collections are seeded under the [genspectrum-bot](https://github.com/genspectrum-bot) account (GitHub ID `218605180`), which is upserted automatically via `POST /users/sync` before seeding.

## Via Docker Compose

The seeder runs automatically as part of Docker Compose:

```bash
BACKEND_TAG=latest WEBSITE_TAG=latest SEEDER_TAG=latest docker compose up
```

## Running locally

Requires [pixi](https://pixi.sh). Install dependencies once:

```bash
pixi install
```

Then use the provided tasks:

```bash
pixi run seed # all sources (resistance mutations + first 10 lineages)
pixi run seed-resistance # resistance mutations only
pixi run seed-lineages # pango lineages (first 10)
pixi run seed-all-lineages # all ~4976 pango lineages
```

To target a different backend:

```bash
pixi run seed --url http://localhost:9021
```

Run `pixi run seed --help` or `pixi run seed <source> --help` for all options.
67 changes: 67 additions & 0 deletions collection-seeding/backend.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
"""Shared backend API client for collection seeders."""

import sys
import time

import requests

from models import Collection, ExistingCollection

RETRY_ATTEMPTS = 30
RETRY_DELAY_S = 2


SYNC_GITHUB_ID = "218605180" # https://github.com/genspectrum-bot
SYNC_NAME = "GenSpectrum Team"


class BackendClient:
def __init__(self, base_url: str):
self.base_url = base_url.rstrip("/")
self.user_id: int | None = None
self._collections_url = f"{self.base_url}/collections"

def sync_user(self, github_id: str = SYNC_GITHUB_ID, name: str = SYNC_NAME, email: str | None = None) -> int:
"""Upsert the seed user and store the returned internal id."""
body = {"githubId": github_id, "name": name, "email": email}
r = requests.post(f"{self.base_url}/users/sync", json=body, timeout=10)
if not r.ok:
raise RuntimeError(f"POST /users/sync failed: {r.status_code} {r.text}")
self.user_id = r.json()["id"]
return self.user_id

def wait_for_backend(self, attempts: int = RETRY_ATTEMPTS, delay: float = RETRY_DELAY_S):
"""Poll until the backend is ready by repeatedly attempting user sync."""
for attempt in range(1, attempts + 1):
try:
self.sync_user()
return
except (requests.RequestException, RuntimeError):
pass
print(f"Waiting for backend... (attempt {attempt}/{attempts})")
time.sleep(delay)
print(
f"Backend at {self.base_url} did not become ready after {attempts} attempts.",
file=sys.stderr,
)
sys.exit(1)

def fetch_existing_collections(self, organism: str) -> list[ExistingCollection]:
params = {"userId": self.user_id, "organism": organism}
r = requests.get(self._collections_url, params=params, timeout=10)
if not r.ok:
raise RuntimeError(f"GET /collections failed: {r.status_code} {r.text}")
return r.json()

def create_collection(self, collection: Collection) -> int:
params = {"userId": self.user_id}
r = requests.post(self._collections_url, params=params, json=collection, timeout=10)
if r.status_code != 201:
raise RuntimeError(f"POST /collections failed: {r.status_code} {r.text}")
return r.json()["id"]

def update_collection(self, collection_id: int, collection: Collection) -> None:
params = {"userId": self.user_id}
r = requests.put(f"{self._collections_url}/{collection_id}", params=params, json=collection, timeout=10)
if not r.ok:
raise RuntimeError(f"PUT /collections/{collection_id} failed: {r.status_code} {r.text}")
27 changes: 27 additions & 0 deletions collection-seeding/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
"""Shared type definitions for collection seeding."""

from typing import TypedDict


class FilterObject(TypedDict, total=False):
aminoAcidMutations: list[str]
nucleotideMutations: list[str]


class Variant(TypedDict):
type: str
name: str
filterObject: FilterObject


class Collection(TypedDict):
name: str
organism: str
description: str
variants: list[Variant]


class ExistingCollection(TypedDict):
"""A collection as returned by the backend (includes the assigned id)."""
id: int
name: str
Loading
Loading