From f6b4b4cb2e2c5656a301405543996019199ea4fc Mon Sep 17 00:00:00 2001 From: Thomas Gardos <3973626+trgardos@users.noreply.github.com> Date: Thu, 21 May 2026 09:13:46 -0400 Subject: [PATCH 01/19] ignore pycache --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 21d0b89..a230a78 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ .venv/ +__pycache__/ From 5f70e5761071aa88df167153cc42a36221f28a23 Mon Sep 17 00:00:00 2001 From: Thomas Gardos <3973626+trgardos@users.noreply.github.com> Date: Thu, 21 May 2026 09:37:31 -0400 Subject: [PATCH 02/19] new sqlite based download tracking --- DEPLOYMENT.md | 212 +++++++++++++++++ README.md | 142 ++++++++++-- download_db.py | 293 ++++++++++++++++++++++++ image_install_db.py | 546 ++++++++++++++++++++++++++++++++++++++++++++ image_install_db.sh | 17 ++ init_download_db.py | 267 ++++++++++++++++++++++ init_download_db.sh | 21 ++ status_report.py | 157 +++++++++++++ 8 files changed, 1630 insertions(+), 25 deletions(-) create mode 100644 DEPLOYMENT.md create mode 100644 download_db.py create mode 100644 image_install_db.py create mode 100755 image_install_db.sh create mode 100644 init_download_db.py create mode 100755 init_download_db.sh create mode 100644 status_report.py diff --git a/DEPLOYMENT.md b/DEPLOYMENT.md new file mode 100644 index 0000000..e487545 --- /dev/null +++ b/DEPLOYMENT.md @@ -0,0 +1,212 @@ +# Deployment Guide — Image Downloader with SQLite Status Tracking + +This guide covers deploying the GBIF herbarium image downloader after its switch +from flat-file checkpoints (`processed_ids.txt` / `failed_ids.txt`) to a +queryable SQLite status database. + +There are two phases: **build the database once**, then **run (and re-run) the +downloader**. All commands assume the SCC and the `spring-2026-pyt` conda +environment. + +--- + +## What changed + +| Before | After | +|---|---| +| Progress in `processed_ids.txt` / `failed_ids.txt` (ID only, no reason) | Progress in `download_status.db` — every URL's outcome and *why* it failed | +| `multimedia.txt` re-read and re-grouped with pandas every run | Ingested into the DB once; later runs query the work queue | +| Failed IDs all retried blindly (or skipped) | Only transient failures retried (timeout/rate-limit/5xx/dropped connection), capped at 4 attempts | +| `analyze_image_progress.py` (slow, loads ~180 MB of text) | `status_report.py` (instant SQL queries) | +| ~1.4 GB run logs, ~134 MB warning spam | `WARNING`-level log only; warning spam suppressed | + +The database lives **outside this git repo**, in the data directory, so it is +never committed: + +- `download_status.db` (+ `-wal`, `-shm` companions) at + `/projectnb/herbdl/data/GBIF-F25h/download_status.db` +- Estimated size after ingest: **~10–15 GB** + +--- + +## Files + +| File | Role | +|---|---| +| `init_download_db.py` / `init_download_db.sh` | One-time database builder (+ qsub wrapper) | +| `image_install_db.py` / `image_install_db.sh` | The downloader (+ qsub wrapper) | +| `status_report.py` | Progress reporting | +| `download_db.py` | Shared schema + DB helpers (imported, not run) | + +> The original flat-file downloader is preserved as `image_install_parallel.py` +> (run via `image_install.sh`). It is independent of the database workflow +> described here and is kept only for reference / fallback. + +--- + +## Phase 1 — Build the status database (once) + +This step ingests `multimedia.txt`, imports already-completed downloads from +`processed_ids.txt`, and renames legacy `.jpg` files to `-00.jpg`. + +It is heavy — it reads the ~59M-row `multimedia.txt` with pandas and renames up +to ~13.5M files. **Run it as a batch job, not on a login node.** + +```bash +qsub -N init_download_db -l h_rt=12:00:00 -pe omp 16 -P herbdl \ + -m beas -M your_email@bu.edu init_download_db.sh +``` + +`init_download_db.sh` runs: + +```bash +python init_download_db.py \ + --processed-file /projectnb/herbdl/workspaces/ljhao/herbdl/utils/processed_ids.txt +``` + +> **Important:** the production `processed_ids.txt` (~13.5M IDs) lives in +> ljhao's working directory, not in this repo. The wrapper already points there. +> If you build the DB by hand, pass that `--processed-file` path explicitly, or +> the legacy progress will not be imported. + +**Options:** + +| Command | Effect | +|---|---| +| `python init_download_db.py` | Build DB + import legacy progress | +| `python init_download_db.py --skip-legacy` | Build DB only (everything starts `pending`) | +| `python init_download_db.py --reset` | Delete an existing DB and rebuild from scratch | + +**Expected output** — a status breakdown, e.g.: + +``` +Final gbifID status counts: + done 13,200,000 + partial 320,000 + pending 36,900,000 +``` + +- `done` — every image for the gbifID is present +- `partial` — has an image already (legacy first image) but more to fetch +- `pending` — never attempted + +Re-running the builder is safe: file renames are idempotent (already-renamed +files are detected and reused). If a build fails partway, re-run with `--reset`. + +--- + +## Phase 2 — Run the downloader + +The downloader has no separate "resume" mode — every run reads the work queue +(`pending` + `partial` gbifIDs) from the database. Submit it as many times as +needed; each run continues where the last left off. + +```bash +qsub -N image_install_db -l h_rt=48:00:00 -pe omp 16 -P herbdl \ + -m beas -M your_email@bu.edu image_install_db.sh +``` + +If the job hits its `h_rt` wall-clock limit, just submit it again — progress is +committed to the database continuously, and host cooldown / circuit-breaker +state is persisted between runs. + +When the work queue is empty the script prints +`Nothing to download` and exits. + +To point at a non-default database, pass `--db PATH` (edit `image_install_db.sh`). + +--- + +## Phase 3 — Monitor progress + +Run any time — it is read-only and returns in seconds: + +```bash +python status_report.py +``` + +It prints (and writes `summary_YYYYMMDDHHMM.txt`): gbifID and per-image +progress, failures broken down by type, retry-attempt distribution, the worst +hosts, and circuit-breaker state. + +The run log (`WARNING` and above) is at +`/projectnb/herbdl/logs/image_install_.log`. + +Ad hoc queries: + +```bash +sqlite3 /projectnb/herbdl/data/GBIF-F25h/download_status.db +``` +```sql +-- count each kind of failure +SELECT error_type, COUNT(*) FROM images +WHERE status LIKE 'failed%' GROUP BY error_type ORDER BY 2 DESC; + +-- URLs still worth retrying +SELECT gbif_id, url FROM images WHERE status='failed_transient' LIMIT 50; + +-- hosts currently in cooldown +SELECT host, datetime(blocked_until,'unixepoch') FROM hosts +WHERE blocked_until > strftime('%s','now'); +``` + +--- + +## How retries work + +Each failure is classified into an `error_type`: + +- **Permanent** — `http_404`, `http_401`, `http_403`, `http_410`, + `invalid_content_type`, `not_an_image`, … → never retried. +- **Transient** — `timeout`, `rate_limited`, `server_error`, + `connection_broken`, `truncated`, `manifest_error` → retried on later runs, + up to **4 attempts** (`MAX_ATTEMPTS` in `download_db.py`), then they count + toward the gbifID's `failed` status. + +A gbifID leaves the work queue only when it is `done` (all images succeeded) or +`failed` (all images terminal, no retries left). To re-open exhausted transient +failures for another pass, raise `MAX_ATTEMPTS` or reset rows manually, e.g.: + +```sql +UPDATE images SET status='pending', attempts=0 +WHERE status='failed_transient'; +UPDATE gbif_ids SET status='partial' WHERE status='failed'; +``` + +--- + +## Caveats + +- **Legacy first-image index is approximate.** For gbifIDs imported from + `processed_ids.txt` that have more than one image, the existing file is + assumed to be image index 0 and marked `error_type='legacy_unverified_index'`. + The old downloader shuffled URLs, so the exact source URL is unknown. This is + exact for the ~87% of gbifIDs that have only one image; for the rest it + affects only metadata, not the image files. +- **Database size.** Expect ~10–15 GB. It sits in the data directory, not the + repo. Ensure the `herbdl` project has the space. +- **Single job at a time.** SQLite (WAL mode) is fine for one job with 5 worker + threads. Do not run multiple `image_install_db.sh` jobs against the same + database concurrently. + +--- + +## Rollback + +The previous flat-file downloader still exists in +`/projectnb/herbdl/workspaces/ljhao/herbdl/utils/` and is unaffected by this +work. To revert this repo, use git (`git log` / `git revert`). The status +database is independent — deleting `download_status.db*` simply means Phase 1 +must be re-run. + +--- + +## Troubleshooting + +| Symptom | Fix | +|---|---| +| `Status database not found` | Run Phase 1 first (`init_download_db.sh`). | +| `Database already exists` from the builder | Intended guard — pass `--reset` to rebuild. | +| `database is locked` | Another process is using the DB; ensure only one downloader job runs. The code already sets a 120 s busy timeout. | +| Builder runs out of memory | `multimedia.txt` is large; request more memory (e.g. a larger `-pe omp` slot count). | +| Legacy progress not imported | `--processed-file` was not pointed at ljhao's `processed_ids.txt`. | diff --git a/README.md b/README.md index 9a7e11c..2736324 100644 --- a/README.md +++ b/README.md @@ -2,12 +2,56 @@ This directory contains utility scripts for managing herbarium specimen images, including downloading, processing, organizing, and labeling datasets. +> **Deploying the image downloader?** See [DEPLOYMENT.md](DEPLOYMENT.md) for the +> step-by-step procedure (build the status database once, then run/resume the +> download job). + ## Scripts Overview ### Image Download & Installation -#### `image_install_parallel.py` -**Purpose**: Primary script for downloading herbarium specimen images from GBIF (Global Biodiversity Information Facility) multimedia datasets. +#### `image_install_db.py` +**Purpose**: Current script for downloading herbarium specimen images from GBIF (Global Biodiversity Information Facility) multimedia datasets. Downloads **all** images per gbifID, each saved as `-NN.jpg`, with status tracked in a SQLite database. See [DEPLOYMENT.md](DEPLOYMENT.md) for the full procedure. + +**Key Features**: +- Parallel downloading with ThreadPoolExecutor (5 workers) +- Host-based rate limiting and circuit breaker pattern +- IIIF (International Image Interoperability Framework) manifest support — one file saved per source URL, highest resolution first +- Automatic image resizing to 1024px max dimension +- Atomic downloads (stream to `.tmp`, length-check, then rename) so a dropped connection never leaves a corrupt file +- SQLite status database for resumable downloads and queryable, classified error tracking — see [`download_db.py`](download_db.py) +- Hierarchical directory organization (3-digit prefix structure) + +**Prerequisite**: build the status database once with `python init_download_db.py` before the first run. + +**Usage**: +```bash +python image_install_db.py [--db PATH] +``` + +**Configuration**: +- Input: `/projectnb/herbdl/data/GBIF-F25/multimedia.txt` (ingested once into the database) +- Output: `/projectnb/herbdl/data/GBIF-F25h/` +- Logs: `/projectnb/herbdl/logs/image_install_*.log` (WARNING level and above only — routine successes are recorded in the database, not the log) +- Status: `download_status.db` (default `/projectnb/herbdl/data/GBIF-F25h/download_status.db`) + +**Advanced Features**: +- Host cooldown on rate limiting (429 errors): 30 minutes default +- Host cooldown on timeouts: 60 minutes +- Circuit breaker: skips hosts after 500+ errors; state persists across runs in the `hosts` table +- Failures are classified (404, 401, timeout, rate-limited, dropped connection, …); only transient failures are retried, capped at 4 attempts +- Retry strategy with backoff for 500-level errors + +#### `image_install_db.sh` +**Purpose**: SCC job submission wrapper for `image_install_db.py`. + +**Usage**: +```bash +qsub -N image_install_db -l h_rt=48:00:00 -pe omp 16 -P herbdl -m beas -M your_email@bu.edu image_install_db.sh +``` + +#### `image_install_parallel.py` (original) +**Purpose**: The original downloader, kept for reference and fallback — superseded by `image_install_db.py`. Downloads **one** image per gbifID (stops at the first URL that succeeds) and tracks progress in flat `processed_ids.txt` / `failed_ids.txt` files. **Key Features**: - Parallel downloading with ThreadPoolExecutor (5 workers) @@ -30,21 +74,59 @@ python image_install_parallel.py [-c COUNTRY_CODE] - Logs: `/projectnb/herbdl/logs/image_install_*.log` - Checkpoints: `processed_ids.txt`, `failed_ids.txt` -**Advanced Features**: -- Host cooldown on rate limiting (429 errors): 30 minutes default -- Host cooldown on timeouts: 60 minutes -- Circuit breaker: Permanently blocks hosts after 50+ errors -- Multiple URL fallback per GBIF ID -- Retry strategy with backoff for 500-level errors - #### `image_install.sh` -**Purpose**: SCC job submission wrapper for `image_install_parallel.py`. +**Purpose**: SCC job submission wrapper for the original `image_install_parallel.py`. **Usage**: ```bash qsub -N image_install -l h_rt=48:00:00 -pe omp 16 -P herbdl -m beas -M your_email@bu.edu image_install.sh ``` +#### `download_db.py` +**Purpose**: SQLite-backed download-status tracking. Imported by the other download scripts — not run directly. + +**Why it exists**: replaces the flat `processed_ids.txt` / `failed_ids.txt` files, which recorded only an ID with no reason for failure. The database records, per image URL, whether it succeeded or failed and *why*, so failures are queryable and only transient ones get retried. + +**Tables**: +- `images` — one row per source image URL: `status`, `http_status`, `error_type`, `error_detail`, `file_path`, `file_size`, `attempts` +- `gbif_ids` — one row per gbifID; the resumable work queue (`pending` / `partial` / `done` / `failed`) +- `hosts` — per-host error tally and cooldown, so circuit-breaker state survives a restart + +#### `init_download_db.py` +**Purpose**: One-time builder for the status database. + +**What it does**: +1. Creates the schema +2. Ingests `multimedia.txt` into `images` + `gbif_ids` (so later runs never re-read the 59M-row file) +3. Imports `processed_ids.txt`: renames legacy `.jpg` files to `-00.jpg` for a consistent naming scheme and marks them done. Multi-image gbifIDs are left `partial` so the downloader fetches their remaining images. (`failed_ids.txt` is **not** imported — those IDs get a fresh, tracked retry.) + +**Usage**: +```bash +python init_download_db.py # build DB + import legacy progress +python init_download_db.py --skip-legacy # build DB only +python init_download_db.py --reset # rebuild from scratch +``` + +#### `status_report.py` +**Purpose**: Report download progress directly from the database — replaces `analyze_image_progress.py`. Every figure is a single indexed SQL query, so it returns in seconds instead of loading ~180 MB of text and re-grouping `multimedia.txt`. + +**Reports**: gbifID and per-image progress, failures broken down by type (permanent vs retryable), retry-attempt distribution, worst hosts, and circuit-breaker state. Writes a timestamped `summary_YYYYMMDDHHMM.txt`. + +**Usage**: +```bash +python status_report.py [--db PATH] [--output-dir DIR] +``` + +Ad hoc queries against the database, e.g.: +```sql +-- count each kind of failure +SELECT error_type, COUNT(*) FROM images +WHERE status LIKE 'failed%' GROUP BY error_type ORDER BY 2 DESC; + +-- every URL still worth retrying +SELECT gbif_id, url FROM images WHERE status='failed_transient'; +``` + ### Image Processing #### `image_utils.py` @@ -205,17 +287,24 @@ from notifications import send_notification send_notification("Image Installation", "Downloaded 50,000 images") ``` -**Integration**: Used by `image_install_parallel.py` to send progress updates every 50,000 images. +**Integration**: Used by the image download scripts (`image_install_db.py` and `image_install_parallel.py`) to send progress updates every 50,000 images. ## Common Workflows ### 1. Download GBIF Images + +See [DEPLOYMENT.md](DEPLOYMENT.md) for the full procedure. In brief: + ```bash -# Submit parallel download job -qsub -N image_install -l h_rt=48:00:00 -pe omp 16 -P herbdl image_install.sh +# One-time: build the status database (ingest multimedia.txt + import progress) +qsub -N init_download_db -l h_rt=12:00:00 -pe omp 16 -P herbdl init_download_db.sh -# Monitor progress in logs -tail -f /projectnb/herbdl/logs/image_install_*.log +# Submit the download job (re-run any time to resume — it reads the work +# queue from the database) +qsub -N image_install_db -l h_rt=48:00:00 -pe omp 16 -P herbdl image_install_db.sh + +# Check progress at any time +python status_report.py ``` ### 2. Organize Downloaded Images @@ -245,21 +334,23 @@ python link_check.py ## Directory Structures ### Hierarchical Image Storage -Images are organized by GBIF ID prefix for efficient filesystem access: +Images are organized by GBIF ID prefix for efficient filesystem access. Each +image for a gbifID is saved with a zero-padded index suffix (`-00`, `-01`, ...): ``` /projectnb/herbdl/data/GBIF-F25h/ -├── 000/ -│ ├── 000/ -│ │ ├── 000000.jpg -│ │ ├── 000001.jpg -│ ├── 001/ -│ │ ├── 000001000.jpg -├── 001/ +├── 105/ +│ ├── 716/ +│ │ ├── 1057161997-00.jpg +│ │ ├── 1057161997-01.jpg +│ ├── 717/ +│ │ ├── 1057170001-00.jpg +├── 106/ │ ├── 000/ │ ├── 001/ ``` -This structure prevents issues with directories containing millions of files. +`prefix1` is the first 3 digits of the gbifID, `prefix2` digits 4–6. This +structure prevents issues with directories containing millions of files. ## Dependencies @@ -278,5 +369,6 @@ This structure prevents issues with directories containing millions of files. - All scripts are designed for use on Boston University's Shared Computing Cluster (SCC) - Many scripts use parallel processing for performance -- Checkpoint files enable resumable operations after interruptions +- The `download_status.db` SQLite database enables resumable downloads and queryable error tracking; re-running the job simply continues the work queue +- `analyze_image_progress.py` and the `processed_ids.txt` / `failed_ids.txt` files are superseded by the database (`status_report.py`); kept only for historical reference - Always verify paths before running scripts to avoid data loss diff --git a/download_db.py b/download_db.py new file mode 100644 index 0000000..3540a93 --- /dev/null +++ b/download_db.py @@ -0,0 +1,293 @@ +""" +SQLite-backed download-status tracking for image_install_db.py. + +Replaces the flat processed_ids.txt / failed_ids.txt checkpoint files with a +queryable database that records, for every image URL, whether it succeeded or +failed and *why*. That makes it possible to: + * resume a run without re-reading and re-grouping the 59M-row multimedia.txt, + * retry only transient failures (timeouts, rate limits, 5xx, dropped + connections) while leaving permanent ones (404/410/etc.) alone, + * answer questions like "how many 404s?" or "which hosts fail most?" with a + single SQL query (see status_report.py). + +Tables +------ +images one row per source image URL (a GBIF "identifier"). +gbif_ids one row per gbifID; doubles as the resumable work queue. +hosts per-host error tally + cooldown timestamp, so circuit-breaker and + rate-limit state survive a job restart. + +A gbifID is 'done' only when every one of its images has status 'success'. +""" + +import os +import time +import sqlite3 +import threading + +DEFAULT_DB_PATH = "/projectnb/herbdl/data/GBIF-F25h/download_status.db" + +# Retry budget: a transient failure is retried until this many attempts. +MAX_ATTEMPTS = 4 + +# ---- images.status ----------------------------------------------------------- +ST_PENDING = "pending" # never attempted +ST_SUCCESS = "success" # downloaded (and resized) OK +ST_FAILED_PERMANENT = "failed_permanent" # retrying will not help +ST_FAILED_TRANSIENT = "failed_transient" # may succeed on a later run + +# ---- gbif_ids.status --------------------------------------------------------- +G_PENDING = "pending" # no image attempted yet +G_PARTIAL = "partial" # some work still possible (in the work queue) +G_DONE = "done" # every image succeeded +G_FAILED = "failed" # all images terminal, not all succeeded + +# ---- error_type values ------------------------------------------------------- +ERR_RATE_LIMITED = "rate_limited" # HTTP 429 +ERR_TIMEOUT = "timeout" # connect/read timeout, HTTP 408 +ERR_SERVER = "server_error" # HTTP 5xx +ERR_CONNECTION = "connection_broken" # dropped connection / IncompleteRead +ERR_TRUNCATED = "truncated" # download shorter than Content-Length +ERR_MANIFEST = "manifest_error" # IIIF manifest could not be parsed +ERR_INVALID_CONTENT = "invalid_content_type" # server returned HTML/XML/text +ERR_NOT_IMAGE = "not_an_image" # bytes downloaded but not decodable +ERR_NO_URL = "no_url" # no usable URL for this identifier +ERR_OTHER = "other" # anything uncategorised +ERR_LEGACY = "legacy_unverified_index" # marker on imported processed_ids.txt + +# Everything not in this set is treated as permanent (e.g. any "http_4xx"). +TRANSIENT_ERRORS = { + ERR_RATE_LIMITED, ERR_TIMEOUT, ERR_SERVER, + ERR_CONNECTION, ERR_TRUNCATED, ERR_MANIFEST, ERR_OTHER, +} + + +def http_error_type(code): + """Map an HTTP status code to an error_type string.""" + if code == 429: + return ERR_RATE_LIMITED + if code == 408: + return ERR_TIMEOUT + if 500 <= code <= 599: + return ERR_SERVER + return f"http_{code}" + + +def is_permanent(error_type): + """True if a failure of this type is not worth retrying.""" + return error_type not in TRANSIENT_ERRORS + + +def status_for_error(error_type): + """Pick the images.status value implied by an error_type.""" + return ST_FAILED_PERMANENT if is_permanent(error_type) else ST_FAILED_TRANSIENT + + +# ---- schema ------------------------------------------------------------------ + +_TABLES = [ + """CREATE TABLE IF NOT EXISTS images ( + gbif_id INTEGER NOT NULL, + img_index INTEGER NOT NULL, -- position in this ID's URL list + url TEXT NOT NULL, + host TEXT, + status TEXT NOT NULL DEFAULT 'pending', + http_status INTEGER, + error_type TEXT, + error_detail TEXT, -- truncated message, for debugging + file_path TEXT, + file_size INTEGER, -- bytes on disk after resize + attempts INTEGER NOT NULL DEFAULT 0, + last_attempt_at TEXT, + PRIMARY KEY (gbif_id, img_index) + )""", + """CREATE TABLE IF NOT EXISTS gbif_ids ( + gbif_id INTEGER PRIMARY KEY, + n_images INTEGER NOT NULL DEFAULT 0, + n_success INTEGER NOT NULL DEFAULT 0, + status TEXT NOT NULL DEFAULT 'pending', + completed_at TEXT + )""", + """CREATE TABLE IF NOT EXISTS hosts ( + host TEXT PRIMARY KEY, + error_count INTEGER NOT NULL DEFAULT 0, + blocked_until REAL -- epoch seconds; NULL when not blocked + )""", +] + +_INDEXES = [ + "CREATE INDEX IF NOT EXISTS idx_images_status ON images(status)", + "CREATE INDEX IF NOT EXISTS idx_images_host ON images(host)", + "CREATE INDEX IF NOT EXISTS idx_images_error ON images(error_type) " + "WHERE error_type IS NOT NULL", + "CREATE INDEX IF NOT EXISTS idx_gbif_status ON gbif_ids(status)", +] + + +def create_tables(conn): + for sql in _TABLES: + conn.execute(sql) + conn.commit() + + +def create_indexes(conn): + for sql in _INDEXES: + conn.execute(sql) + conn.commit() + + +def apply_schema(conn): + """Create tables and indexes if they do not already exist.""" + create_tables(conn) + create_indexes(conn) + + +# ---- runtime handle ---------------------------------------------------------- + +class DownloadDB: + """ + Thread-safe handle used by image_install_db.py during a run. + + One SQLite connection is shared by all worker threads and guarded by a + single lock. The downloads themselves take seconds each, so lock contention + on these short statements is negligible. WAL mode keeps writes durable + without blocking the occasional reader. + """ + + def __init__(self, db_path=DEFAULT_DB_PATH, max_attempts=MAX_ATTEMPTS): + self.path = db_path + self.max_attempts = max_attempts + self.conn = sqlite3.connect(db_path, check_same_thread=False, timeout=120) + self.conn.execute("PRAGMA journal_mode=WAL") + self.conn.execute("PRAGMA synchronous=NORMAL") + self.conn.execute("PRAGMA busy_timeout=120000") + apply_schema(self.conn) + self.lock = threading.Lock() + + def close(self): + with self.lock: + self.conn.commit() + self.conn.close() + + # -- work queue ------------------------------------------------------------ + + def get_work_gbif_ids(self): + """Return every gbifID that still has work to do, in ascending order.""" + with self.lock: + cur = self.conn.execute( + "SELECT gbif_id FROM gbif_ids WHERE status IN (?, ?) ORDER BY gbif_id", + (G_PENDING, G_PARTIAL), + ) + return [row[0] for row in cur.fetchall()] + + def get_images_for(self, gbif_id): + """Return (img_index, url, host, status, attempts) rows for one gbifID.""" + with self.lock: + cur = self.conn.execute( + "SELECT img_index, url, host, status, attempts " + "FROM images WHERE gbif_id=? ORDER BY img_index", + (gbif_id,), + ) + return cur.fetchall() + + # -- recording results ----------------------------------------------------- + + def record_image_result(self, gbif_id, img_index, status, *, host=None, + http_status=None, error_type=None, error_detail=None, + file_path=None, file_size=None, + increment_attempts=True): + """Write the outcome of one image attempt into the images table.""" + detail = (error_detail or "")[:500] or None + delta = 1 if increment_attempts else 0 + with self.lock: + self.conn.execute( + "UPDATE images SET " + " status=?, host=COALESCE(?, host), http_status=?, " + " error_type=?, error_detail=?, file_path=?, file_size=?, " + " attempts=attempts+?, last_attempt_at=datetime('now') " + "WHERE gbif_id=? AND img_index=?", + (status, host, http_status, error_type, detail, file_path, + file_size, delta, gbif_id, img_index), + ) + self.conn.commit() + + def finalize_gbif_id(self, gbif_id): + """ + Recompute and store a gbifID's rolled-up status from its image rows. + Returns the new status string. + """ + with self.lock: + rows = self.conn.execute( + "SELECT status, attempts FROM images WHERE gbif_id=?", + (gbif_id,), + ).fetchall() + if not rows: + return None + + n_success = sum(1 for s, _ in rows if s == ST_SUCCESS) + + def retryable(status, attempts): + if status == ST_PENDING: + return True + if status == ST_FAILED_TRANSIENT and attempts < self.max_attempts: + return True + return False + + if n_success == len(rows): + status = G_DONE + elif any(retryable(s, a) for s, a in rows): + status = G_PARTIAL + else: + status = G_FAILED + + self.conn.execute( + "UPDATE gbif_ids SET n_success=?, status=?, " + "completed_at=CASE WHEN ? IN (?, ?) THEN datetime('now') " + " ELSE completed_at END " + "WHERE gbif_id=?", + (n_success, status, status, G_DONE, G_FAILED, gbif_id), + ) + self.conn.commit() + return status + + # -- host circuit-breaker state ------------------------------------------- + + def load_host_state(self): + """Return (error_counts, blocked_until) dicts to seed the in-memory state.""" + now = time.time() + with self.lock: + cur = self.conn.execute( + "SELECT host, error_count, blocked_until FROM hosts" + ) + error_counts, blocked_until = {}, {} + for host, count, until in cur.fetchall(): + if count: + error_counts[host] = count + if until and until > now: + blocked_until[host] = until + return error_counts, blocked_until + + def save_host_state(self, error_counts, blocked_until): + """Persist the in-memory circuit-breaker dicts so they survive a restart.""" + hosts = set(error_counts) | set(blocked_until) + rows = [(h, error_counts.get(h, 0), blocked_until.get(h)) for h in hosts] + if not rows: + return + with self.lock: + self.conn.executemany( + "INSERT INTO hosts(host, error_count, blocked_until) VALUES(?,?,?) " + "ON CONFLICT(host) DO UPDATE SET " + " error_count=excluded.error_count, " + " blocked_until=excluded.blocked_until", + rows, + ) + self.conn.commit() + + # -- reporting helpers ----------------------------------------------------- + + def gbif_status_counts(self): + """Return {status: count} over the gbif_ids table.""" + with self.lock: + return dict(self.conn.execute( + "SELECT status, COUNT(*) FROM gbif_ids GROUP BY status" + ).fetchall()) diff --git a/image_install_db.py b/image_install_db.py new file mode 100644 index 0000000..503a0a5 --- /dev/null +++ b/image_install_db.py @@ -0,0 +1,546 @@ +""" +Image install script: download herbarium specimen images from a GBIF +multimedia.txt file. + +Downloads ALL images for each gbifID. Each source URL (a GBIF "identifier") is +saved as one file with an index suffix: -00.jpg, -01.jpg, ... +A gbifID is marked 'done' only once every one of its images has succeeded. + +Status tracking +--------------- +Per-image and per-gbifID status lives in a SQLite database (download_status.db, +see download_db.py) instead of the old processed_ids.txt / failed_ids.txt flat +files. Build the database once with init_download_db.py before the first run. + +The database lets the script: + * resume without re-reading the 59M-row multimedia.txt every run, + * retry only transient failures (timeout / rate-limit / 5xx / dropped + connection), capped at MAX_ATTEMPTS, and never re-hammer permanent 404s, + * record *why* each download failed so failures are queryable afterwards + (see status_report.py). + +Accurate as of May 2026. +""" + +import os +import time +import random +import logging +import threading +import datetime as dt +from argparse import ArgumentParser +from urllib.parse import urlparse +from concurrent.futures import ThreadPoolExecutor, as_completed + +import urllib3 +import requests as req +from requests.exceptions import ConnectTimeout, ReadTimeout, Timeout, ConnectionError +from requests.adapters import HTTPAdapter +from urllib3.util.retry import Retry +from PIL import UnidentifiedImageError + +from notifications import send_notification +from image_utils import get_file_size_in_mb, resize_with_aspect_ratio +import download_db as ddb +from download_db import DownloadDB + +# verify=False is needed because many herbarium hosts have broken TLS certs. +# Suppress the resulting per-request warning so it does not flood the .e log +# (it previously produced ~134 MB of InsecureRequestWarning spam per run). +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + +# ---- configuration ----------------------------------------------------------- + +INSTALL_PATH = "/projectnb/herbdl/data/GBIF-F25h" +LOG_DIR = "/projectnb/herbdl/logs" + +MAX_WORKERS = 5 +WORK_CHUNK = 20_000 # gbifIDs submitted to the pool at a time +MIN_IMAGE_MB = 0.01 # files smaller than this are treated as invalid + +HOST_COOLDOWN_DEFAULT = 30 * 60 +HOST_COOLDOWN_TIMEOUT = 60 * 60 +HOST_ERROR_THRESHOLD = 500 # circuit breaker: skip a host after this many errors + +# ---- in-memory host circuit-breaker state (seeded from / saved to the DB) ---- + +host_block_until = {} +host_error_counts = {} +host_lock = threading.Lock() +circuit_breaker_lock = threading.Lock() +counter_lock = threading.Lock() + +n_installed = 0 + +user_agents = [ + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0", +] + +session = req.Session() +retry_strategy = Retry( + total=2, + backoff_factor=1, + status_forcelist=[500, 502, 503, 504], + allowed_methods=["HEAD", "GET", "OPTIONS"], +) +adapter = HTTPAdapter(max_retries=retry_strategy) +session.mount("http://", adapter) +session.mount("https://", adapter) + +logger = logging.getLogger(__name__) + + +# ---- paths ------------------------------------------------------------------- + +def get_hierarchical_path(base_dir, gbif_id, suffix, ext=".jpg"): + """ + Build a hierarchical storage path to avoid millions of files in one dir. + suffix: image index suffix, e.g. '-00', '-01'. + Example: gbifID=1057161997, suffix='-00' -> /105/716/1057161997-00.jpg + """ + stem = str(gbif_id) + prefix1 = stem[:3] if len(stem) >= 3 else stem + prefix2 = stem[3:6] if len(stem) >= 6 else "000" + dest_dir = os.path.join(base_dir, prefix1, prefix2) + os.makedirs(dest_dir, exist_ok=True) + return os.path.join(dest_dir, f"{stem}{suffix}{ext}") + + +# ---- host circuit breaker / cooldown ----------------------------------------- + +def _host_from_url(url): + return urlparse(url).netloc.split(":")[0] + + +def is_host_blocked(url): + host = _host_from_url(url) + now = time.time() + with host_lock: + until = host_block_until.get(host) + if until and now < until: + return True + if until and now >= until: + del host_block_until[host] + return False + + +def is_host_circuit_broken(url): + host = _host_from_url(url) + with circuit_breaker_lock: + return host_error_counts.get(host, 0) >= HOST_ERROR_THRESHOLD + + +def increment_host_errors(url, is_rate_limit=False): + # Rate limiting is handled by a timed cooldown, not the permanent breaker. + if is_rate_limit: + return + host = _host_from_url(url) + with circuit_breaker_lock: + host_error_counts[host] = host_error_counts.get(host, 0) + 1 + count = host_error_counts[host] + if count == HOST_ERROR_THRESHOLD: + logger.error(f"CIRCUIT BREAKER: host '{host}' reached " + f"{HOST_ERROR_THRESHOLD} errors; skipping it from now on.") + + +def block_host(url, retry_after=None, timeout_issue=False): + host = _host_from_url(url) + now = time.time() + seconds = HOST_COOLDOWN_TIMEOUT if timeout_issue else HOST_COOLDOWN_DEFAULT + if retry_after and not timeout_issue: + try: + seconds = int(retry_after) + except (TypeError, ValueError): + try: + from email.utils import parsedate_to_datetime + dt_retry = parsedate_to_datetime(retry_after) + seconds = max(0, (dt_retry - dt.datetime.now(dt.timezone.utc)) + .total_seconds()) + except Exception: + seconds = HOST_COOLDOWN_DEFAULT + with host_lock: + host_block_until[host] = now + seconds + reason = "timeout issues" if timeout_issue else "rate limiting" + logger.warning(f"Blocking host '{host}' due to {reason} for ~{int(seconds)}s.") + + +# ---- IIIF manifests ---------------------------------------------------------- + +def extract_image_from_iiif_manifest(manifest_url, gbif_id): + """ + Fetch a IIIF manifest and return (image_urls, error_type). + + image_urls is an ordered list of direct image URLs (highest resolution + first). On failure image_urls is empty and error_type explains why, so the + caller can decide whether the manifest is worth retrying. + """ + try: + response = session.get( + manifest_url, + headers={"User-Agent": random.choice(user_agents), + "Accept": "application/json"}, + timeout=120, + ) + if response.status_code != 200: + logger.warning(f"IIIF manifest {gbif_id}: HTTP {response.status_code}") + return [], ddb.http_error_type(response.status_code) + + manifest = response.json() + image_urls = [] + for item in manifest.get("items", []): + if item.get("type") != "Canvas": + continue + for anno_page in item.get("items", []): + if anno_page.get("type") != "AnnotationPage": + continue + for anno in anno_page.get("items", []): + body = anno.get("body") + if not isinstance(body, dict): + continue + for service in body.get("service", []): + base_url = service.get("id") + if base_url: + # Highest resolution first; caller stops at the + # first that succeeds, so only one file is saved. + image_urls.append(f"{base_url}/full/1600,/0/default.jpg") + image_urls.append(f"{base_url}/full/1200,/0/default.jpg") + image_urls.append(f"{base_url}/full/800,/0/default.jpg") + + if not image_urls: + return [], ddb.ERR_MANIFEST + return image_urls, None + + except (ConnectTimeout, ReadTimeout, Timeout) as e: + logger.warning(f"IIIF manifest {gbif_id}: timeout {e}") + return [], ddb.ERR_TIMEOUT + except Exception as e: + logger.warning(f"IIIF manifest {gbif_id}: parse error {e}") + return [], ddb.ERR_MANIFEST + + +# ---- downloading ------------------------------------------------------------- + +def _rm(path): + try: + os.remove(path) + except OSError: + pass + + +def download_one_url(gbif_id, image_url, local_path): + """ + Download a single URL to local_path, atomically. + + Bytes are streamed to a .tmp file, length-checked against Content-Length, + then renamed into place -- so a dropped connection never leaves a corrupt + file behind. Returns a result dict with keys: ok, size, http_status, + error_type, error_detail, host. + """ + host = _host_from_url(image_url) + tmp_path = local_path + ".tmp" + + def fail(error_type, detail, http_status=None): + return {"ok": False, "size": None, "http_status": http_status, + "error_type": error_type, "error_detail": detail, "host": host} + + try: + time.sleep(random.uniform(0.2, 0.8)) + with session.get( + image_url, + stream=True, + verify=False, + headers={ + "User-Agent": random.choice(user_agents), + "Connection": "keep-alive", + "Referer": "https://scc-ondemand1.bu.edu/", + }, + timeout=180, + ) as resp: + status = resp.status_code + + if status == 429: + increment_host_errors(image_url, is_rate_limit=True) + block_host(image_url, resp.headers.get("Retry-After")) + return fail(ddb.ERR_RATE_LIMITED, "HTTP 429", status) + + if status != 200: + increment_host_errors(image_url) + return fail(ddb.http_error_type(status), f"HTTP {status}", status) + + ctype = (resp.headers.get("Content-Type") or "").lower() + if ctype and any(bad in ctype for bad in + ("text/html", "text/plain", "application/xml")): + increment_host_errors(image_url) + return fail(ddb.ERR_INVALID_CONTENT, f"Content-Type: {ctype}", status) + + expected = resp.headers.get("Content-Length") + written = 0 + with open(tmp_path, "wb") as out: + for chunk in resp.iter_content(chunk_size=65536): + if chunk: + out.write(chunk) + written += len(chunk) + + if expected is not None: + try: + if int(expected) != written: + _rm(tmp_path) + return fail(ddb.ERR_TRUNCATED, + f"expected {expected} bytes, got {written}", + status) + except ValueError: + pass + + if written < 1024: + _rm(tmp_path) + return fail(ddb.ERR_TRUNCATED, f"only {written} bytes", status) + + os.replace(tmp_path, local_path) + return {"ok": True, "size": written, "http_status": status, + "error_type": None, "error_detail": None, "host": host} + + except (ConnectTimeout, ReadTimeout, Timeout) as e: + _rm(tmp_path) + block_host(image_url, timeout_issue=True) + return fail(ddb.ERR_TIMEOUT, str(e)) + except (ConnectionError, req.exceptions.ChunkedEncodingError) as e: + # ChunkedEncodingError covers IncompleteRead -- a connection dropped + # mid-download, which leaves only a partial .tmp file. + _rm(tmp_path) + increment_host_errors(image_url) + return fail(ddb.ERR_CONNECTION, str(e)) + except Exception as e: + _rm(tmp_path) + increment_host_errors(image_url) + return fail(ddb.ERR_OTHER, str(e)) + + +def resize_image(gbif_id, local_path): + changed, new_size = resize_with_aspect_ratio( + local_path, local_path, max_size=1024, format="JPEG", quality=85) + if changed: + logger.info(f"Resized {gbif_id} to {new_size} at {local_path}") + + +def resolve_and_download(gbif_id, identifier_url, local_path): + """ + Download the image for one source identifier (one img_index) and save it as + exactly one file at local_path. + + For a plain URL there is one candidate. For a IIIF manifest the manifest is + expanded into resolution variants and tried highest-first; the first success + wins, so still only one file is saved per identifier. + + Returns a result dict with keys: outcome ('success' | 'failed' | + 'deferred'), db_status, http_status, error_type, error_detail, host, + file_size. 'deferred' means every candidate host was blocked/circuit-broken, + so the image was not really attempted and should stay 'pending'. + """ + if "/manifest" in identifier_url or identifier_url.endswith(".json"): + candidates, manifest_err = extract_image_from_iiif_manifest( + identifier_url, gbif_id) + if not candidates: + return {"outcome": "failed", + "db_status": ddb.status_for_error(manifest_err), + "http_status": None, "error_type": manifest_err, + "error_detail": "IIIF manifest yielded no image URLs", + "host": _host_from_url(identifier_url), "file_size": None} + else: + candidates = [identifier_url] + + # Deduplicate while preserving the highest-resolution-first order. + seen, ordered = set(), [] + for url in candidates: + if url not in seen: + seen.add(url) + ordered.append(url) + + failures = [] + attempted_any = False + for url in ordered: + if is_host_circuit_broken(url) or is_host_blocked(url): + continue + attempted_any = True + result = download_one_url(gbif_id, url, local_path) + if result["ok"]: + try: + resize_image(gbif_id, local_path) + except (OSError, UnidentifiedImageError) as e: + _rm(local_path) + return {"outcome": "failed", "db_status": ddb.ST_FAILED_PERMANENT, + "http_status": result["http_status"], + "error_type": ddb.ERR_NOT_IMAGE, + "error_detail": str(e), "host": result["host"], + "file_size": None} + try: + size = os.path.getsize(local_path) + except OSError: + size = result["size"] + return {"outcome": "success", "db_status": ddb.ST_SUCCESS, + "http_status": 200, "error_type": None, + "error_detail": None, "host": result["host"], + "file_size": size} + failures.append(result) + + if not attempted_any: + # Every candidate's host was blocked -- leave the image 'pending'. + return {"outcome": "deferred"} + + # Prefer a transient failure as the recorded reason: if any candidate could + # still succeed later, the whole identifier is worth retrying. + transient = [f for f in failures if not ddb.is_permanent(f["error_type"])] + chosen = transient[0] if transient else failures[0] + db_status = ddb.ST_FAILED_TRANSIENT if transient else ddb.ST_FAILED_PERMANENT + return {"outcome": "failed", "db_status": db_status, + "http_status": chosen["http_status"], + "error_type": chosen["error_type"], + "error_detail": chosen["error_detail"], + "host": chosen["host"], "file_size": None} + + +# ---- per-gbifID processing --------------------------------------------------- + +def process_id(db, gbif_id, total_to_install): + """Download every not-yet-done image for one gbifID and update the DB.""" + global n_installed + images = db.get_images_for(gbif_id) + + for img_index, url, _host, status, attempts in images: + # Skip images that are already finished or have exhausted their retries. + if status == ddb.ST_SUCCESS: + continue + if status == ddb.ST_FAILED_PERMANENT: + continue + if status == ddb.ST_FAILED_TRANSIENT and attempts >= db.max_attempts: + continue + + suffix = f"-{img_index:02d}" + local_path = get_hierarchical_path(INSTALL_PATH, gbif_id, suffix) + + # If a valid file is already on disk, record it without downloading. + if os.path.exists(local_path): + try: + size_mb = get_file_size_in_mb(local_path) + except OSError: + size_mb = 0.0 + if size_mb >= MIN_IMAGE_MB: + db.record_image_result( + gbif_id, img_index, ddb.ST_SUCCESS, + host=_host_from_url(url), http_status=200, + file_path=local_path, file_size=int(size_mb * 1024 * 1024), + increment_attempts=False) + continue + + result = resolve_and_download(gbif_id, url, local_path) + if result["outcome"] == "deferred": + continue # host blocked; leave 'pending' for a later run + + db.record_image_result( + gbif_id, img_index, result["db_status"], + host=result.get("host"), http_status=result.get("http_status"), + error_type=result.get("error_type"), + error_detail=result.get("error_detail"), + file_path=local_path if result["outcome"] == "success" else None, + file_size=result.get("file_size")) + + if result["outcome"] == "success": + with counter_lock: + n_installed += 1 + current = n_installed + if current % 50000 == 0: + send_notification( + "Image Installation", + f"Installed {current} images this run " + f"(work queue: {total_to_install} gbifIDs).") + logger.warning(f"Installed {current} images this run.") + + db.finalize_gbif_id(gbif_id) + + +# ---- main -------------------------------------------------------------------- + +def main(): + parser = ArgumentParser(description=__doc__) + parser.add_argument("-c", "--country", dest="country", + help="(Unsupported) country filter -- multimedia.txt " + "has no countryCode column; ignored.") + parser.add_argument("--db", default=ddb.DEFAULT_DB_PATH, + help=f"Status database path (default: {ddb.DEFAULT_DB_PATH})") + args = parser.parse_args() + + if args.country: + print("WARNING: -c/--country is ignored; the work queue comes from the " + "database and multimedia.txt has no countryCode column.") + + today = dt.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + os.makedirs(LOG_DIR, exist_ok=True) + # File log captures WARNING and above only. Routine per-image successes are + # recorded in the database, not the log -- this keeps the log from growing + # to the ~1.4 GB seen with INFO-level logging. + logging.basicConfig(filename=f"{LOG_DIR}/image_install_{today}.log", + level=logging.WARNING, filemode="w", + format="%(asctime)s %(levelname)s %(message)s") + + if not os.path.exists(args.db): + raise SystemExit( + f"Status database not found: {args.db}\n" + f"Build it once first: python init_download_db.py") + + db = DownloadDB(args.db) + + # Seed the in-memory circuit breaker from the last run's host stats. + saved_errors, saved_blocks = db.load_host_state() + host_error_counts.update(saved_errors) + host_block_until.update(saved_blocks) + print(f"Loaded host state: {len(saved_errors)} hosts with errors, " + f"{len(saved_blocks)} currently blocked.") + + work = db.get_work_gbif_ids() + total_to_install = len(work) + print(f"gbifIDs with work to do: {total_to_install}") + if total_to_install == 0: + print("Nothing to download. All gbifIDs are 'done' or terminally 'failed'.") + db.close() + return + + send_notification("Image Installation", + f"Starting run: {total_to_install} gbifIDs to process.") + + try: + with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor: + for start in range(0, total_to_install, WORK_CHUNK): + chunk = work[start:start + WORK_CHUNK] + futures = [executor.submit(process_id, db, gid, total_to_install) + for gid in chunk] + for future in as_completed(futures): + try: + future.result() + except Exception as e: + logger.error(f"Worker error: {e}") + # Persist circuit-breaker state periodically so a killed job + # (e.g. qsub h_rt limit) does not lose it. + db.save_host_state(host_error_counts, host_block_until) + print(f" processed {min(start + WORK_CHUNK, total_to_install)}" + f"/{total_to_install} gbifIDs", end="\r") + except KeyboardInterrupt: + logger.warning("Interrupted by user; saving state and exiting.") + finally: + db.save_host_state(host_error_counts, host_block_until) + counts = db.gbif_status_counts() + broken = sum(1 for c in host_error_counts.values() + if c >= HOST_ERROR_THRESHOLD) + logger.warning(f"Run finished. Images installed this run: {n_installed}. " + f"gbifID status: {counts}. Circuit-broken hosts: {broken}.") + for host, count in sorted(host_error_counts.items(), + key=lambda x: x[1], reverse=True)[:10]: + logger.warning(f" host errors: {host}: {count}") + db.close() + + print(f"\nDone. Images installed this run: {n_installed}") + print(f"gbifID status: {counts}") + + +if __name__ == "__main__": + main() diff --git a/image_install_db.sh b/image_install_db.sh new file mode 100755 index 0000000..43b426b --- /dev/null +++ b/image_install_db.sh @@ -0,0 +1,17 @@ +#!/bin/bash -l + +# Run the SQLite-tracked image downloader (image_install_db.py). +# +# Prerequisite: build the status database once with init_download_db.sh. +# This job is resumable -- re-submit it any time and it continues from the +# work queue stored in download_status.db. + +module load miniconda +module load academic-ml/spring-2026 + +conda activate spring-2026-pyt + +python image_install_db.py + +### The command below is used to submit the job to the cluster +### qsub -N image_install_db -l h_rt=48:00:00 -pe omp 16 -P herbdl -m beas -M your_email@bu.edu image_install_db.sh diff --git a/init_download_db.py b/init_download_db.py new file mode 100644 index 0000000..65b23f2 --- /dev/null +++ b/init_download_db.py @@ -0,0 +1,267 @@ +#!/usr/bin/env python3 +""" +One-time builder for the image-download status database (download_status.db). + +What it does +------------ +1. Creates the SQLite schema (see download_db.py). +2. Reads multimedia.txt once and loads every (gbifID, image URL) pair into the + `images` table and every gbifID into `gbif_ids`. After this, runs of + image_install_db.py no longer need to re-read and re-group the 59M-row + multimedia.txt -- the work queue lives in the database. +3. Imports processed_ids.txt: for each already-finished gbifID it locates the + downloaded file, renames legacy `.jpg` to `-00.jpg` so the dataset + uses one consistent naming scheme, and marks image index 0 as 'success'. + gbifIDs with more than one image are left 'partial' so the multi-image + downloader goes back and fetches their remaining images. + + NOTE: the old one-image-per-ID downloader shuffled candidate URLs, so for a + multi-image gbifID we cannot know which URL the existing file came from. It + is recorded against img_index 0 with error_type 'legacy_unverified_index'. + ~87% of gbifIDs have only one image, where this assignment is exact. + +failed_ids.txt is intentionally NOT imported: those IDs stay 'pending' and get +a fresh, fully-tracked retry. + +This script is destructive-ish (it renames files and can drop an existing DB +with --reset). It does not download anything. Run it once before the first +run of image_install_db.py. + +Usage +----- + python init_download_db.py # build DB + import legacy + python init_download_db.py --skip-legacy # build DB only + python init_download_db.py --reset # rebuild from scratch +""" + +import os +import sys +import time +import sqlite3 +import argparse + +import pandas as pd + +import download_db as ddb + +GBIF_MULTIMEDIA_DATA = "/projectnb/herbdl/data/GBIF-F25/multimedia.txt" +INSTALL_PATH = "/projectnb/herbdl/data/GBIF-F25h" +PROCESSED_FILE = "processed_ids.txt" + +INSERT_BATCH = 200_000 +LEGACY_BATCH = 50_000 + + +def hierarchical_path(base_dir, gbif_id, suffix=""): + """Mirror image_install_db.get_hierarchical_path (without makedirs).""" + stem = str(gbif_id) + prefix1 = stem[:3] if len(stem) >= 3 else stem + prefix2 = stem[3:6] if len(stem) >= 6 else "000" + return os.path.join(base_dir, prefix1, prefix2, f"{stem}{suffix}.jpg") + + +def parse_args(): + p = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + p.add_argument("--db", default=ddb.DEFAULT_DB_PATH, + help=f"Path to the SQLite database (default: {ddb.DEFAULT_DB_PATH})") + p.add_argument("--multimedia", default=GBIF_MULTIMEDIA_DATA, + help="GBIF multimedia.txt to ingest") + p.add_argument("--install-path", default=INSTALL_PATH, + help="Root directory where images are stored") + p.add_argument("--processed-file", default=PROCESSED_FILE, + help="processed_ids.txt to import as already-done gbifIDs") + p.add_argument("--skip-legacy", action="store_true", + help="Do not import processed_ids.txt") + p.add_argument("--reset", action="store_true", + help="Delete an existing database before building") + return p.parse_args() + + +def ingest_multimedia(conn, multimedia_path): + """Load every image URL from multimedia.txt into images + gbif_ids.""" + print(f"Reading {multimedia_path} ...") + df = pd.read_csv( + multimedia_path, + delimiter="\t", + usecols=lambda c: c in ("gbifID", "identifier"), + on_bad_lines="skip", + ) + df = df.dropna(subset=["gbifID", "identifier"]) + df["gbifID"] = df["gbifID"].astype("int64") + df["identifier"] = df["identifier"].astype("string") + print(f" {len(df):,} (gbifID, URL) rows") + + # Sort so each gbifID's rows are contiguous, then number them 0,1,2,... + df = df.sort_values("gbifID", kind="stable").reset_index(drop=True) + df["img_index"] = df.groupby("gbifID").cumcount() + df["host"] = ( + df["identifier"].str.extract(r"^[a-zA-Z][a-zA-Z0-9+.-]*://([^/:]+)", + expand=False) + .fillna("") + ) + + print(" Inserting image rows ...") + inserted = 0 + for start in range(0, len(df), INSERT_BATCH): + sub = df.iloc[start:start + INSERT_BATCH] + rows = list(zip( + sub["gbifID"].tolist(), + sub["img_index"].tolist(), + sub["identifier"].tolist(), + sub["host"].tolist(), + )) + conn.executemany( + "INSERT OR IGNORE INTO images(gbif_id, img_index, url, host) " + "VALUES(?,?,?,?)", + rows, + ) + conn.commit() + inserted += len(rows) + print(f" {inserted:,}/{len(df):,} image rows", end="\r") + print(f" {inserted:,} image rows inserted ") + + print(" Inserting gbifID rows ...") + sizes = df.groupby("gbifID").size() + gid_rows = list(zip(sizes.index.tolist(), sizes.tolist())) + for start in range(0, len(gid_rows), INSERT_BATCH): + conn.executemany( + "INSERT OR IGNORE INTO gbif_ids(gbif_id, n_images) VALUES(?,?)", + gid_rows[start:start + INSERT_BATCH], + ) + conn.commit() + print(f" {len(gid_rows):,} gbifIDs inserted") + + +def import_legacy(conn, processed_file, install_path): + """Mark gbifIDs from processed_ids.txt as already having their first image.""" + if not os.path.exists(processed_file): + print(f" {processed_file} not found -- skipping legacy import.") + return + + print(f"Importing already-processed gbifIDs from {processed_file} ...") + renamed = relabeled = missing = 0 + updates = [] + + def flush(batch): + if batch: + conn.executemany( + "UPDATE images SET status='success', " + " error_type=?, file_path=?, file_size=?, " + " last_attempt_at=datetime('now') " + "WHERE gbif_id=? AND img_index=0", + batch, + ) + conn.commit() + + with open(processed_file) as fh: + for line in fh: + gid = line.strip() + if not gid or not gid.isdigit(): + continue + + new_path = hierarchical_path(install_path, gid, "-00") + old_path = hierarchical_path(install_path, gid, "") + + if os.path.exists(new_path): + path = new_path + relabeled += 1 + elif os.path.exists(old_path): + try: + os.rename(old_path, new_path) + except OSError: + missing += 1 + continue + path = new_path + renamed += 1 + else: + missing += 1 + continue + + try: + size = os.path.getsize(path) + except OSError: + missing += 1 + continue + + updates.append((ddb.ERR_LEGACY, path, size, int(gid))) + if len(updates) >= LEGACY_BATCH: + flush(updates) + updates = [] + print(f" renamed={renamed:,} relabeled={relabeled:,} " + f"missing={missing:,}", end="\r") + flush(updates) + print(f" renamed={renamed:,} already-suffixed={relabeled:,} " + f"file-missing={missing:,}") + + # Roll the per-image success flags up into gbif_ids statuses in one pass. + print(" Recomputing gbifID statuses ...") + conn.execute( + "UPDATE gbif_ids SET " + " n_success=(SELECT COUNT(*) FROM images i " + " WHERE i.gbif_id=gbif_ids.gbif_id AND i.status='success'), " + " status=CASE " + " WHEN n_images>0 AND n_images=(SELECT COUNT(*) FROM images i " + " WHERE i.gbif_id=gbif_ids.gbif_id AND i.status='success') " + " THEN 'done' " + " WHEN (SELECT COUNT(*) FROM images i " + " WHERE i.gbif_id=gbif_ids.gbif_id AND i.status='success')>0 " + " THEN 'partial' " + " ELSE 'pending' END " + "WHERE gbif_id IN (SELECT DISTINCT gbif_id FROM images " + " WHERE status='success')" + ) + conn.execute( + "UPDATE gbif_ids SET completed_at=datetime('now') " + "WHERE status='done' AND completed_at IS NULL" + ) + conn.commit() + + +def main(): + args = parse_args() + start = time.time() + + if os.path.exists(args.db): + if args.reset: + print(f"Removing existing database {args.db}") + for suffix in ("", "-wal", "-shm"): + try: + os.remove(args.db + suffix) + except FileNotFoundError: + pass + else: + sys.exit(f"Database already exists: {args.db}\n" + f"Pass --reset to rebuild it from scratch.") + + os.makedirs(os.path.dirname(os.path.abspath(args.db)), exist_ok=True) + conn = sqlite3.connect(args.db) + # Fast bulk-load settings; the DB is fully rebuildable, so durability during + # ingest is not needed. image_install_db.py switches it to WAL later. + conn.execute("PRAGMA journal_mode=OFF") + conn.execute("PRAGMA synchronous=OFF") + conn.execute("PRAGMA cache_size=-200000") # ~200 MB page cache + + print("Creating schema ...") + ddb.create_tables(conn) + + ingest_multimedia(conn, args.multimedia) + + print("Building indexes (this takes a few minutes) ...") + ddb.create_indexes(conn) + + if not args.skip_legacy: + import_legacy(conn, args.processed_file, args.install_path) + + print("\nFinal gbifID status counts:") + for status, count in conn.execute( + "SELECT status, COUNT(*) FROM gbif_ids GROUP BY status ORDER BY status" + ): + print(f" {status:10s} {count:,}") + + conn.close() + print(f"\nDone in {time.time() - start:.0f}s. Database: {os.path.abspath(args.db)}") + + +if __name__ == "__main__": + main() diff --git a/init_download_db.sh b/init_download_db.sh new file mode 100755 index 0000000..5c5278a --- /dev/null +++ b/init_download_db.sh @@ -0,0 +1,21 @@ +#!/bin/bash -l + +# One-time build of the image-download status database (download_status.db). +# +# This is heavy: it reads the ~59M-row multimedia.txt with pandas and renames +# up to ~13.5M already-downloaded files. Run it as a batch job, not on a login +# node. It only needs to be run once; after that, just (re-)submit +# image_install.sh to download and resume. + +module load miniconda +module load academic-ml/spring-2026 + +conda activate spring-2026-pyt + +# --processed-file points at the production processed_ids.txt, which lives in +# ljhao's working directory, not this repo. +python init_download_db.py \ + --processed-file /projectnb/herbdl/workspaces/ljhao/herbdl/utils/processed_ids.txt + +### The command below is used to submit the job to the cluster +### qsub -N init_download_db -l h_rt=12:00:00 -pe omp 16 -P herbdl -m beas -M your_email@bu.edu init_download_db.sh diff --git a/status_report.py b/status_report.py new file mode 100644 index 0000000..9c8e4b5 --- /dev/null +++ b/status_report.py @@ -0,0 +1,157 @@ +#!/usr/bin/env python3 +""" +Report image-download progress straight from the SQLite status database. + +This replaces analyze_image_progress.py: instead of loading ~180 MB of text +checkpoint files and re-grouping the 59M-row multimedia.txt with pandas, every +number here is a single indexed SQL query, so the report returns in seconds. + +The same numbers are available ad hoc -- a few useful queries: + + -- how many of each kind of failure? + SELECT error_type, COUNT(*) FROM images + WHERE status LIKE 'failed%' GROUP BY error_type ORDER BY 2 DESC; + + -- every URL still worth retrying + SELECT gbif_id, url FROM images WHERE status='failed_transient'; + + -- worst hosts + SELECT host, COUNT(*) FROM images WHERE status LIKE 'failed%' + GROUP BY host ORDER BY 2 DESC LIMIT 20; + +Usage: + python status_report.py [--db PATH] [--output-dir DIR] +""" + +import os +import sqlite3 +import argparse +from datetime import datetime + +import download_db as ddb + + +def parse_args(): + p = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + p.add_argument("--db", default=ddb.DEFAULT_DB_PATH, + help=f"Status database path (default: {ddb.DEFAULT_DB_PATH})") + p.add_argument("--output-dir", default=os.getcwd(), + help="Directory for the summary_YYYYMMDDHHMM.txt file") + return p.parse_args() + + +def main(): + args = parse_args() + if not os.path.exists(args.db): + raise SystemExit(f"Status database not found: {args.db}") + + conn = sqlite3.connect(args.db) + run_time = datetime.now() + output_file = os.path.join( + args.output_dir, f"summary_{run_time:%Y%m%d%H%M}.txt") + os.makedirs(args.output_dir, exist_ok=True) + + with open(output_file, "w") as out: + def write(msg=""): + out.write(msg + "\n") + print(msg) + + def section(title): + write() + write("=" * 70) + write(title) + write("=" * 70) + + write(f"Run date: {run_time:%Y-%m-%d %H:%M:%S}") + write(f"Database: {os.path.abspath(args.db)}") + + # -- gbifID progress -------------------------------------------------- + section("GBIFID PROGRESS") + gbif_counts = dict(conn.execute( + "SELECT status, COUNT(*) FROM gbif_ids GROUP BY status").fetchall()) + total_ids = sum(gbif_counts.values()) + write(f"Total gbifIDs: {total_ids:,}") + for status in (ddb.G_DONE, ddb.G_PARTIAL, ddb.G_PENDING, ddb.G_FAILED): + count = gbif_counts.get(status, 0) + pct = (count / total_ids * 100) if total_ids else 0.0 + write(f" {status:10s} {count:>14,} ({pct:5.2f}%)") + remaining = gbif_counts.get(ddb.G_PENDING, 0) + gbif_counts.get(ddb.G_PARTIAL, 0) + write(f"Still in the work queue: {remaining:,}") + + # -- per-image progress ---------------------------------------------- + section("IMAGE (URL) PROGRESS") + img_counts = dict(conn.execute( + "SELECT status, COUNT(*) FROM images GROUP BY status").fetchall()) + total_imgs = sum(img_counts.values()) + write(f"Total image URLs: {total_imgs:,}") + for status in (ddb.ST_SUCCESS, ddb.ST_PENDING, + ddb.ST_FAILED_TRANSIENT, ddb.ST_FAILED_PERMANENT): + count = img_counts.get(status, 0) + pct = (count / total_imgs * 100) if total_imgs else 0.0 + write(f" {status:18s} {count:>14,} ({pct:5.2f}%)") + + # -- failure breakdown ------------------------------------------------ + section("FAILURES BY TYPE") + write(f"{'error_type':24s} {'count':>14s} {'verdict':s}") + write("-" * 60) + rows = conn.execute( + "SELECT error_type, COUNT(*) FROM images " + "WHERE status LIKE 'failed%' AND error_type IS NOT NULL " + "GROUP BY error_type ORDER BY 2 DESC").fetchall() + for error_type, count in rows: + verdict = "permanent" if ddb.is_permanent(error_type) else "retryable" + write(f"{error_type:24s} {count:>14,} {verdict}") + if not rows: + write("(no failures recorded yet)") + + # -- retry attempt distribution -------------------------------------- + section("RETRY ATTEMPTS (failed_transient images)") + rows = conn.execute( + "SELECT attempts, COUNT(*) FROM images " + "WHERE status='failed_transient' GROUP BY attempts ORDER BY attempts" + ).fetchall() + for attempts, count in rows: + note = " <- retry budget exhausted" if attempts >= ddb.MAX_ATTEMPTS else "" + write(f" {attempts} attempt(s): {count:,}{note}") + if not rows: + write("(none)") + + # -- worst hosts ------------------------------------------------------ + section("TOP 20 HOSTS BY FAILED IMAGES") + write(f"{'host':40s} {'failed':>10s} {'success':>10s}") + write("-" * 64) + rows = conn.execute( + "SELECT host, " + " SUM(CASE WHEN status LIKE 'failed%' THEN 1 ELSE 0 END) AS failed, " + " SUM(CASE WHEN status='success' THEN 1 ELSE 0 END) AS ok " + "FROM images WHERE host IS NOT NULL AND host != '' " + "GROUP BY host ORDER BY failed DESC LIMIT 20").fetchall() + for host, failed, ok in rows: + write(f"{host[:40]:40s} {failed or 0:>10,} {ok or 0:>10,}") + + # -- circuit-breaker state ------------------------------------------- + section("CIRCUIT BREAKER / COOLDOWNS") + broken = conn.execute( + "SELECT COUNT(*) FROM hosts WHERE error_count >= 500").fetchone()[0] + blocked = conn.execute( + "SELECT COUNT(*) FROM hosts " + "WHERE blocked_until IS NOT NULL " + "AND blocked_until > strftime('%s','now')").fetchone()[0] + write(f"Hosts past the circuit-breaker threshold (500 errors): {broken:,}") + write(f"Hosts currently in cooldown: {blocked:,}") + + section("NOTES") + write("- 'done' = every image URL for the gbifID succeeded.") + write("- 'partial' = still has retryable work; stays in the queue.") + write("- 'failed' = all images terminal, not all succeeded; no retries left.") + write("- failed_transient images are retried until " + f"{ddb.MAX_ATTEMPTS} attempts, then count toward 'failed'.") + write() + write(f"Summary written to: {os.path.abspath(output_file)}") + + conn.close() + + +if __name__ == "__main__": + main() From 0b59c51c677d2333a750ef8d851dcf67e1bfb881 Mon Sep 17 00:00:00 2001 From: Thomas Gardos <3973626+trgardos@users.noreply.github.com> Date: Thu, 21 May 2026 09:38:19 -0400 Subject: [PATCH 03/19] ignore generated summary files --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index a230a78..b85c0e6 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,5 @@ .venv/ __pycache__/ + +# ignore generated summary files +summary*.txt From 8f3e299dc080463267497aa7c27273cb19678ce7 Mon Sep 17 00:00:00 2001 From: Thomas Gardos <3973626+trgardos@users.noreply.github.com> Date: Thu, 21 May 2026 13:37:07 -0400 Subject: [PATCH 04/19] ignore batch scheduler log files --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index b85c0e6..68a21d2 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,7 @@ __pycache__/ # ignore generated summary files summary*.txt + +# ignore batch scheduler log files +*.e[0-9]* +*.o[0-9]* From 8a9d326ab80fb4f7a396b0217bdc3f27c4d46f88 Mon Sep 17 00:00:00 2001 From: Thomas Gardos <3973626+trgardos@users.noreply.github.com> Date: Thu, 21 May 2026 13:39:24 -0400 Subject: [PATCH 05/19] add legacy-only mode to finish reading processed IDs, and allow concurrent read-only access to db --- DEPLOYMENT.md | 13 ++++-- README.md | 1 + init_download_db.py | 104 +++++++++++++++++++++++++++++++++++--------- init_download_db.sh | 4 +- 4 files changed, 96 insertions(+), 26 deletions(-) diff --git a/DEPLOYMENT.md b/DEPLOYMENT.md index e487545..d7ad4c8 100644 --- a/DEPLOYMENT.md +++ b/DEPLOYMENT.md @@ -75,6 +75,7 @@ python init_download_db.py \ |---|---| | `python init_download_db.py` | Build DB + import legacy progress | | `python init_download_db.py --skip-legacy` | Build DB only (everything starts `pending`) | +| `python init_download_db.py --legacy-only` | Skip the ingest; only (re-)run the legacy import on an existing DB | | `python init_download_db.py --reset` | Delete an existing DB and rebuild from scratch | **Expected output** — a status breakdown, e.g.: @@ -90,8 +91,11 @@ Final gbifID status counts: - `partial` — has an image already (legacy first image) but more to fetch - `pending` — never attempted -Re-running the builder is safe: file renames are idempotent (already-renamed -files are detected and reused). If a build fails partway, re-run with `--reset`. +Re-running is safe: file renames and database updates are idempotent +(already-renamed files are detected and reused). If the **ingest** fails partway, +re-run with `--reset`. If only the **legacy import** fails partway (e.g. it was +interrupted), re-run with `--legacy-only` — that finishes the import without +redoing the hour-long ingest. --- @@ -206,7 +210,8 @@ must be re-run. | Symptom | Fix | |---|---| | `Status database not found` | Run Phase 1 first (`init_download_db.sh`). | -| `Database already exists` from the builder | Intended guard — pass `--reset` to rebuild. | -| `database is locked` | Another process is using the DB; ensure only one downloader job runs. The code already sets a 120 s busy timeout. | +| `Database already exists` from the builder | Intended guard — `--reset` to rebuild, or `--legacy-only` to just (re-)run the legacy import. | +| `database is locked` | The builder now uses WAL mode (readers do not block the writer) and a 120 s busy timeout, so this should not recur. If the legacy import was interrupted by it, finish it with `init_download_db.py --legacy-only`. Still avoid running two writers against one DB. | +| Legacy import interrupted partway | Re-run `init_download_db.py --legacy-only` — it is idempotent and skips the hour-long ingest. | | Builder runs out of memory | `multimedia.txt` is large; request more memory (e.g. a larger `-pe omp` slot count). | | Legacy progress not imported | `--processed-file` was not pointed at ljhao's `processed_ids.txt`. | diff --git a/README.md b/README.md index 2736324..930339a 100644 --- a/README.md +++ b/README.md @@ -104,6 +104,7 @@ qsub -N image_install -l h_rt=48:00:00 -pe omp 16 -P herbdl -m beas -M your_emai ```bash python init_download_db.py # build DB + import legacy progress python init_download_db.py --skip-legacy # build DB only +python init_download_db.py --legacy-only # (re-)run only the legacy import python init_download_db.py --reset # rebuild from scratch ``` diff --git a/init_download_db.py b/init_download_db.py index 65b23f2..ca03bf9 100644 --- a/init_download_db.py +++ b/init_download_db.py @@ -27,10 +27,14 @@ with --reset). It does not download anything. Run it once before the first run of image_install_db.py. +The legacy import is idempotent and resumable: if it is interrupted, re-run +with --legacy-only to finish it without redoing the multimedia ingest. + Usage ----- python init_download_db.py # build DB + import legacy python init_download_db.py --skip-legacy # build DB only + python init_download_db.py --legacy-only # (re-)run only the legacy import python init_download_db.py --reset # rebuild from scratch """ @@ -73,6 +77,9 @@ def parse_args(): help="processed_ids.txt to import as already-done gbifIDs") p.add_argument("--skip-legacy", action="store_true", help="Do not import processed_ids.txt") + p.add_argument("--legacy-only", action="store_true", + help="Skip the ingest; only (re-)run the legacy import on an " + "existing database (use this to finish an interrupted import)") p.add_argument("--reset", action="store_true", help="Delete an existing database before building") return p.parse_args() @@ -144,15 +151,28 @@ def import_legacy(conn, processed_file, install_path): updates = [] def flush(batch): - if batch: - conn.executemany( - "UPDATE images SET status='success', " - " error_type=?, file_path=?, file_size=?, " - " last_attempt_at=datetime('now') " - "WHERE gbif_id=? AND img_index=0", - batch, - ) - conn.commit() + if not batch: + return + # WAL mode + the 120 s busy timeout make a lock here very unlikely, but + # retry rather than throw away a long-running import if one occurs. + for attempt in range(1, 4): + try: + conn.executemany( + "UPDATE images SET status='success', " + " error_type=?, file_path=?, file_size=?, " + " last_attempt_at=datetime('now') " + "WHERE gbif_id=? AND img_index=0", + batch, + ) + conn.commit() + return + except sqlite3.OperationalError as e: + if "locked" in str(e).lower() and attempt < 3: + print(f"\n database locked; retry {attempt}/3 " + f"in {10 * attempt}s ...") + time.sleep(10 * attempt) + continue + raise with open(processed_file) as fh: for line in fh: @@ -218,10 +238,55 @@ def flush(batch): conn.commit() +def report_status(conn): + """Print the gbifID status breakdown.""" + print("\nFinal gbifID status counts:") + for status, count in conn.execute( + "SELECT status, COUNT(*) FROM gbif_ids GROUP BY status ORDER BY status" + ): + print(f" {status:10s} {count:,}") + + +def connect(db_path, bulk_load): + """ + Open the database with a 120 s busy timeout, so a momentary lock from a + concurrent reader (e.g. status_report.py) makes the write wait rather than + abort the run. + + bulk_load=True -> fastest, no durability (for the rebuildable ingest). + bulk_load=False -> WAL + synchronous=NORMAL: durable, and readers never + block the writer (used for the legacy import). + """ + conn = sqlite3.connect(db_path, timeout=120) + conn.execute("PRAGMA busy_timeout=120000") + conn.execute("PRAGMA cache_size=-200000") # ~200 MB page cache + if bulk_load: + conn.execute("PRAGMA journal_mode=OFF") + conn.execute("PRAGMA synchronous=OFF") + else: + conn.execute("PRAGMA journal_mode=WAL") + conn.execute("PRAGMA synchronous=NORMAL") + return conn + + def main(): args = parse_args() start = time.time() + # --legacy-only: skip the ingest and just (re-)run the legacy import. Use + # this to finish an interrupted import without rebuilding the database. + if args.legacy_only: + if not os.path.exists(args.db): + sys.exit(f"--legacy-only needs an existing database, but none was " + f"found at: {args.db}\nRun the full build first.") + print(f"--legacy-only: (re-)running the legacy import on {args.db}") + conn = connect(args.db, bulk_load=False) + import_legacy(conn, args.processed_file, args.install_path) + report_status(conn) + conn.close() + print(f"\nDone in {time.time() - start:.0f}s.") + return + if os.path.exists(args.db): if args.reset: print(f"Removing existing database {args.db}") @@ -232,15 +297,13 @@ def main(): pass else: sys.exit(f"Database already exists: {args.db}\n" - f"Pass --reset to rebuild it from scratch.") + f" --reset rebuild it from scratch\n" + f" --legacy-only (re-)run just the legacy import on it") os.makedirs(os.path.dirname(os.path.abspath(args.db)), exist_ok=True) - conn = sqlite3.connect(args.db) # Fast bulk-load settings; the DB is fully rebuildable, so durability during - # ingest is not needed. image_install_db.py switches it to WAL later. - conn.execute("PRAGMA journal_mode=OFF") - conn.execute("PRAGMA synchronous=OFF") - conn.execute("PRAGMA cache_size=-200000") # ~200 MB page cache + # the ingest is not needed. + conn = connect(args.db, bulk_load=True) print("Creating schema ...") ddb.create_tables(conn) @@ -251,14 +314,13 @@ def main(): ddb.create_indexes(conn) if not args.skip_legacy: + # Switch to a durable, reader-tolerant mode for the legacy import: it + # renames files on disk, so a crash here is costlier than during ingest. + conn.execute("PRAGMA journal_mode=WAL") + conn.execute("PRAGMA synchronous=NORMAL") import_legacy(conn, args.processed_file, args.install_path) - print("\nFinal gbifID status counts:") - for status, count in conn.execute( - "SELECT status, COUNT(*) FROM gbif_ids GROUP BY status ORDER BY status" - ): - print(f" {status:10s} {count:,}") - + report_status(conn) conn.close() print(f"\nDone in {time.time() - start:.0f}s. Database: {os.path.abspath(args.db)}") diff --git a/init_download_db.sh b/init_download_db.sh index 5c5278a..1fde26b 100755 --- a/init_download_db.sh +++ b/init_download_db.sh @@ -14,7 +14,9 @@ conda activate spring-2026-pyt # --processed-file points at the production processed_ids.txt, which lives in # ljhao's working directory, not this repo. -python init_download_db.py \ +# --legacy-only means only process the legacy images without creating the database. +# Run this on subsequent times after db was created. +python init_download_db.py --legacy-only \ --processed-file /projectnb/herbdl/workspaces/ljhao/herbdl/utils/processed_ids.txt ### The command below is used to submit the job to the cluster From 5abbaea9a064f56d818c55be5342777bd9a9f725 Mon Sep 17 00:00:00 2001 From: Thomas Gardos <3973626+trgardos@users.noreply.github.com> Date: Thu, 21 May 2026 16:11:34 -0400 Subject: [PATCH 06/19] doc on how to download GBIF metadata --- README.md | 2 +- docs/gbif-metadata-download.md | 71 ++++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+), 1 deletion(-) create mode 100644 docs/gbif-metadata-download.md diff --git a/README.md b/README.md index 930339a..d01c343 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Utils Directory +# GBIF Image Data Download This directory contains utility scripts for managing herbarium specimen images, including downloading, processing, organizing, and labeling datasets. diff --git a/docs/gbif-metadata-download.md b/docs/gbif-metadata-download.md new file mode 100644 index 0000000..003f695 --- /dev/null +++ b/docs/gbif-metadata-download.md @@ -0,0 +1,71 @@ +# GBIF Metadata Download + +To get a complete list of all entries for vascular plants that include herbaria image assets on [GBIF](https://www.gbif.org/), you need to filter by a specific taxonomic phylum, restrict the results to preserved specimens (herbaria), and ensure they contain image media. + +Because this query will yield tens of millions of records, navigating them via the web interface is inefficient. The standard way to get this list is by initiating a filtered asynchronous download. + +Here is how you can do it using both the website interface and the GBIF API: + +### Method 1: Using the GBIF Web Interface + +1. Go to the [GBIF Occurrence Search Page](https://www.gbif.org/occurrence/search). +2. Apply the following filters in the left-hand panel: +* **Scientific Name / Taxon**: Search for and select **`Tracheophyta`** (this is the phylum name for all vascular plants). +* **Basis of Record**: Select **`Preserved specimen`** (this restricts your search to herbarium sheets and physical collections rather than citizen science observations). +* **Media Type**: Select **`Image`** (this ensures every record has an attached digital photo asset). + + +3. Once the filters are applied, click the **Download** button at the top right of the search panel. +4. Choose the **Darwin Core Archive (DwC-A)** format. This format is ideal because it generates a `.zip` package containing: +* `occurrence.txt`: The main list of data entries. +* `multimedia.txt`: A ledger mapping the occurrences directly to their herbarium image URLs. + + + +--- + +### Method 2: Programmatically via the GBIF API + +If you want to automate the request or incorporate it into a script (using Python, R, or `curl`), you can send a `POST` request to the GBIF download API using the exact keys for your filters. + +**Taxon Key for Tracheophyta:** `7707728` + +#### Example API Request Payload + +You can send a JSON object to `https://api.gbif.org/v1/occurrence/download/request` (requires your GBIF account credentials): + +```json +{ + "creator": "your_gbif_username", + "notificationAddresses": [ + "your_email@example.com" + ], + "sendNotification": true, + "format": "DWCA", + "predicate": { + "type": "and", + "predicates": [ + { + "type": "equals", + "key": "TAXON_KEY", + "value": "7707728" + }, + { + "type": "equals", + "key": "BASIS_OF_RECORD", + "value": "PRESERVED_SPECIMEN" + }, + { + "type": "equals", + "key": "MEDIA_TYPE", + "value": "StillImage" + } + ] + } +} + +``` + +### Pro-Tip for Data Handling + +Once your download is processed and unzipped, the `multimedia.txt` file will serve as your master list for image links, which you can link back to the metadata in `occurrence.txt` using the shared `gbifID` column. If you are using Python, you can also use the [`plantnet/gbif-dl`](https://www.google.com/search?q=%5Bhttps://github.com/plantnet/gbif-dl%5D(https://github.com/plantnet/gbif-dl)) library specifically designed to parse these queries and download the image assets efficiently. \ No newline at end of file From ed94e6007ff0cd62ca5fdd33143240f4ead3d2b2 Mon Sep 17 00:00:00 2001 From: Thomas Gardos <3973626+trgardos@users.noreply.github.com> Date: Thu, 21 May 2026 16:11:53 -0400 Subject: [PATCH 07/19] flush printing so it shows up in logs right away --- image_install_db.py | 22 ++++++++++++++++++++-- init_download_db.py | 22 +++++++++++++++++++--- 2 files changed, 39 insertions(+), 5 deletions(-) diff --git a/image_install_db.py b/image_install_db.py index 503a0a5..a2fe443 100644 --- a/image_install_db.py +++ b/image_install_db.py @@ -23,6 +23,7 @@ """ import os +import sys import time import random import logging @@ -92,6 +93,18 @@ logger = logging.getLogger(__name__) +def progress(msg): + """ + Print a progress line, flushed immediately so a batch job's .o log updates + live instead of only at exit. Overwrites in place on an interactive + terminal; writes one line per update when redirected to a log file. + """ + if sys.stdout.isatty(): + print(f"\r{msg}", end="", flush=True) + else: + print(msg, flush=True) + + # ---- paths ------------------------------------------------------------------- def get_hierarchical_path(base_dir, gbif_id, suffix, ext=".jpg"): @@ -462,6 +475,10 @@ def process_id(db, gbif_id, total_to_install): # ---- main -------------------------------------------------------------------- def main(): + # Line-buffer stdout so progress appears in a batch job's .o log live, + # not only when the job finishes. + sys.stdout.reconfigure(line_buffering=True) + parser = ArgumentParser(description=__doc__) parser.add_argument("-c", "--country", dest="country", help="(Unsupported) country filter -- multimedia.txt " @@ -522,8 +539,9 @@ def main(): # Persist circuit-breaker state periodically so a killed job # (e.g. qsub h_rt limit) does not lose it. db.save_host_state(host_error_counts, host_block_until) - print(f" processed {min(start + WORK_CHUNK, total_to_install)}" - f"/{total_to_install} gbifIDs", end="\r") + progress(f" processed " + f"{min(start + WORK_CHUNK, total_to_install)}" + f"/{total_to_install} gbifIDs") except KeyboardInterrupt: logger.warning("Interrupted by user; saving state and exiting.") finally: diff --git a/init_download_db.py b/init_download_db.py index ca03bf9..dac6214 100644 --- a/init_download_db.py +++ b/init_download_db.py @@ -56,6 +56,18 @@ LEGACY_BATCH = 50_000 +def progress(msg): + """ + Print a progress line, flushed immediately so a batch job's .o log updates + live instead of only at exit. Overwrites in place on an interactive + terminal; writes one line per update when redirected to a log file. + """ + if sys.stdout.isatty(): + print(f"\r{msg}", end="", flush=True) + else: + print(msg, flush=True) + + def hierarchical_path(base_dir, gbif_id, suffix=""): """Mirror image_install_db.get_hierarchical_path (without makedirs).""" stem = str(gbif_id) @@ -125,7 +137,7 @@ def ingest_multimedia(conn, multimedia_path): ) conn.commit() inserted += len(rows) - print(f" {inserted:,}/{len(df):,} image rows", end="\r") + progress(f" {inserted:,}/{len(df):,} image rows") print(f" {inserted:,} image rows inserted ") print(" Inserting gbifID rows ...") @@ -208,8 +220,8 @@ def flush(batch): if len(updates) >= LEGACY_BATCH: flush(updates) updates = [] - print(f" renamed={renamed:,} relabeled={relabeled:,} " - f"missing={missing:,}", end="\r") + progress(f" renamed={renamed:,} relabeled={relabeled:,} " + f"missing={missing:,}") flush(updates) print(f" renamed={renamed:,} already-suffixed={relabeled:,} " f"file-missing={missing:,}") @@ -270,6 +282,10 @@ def connect(db_path, bulk_load): def main(): + # Line-buffer stdout so progress appears in a batch job's .o log live, + # not only when the job finishes. + sys.stdout.reconfigure(line_buffering=True) + args = parse_args() start = time.time() From 26c416955ecc5fd24e9e15af493c86bd0f0eb80e Mon Sep 17 00:00:00 2001 From: Thomas Gardos <3973626+trgardos@users.noreply.github.com> Date: Thu, 21 May 2026 16:50:32 -0400 Subject: [PATCH 08/19] add schema description for occurence.txt and multimedia.txt files --- docs/gbif-occurence-multimedia-txt-schema.md | 69 ++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 docs/gbif-occurence-multimedia-txt-schema.md diff --git a/docs/gbif-occurence-multimedia-txt-schema.md b/docs/gbif-occurence-multimedia-txt-schema.md new file mode 100644 index 0000000..5132e3f --- /dev/null +++ b/docs/gbif-occurence-multimedia-txt-schema.md @@ -0,0 +1,69 @@ +In a Darwin Core Archive (DwC-A) downloaded from GBIF, the relationship between `occurrence.txt` and `multimedia.txt` follows a **star schema** layout. + +`occurrence.txt` acts as the **Core file** (the center of the star), while `multimedia.txt` acts as an **Extension file** linked back to the core. + +--- + +### 1. The Linkage: How They Connect + +The two files are mapped together using a single relational key: **`gbifID`**. + +* **`occurrence.txt`**: Each row represents a unique biodiversity record and has a unique `gbifID`. +* **`multimedia.txt`**: May contain zero, one, or multiple rows for a single `gbifID` (since one physical herbarium sheet might have multiple photos taken of it, or close-ups of its label). + +--- + +### 2. Schema for `occurrence.txt` (The Core Metadata) + +This file contains the biological, geographical, and administrative data for the specimen. While it can feature over 200 columns of standardized Darwin Core terms, the most vital fields for a vascular plant herbaria project include: + +| Column Name | Description | Example Data | +| --- | --- | --- | +| **`gbifID`** | The unique numerical primary key assigned by GBIF. | `402391023` | +| **`basisOfRecord`** | The physical nature of the record. For herbaria, this is always filtered to this value. | `PRESERVED_SPECIMEN` | +| **`scientificName`** | The full, three-part or two-part taxon name with authorship. | *Quercus alba L.* | +| **`taxonKey`** / **`speciesKey`** | Unique backbone taxonomic ID numbers used to group species regardless of spelling variations. | `2878688` | +| **`institutionCode`** / **`collectionCode`** | Identifiers for the home museum or herbarium hosting the physical asset. | `NY` (New York Botanical Garden) | +| **`catalogNumber`** | The barcode or physical filing number stamped on the sheet. | `NY00123456` | +| **`recordedBy`** | The name of the original collector who found the plant. | `Asa Gray` | +| **`eventDate`** | The ISO 8601 date the plant was harvested from the wild. | `1874-06-15` | +| **`decimalLatitude`** / **`decimalLongitude`** | GPS/Coordinate mapping of where the specimen originally grew. | `42.3601`, `-71.0589` | + +--- + +### 3. Schema for `multimedia.txt` (The Asset Ledger) + +This file is much narrower and strictly handles the digital representations of the specimen. It breaks down into media-specific fields: + +| Column Name | Description | Example Data | +| --- | --- | --- | +| **`gbifID`** | The foreign key pointing straight back to `occurrence.txt`. | `402391023` | +| **`type`** | The type of media asset. For photos, this standard term is used. | `StillImage` | +| **`format`** | The MIME type indicating the file extension pattern. | `image/jpeg` or `image/tiff` | +| **`identifier`** | **The actual URL** where the high-resolution image asset is publicly hosted by the museum. | `https://sweetgum.nybg.org/images/v2/highres...jpg` | +| **`references`** | A web URL directing to the museum’s interactive webpage for that specimen. | `https://word.nybg.org/detail.php?irn=4920` | +| **`license`** | The text declaration or Creative Commons status of the photograph. | `CC BY 4.0` or `CC0` | +| **`creator`** / **`rightsHolder`** | The photographer or the legal institution holding the copyright to the image. | `The New York Botanical Garden` | + +--- + +### Practical Data Layout Example + +If a single white oak specimen (`gbifID: 101`) has a photo of the full sheet and a secondary close-up macro photo of its acorns, your files will structurally parse out like this: + +**`occurrence.txt`** + +```text +gbifID scientificName basisOfRecord institutionCode +101 Quercus alba PRESERVED_SPECIMEN NY + +``` + +**`multimedia.txt`** + +```text +gbifID type format identifier +101 StillImage image/jpeg https://museum.org/specimen101_full.jpg +101 StillImage image/jpeg https://museum.org/specimen101_acorn_zoom.jpg + +``` From 0197d99369d4fc3d2f9253c1a48f12f2472ab2a6 Mon Sep 17 00:00:00 2001 From: Thomas Gardos <3973626+trgardos@users.noreply.github.com> Date: Fri, 22 May 2026 12:17:21 -0400 Subject: [PATCH 09/19] document the location of the other processed_ids.txt file --- init_download_db.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/init_download_db.sh b/init_download_db.sh index 1fde26b..f42c1bb 100755 --- a/init_download_db.sh +++ b/init_download_db.sh @@ -19,5 +19,8 @@ conda activate spring-2026-pyt python init_download_db.py --legacy-only \ --processed-file /projectnb/herbdl/workspaces/ljhao/herbdl/utils/processed_ids.txt +# The other big initial run is tracked in +# /projectnb/herbdl/workspaces/tsehou26/herbarium_project/utils/processed_ids.txt and .../failed_ids.txt + ### The command below is used to submit the job to the cluster ### qsub -N init_download_db -l h_rt=12:00:00 -pe omp 16 -P herbdl -m beas -M your_email@bu.edu init_download_db.sh From 2032781b49b0dca204cd4494d700a8d8097b098f Mon Sep 17 00:00:00 2001 From: Thomas Gardos <3973626+trgardos@users.noreply.github.com> Date: Fri, 22 May 2026 12:35:26 -0400 Subject: [PATCH 10/19] dedupe images, capture non-image responses, keep raw DNG files Rework the SQLite download tracking to count and download one file per distinct image instead of one per multimedia.txt row. - download_db: images table is now one row per distinct image (image_no, image_key, packed candidate urls). canonical_image_key() collapses a IIIF manifest and the resolution variants of one specimen photo to a single key. - init_download_db: ingest groups multimedia rows into distinct images, ordering candidate URLs highest-resolution-first. - image_install_db: downloads one file per distinct image; detects HTML/text responses (Content-Type header + body sniff) and captures the page text into error_detail for follow-up; keeps undecodable camera-raw DNG as -NN.dng flagged raw_unprocessed instead of discarding it. - status_report: distinct-image counts, non-image-responses-by-host section, raw_unprocessed count. - init_download_db.sh: take the build mode (--reset / --legacy-only) from the qsub command line. The images table schema changed, so this requires a --reset rebuild. Co-Authored-By: Claude Opus 4.7 --- DEPLOYMENT.md | 34 +++- README.md | 20 ++- download_db.py | 101 ++++++++---- image_install_db.py | 377 ++++++++++++++++++++++++++++++-------------- init_download_db.py | 88 ++++++++--- init_download_db.sh | 12 +- status_report.py | 49 +++++- 7 files changed, 482 insertions(+), 199 deletions(-) diff --git a/DEPLOYMENT.md b/DEPLOYMENT.md index d7ad4c8..f9fce1c 100644 --- a/DEPLOYMENT.md +++ b/DEPLOYMENT.md @@ -14,9 +14,12 @@ environment. | Before | After | |---|---| -| Progress in `processed_ids.txt` / `failed_ids.txt` (ID only, no reason) | Progress in `download_status.db` — every URL's outcome and *why* it failed | +| Progress in `processed_ids.txt` / `failed_ids.txt` (ID only, no reason) | Progress in `download_status.db` — every image's outcome and *why* it failed | | `multimedia.txt` re-read and re-grouped with pandas every run | Ingested into the DB once; later runs query the work queue | +| One file per multimedia row (a manifest + its resolution variants → 3 files) | One file per **distinct image** — IIIF manifest/resolution variants deduplicated | | Failed IDs all retried blindly (or skipped) | Only transient failures retried (timeout/rate-limit/5xx/dropped connection), capped at 4 attempts | +| Camera-raw DNG silently discarded | DNG kept as `-NN.dng`, flagged `raw_unprocessed` for a later conversion pass | +| HTML "download not supported" pages saved as junk `.jpg` | Detected; the page text is captured in `error_detail` for follow-up | | `analyze_image_progress.py` (slow, loads ~180 MB of text) | `status_report.py` (instant SQL queries) | | ~1.4 GB run logs, ~134 MB warning spam | `WARNING`-level log only; warning spam suppressed | @@ -49,6 +52,11 @@ never committed: This step ingests `multimedia.txt`, imports already-completed downloads from `processed_ids.txt`, and renames legacy `.jpg` files to `-00.jpg`. +> **If a `download_status.db` already exists from before the distinct-image +> change**, it has the old schema and must be rebuilt — run the builder with +> `--reset`. Files already on disk are detected and re-used, so this re-discovers +> existing progress; it does not re-download anything. + It is heavy — it reads the ~59M-row `multimedia.txt` with pandas and renames up to ~13.5M files. **Run it as a batch job, not on a login node.** @@ -146,8 +154,17 @@ sqlite3 /projectnb/herbdl/data/GBIF-F25h/download_status.db SELECT error_type, COUNT(*) FROM images WHERE status LIKE 'failed%' GROUP BY error_type ORDER BY 2 DESC; --- URLs still worth retrying -SELECT gbif_id, url FROM images WHERE status='failed_transient' LIMIT 50; +-- images still worth retrying +SELECT gbif_id, image_no, urls FROM images +WHERE status='failed_transient' LIMIT 50; + +-- URLs that returned an HTML/text page, with the captured message +SELECT host, error_detail FROM images +WHERE error_type='invalid_content_type' GROUP BY host; + +-- raw files (DNG etc.) kept for a later conversion pass +SELECT gbif_id, image_no, file_path FROM images +WHERE error_type='raw_unprocessed'; -- hosts currently in cooldown SELECT host, datetime(blocked_until,'unixepoch') FROM hosts @@ -187,6 +204,17 @@ UPDATE gbif_ids SET status='partial' WHERE status='failed'; The old downloader shuffled URLs, so the exact source URL is unknown. This is exact for the ~87% of gbifIDs that have only one image; for the rest it affects only metadata, not the image files. +- **Non-IIIF duplicates are not deduplicated.** Two distinct non-IIIF URLs on + one gbifID stay separate images — metadata cannot tell whether they are the + same photo at different sizes. Only content hashing could, and that is not + done here. +- **Raw DNG files are kept, not converted.** A camera-raw DNG is saved as + `-NN.dng` with `error_type='raw_unprocessed'` (the row still counts as + `success`). Converting them to JPEG-1024 is a later pass and needs a raw + decoder (`rawpy`) added to the environment. Query them with + `WHERE error_type='raw_unprocessed'`. +- **HTML/non-image responses** are recorded `invalid_content_type` with the page + text captured in `error_detail`; `status_report.py` lists them by host. - **Database size.** Expect ~10–15 GB. It sits in the data directory, not the repo. Ensure the `herbdl` project has the space. - **Single job at a time.** SQLite (WAL mode) is fine for one job with 5 worker diff --git a/README.md b/README.md index d01c343..fbee679 100644 --- a/README.md +++ b/README.md @@ -11,13 +11,15 @@ This directory contains utility scripts for managing herbarium specimen images, ### Image Download & Installation #### `image_install_db.py` -**Purpose**: Current script for downloading herbarium specimen images from GBIF (Global Biodiversity Information Facility) multimedia datasets. Downloads **all** images per gbifID, each saved as `-NN.jpg`, with status tracked in a SQLite database. See [DEPLOYMENT.md](DEPLOYMENT.md) for the full procedure. +**Purpose**: Current script for downloading herbarium specimen images from GBIF (Global Biodiversity Information Facility) multimedia datasets. Downloads one file per **distinct image** of each gbifID, saved as `-NN.jpg`, with status tracked in a SQLite database. See [DEPLOYMENT.md](DEPLOYMENT.md) for the full procedure. **Key Features**: - Parallel downloading with ThreadPoolExecutor (5 workers) - Host-based rate limiting and circuit breaker pattern -- IIIF (International Image Interoperability Framework) manifest support — one file saved per source URL, highest resolution first -- Automatic image resizing to 1024px max dimension +- IIIF manifest support, with deduplication — a manifest plus the resolution variants of one specimen photo count as **one** image, so one file is saved, not three +- Automatic image resizing to 1024px max-dimension JPEG +- TIFF/PNG decoded and saved as JPEG; camera-raw **DNG kept as-is** (`-NN.dng`, flagged `raw_unprocessed`) rather than discarded +- HTML/text responses (e.g. "direct download no longer supported") detected and the page text captured for follow-up - Atomic downloads (stream to `.tmp`, length-check, then rename) so a dropped connection never leaves a corrupt file - SQLite status database for resumable downloads and queryable, classified error tracking — see [`download_db.py`](download_db.py) - Hierarchical directory organization (3-digit prefix structure) @@ -85,20 +87,22 @@ qsub -N image_install -l h_rt=48:00:00 -pe omp 16 -P herbdl -m beas -M your_emai #### `download_db.py` **Purpose**: SQLite-backed download-status tracking. Imported by the other download scripts — not run directly. -**Why it exists**: replaces the flat `processed_ids.txt` / `failed_ids.txt` files, which recorded only an ID with no reason for failure. The database records, per image URL, whether it succeeded or failed and *why*, so failures are queryable and only transient ones get retried. +**Why it exists**: replaces the flat `processed_ids.txt` / `failed_ids.txt` files, which recorded only an ID with no reason for failure. The database records, per distinct image, whether it succeeded or failed and *why*, so failures are queryable and only transient ones get retried. **Tables**: -- `images` — one row per source image URL: `status`, `http_status`, `error_type`, `error_detail`, `file_path`, `file_size`, `attempts` -- `gbif_ids` — one row per gbifID; the resumable work queue (`pending` / `partial` / `done` / `failed`) +- `images` — one row per **distinct image** (the download unit): `image_no`, `image_key` (canonical identity), `urls` (candidate URLs), `status`, `http_status`, `error_type`, `error_detail`, `file_path`, `file_size`, `attempts` +- `gbif_ids` — one row per gbifID; the resumable work queue (`pending` / `partial` / `done` / `failed`); `n_images` is the distinct-image count - `hosts` — per-host error tally and cooldown, so circuit-breaker state survives a restart +`canonical_image_key()` collapses a IIIF manifest and the resolution variants of one specimen photo to a single key, so multiple `multimedia.txt` rows become one `images` row. + #### `init_download_db.py` **Purpose**: One-time builder for the status database. **What it does**: 1. Creates the schema -2. Ingests `multimedia.txt` into `images` + `gbif_ids` (so later runs never re-read the 59M-row file) -3. Imports `processed_ids.txt`: renames legacy `.jpg` files to `-00.jpg` for a consistent naming scheme and marks them done. Multi-image gbifIDs are left `partial` so the downloader fetches their remaining images. (`failed_ids.txt` is **not** imported — those IDs get a fresh, tracked retry.) +2. Reads `multimedia.txt` once, groups its rows into **distinct images** (a manifest + resolution variants of one photo collapse to one), and loads `images` + `gbif_ids` — so later runs never re-read the 59M-row file +3. Imports `processed_ids.txt`: renames legacy `.jpg` files to `-00.jpg` for a consistent naming scheme and marks image 0 done. Multi-image gbifIDs are left `partial` so the downloader fetches their remaining images. (`failed_ids.txt` is **not** imported — those IDs get a fresh, tracked retry.) **Usage**: ```bash diff --git a/download_db.py b/download_db.py index 3540a93..c0358f3 100644 --- a/download_db.py +++ b/download_db.py @@ -2,25 +2,29 @@ SQLite-backed download-status tracking for image_install_db.py. Replaces the flat processed_ids.txt / failed_ids.txt checkpoint files with a -queryable database that records, for every image URL, whether it succeeded or -failed and *why*. That makes it possible to: - * resume a run without re-reading and re-grouping the 59M-row multimedia.txt, - * retry only transient failures (timeouts, rate limits, 5xx, dropped - connections) while leaving permanent ones (404/410/etc.) alone, - * answer questions like "how many 404s?" or "which hosts fail most?" with a - single SQL query (see status_report.py). +queryable database that records, for every *distinct image*, whether it was +downloaded and -- when it failed -- why. + +Distinct images vs. multimedia rows +----------------------------------- +A gbifID often has several rows in GBIF's multimedia.txt that all point at the +SAME photo: a IIIF manifest plus the 300px / 1600px renderings of one specimen. +canonical_image_key() collapses those to a single key, so one row in the +`images` table = one distinct image = one downloaded file. Non-IIIF URLs key to +themselves (metadata cannot tell whether two opaque URLs are the same photo -- +that needs content hashing, which this layer does not do). Tables ------ -images one row per source image URL (a GBIF "identifier"). -gbif_ids one row per gbifID; doubles as the resumable work queue. -hosts per-host error tally + cooldown timestamp, so circuit-breaker and - rate-limit state survive a job restart. +images one row per distinct image; the download/work unit. +gbif_ids one row per gbifID; the resumable work queue. +hosts per-host error tally + cooldown, surviving a restart. -A gbifID is 'done' only when every one of its images has status 'success'. +A gbifID is 'done' only when every one of its distinct images has succeeded. """ import os +import re import time import sqlite3 import threading @@ -32,28 +36,32 @@ # ---- images.status ----------------------------------------------------------- ST_PENDING = "pending" # never attempted -ST_SUCCESS = "success" # downloaded (and resized) OK +ST_SUCCESS = "success" # image obtained (resized JPEG, or kept raw) ST_FAILED_PERMANENT = "failed_permanent" # retrying will not help ST_FAILED_TRANSIENT = "failed_transient" # may succeed on a later run # ---- gbif_ids.status --------------------------------------------------------- G_PENDING = "pending" # no image attempted yet G_PARTIAL = "partial" # some work still possible (in the work queue) -G_DONE = "done" # every image succeeded +G_DONE = "done" # every distinct image succeeded G_FAILED = "failed" # all images terminal, not all succeeded -# ---- error_type values ------------------------------------------------------- +# ---- error_type values (on failure rows) ------------------------------------ ERR_RATE_LIMITED = "rate_limited" # HTTP 429 ERR_TIMEOUT = "timeout" # connect/read timeout, HTTP 408 ERR_SERVER = "server_error" # HTTP 5xx ERR_CONNECTION = "connection_broken" # dropped connection / IncompleteRead ERR_TRUNCATED = "truncated" # download shorter than Content-Length ERR_MANIFEST = "manifest_error" # IIIF manifest could not be parsed -ERR_INVALID_CONTENT = "invalid_content_type" # server returned HTML/XML/text -ERR_NOT_IMAGE = "not_an_image" # bytes downloaded but not decodable -ERR_NO_URL = "no_url" # no usable URL for this identifier +ERR_INVALID_CONTENT = "invalid_content_type" # URL returned HTML/text, not an image +ERR_NOT_IMAGE = "not_an_image" # bytes downloaded but undecodable junk +ERR_NO_URL = "no_url" # no usable URL for this image ERR_OTHER = "other" # anything uncategorised -ERR_LEGACY = "legacy_unverified_index" # marker on imported processed_ids.txt + +# ---- flags carried on status='success' rows (not failures) ------------------ +ERR_LEGACY = "legacy_unverified_index" # imported from processed_ids.txt +ERR_RAW_UNPROCESSED = "raw_unprocessed" # kept as a raw file (e.g. DNG); needs + # a later conversion pass to JPEG # Everything not in this set is treated as permanent (e.g. any "http_4xx"). TRANSIENT_ERRORS = { @@ -83,27 +91,56 @@ def status_for_error(error_type): return ST_FAILED_PERMANENT if is_permanent(error_type) else ST_FAILED_TRANSIENT +# ---- canonical image identity ----------------------------------------------- + +_IIIF_MANIFEST = re.compile(r"/manifest(?:\.json)?$", re.IGNORECASE) +_IIIF_IMAGE_TAIL = re.compile( + r"/[^/]+/[^/]+/[-+0-9.!]+/(?:default|color|gray|bitonal)\.[A-Za-z0-9]+$", + re.IGNORECASE, +) + + +def canonical_image_key(url): + """ + Return a canonical identity for the image a URL points at. + + A IIIF Presentation manifest (".../E00699064/manifest") and every IIIF + Image-API rendering (".../E00699064/full/1600,/0/default.jpg") of one + specimen collapse to the same key -- the IIIF identifier ".../E00699064". + Non-IIIF URLs key to themselves. + """ + u = (url or "").strip() + stripped = _IIIF_MANIFEST.sub("", u) + if stripped != u: + return stripped + stripped = _IIIF_IMAGE_TAIL.sub("", u) + if stripped != u: + return stripped + return u + + # ---- schema ------------------------------------------------------------------ _TABLES = [ """CREATE TABLE IF NOT EXISTS images ( gbif_id INTEGER NOT NULL, - img_index INTEGER NOT NULL, -- position in this ID's URL list - url TEXT NOT NULL, + image_no INTEGER NOT NULL, -- distinct-image ordinal in the gbifID + image_key TEXT NOT NULL, -- canonical identity (debug/transparency) + urls TEXT NOT NULL, -- newline-joined candidate URLs, best first host TEXT, status TEXT NOT NULL DEFAULT 'pending', http_status INTEGER, error_type TEXT, - error_detail TEXT, -- truncated message, for debugging + error_detail TEXT, -- truncated message / captured page text file_path TEXT, - file_size INTEGER, -- bytes on disk after resize + file_size INTEGER, -- bytes on disk attempts INTEGER NOT NULL DEFAULT 0, last_attempt_at TEXT, - PRIMARY KEY (gbif_id, img_index) + PRIMARY KEY (gbif_id, image_no) )""", """CREATE TABLE IF NOT EXISTS gbif_ids ( gbif_id INTEGER PRIMARY KEY, - n_images INTEGER NOT NULL DEFAULT 0, + n_images INTEGER NOT NULL DEFAULT 0, -- distinct images n_success INTEGER NOT NULL DEFAULT 0, status TEXT NOT NULL DEFAULT 'pending', completed_at TEXT @@ -181,23 +218,23 @@ def get_work_gbif_ids(self): return [row[0] for row in cur.fetchall()] def get_images_for(self, gbif_id): - """Return (img_index, url, host, status, attempts) rows for one gbifID.""" + """Return (image_no, image_key, urls, host, status, attempts) per image.""" with self.lock: cur = self.conn.execute( - "SELECT img_index, url, host, status, attempts " - "FROM images WHERE gbif_id=? ORDER BY img_index", + "SELECT image_no, image_key, urls, host, status, attempts " + "FROM images WHERE gbif_id=? ORDER BY image_no", (gbif_id,), ) return cur.fetchall() # -- recording results ----------------------------------------------------- - def record_image_result(self, gbif_id, img_index, status, *, host=None, + def record_image_result(self, gbif_id, image_no, status, *, host=None, http_status=None, error_type=None, error_detail=None, file_path=None, file_size=None, increment_attempts=True): """Write the outcome of one image attempt into the images table.""" - detail = (error_detail or "")[:500] or None + detail = (error_detail or "")[:2000] or None delta = 1 if increment_attempts else 0 with self.lock: self.conn.execute( @@ -205,9 +242,9 @@ def record_image_result(self, gbif_id, img_index, status, *, host=None, " status=?, host=COALESCE(?, host), http_status=?, " " error_type=?, error_detail=?, file_path=?, file_size=?, " " attempts=attempts+?, last_attempt_at=datetime('now') " - "WHERE gbif_id=? AND img_index=?", + "WHERE gbif_id=? AND image_no=?", (status, host, http_status, error_type, detail, file_path, - file_size, delta, gbif_id, img_index), + file_size, delta, gbif_id, image_no), ) self.conn.commit() diff --git a/image_install_db.py b/image_install_db.py index a2fe443..0603b72 100644 --- a/image_install_db.py +++ b/image_install_db.py @@ -2,27 +2,38 @@ Image install script: download herbarium specimen images from a GBIF multimedia.txt file. -Downloads ALL images for each gbifID. Each source URL (a GBIF "identifier") is -saved as one file with an index suffix: -00.jpg, -01.jpg, ... -A gbifID is marked 'done' only once every one of its images has succeeded. +Downloads one file per *distinct image* of each gbifID. A IIIF manifest and the +resolution variants of one specimen photo are treated as a single image (see +download_db.canonical_image_key) -- so a record listed in multimedia.txt as +"manifest + 300px + 1600px" produces ONE file, not three. Files are named +-00.jpg, -01.jpg, ... A gbifID is 'done' only once every one of +its distinct images has succeeded. Status tracking --------------- Per-image and per-gbifID status lives in a SQLite database (download_status.db, -see download_db.py) instead of the old processed_ids.txt / failed_ids.txt flat -files. Build the database once with init_download_db.py before the first run. +see download_db.py). Build it once with init_download_db.py before the first run. The database lets the script: * resume without re-reading the 59M-row multimedia.txt every run, * retry only transient failures (timeout / rate-limit / 5xx / dropped connection), capped at MAX_ATTEMPTS, and never re-hammer permanent 404s, - * record *why* each download failed so failures are queryable afterwards - (see status_report.py). + * record *why* each download failed so failures are queryable afterwards. + +Non-JPEG handling +----------------- +TIFF/PNG/etc. are decoded by Pillow and saved as resized JPEG like everything +else. A file Pillow cannot decode but that is a real image format (camera-raw +DNG) is kept as-is (-NN.dng) and flagged 'raw_unprocessed' for a later +conversion pass -- it is not discarded. A URL that returns an HTML/text page +(e.g. "direct download no longer supported") is recorded as +'invalid_content_type' with the page text captured for follow-up. Accurate as of May 2026. """ import os +import re import sys import time import random @@ -46,8 +57,7 @@ from download_db import DownloadDB # verify=False is needed because many herbarium hosts have broken TLS certs. -# Suppress the resulting per-request warning so it does not flood the .e log -# (it previously produced ~134 MB of InsecureRequestWarning spam per run). +# Suppress the resulting per-request warning so it does not flood the .e log. urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) # ---- configuration ----------------------------------------------------------- @@ -63,6 +73,10 @@ HOST_COOLDOWN_TIMEOUT = 60 * 60 HOST_ERROR_THRESHOLD = 500 # circuit breaker: skip a host after this many errors +# Extensions under which an undecodable-but-real image is kept for later. +RAW_EXTS = (".dng", ".nef", ".cr2", ".cr3", ".arw", ".raf", ".orf", ".rw2", + ".tif", ".raw") + # ---- in-memory host circuit-breaker state (seeded from / saved to the DB) ---- host_block_until = {} @@ -107,18 +121,23 @@ def progress(msg): # ---- paths ------------------------------------------------------------------- -def get_hierarchical_path(base_dir, gbif_id, suffix, ext=".jpg"): +def image_path(gbif_id, image_no, ext): """ - Build a hierarchical storage path to avoid millions of files in one dir. - suffix: image index suffix, e.g. '-00', '-01'. - Example: gbifID=1057161997, suffix='-00' -> /105/716/1057161997-00.jpg + Storage path for one image: ///-NN (no mkdir). + p1 = first 3 digits of the gbifID, p2 = digits 4-6. """ stem = str(gbif_id) prefix1 = stem[:3] if len(stem) >= 3 else stem prefix2 = stem[3:6] if len(stem) >= 6 else "000" - dest_dir = os.path.join(base_dir, prefix1, prefix2) - os.makedirs(dest_dir, exist_ok=True) - return os.path.join(dest_dir, f"{stem}{suffix}{ext}") + return os.path.join(INSTALL_PATH, prefix1, prefix2, + f"{stem}-{image_no:02d}{ext}") + + +def _rm(path): + try: + os.remove(path) + except OSError: + pass # ---- host circuit breaker / cooldown ----------------------------------------- @@ -181,13 +200,17 @@ def block_host(url, retry_after=None, timeout_issue=False): # ---- IIIF manifests ---------------------------------------------------------- +def is_manifest_url(url): + low = url.lower() + return "/manifest" in low or low.endswith(".json") + + def extract_image_from_iiif_manifest(manifest_url, gbif_id): """ Fetch a IIIF manifest and return (image_urls, error_type). image_urls is an ordered list of direct image URLs (highest resolution - first). On failure image_urls is empty and error_type explains why, so the - caller can decide whether the manifest is worth retrying. + first). On failure image_urls is empty and error_type explains why. """ try: response = session.get( @@ -233,30 +256,95 @@ def extract_image_from_iiif_manifest(manifest_url, gbif_id): return [], ddb.ERR_MANIFEST -# ---- downloading ------------------------------------------------------------- +# ---- non-image response detection ------------------------------------------- -def _rm(path): +_NON_IMAGE_CTYPES = ("text/html", "text/plain", "text/xml", + "application/xml", "application/json", "ld+json") + + +def _is_non_image_ctype(ctype): + return bool(ctype) and any(t in ctype for t in _NON_IMAGE_CTYPES) + + +def _looks_like_text(data): + """Heuristic: do the first bytes look like an HTML / XML / JSON document?""" + if not data: + return False + head = data.lstrip()[:64].lower() + return head.startswith((b"= limit: + break + return raw[:limit] + + +def _join_bounded(stream, limit=16384): + """Drain at most `limit` bytes from an iter_content generator.""" + raw = b"" + for chunk in stream: + raw += chunk + if len(raw) >= limit: + break + return raw[:limit] + + +def _html_to_text(raw): + """Strip an HTML/text body down to readable text for capture in the DB.""" try: - os.remove(path) - except OSError: - pass + text = raw.decode("utf-8", errors="replace") + except Exception: + text = str(raw) + text = re.sub(r"(?is)<(script|style)[^>]*>.*?", " ", text) + text = re.sub(r"(?s)<[^>]+>", " ", text) + text = re.sub(r"\s+", " ", text).strip() + return text[:1800] -def download_one_url(gbif_id, image_url, local_path): +def raw_keep_extension(content_type, url): """ - Download a single URL to local_path, atomically. + Extension under which to keep an image Pillow could not decode, or None to + discard it. Camera-raw DNG is the main case -- kept for a later conversion + pass rather than lost. + """ + ct = (content_type or "").lower() + low = (url or "").lower().split("?")[0] + if "dng" in ct or low.endswith(".dng"): + return ".dng" + for ext in (".nef", ".cr2", ".cr3", ".arw", ".raf", ".orf", ".rw2"): + if low.endswith(ext): + return ext + if "tiff" in ct or "tif" in ct or low.endswith((".tif", ".tiff")): + return ".tif" + if ct.startswith("image/"): + return ".raw" # an image/* type Pillow cannot read -- keep it anyway + return None + - Bytes are streamed to a .tmp file, length-checked against Content-Length, - then renamed into place -- so a dropped connection never leaves a corrupt - file behind. Returns a result dict with keys: ok, size, http_status, - error_type, error_detail, host. +# ---- downloading ------------------------------------------------------------- + +def download_one_url(gbif_id, image_url, tmp_path): + """ + Download one URL to tmp_path. Returns a dict: + success -> {ok: True, host, http_status, content_type, size} + failure -> {ok: False, host, http_status, content_type, size, + error_type, error_detail} + + Detects HTML/text responses -- including ones disguised with an image + Content-Type -- and captures the page text into error_detail. """ host = _host_from_url(image_url) - tmp_path = local_path + ".tmp" - def fail(error_type, detail, http_status=None): - return {"ok": False, "size": None, "http_status": http_status, - "error_type": error_type, "error_detail": detail, "host": host} + def fail(error_type, detail, http_status=None, content_type=None): + return {"ok": False, "host": host, "http_status": http_status, + "content_type": content_type, "size": None, + "error_type": error_type, "error_detail": detail} try: time.sleep(random.uniform(0.2, 0.8)) @@ -283,15 +371,36 @@ def fail(error_type, detail, http_status=None): return fail(ddb.http_error_type(status), f"HTTP {status}", status) ctype = (resp.headers.get("Content-Type") or "").lower() - if ctype and any(bad in ctype for bad in - ("text/html", "text/plain", "application/xml")): + + # Content-Type clearly says this is not an image -> capture the text. + if _is_non_image_ctype(ctype): increment_host_errors(image_url) - return fail(ddb.ERR_INVALID_CONTENT, f"Content-Type: {ctype}", status) + snippet = _html_to_text(_read_bounded(resp)) + return fail(ddb.ERR_INVALID_CONTENT, + f"[{ctype}] {snippet}", status, ctype) + + # Sniff the first chunk: some hosts serve an HTML notice ("direct + # download no longer supported ...") with an image/* Content-Type. + stream = resp.iter_content(chunk_size=65536) + first = b"" + for chunk in stream: + if chunk: + first = chunk + break + if _looks_like_text(first): + increment_host_errors(image_url) + snippet = _html_to_text(first + _join_bounded(stream)) + return fail(ddb.ERR_INVALID_CONTENT, + f"[non-image body, {ctype or 'no Content-Type'}] " + f"{snippet}", status, ctype) expected = resp.headers.get("Content-Length") written = 0 with open(tmp_path, "wb") as out: - for chunk in resp.iter_content(chunk_size=65536): + if first: + out.write(first) + written += len(first) + for chunk in stream: if chunk: out.write(chunk) written += len(chunk) @@ -302,17 +411,17 @@ def fail(error_type, detail, http_status=None): _rm(tmp_path) return fail(ddb.ERR_TRUNCATED, f"expected {expected} bytes, got {written}", - status) + status, ctype) except ValueError: pass if written < 1024: _rm(tmp_path) - return fail(ddb.ERR_TRUNCATED, f"only {written} bytes", status) + return fail(ddb.ERR_TRUNCATED, f"only {written} bytes", + status, ctype) - os.replace(tmp_path, local_path) - return {"ok": True, "size": written, "http_status": status, - "error_type": None, "error_detail": None, "host": host} + return {"ok": True, "host": host, "http_status": status, + "content_type": ctype, "size": written} except (ConnectTimeout, ReadTimeout, Timeout) as e: _rm(tmp_path) @@ -330,79 +439,99 @@ def fail(error_type, detail, http_status=None): return fail(ddb.ERR_OTHER, str(e)) -def resize_image(gbif_id, local_path): - changed, new_size = resize_with_aspect_ratio( - local_path, local_path, max_size=1024, format="JPEG", quality=85) - if changed: - logger.info(f"Resized {gbif_id} to {new_size} at {local_path}") +def _finalize_download(gbif_id, image_no, image_url, res, tmp_path): + """Turn a downloaded temp file into the final image, or keep it raw.""" + try: + # Decode + resize as a normal image (JPEG/TIFF/PNG/...), in place. + resize_with_aspect_ratio(tmp_path, tmp_path, max_size=1024, + format="JPEG", quality=85) + except (OSError, UnidentifiedImageError) as e: + # Pillow cannot decode it. If it is a real image format (DNG etc.), + # keep the raw file for a later conversion pass; otherwise discard. + ext = raw_keep_extension(res["content_type"], image_url) + if ext: + raw_path = image_path(gbif_id, image_no, ext) + os.replace(tmp_path, raw_path) + try: + size = os.path.getsize(raw_path) + except OSError: + size = res["size"] + logger.warning(f"Kept raw image {gbif_id} #{image_no} " + f"({res['content_type']}) at {raw_path}") + return {"outcome": "success", "db_status": ddb.ST_SUCCESS, + "http_status": 200, "error_type": ddb.ERR_RAW_UNPROCESSED, + "error_detail": f"kept raw: {res['content_type'] or 'unknown'}", + "host": res["host"], "file_path": raw_path, "file_size": size} + _rm(tmp_path) + return {"outcome": "failed", "db_status": ddb.ST_FAILED_PERMANENT, + "http_status": 200, "error_type": ddb.ERR_NOT_IMAGE, + "error_detail": str(e), "host": res["host"], + "file_path": None, "file_size": None} + + jpg_path = image_path(gbif_id, image_no, ".jpg") + os.replace(tmp_path, jpg_path) + try: + size = os.path.getsize(jpg_path) + except OSError: + size = res["size"] + return {"outcome": "success", "db_status": ddb.ST_SUCCESS, "http_status": 200, + "error_type": None, "error_detail": None, "host": res["host"], + "file_path": jpg_path, "file_size": size} -def resolve_and_download(gbif_id, identifier_url, local_path): +def resolve_and_download(gbif_id, image_no, candidate_urls): """ - Download the image for one source identifier (one img_index) and save it as - exactly one file at local_path. - - For a plain URL there is one candidate. For a IIIF manifest the manifest is - expanded into resolution variants and tried highest-first; the first success - wins, so still only one file is saved per identifier. + Fetch one distinct image and save it as exactly one file. - Returns a result dict with keys: outcome ('success' | 'failed' | - 'deferred'), db_status, http_status, error_type, error_detail, host, - file_size. 'deferred' means every candidate host was blocked/circuit-broken, - so the image was not really attempted and should stay 'pending'. + candidate_urls are this image's URLs from the database, best-resolution + first. IIIF manifests among them are expanded into image URLs. The first + URL that yields a usable image wins. Returns an outcome dict; outcome is + 'success', 'failed', or 'deferred' (every candidate host was blocked, so + the image was not really attempted and should stay 'pending'). """ - if "/manifest" in identifier_url or identifier_url.endswith(".json"): - candidates, manifest_err = extract_image_from_iiif_manifest( - identifier_url, gbif_id) - if not candidates: - return {"outcome": "failed", - "db_status": ddb.status_for_error(manifest_err), - "http_status": None, "error_type": manifest_err, - "error_detail": "IIIF manifest yielded no image URLs", - "host": _host_from_url(identifier_url), "file_size": None} - else: - candidates = [identifier_url] + resolved, manifest_err = [], None + for url in candidate_urls: + if is_manifest_url(url): + extracted, err = extract_image_from_iiif_manifest(url, gbif_id) + if extracted: + resolved.extend(extracted) + elif err: + manifest_err = err + else: + resolved.append(url) - # Deduplicate while preserving the highest-resolution-first order. seen, ordered = set(), [] - for url in candidates: + for url in resolved: if url not in seen: seen.add(url) ordered.append(url) - failures = [] - attempted_any = False + if not ordered: + et = manifest_err or ddb.ERR_NO_URL + return {"outcome": "failed", "db_status": ddb.status_for_error(et), + "http_status": None, "error_type": et, + "error_detail": "no downloadable image URL for this image", + "host": None, "file_path": None, "file_size": None} + + tmp_path = image_path(gbif_id, image_no, ".tmp") + os.makedirs(os.path.dirname(tmp_path), exist_ok=True) + + failures, attempted = [], False for url in ordered: if is_host_circuit_broken(url) or is_host_blocked(url): continue - attempted_any = True - result = download_one_url(gbif_id, url, local_path) - if result["ok"]: - try: - resize_image(gbif_id, local_path) - except (OSError, UnidentifiedImageError) as e: - _rm(local_path) - return {"outcome": "failed", "db_status": ddb.ST_FAILED_PERMANENT, - "http_status": result["http_status"], - "error_type": ddb.ERR_NOT_IMAGE, - "error_detail": str(e), "host": result["host"], - "file_size": None} - try: - size = os.path.getsize(local_path) - except OSError: - size = result["size"] - return {"outcome": "success", "db_status": ddb.ST_SUCCESS, - "http_status": 200, "error_type": None, - "error_detail": None, "host": result["host"], - "file_size": size} - failures.append(result) + attempted = True + res = download_one_url(gbif_id, url, tmp_path) + if res["ok"]: + return _finalize_download(gbif_id, image_no, url, res, tmp_path) + failures.append(res) - if not attempted_any: + if not attempted: # Every candidate's host was blocked -- leave the image 'pending'. return {"outcome": "deferred"} # Prefer a transient failure as the recorded reason: if any candidate could - # still succeed later, the whole identifier is worth retrying. + # still succeed later, the whole image is worth retrying. transient = [f for f in failures if not ddb.is_permanent(f["error_type"])] chosen = transient[0] if transient else failures[0] db_status = ddb.ST_FAILED_TRANSIENT if transient else ddb.ST_FAILED_PERMANENT @@ -410,17 +539,30 @@ def resolve_and_download(gbif_id, identifier_url, local_path): "http_status": chosen["http_status"], "error_type": chosen["error_type"], "error_detail": chosen["error_detail"], - "host": chosen["host"], "file_size": None} + "host": chosen["host"], "file_path": None, "file_size": None} # ---- per-gbifID processing --------------------------------------------------- +def _existing_file(gbif_id, image_no): + """Return (path, is_raw) if a valid file is already on disk, else None.""" + for ext in (".jpg",) + RAW_EXTS: + path = image_path(gbif_id, image_no, ext) + if os.path.exists(path): + try: + if get_file_size_in_mb(path) >= MIN_IMAGE_MB: + return path, (ext != ".jpg") + except OSError: + pass + return None + + def process_id(db, gbif_id, total_to_install): - """Download every not-yet-done image for one gbifID and update the DB.""" + """Download every not-yet-done distinct image for one gbifID.""" global n_installed images = db.get_images_for(gbif_id) - for img_index, url, _host, status, attempts in images: + for image_no, image_key, urls_str, host, status, attempts in images: # Skip images that are already finished or have exhausted their retries. if status == ddb.ST_SUCCESS: continue @@ -429,33 +571,33 @@ def process_id(db, gbif_id, total_to_install): if status == ddb.ST_FAILED_TRANSIENT and attempts >= db.max_attempts: continue - suffix = f"-{img_index:02d}" - local_path = get_hierarchical_path(INSTALL_PATH, gbif_id, suffix) - - # If a valid file is already on disk, record it without downloading. - if os.path.exists(local_path): + # If a valid file is already on disk (a previous run, or the legacy + # import), record it without downloading. + existing = _existing_file(gbif_id, image_no) + if existing: + path, is_raw = existing try: - size_mb = get_file_size_in_mb(local_path) + size = os.path.getsize(path) except OSError: - size_mb = 0.0 - if size_mb >= MIN_IMAGE_MB: - db.record_image_result( - gbif_id, img_index, ddb.ST_SUCCESS, - host=_host_from_url(url), http_status=200, - file_path=local_path, file_size=int(size_mb * 1024 * 1024), - increment_attempts=False) - continue + size = None + db.record_image_result( + gbif_id, image_no, ddb.ST_SUCCESS, host=host, http_status=200, + error_type=ddb.ERR_RAW_UNPROCESSED if is_raw else None, + file_path=path, file_size=size, increment_attempts=False) + continue - result = resolve_and_download(gbif_id, url, local_path) + candidate_urls = [u for u in urls_str.split("\n") if u] + result = resolve_and_download(gbif_id, image_no, candidate_urls) if result["outcome"] == "deferred": continue # host blocked; leave 'pending' for a later run db.record_image_result( - gbif_id, img_index, result["db_status"], - host=result.get("host"), http_status=result.get("http_status"), + gbif_id, image_no, result["db_status"], + host=result.get("host") or host, + http_status=result.get("http_status"), error_type=result.get("error_type"), error_detail=result.get("error_detail"), - file_path=local_path if result["outcome"] == "success" else None, + file_path=result.get("file_path"), file_size=result.get("file_size")) if result["outcome"] == "success": @@ -525,6 +667,7 @@ def main(): send_notification("Image Installation", f"Starting run: {total_to_install} gbifIDs to process.") + counts = {} try: with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor: for start in range(0, total_to_install, WORK_CHUNK): diff --git a/init_download_db.py b/init_download_db.py index dac6214..899f980 100644 --- a/init_download_db.py +++ b/init_download_db.py @@ -5,27 +5,27 @@ What it does ------------ 1. Creates the SQLite schema (see download_db.py). -2. Reads multimedia.txt once and loads every (gbifID, image URL) pair into the - `images` table and every gbifID into `gbif_ids`. After this, runs of - image_install_db.py no longer need to re-read and re-group the 59M-row +2. Reads multimedia.txt once and groups its rows into *distinct images*: a IIIF + manifest plus the resolution variants of one specimen collapse to a single + image (see download_db.canonical_image_key). Each distinct image becomes one + row in `images`, carrying all of its candidate URLs; `gbif_ids` gets one row + per gbifID. After this, runs of image_install_db.py never re-read the 59M-row multimedia.txt -- the work queue lives in the database. 3. Imports processed_ids.txt: for each already-finished gbifID it locates the downloaded file, renames legacy `.jpg` to `-00.jpg` so the dataset - uses one consistent naming scheme, and marks image index 0 as 'success'. - gbifIDs with more than one image are left 'partial' so the multi-image - downloader goes back and fetches their remaining images. + uses one consistent naming scheme, and marks image 0 as 'success'. gbifIDs + with more than one distinct image are left 'partial' so the downloader goes + back and fetches their remaining images. NOTE: the old one-image-per-ID downloader shuffled candidate URLs, so for a - multi-image gbifID we cannot know which URL the existing file came from. It - is recorded against img_index 0 with error_type 'legacy_unverified_index'. - ~87% of gbifIDs have only one image, where this assignment is exact. + multi-image gbifID we cannot know which image the existing file is. It is + recorded against image 0 with error_type 'legacy_unverified_index'. failed_ids.txt is intentionally NOT imported: those IDs stay 'pending' and get a fresh, fully-tracked retry. This script is destructive-ish (it renames files and can drop an existing DB -with --reset). It does not download anything. Run it once before the first -run of image_install_db.py. +with --reset). It does not download anything. The legacy import is idempotent and resumable: if it is interrupted, re-run with --legacy-only to finish it without redoing the multimedia ingest. @@ -69,7 +69,7 @@ def progress(msg): def hierarchical_path(base_dir, gbif_id, suffix=""): - """Mirror image_install_db.get_hierarchical_path (without makedirs).""" + """Path for a stored image (mirrors image_install_db.image_path, no mkdir).""" stem = str(gbif_id) prefix1 = stem[:3] if len(stem) >= 3 else stem prefix2 = stem[3:6] if len(stem) >= 6 else "000" @@ -98,7 +98,7 @@ def parse_args(): def ingest_multimedia(conn, multimedia_path): - """Load every image URL from multimedia.txt into images + gbif_ids.""" + """Group multimedia.txt rows into distinct images and load images + gbif_ids.""" print(f"Reading {multimedia_path} ...") df = pd.read_csv( multimedia_path, @@ -111,37 +111,73 @@ def ingest_multimedia(conn, multimedia_path): df["identifier"] = df["identifier"].astype("string") print(f" {len(df):,} (gbifID, URL) rows") - # Sort so each gbifID's rows are contiguous, then number them 0,1,2,... + # Stable sort -> each gbifID's rows keep their multimedia.txt order. df = df.sort_values("gbifID", kind="stable").reset_index(drop=True) - df["img_index"] = df.groupby("gbifID").cumcount() + + print(" Computing canonical image keys ...") + df["image_key"] = df["identifier"].map(ddb.canonical_image_key).astype("string") + + # image_no: dense rank of image_key within each gbifID, by first appearance. + df["is_new"] = ~df.duplicated(["gbifID", "image_key"]) + df["image_no"] = df.groupby("gbifID")["is_new"].cumsum().astype("int32") - 1 + + # Order a distinct image's candidate URLs: highest resolution first, IIIF + # manifests last (they are only a fallback -- expanded at download time). + df["is_manifest"] = df["identifier"].str.contains( + "/manifest", case=False, na=False).astype("int8") + size = pd.to_numeric( + df["identifier"].str.extract(r"/full/(\d+),", expand=False), + errors="coerce") + big = df["identifier"].str.contains( + r"/full/(?:max|full)/", case=False, na=False, regex=True) + size = size.where(~big, 100000) + size = size.mask(df["is_manifest"] == 1, 1600) # manifest expands to ~1600 + df["eff_size"] = size.fillna(0).astype("int32") + df["host"] = ( df["identifier"].str.extract(r"^[a-zA-Z][a-zA-Z0-9+.-]*://([^/:]+)", expand=False) - .fillna("") + .fillna("").astype("string") + ) + + df = df.sort_values( + ["gbifID", "image_no", "eff_size", "is_manifest"], + ascending=[True, True, False, True], kind="stable") + + print(" Grouping rows into distinct images ...") + images = ( + df.groupby(["gbifID", "image_no"], sort=True) + .agg(urls=("identifier", "\n".join), + image_key=("image_key", "first"), + host=("host", "first")) + .reset_index() ) + del df + print(f" {len(images):,} distinct images") print(" Inserting image rows ...") inserted = 0 - for start in range(0, len(df), INSERT_BATCH): - sub = df.iloc[start:start + INSERT_BATCH] + for start in range(0, len(images), INSERT_BATCH): + sub = images.iloc[start:start + INSERT_BATCH] rows = list(zip( sub["gbifID"].tolist(), - sub["img_index"].tolist(), - sub["identifier"].tolist(), + sub["image_no"].tolist(), + sub["image_key"].tolist(), + sub["urls"].tolist(), sub["host"].tolist(), )) conn.executemany( - "INSERT OR IGNORE INTO images(gbif_id, img_index, url, host) " - "VALUES(?,?,?,?)", + "INSERT OR IGNORE INTO images" + "(gbif_id, image_no, image_key, urls, host) VALUES(?,?,?,?,?)", rows, ) conn.commit() inserted += len(rows) - progress(f" {inserted:,}/{len(df):,} image rows") - print(f" {inserted:,} image rows inserted ") + progress(f" {inserted:,}/{len(images):,} image rows") + print(f" {inserted:,} distinct images inserted ") print(" Inserting gbifID rows ...") - sizes = df.groupby("gbifID").size() + sizes = images.groupby("gbifID").size() gid_rows = list(zip(sizes.index.tolist(), sizes.tolist())) for start in range(0, len(gid_rows), INSERT_BATCH): conn.executemany( @@ -173,7 +209,7 @@ def flush(batch): "UPDATE images SET status='success', " " error_type=?, file_path=?, file_size=?, " " last_attempt_at=datetime('now') " - "WHERE gbif_id=? AND img_index=0", + "WHERE gbif_id=? AND image_no=0", batch, ) conn.commit() diff --git a/init_download_db.sh b/init_download_db.sh index f42c1bb..80872d1 100755 --- a/init_download_db.sh +++ b/init_download_db.sh @@ -12,15 +12,17 @@ module load academic-ml/spring-2026 conda activate spring-2026-pyt +# The build mode is taken from the qsub command line. Submit with one of: +# qsub ... init_download_db.sh --reset # full (re)build from scratch +# qsub ... init_download_db.sh --legacy-only # only (re-)run the legacy import # --processed-file points at the production processed_ids.txt, which lives in # ljhao's working directory, not this repo. -# --legacy-only means only process the legacy images without creating the database. -# Run this on subsequent times after db was created. -python init_download_db.py --legacy-only \ +python init_download_db.py "$@" \ --processed-file /projectnb/herbdl/workspaces/ljhao/herbdl/utils/processed_ids.txt # The other big initial run is tracked in # /projectnb/herbdl/workspaces/tsehou26/herbarium_project/utils/processed_ids.txt and .../failed_ids.txt -### The command below is used to submit the job to the cluster -### qsub -N init_download_db -l h_rt=12:00:00 -pe omp 16 -P herbdl -m beas -M your_email@bu.edu init_download_db.sh +### The command below is used to submit the job to the cluster. Use --reset for +### the first build on the new distinct-image schema; --legacy-only for later top-ups: +### qsub -N init_download_db -l h_rt=12:00:00 -pe omp 16 -P herbdl -m beas -M your_email@bu.edu init_download_db.sh --reset diff --git a/status_report.py b/status_report.py index 9c8e4b5..2e71767 100644 --- a/status_report.py +++ b/status_report.py @@ -6,18 +6,25 @@ checkpoint files and re-grouping the 59M-row multimedia.txt with pandas, every number here is a single indexed SQL query, so the report returns in seconds. +Counts are over *distinct images* (a IIIF manifest plus its resolution variants +count once -- see download_db.canonical_image_key). + The same numbers are available ad hoc -- a few useful queries: -- how many of each kind of failure? SELECT error_type, COUNT(*) FROM images WHERE status LIKE 'failed%' GROUP BY error_type ORDER BY 2 DESC; - -- every URL still worth retrying - SELECT gbif_id, url FROM images WHERE status='failed_transient'; + -- every image still worth retrying + SELECT gbif_id, image_no, urls FROM images WHERE status='failed_transient'; + + -- URLs that returned an HTML/text page, with the captured message + SELECT host, error_detail FROM images + WHERE error_type='invalid_content_type' GROUP BY host; - -- worst hosts - SELECT host, COUNT(*) FROM images WHERE status LIKE 'failed%' - GROUP BY host ORDER BY 2 DESC LIMIT 20; + -- raw files (DNG etc.) kept for a later conversion pass + SELECT gbif_id, image_no, file_path FROM images + WHERE error_type='raw_unprocessed'; Usage: python status_report.py [--db PATH] [--output-dir DIR] @@ -80,16 +87,21 @@ def section(title): write(f"Still in the work queue: {remaining:,}") # -- per-image progress ---------------------------------------------- - section("IMAGE (URL) PROGRESS") + section("DISTINCT-IMAGE PROGRESS") img_counts = dict(conn.execute( "SELECT status, COUNT(*) FROM images GROUP BY status").fetchall()) total_imgs = sum(img_counts.values()) - write(f"Total image URLs: {total_imgs:,}") + write(f"Total distinct images: {total_imgs:,}") for status in (ddb.ST_SUCCESS, ddb.ST_PENDING, ddb.ST_FAILED_TRANSIENT, ddb.ST_FAILED_PERMANENT): count = img_counts.get(status, 0) pct = (count / total_imgs * 100) if total_imgs else 0.0 write(f" {status:18s} {count:>14,} ({pct:5.2f}%)") + raw_kept = conn.execute( + "SELECT COUNT(*) FROM images WHERE error_type=?", + (ddb.ERR_RAW_UNPROCESSED,)).fetchone()[0] + write(f" (of 'success', kept raw -- DNG etc., need conversion: " + f"{raw_kept:,})") # -- failure breakdown ------------------------------------------------ section("FAILURES BY TYPE") @@ -117,6 +129,23 @@ def section(title): if not rows: write("(none)") + # -- non-image (HTML/text) responses --------------------------------- + section("NON-IMAGE RESPONSES BY HOST (for follow-up)") + write("Hosts whose URLs returned an HTML/text page instead of an image") + write("(e.g. 'direct download no longer supported'). Sample message shown.") + write("-" * 70) + rows = conn.execute( + "SELECT host, COUNT(*) AS n, MIN(error_detail) " + "FROM images WHERE error_type=? " + "GROUP BY host ORDER BY n DESC LIMIT 20", + (ddb.ERR_INVALID_CONTENT,)).fetchall() + for host, count, sample in rows: + write(f"{(host or '?')[:45]:45s} {count:>10,}") + if sample: + write(f" {sample[:200].strip()}") + if not rows: + write("(none recorded yet)") + # -- worst hosts ------------------------------------------------------ section("TOP 20 HOSTS BY FAILED IMAGES") write(f"{'host':40s} {'failed':>10s} {'success':>10s}") @@ -142,11 +171,15 @@ def section(title): write(f"Hosts currently in cooldown: {blocked:,}") section("NOTES") - write("- 'done' = every image URL for the gbifID succeeded.") + write("- Counts are over distinct images: a IIIF manifest plus its") + write(" resolution variants count as one image.") + write("- 'done' = every distinct image for the gbifID succeeded.") write("- 'partial' = still has retryable work; stays in the queue.") write("- 'failed' = all images terminal, not all succeeded; no retries left.") write("- failed_transient images are retried until " f"{ddb.MAX_ATTEMPTS} attempts, then count toward 'failed'.") + write("- 'raw_unprocessed' images ARE downloaded (status success); they") + write(" are raw files (DNG etc.) awaiting a separate conversion pass.") write() write(f"Summary written to: {os.path.abspath(output_file)}") From 138974c92cac9c26190776f2f5fe81610b4043ac Mon Sep 17 00:00:00 2001 From: Thomas Gardos <3973626+trgardos@users.noreply.github.com> Date: Sun, 24 May 2026 15:57:13 -0400 Subject: [PATCH 11/19] stream the distinct-image groupby so ingest fits in wall-clock The previous ingest used df.groupby([gbifID, image_no]).agg("\n".join, image_key="first", host="first") to build one row per distinct image. With ~52M groups and a Python callable, that one statement did not finish inside the 12 h qsub limit -- the job died before ever reaching the first INSERT, so nothing was committed. Replace it with a single-pass streaming groupby that walks the already- sorted rows in native Python lists and emits one INSERT batch per chunk of distinct images (commit every 200k images, so partial progress is saved on a future kill too). Replace the pandas gbif_ids build with one SQL `INSERT INTO gbif_ids SELECT gbif_id, COUNT(*) FROM images GROUP BY gbif_id` which rides the (gbif_id, image_no) primary-key index. Co-Authored-By: Claude Opus 4.7 --- init_download_db.py | 95 +++++++++++++++++++++++++++------------------ 1 file changed, 57 insertions(+), 38 deletions(-) diff --git a/init_download_db.py b/init_download_db.py index 899f980..15e1a7e 100644 --- a/init_download_db.py +++ b/init_download_db.py @@ -40,6 +40,7 @@ import os import sys +import gc import time import sqlite3 import argparse @@ -144,48 +145,66 @@ def ingest_multimedia(conn, multimedia_path): ["gbifID", "image_no", "eff_size", "is_manifest"], ascending=[True, True, False, True], kind="stable") - print(" Grouping rows into distinct images ...") - images = ( - df.groupby(["gbifID", "image_no"], sort=True) - .agg(urls=("identifier", "\n".join), - image_key=("image_key", "first"), - host=("host", "first")) - .reset_index() - ) - del df - print(f" {len(images):,} distinct images") - - print(" Inserting image rows ...") - inserted = 0 - for start in range(0, len(images), INSERT_BATCH): - sub = images.iloc[start:start + INSERT_BATCH] - rows = list(zip( - sub["gbifID"].tolist(), - sub["image_no"].tolist(), - sub["image_key"].tolist(), - sub["urls"].tolist(), - sub["host"].tolist(), - )) + print(" Streaming distinct images into the database ...") + # Single-pass streaming groupby. The previous pandas + # .agg('\n'.join, ...) over ~52M groups was the bottleneck (did not + # finish inside a 12 h job). Pull the sorted columns into native + # Python lists and emit one INSERT batch per chunk of distinct images. + gid_l = df["gbifID"].tolist(); del df["gbifID"] + no_l = df["image_no"].tolist(); del df["image_no"] + url_l = df["identifier"].tolist(); del df["identifier"] + key_l = df["image_key"].tolist(); del df["image_key"] + host_l = df["host"].tolist(); del df + gc.collect() + n_rows = len(gid_l) + + img_batch = [] + distinct = 0 + cur_gid = cur_no = None + cur_key = cur_host = None + cur_urls = [] + + def flush(): + if not img_batch: + return conn.executemany( "INSERT OR IGNORE INTO images" "(gbif_id, image_no, image_key, urls, host) VALUES(?,?,?,?,?)", - rows, - ) - conn.commit() - inserted += len(rows) - progress(f" {inserted:,}/{len(images):,} image rows") - print(f" {inserted:,} distinct images inserted ") - - print(" Inserting gbifID rows ...") - sizes = images.groupby("gbifID").size() - gid_rows = list(zip(sizes.index.tolist(), sizes.tolist())) - for start in range(0, len(gid_rows), INSERT_BATCH): - conn.executemany( - "INSERT OR IGNORE INTO gbif_ids(gbif_id, n_images) VALUES(?,?)", - gid_rows[start:start + INSERT_BATCH], - ) + img_batch) conn.commit() - print(f" {len(gid_rows):,} gbifIDs inserted") + img_batch.clear() + + for i in range(n_rows): + g = gid_l[i] + n = no_l[i] + if g != cur_gid or n != cur_no: + if cur_gid is not None: + img_batch.append((cur_gid, cur_no, cur_key, + "\n".join(cur_urls), cur_host)) + distinct += 1 + if len(img_batch) >= INSERT_BATCH: + flush() + progress(f" {distinct:,} images inserted " + f"({100 * i / n_rows:.1f}% through input)") + cur_gid, cur_no = g, n + cur_key, cur_host = key_l[i], host_l[i] + cur_urls = [url_l[i]] + else: + cur_urls.append(url_l[i]) + if cur_gid is not None: + img_batch.append((cur_gid, cur_no, cur_key, + "\n".join(cur_urls), cur_host)) + distinct += 1 + flush() + print(f" {distinct:,} distinct images inserted ") + + print(" Populating gbif_ids from images ...") + conn.execute( + "INSERT INTO gbif_ids(gbif_id, n_images) " + "SELECT gbif_id, COUNT(*) FROM images GROUP BY gbif_id") + conn.commit() + n_gbif = conn.execute("SELECT COUNT(*) FROM gbif_ids").fetchone()[0] + print(f" {n_gbif:,} gbifIDs inserted") def import_legacy(conn, processed_file, install_path): From 54fb3e31de6aa741f7da0cb5835565dfb84cbbf1 Mon Sep 17 00:00:00 2001 From: Thomas Gardos <3973626+trgardos@users.noreply.github.com> Date: Tue, 26 May 2026 15:17:55 -0400 Subject: [PATCH 12/19] chunk the gbif_ids recompute and add --finalize-only resume mode The final step of the legacy import rolled images.status up into gbif_ids.status via one giant UPDATE with three correlated COUNT subqueries plus an `IN (SELECT DISTINCT gbif_id FROM images ...)` filter. SQLite's planner pessimised it badly enough that it did not finish in 24 h, and because the whole thing was a single transaction, nothing was committed and no progress was visible. Replace it with _finalize_gbif_ids_status(): one SELECT gbif_id, COUNT(*) FROM images WHERE status='success' GROUP BY gbif_id to materialise per-gbifID success counts, an in-memory n_images map from gbif_ids, then chunked UPDATEs of 50k rows each with a commit and a progress print per batch. A kill mid-finalize now leaves a partially updated table and re-running is a safe idempotent retry. Add --finalize-only so the finalize can be re-run without redoing the disk re-scan -- useful when import_legacy completed its per-image flushes but the recompute did not. Co-Authored-By: Claude Opus 4.7 --- init_download_db.py | 93 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 74 insertions(+), 19 deletions(-) diff --git a/init_download_db.py b/init_download_db.py index 15e1a7e..0b7ed1e 100644 --- a/init_download_db.py +++ b/init_download_db.py @@ -29,12 +29,15 @@ The legacy import is idempotent and resumable: if it is interrupted, re-run with --legacy-only to finish it without redoing the multimedia ingest. +The final "roll up images.status into gbif_ids" step is also resumable on its +own -- re-run with --finalize-only to redo just that, no disk re-scan. Usage ----- python init_download_db.py # build DB + import legacy python init_download_db.py --skip-legacy # build DB only python init_download_db.py --legacy-only # (re-)run only the legacy import + python init_download_db.py --finalize-only # only roll images -> gbif_ids python init_download_db.py --reset # rebuild from scratch """ @@ -93,6 +96,9 @@ def parse_args(): p.add_argument("--legacy-only", action="store_true", help="Skip the ingest; only (re-)run the legacy import on an " "existing database (use this to finish an interrupted import)") + p.add_argument("--finalize-only", action="store_true", + help="Skip ingest and the disk re-scan; only (re-)roll " + "images.status up into gbif_ids.status on an existing DB") p.add_argument("--reset", action="store_true", help="Delete an existing database before building") return p.parse_args() @@ -281,27 +287,61 @@ def flush(batch): print(f" renamed={renamed:,} already-suffixed={relabeled:,} " f"file-missing={missing:,}") - # Roll the per-image success flags up into gbif_ids statuses in one pass. - print(" Recomputing gbifID statuses ...") - conn.execute( - "UPDATE gbif_ids SET " - " n_success=(SELECT COUNT(*) FROM images i " - " WHERE i.gbif_id=gbif_ids.gbif_id AND i.status='success'), " - " status=CASE " - " WHEN n_images>0 AND n_images=(SELECT COUNT(*) FROM images i " - " WHERE i.gbif_id=gbif_ids.gbif_id AND i.status='success') " - " THEN 'done' " - " WHEN (SELECT COUNT(*) FROM images i " - " WHERE i.gbif_id=gbif_ids.gbif_id AND i.status='success')>0 " - " THEN 'partial' " - " ELSE 'pending' END " - "WHERE gbif_id IN (SELECT DISTINCT gbif_id FROM images " - " WHERE status='success')" - ) + # Roll the per-image success flags up into gbif_ids statuses. + _finalize_gbif_ids_status(conn) + + +def _finalize_gbif_ids_status(conn): + """ + Roll images.status up into gbif_ids.status (done / partial / pending), + set n_success, and stamp completed_at on freshly-done IDs. + + Chunked and committed per batch so progress is visible and a kill mid-run + is resumable -- a re-run just re-applies the same UPDATEs. + + A previous one-statement UPDATE with three correlated COUNT subqueries + and an `IN (SELECT DISTINCT ...)` filter did not finish in 24 h; this + Python-side pass takes well under an hour. + """ + print(" Counting success images per gbifID ...") + success_counts = conn.execute( + "SELECT gbif_id, COUNT(*) FROM images " + "WHERE status='success' GROUP BY gbif_id ORDER BY gbif_id" + ).fetchall() + print(f" {len(success_counts):,} gbifIDs have at least one success image") + + print(" Loading n_images per gbifID ...") + n_images_map = dict(conn.execute( + "SELECT gbif_id, n_images FROM gbif_ids" + ).fetchall()) + print(f" {len(n_images_map):,} gbifIDs total") + + print(f" Updating gbif_ids statuses ...") + BATCH = 50_000 + updated = 0 + for start in range(0, len(success_counts), BATCH): + chunk = success_counts[start:start + BATCH] + rows = [] + for gid, n_success in chunk: + n_total = n_images_map.get(gid, 0) + if n_total > 0 and n_success >= n_total: + status = "done" + elif n_success > 0: + status = "partial" + else: + status = "pending" + rows.append((n_success, status, gid)) + conn.executemany( + "UPDATE gbif_ids SET n_success=?, status=? WHERE gbif_id=?", rows) + conn.commit() + updated += len(rows) + progress(f" {updated:,}/{len(success_counts):,} gbif_ids updated") + print(f" {updated:,} gbif_ids updated ") + + print(" Setting completed_at for done gbifIDs ...") conn.execute( "UPDATE gbif_ids SET completed_at=datetime('now') " - "WHERE status='done' AND completed_at IS NULL" - ) + "WHERE status='done' AND completed_at IS NULL") conn.commit() @@ -358,6 +398,21 @@ def main(): print(f"\nDone in {time.time() - start:.0f}s.") return + # --finalize-only: just (re-)roll images.status into gbif_ids.status. Use + # this to finish a run that was killed during the recompute step. + if args.finalize_only: + if not os.path.exists(args.db): + sys.exit(f"--finalize-only needs an existing database, but none was " + f"found at: {args.db}\nRun the full build first.") + print(f"--finalize-only: rolling images.status up into gbif_ids on " + f"{args.db}") + conn = connect(args.db, bulk_load=False) + _finalize_gbif_ids_status(conn) + report_status(conn) + conn.close() + print(f"\nDone in {time.time() - start:.0f}s.") + return + if os.path.exists(args.db): if args.reset: print(f"Removing existing database {args.db}") From f72673cf81159e8d09989bc1da4b3d7ac4c06d91 Mon Sep 17 00:00:00 2001 From: Thomas Gardos <3973626+trgardos@users.noreply.github.com> Date: Tue, 26 May 2026 15:24:27 -0400 Subject: [PATCH 13/19] document --finalize-only and the three resumable build stages Update DEPLOYMENT.md for the recent init_download_db.py changes: - Phase 1 qsub example now passes --reset and explains the wrapper forwards "$@" through, so --legacy-only and --finalize-only work the same way. - Options table gains a --finalize-only row. - "Re-running is safe" rewritten as three resumable stages: ingest (--reset), legacy import (--legacy-only), finalize (--finalize-only). - Troubleshooting gets a row for jobs killed during "Recomputing gbifID statuses". Cross-link gbif-metadata-download.md to DEPLOYMENT.md so the GBIF query how-to ends pointing at the project-specific build/download pipeline. Co-Authored-By: Claude Opus 4.7 --- DEPLOYMENT.md | 26 +++++++++++++++++++------- docs/gbif-metadata-download.md | 15 ++++++++++++++- 2 files changed, 33 insertions(+), 8 deletions(-) diff --git a/DEPLOYMENT.md b/DEPLOYMENT.md index f9fce1c..2f675e2 100644 --- a/DEPLOYMENT.md +++ b/DEPLOYMENT.md @@ -61,17 +61,21 @@ It is heavy — it reads the ~59M-row `multimedia.txt` with pandas and renames u to ~13.5M files. **Run it as a batch job, not on a login node.** ```bash +# fresh build (use --reset if a DB already exists at the destination): qsub -N init_download_db -l h_rt=12:00:00 -pe omp 16 -P herbdl \ - -m beas -M your_email@bu.edu init_download_db.sh + -m beas -M your_email@bu.edu init_download_db.sh --reset ``` -`init_download_db.sh` runs: +`init_download_db.sh` forwards its arguments through to the python script: ```bash -python init_download_db.py \ +python init_download_db.py "$@" \ --processed-file /projectnb/herbdl/workspaces/ljhao/herbdl/utils/processed_ids.txt ``` +So `qsub ... init_download_db.sh --reset`, `... --legacy-only`, and +`... --finalize-only` all work without editing the wrapper. + > **Important:** the production `processed_ids.txt` (~13.5M IDs) lives in > ljhao's working directory, not in this repo. The wrapper already points there. > If you build the DB by hand, pass that `--processed-file` path explicitly, or @@ -84,6 +88,7 @@ python init_download_db.py \ | `python init_download_db.py` | Build DB + import legacy progress | | `python init_download_db.py --skip-legacy` | Build DB only (everything starts `pending`) | | `python init_download_db.py --legacy-only` | Skip the ingest; only (re-)run the legacy import on an existing DB | +| `python init_download_db.py --finalize-only` | Skip ingest and disk re-scan; only (re-)roll `images.status` up into `gbif_ids.status` | | `python init_download_db.py --reset` | Delete an existing DB and rebuild from scratch | **Expected output** — a status breakdown, e.g.: @@ -100,10 +105,16 @@ Final gbifID status counts: - `pending` — never attempted Re-running is safe: file renames and database updates are idempotent -(already-renamed files are detected and reused). If the **ingest** fails partway, -re-run with `--reset`. If only the **legacy import** fails partway (e.g. it was -interrupted), re-run with `--legacy-only` — that finishes the import without -redoing the hour-long ingest. +(already-renamed files are detected and reused). The build has three resumable +stages: + +1. **Ingest** (`multimedia.txt` → `images`/`gbif_ids`). If it fails partway, + re-run with `--reset`. +2. **Legacy import** (disk re-scan + mark image 0 success per gbifID). If + killed partway, re-run with `--legacy-only` — skips the hour-long ingest. +3. **Finalize** (roll `images.status` up into `gbif_ids.status` in 50k-row + batches). If killed partway, re-run with `--finalize-only` — skips ingest + AND the disk re-scan; takes well under an hour. --- @@ -241,5 +252,6 @@ must be re-run. | `Database already exists` from the builder | Intended guard — `--reset` to rebuild, or `--legacy-only` to just (re-)run the legacy import. | | `database is locked` | The builder now uses WAL mode (readers do not block the writer) and a 120 s busy timeout, so this should not recur. If the legacy import was interrupted by it, finish it with `init_download_db.py --legacy-only`. Still avoid running two writers against one DB. | | Legacy import interrupted partway | Re-run `init_download_db.py --legacy-only` — it is idempotent and skips the hour-long ingest. | +| Killed during "Recomputing gbifID statuses" | Re-run with `--finalize-only` — it skips ingest and the disk re-scan, commits every 50k gbif_ids, and finishes in under an hour. The previous one-shot `UPDATE` (correlated subqueries + `IN (SELECT DISTINCT …)`) didn't finish in 24h; the chunked replacement does. | | Builder runs out of memory | `multimedia.txt` is large; request more memory (e.g. a larger `-pe omp` slot count). | | Legacy progress not imported | `--processed-file` was not pointed at ljhao's `processed_ids.txt`. | diff --git a/docs/gbif-metadata-download.md b/docs/gbif-metadata-download.md index 003f695..1e5a9d3 100644 --- a/docs/gbif-metadata-download.md +++ b/docs/gbif-metadata-download.md @@ -68,4 +68,17 @@ You can send a JSON object to `https://api.gbif.org/v1/occurrence/download/reque ### Pro-Tip for Data Handling -Once your download is processed and unzipped, the `multimedia.txt` file will serve as your master list for image links, which you can link back to the metadata in `occurrence.txt` using the shared `gbifID` column. If you are using Python, you can also use the [`plantnet/gbif-dl`](https://www.google.com/search?q=%5Bhttps://github.com/plantnet/gbif-dl%5D(https://github.com/plantnet/gbif-dl)) library specifically designed to parse these queries and download the image assets efficiently. \ No newline at end of file +Once your download is processed and unzipped, the `multimedia.txt` file will serve as your master list for image links, which you can link back to the metadata in `occurrence.txt` using the shared `gbifID` column. If you are using Python, you can also use the [`plantnet/gbif-dl`](https://www.google.com/search?q=%5Bhttps://github.com/plantnet/gbif-dl%5D(https://github.com/plantnet/gbif-dl)) library specifically designed to parse these queries and download the image assets efficiently. + +--- + +### Next: download the images for this project + +Once you have a fresh `multimedia.txt`, place it at +`/projectnb/herbdl/data/GBIF-F25/multimedia.txt` and follow +[DEPLOYMENT.md](../DEPLOYMENT.md) — it builds a SQLite status database from the +file (one row per *distinct image*, with IIIF manifest + resolution variants +deduplicated), imports any prior `processed_ids.txt` progress, and runs the +parallel downloader. The build has three resumable stages (ingest, legacy +import, finalize), each with its own `--reset` / `--legacy-only` / +`--finalize-only` flag. \ No newline at end of file From 11c4ac3ec4e02d7cd7ea64e8b7d3016235893f07 Mon Sep 17 00:00:00 2001 From: Thomas Gardos <3973626+trgardos@users.noreply.github.com> Date: Thu, 28 May 2026 17:25:38 -0400 Subject: [PATCH 14/19] open status_report.py read-only; add db_integrity_check.sh status_report.py was opening the DB read-write with no busy timeout, so a brief WAL contention window from the running downloader could surface as `sqlite3.OperationalError: disk I/O error` mid-report. Open the connection read-only via the URI form and set a 60 s busy timeout so SQLite waits through a writer checkpoint instead of erroring out. Add db_integrity_check.sh: a qsub wrapper that runs PRAGMA integrity_check (read-only against the live DB, safe alongside an active writer in WAL mode) and prints fresh row counts at the end so results can be cross-checked against status_report.py. Co-Authored-By: Claude Opus 4.7 --- db_integrity_check.sh | 40 ++++++++++++++++++++++++++++++++++++++++ status_report.py | 5 ++++- 2 files changed, 44 insertions(+), 1 deletion(-) create mode 100755 db_integrity_check.sh diff --git a/db_integrity_check.sh b/db_integrity_check.sh new file mode 100755 index 0000000..2a9d07c --- /dev/null +++ b/db_integrity_check.sh @@ -0,0 +1,40 @@ +#!/bin/bash -l + +# Run SQLite's PRAGMA integrity_check against download_status.db. +# +# Read-only against the live DB -- in WAL mode the running downloader can +# keep writing while this reads a consistent snapshot. On a ~19 GB DB this +# typically takes ~10-60 minutes; h_rt below is set to 4 h for headroom. +# +# Also prints fresh row counts at the end so we can cross-check against the +# numbers status_report.py was seeing. + +module load miniconda +module load academic-ml/spring-2026 + +conda activate spring-2026-pyt + +DB=/projectnb/herbdl/data/GBIF-F25h/download_status.db + +echo "=== PRAGMA integrity_check on $DB ===" +echo "started: $(date)" +python3 -u -c " +import sqlite3 +conn = sqlite3.connect('file:$DB?mode=ro', uri=True, timeout=300) +conn.execute('PRAGMA busy_timeout=300000') +print(' running PRAGMA integrity_check ...', flush=True) +for row in conn.execute('PRAGMA integrity_check'): + print(f' {row[0]}', flush=True) +print() +print(' row counts (live snapshot):', flush=True) +for t in ('images', 'gbif_ids', 'hosts'): + n = conn.execute('SELECT COUNT(*) FROM ' + t).fetchone()[0] + print(f' {t}: {n:,} rows', flush=True) +print(' gbif_ids by status:', flush=True) +for status, n in conn.execute('SELECT status, COUNT(*) FROM gbif_ids GROUP BY status'): + print(f' {status}: {n:,}', flush=True) +" +echo "finished: $(date)" + +### The command below is used to submit the job to the cluster: +### qsub -N db_integrity -l h_rt=4:00:00 -pe omp 4 -P herbdl -j y -o db_integrity.out db_integrity_check.sh diff --git a/status_report.py b/status_report.py index 2e71767..3f738fd 100644 --- a/status_report.py +++ b/status_report.py @@ -53,7 +53,10 @@ def main(): if not os.path.exists(args.db): raise SystemExit(f"Status database not found: {args.db}") - conn = sqlite3.connect(args.db) + # Read-only with a generous busy timeout so a brief WAL contention + # window from a running downloader can't surface as "disk I/O error". + conn = sqlite3.connect(f"file:{args.db}?mode=ro", uri=True, timeout=60) + conn.execute("PRAGMA busy_timeout=60000") run_time = datetime.now() output_file = os.path.join( args.output_dir, f"summary_{run_time:%Y%m%d%H%M}.txt") From 482cc010ee40b85335e85a5666238822c19e0233 Mon Sep 17 00:00:00 2001 From: Thomas Gardos <3973626+trgardos@users.noreply.github.com> Date: Fri, 29 May 2026 16:55:11 -0400 Subject: [PATCH 15/19] retry SELECTs in status_report.py to ride out WAL hiccups Brief WAL checkpoint races on GPFS occasionally surface as transient `disk I/O error` or `file is not a database` when reading the live DB while image_install_db is writing. PRAGMA integrity_check confirmed the DB itself is fine; the read is just unlucky. Add _q_all() / _q_one() helpers that wrap conn.execute().fetchall() / .fetchone() with up to three attempts, sleeping 0.5 s and 2 s between tries on sqlite3.OperationalError / DatabaseError. Switch every query call site in main() to go through them. If all three attempts still fail, the original exception propagates. Co-Authored-By: Claude Opus 4.7 --- status_report.py | 64 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 45 insertions(+), 19 deletions(-) diff --git a/status_report.py b/status_report.py index 3f738fd..6055a45 100644 --- a/status_report.py +++ b/status_report.py @@ -31,6 +31,7 @@ """ import os +import time import sqlite3 import argparse from datetime import datetime @@ -38,6 +39,32 @@ import download_db as ddb +# Transient WAL/checkpoint races on networked filesystems (GPFS) can briefly +# surface as `disk I/O error` or `file is not a database`. The data is fine; +# the read is just unlucky. Retry once or twice and the next attempt succeeds. +_RETRY_DELAYS = (0.5, 2.0) + + +def _q_all(conn, sql, params=()): + """Run a SELECT and fetchall() with brief retries on transient I/O errors.""" + for delay in _RETRY_DELAYS: + try: + return conn.execute(sql, params).fetchall() + except (sqlite3.OperationalError, sqlite3.DatabaseError): + time.sleep(delay) + return conn.execute(sql, params).fetchall() + + +def _q_one(conn, sql, params=()): + """Run a SELECT and fetchone() with brief retries on transient I/O errors.""" + for delay in _RETRY_DELAYS: + try: + return conn.execute(sql, params).fetchone() + except (sqlite3.OperationalError, sqlite3.DatabaseError): + time.sleep(delay) + return conn.execute(sql, params).fetchone() + + def parse_args(): p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) @@ -78,8 +105,8 @@ def section(title): # -- gbifID progress -------------------------------------------------- section("GBIFID PROGRESS") - gbif_counts = dict(conn.execute( - "SELECT status, COUNT(*) FROM gbif_ids GROUP BY status").fetchall()) + gbif_counts = dict(_q_all(conn, + "SELECT status, COUNT(*) FROM gbif_ids GROUP BY status")) total_ids = sum(gbif_counts.values()) write(f"Total gbifIDs: {total_ids:,}") for status in (ddb.G_DONE, ddb.G_PARTIAL, ddb.G_PENDING, ddb.G_FAILED): @@ -91,8 +118,8 @@ def section(title): # -- per-image progress ---------------------------------------------- section("DISTINCT-IMAGE PROGRESS") - img_counts = dict(conn.execute( - "SELECT status, COUNT(*) FROM images GROUP BY status").fetchall()) + img_counts = dict(_q_all(conn, + "SELECT status, COUNT(*) FROM images GROUP BY status")) total_imgs = sum(img_counts.values()) write(f"Total distinct images: {total_imgs:,}") for status in (ddb.ST_SUCCESS, ddb.ST_PENDING, @@ -100,9 +127,9 @@ def section(title): count = img_counts.get(status, 0) pct = (count / total_imgs * 100) if total_imgs else 0.0 write(f" {status:18s} {count:>14,} ({pct:5.2f}%)") - raw_kept = conn.execute( + raw_kept = _q_one(conn, "SELECT COUNT(*) FROM images WHERE error_type=?", - (ddb.ERR_RAW_UNPROCESSED,)).fetchone()[0] + (ddb.ERR_RAW_UNPROCESSED,))[0] write(f" (of 'success', kept raw -- DNG etc., need conversion: " f"{raw_kept:,})") @@ -110,10 +137,10 @@ def section(title): section("FAILURES BY TYPE") write(f"{'error_type':24s} {'count':>14s} {'verdict':s}") write("-" * 60) - rows = conn.execute( + rows = _q_all(conn, "SELECT error_type, COUNT(*) FROM images " "WHERE status LIKE 'failed%' AND error_type IS NOT NULL " - "GROUP BY error_type ORDER BY 2 DESC").fetchall() + "GROUP BY error_type ORDER BY 2 DESC") for error_type, count in rows: verdict = "permanent" if ddb.is_permanent(error_type) else "retryable" write(f"{error_type:24s} {count:>14,} {verdict}") @@ -122,10 +149,9 @@ def section(title): # -- retry attempt distribution -------------------------------------- section("RETRY ATTEMPTS (failed_transient images)") - rows = conn.execute( + rows = _q_all(conn, "SELECT attempts, COUNT(*) FROM images " - "WHERE status='failed_transient' GROUP BY attempts ORDER BY attempts" - ).fetchall() + "WHERE status='failed_transient' GROUP BY attempts ORDER BY attempts") for attempts, count in rows: note = " <- retry budget exhausted" if attempts >= ddb.MAX_ATTEMPTS else "" write(f" {attempts} attempt(s): {count:,}{note}") @@ -137,11 +163,11 @@ def section(title): write("Hosts whose URLs returned an HTML/text page instead of an image") write("(e.g. 'direct download no longer supported'). Sample message shown.") write("-" * 70) - rows = conn.execute( + rows = _q_all(conn, "SELECT host, COUNT(*) AS n, MIN(error_detail) " "FROM images WHERE error_type=? " "GROUP BY host ORDER BY n DESC LIMIT 20", - (ddb.ERR_INVALID_CONTENT,)).fetchall() + (ddb.ERR_INVALID_CONTENT,)) for host, count, sample in rows: write(f"{(host or '?')[:45]:45s} {count:>10,}") if sample: @@ -153,23 +179,23 @@ def section(title): section("TOP 20 HOSTS BY FAILED IMAGES") write(f"{'host':40s} {'failed':>10s} {'success':>10s}") write("-" * 64) - rows = conn.execute( + rows = _q_all(conn, "SELECT host, " " SUM(CASE WHEN status LIKE 'failed%' THEN 1 ELSE 0 END) AS failed, " " SUM(CASE WHEN status='success' THEN 1 ELSE 0 END) AS ok " "FROM images WHERE host IS NOT NULL AND host != '' " - "GROUP BY host ORDER BY failed DESC LIMIT 20").fetchall() + "GROUP BY host ORDER BY failed DESC LIMIT 20") for host, failed, ok in rows: write(f"{host[:40]:40s} {failed or 0:>10,} {ok or 0:>10,}") # -- circuit-breaker state ------------------------------------------- section("CIRCUIT BREAKER / COOLDOWNS") - broken = conn.execute( - "SELECT COUNT(*) FROM hosts WHERE error_count >= 500").fetchone()[0] - blocked = conn.execute( + broken = _q_one(conn, + "SELECT COUNT(*) FROM hosts WHERE error_count >= 500")[0] + blocked = _q_one(conn, "SELECT COUNT(*) FROM hosts " "WHERE blocked_until IS NOT NULL " - "AND blocked_until > strftime('%s','now')").fetchone()[0] + "AND blocked_until > strftime('%s','now')")[0] write(f"Hosts past the circuit-breaker threshold (500 errors): {broken:,}") write(f"Hosts currently in cooldown: {blocked:,}") From 4b229c3640c58d1e5a767d328c4b7e31048e4bce Mon Sep 17 00:00:00 2001 From: Thomas Gardos <3973626+trgardos@users.noreply.github.com> Date: Mon, 1 Jun 2026 10:43:04 -0400 Subject: [PATCH 16/19] fix README example query: url -> urls (and add LIMIT) The `images` column is `urls` (plural -- newline-joined candidate URLs for a distinct image, since IIIF manifest + resolution variants collapse into one row). The example was carrying over the old per-URL column name and would have failed with "no such column: url". Also add LIMIT 50 so the example does not dump every retryable row. Co-Authored-By: Claude Opus 4.7 --- README.md | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index fbee679..ce878f5 100644 --- a/README.md +++ b/README.md @@ -125,11 +125,15 @@ python status_report.py [--db PATH] [--output-dir DIR] Ad hoc queries against the database, e.g.: ```sql -- count each kind of failure -SELECT error_type, COUNT(*) FROM images -WHERE status LIKE 'failed%' GROUP BY error_type ORDER BY 2 DESC; +SELECT error_type, COUNT(*) FROM images WHERE status LIKE 'failed%' GROUP BY error_type ORDER BY 2 DESC; --- every URL still worth retrying -SELECT gbif_id, url FROM images WHERE status='failed_transient'; +-- every image still worth retrying (urls is the newline-joined candidate list) +SELECT gbif_id, urls FROM images WHERE status='failed_transient' LIMIT 50; +``` + +Count of how many downloads since a certain date and time. +```bash +sqlite3 -readonly /projectnb/herbdl/data/GBIF-F25h/download_status.db "SELECT COUNT(*) FROM images WHERE status='success' AND last_attempt_at > '2026-05-31 20:00:00';" ``` ### Image Processing From 9ec6b412f0b40af6ffba643e59c4f69811fbd6ff Mon Sep 17 00:00:00 2001 From: Thomas Gardos <3973626+trgardos@users.noreply.github.com> Date: Wed, 3 Jun 2026 13:47:53 -0400 Subject: [PATCH 17/19] add documentation on db_integrity_check.sh --- README.md | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/README.md b/README.md index ce878f5..c32aafe 100644 --- a/README.md +++ b/README.md @@ -136,6 +136,30 @@ Count of how many downloads since a certain date and time. sqlite3 -readonly /projectnb/herbdl/data/GBIF-F25h/download_status.db "SELECT COUNT(*) FROM images WHERE status='success' AND last_attempt_at > '2026-05-31 20:00:00';" ``` +#### `db_integrity_check.sh` +**Purpose**: Run SQLite's `PRAGMA integrity_check` against `download_status.db` and print a fresh row-count snapshot. Use it whenever something looks off — e.g. transient `file is not a database` / `disk I/O error` from concurrent readers, anomalous row counts, or just for periodic verification. + +**Best practice — pause the downloader first**. Concurrent WAL writes from `image_install_db.py` on networked filesystems (GPFS) can race with the integrity scan and either slow it down or surface as false positives. The downloader is fully resumable, so stopping it costs nothing. + +**Usage**: +```bash +# 1. pause the downloader (find its job-ID with `qstat -u $USER`) +qdel + +# 2. give SQLite a few seconds to checkpoint the WAL +sleep 30 + +# 3. submit the integrity check +qsub -N db_integrity -l h_rt=4:00:00 -pe omp 4 -P herbdl -j y \ + -o db_integrity.out db_integrity_check.sh + +# 4. when db_integrity.out shows 'ok', restart the downloader +qsub -N image_install_db -l h_rt=48:00:00 -pe omp 16 -P herbdl \ + -m beas -M your_email@bu.edu image_install_db.sh +``` + +On a clean DB the check prints `ok` (takes ~10–20 minutes on a ~20 GB DB), followed by current row counts for `images`, `gbif_ids`, `hosts`, and a `gbif_ids`-by-status breakdown. Any other output indicates real corruption — capture it from `db_integrity.out`. + ### Image Processing #### `image_utils.py` From 889cc0854f6c49531f06a408096966f49de08333 Mon Sep 17 00:00:00 2001 From: Thomas Gardos <3973626+trgardos@users.noreply.github.com> Date: Wed, 3 Jun 2026 14:06:26 -0400 Subject: [PATCH 18/19] switch notifications.py from Pushover to Slack webhook Replace the Pushover POST in send_notification() with a Slack incoming webhook POST. SLACK_WEBHOOK_URL is read from .env; without it set the function logs a one-line warning and silently no-ops so the downloader keeps working without notifications. A try/except wraps the POST so a Slack hiccup never interrupts the caller. Update the README: - Add a "Push notifications (optional)" callout in the image_install_db.py section that points at the notifications.py setup. - Rewrite the notifications.py setup as a five-step Slack walkthrough (create Slack app -> enable Incoming Webhooks -> add to channel -> paste URL into .env -> chmod 600 + verify with a one-liner). - Add a db_integrity_check.sh section describing when/how to run the integrity check (pause downloader -> qdel -> qsub -> resume). - Fix the example query to use `urls` (plural) and add LIMIT. Add `.env` to .gitignore so the Slack webhook URL never gets committed. Co-Authored-By: Claude Opus 4.7 --- .gitignore | 3 +++ README.md | 33 +++++++++++++++++++++++++-------- notifications.py | 40 ++++++++++++++++++++++------------------ 3 files changed, 50 insertions(+), 26 deletions(-) diff --git a/.gitignore b/.gitignore index 68a21d2..1ea641e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,9 @@ .venv/ __pycache__/ +# secrets (Slack incoming webhook URL for notifications.py) +.env + # ignore generated summary files summary*.txt diff --git a/README.md b/README.md index c32aafe..a3e9605 100644 --- a/README.md +++ b/README.md @@ -44,6 +44,8 @@ python image_install_db.py [--db PATH] - Failures are classified (404, 401, timeout, rate-limited, dropped connection, …); only transient failures are retried, capped at 4 attempts - Retry strategy with backoff for 500-level errors +**Push notifications (optional)**: every 50,000 images downloaded in a run, `image_install_db.py` calls `send_notification(...)` from `notifications.py`, which posts a message to a Slack channel via an incoming webhook. Without `SLACK_WEBHOOK_URL` set, it logs a one-line warning and silently no-ops — the downloader works either way. See the [`notifications.py`](#notificationspy) section below for the one-time setup. + #### `image_install_db.sh` **Purpose**: SCC job submission wrapper for `image_install_db.py`. @@ -300,14 +302,29 @@ python link_check.py ``` #### `notifications.py` -**Purpose**: Send push notifications via Pushover API for long-running job monitoring. - -**Setup**: -1. Create a `.env` file with: -``` -PUSHOVER_API_TOKEN=your_token_here -PUSHOVER_USER_KEY=your_user_key_here -``` +**Purpose**: Post a message to a Slack channel via an incoming webhook, for long-running job monitoring. `image_install_db.py` calls it every 50,000 images successfully downloaded in a run. + +**Setup (one-time)**: + +1. Open and click **Create New App** → **From scratch**. Pick the workspace you want notifications in and give the app a name (e.g. *Herbarium downloader*). +2. In the new app, open **Incoming Webhooks** (left sidebar) and toggle it **On**. +3. Click **Add New Webhook to Workspace**, pick the channel that should receive the notifications, and authorise. +4. Copy the resulting webhook URL (looks like `https://hooks.slack.com/services/T…/B…/…`) into a `.env` file in the repo root: + ``` + SLACK_WEBHOOK_URL=https://hooks.slack.com/services/your/webhook/url + ``` + Lock it down so others can't read the URL: + ```bash + chmod 600 .env + ``` + `.env` is already in `.gitignore`, so the URL won't get committed. +5. Verify it works: + ```bash + python -c "from notifications import send_notification; send_notification('Test', 'Slack is working')" + ``` + You should see the message in the channel within a second or two. If you see `SLACK_WEBHOOK_URL not set; skipping notification.` instead, the env var is empty — re-check `.env`. + +Without `.env`, notifications are a silent no-op — the downloader runs fine, just without Slack updates. Treat the webhook URL like a secret: anyone with it can post to the channel. **Function**: ```python diff --git a/notifications.py b/notifications.py index 8cc01c9..e844e6c 100644 --- a/notifications.py +++ b/notifications.py @@ -1,25 +1,29 @@ -import requests -import json +""" +Push notifications via a Slack incoming webhook. + +The webhook URL is read from `.env` (SLACK_WEBHOOK_URL). Without it +configured, send_notification() is a silent no-op so the caller (e.g. +image_install_db.py) keeps working without notifications. +""" + import os +import requests from dotenv import load_dotenv load_dotenv() -def send_notification(title, message): - URL = "https://api.pushover.net/1/messages.json" - - api_token = os.getenv("PUSHOVER_API_TOKEN") - user_key = os.getenv("PUSHOVER_USER_KEY") - - if not api_token or not user_key: - print("Pushover API token or user key not set") +def send_notification(title, message): + url = os.getenv("SLACK_WEBHOOK_URL") + if not url: + print("SLACK_WEBHOOK_URL not set; skipping notification.") return - - data = { - "token": os.getenv("PUSHOVER_API_TOKEN"), - "user": os.getenv("PUSHOVER_USER_KEY"), - "title": title, - "message": message - } - requests.post(URL, data=data) + try: + requests.post( + url, + json={"text": f"*{title}*\n{message}"}, + timeout=10, + ) + except requests.RequestException as e: + # Never let a notification failure interrupt the caller. + print(f"Slack notification failed: {e}") From 7811b9727d62f8157f69965b9869f11a2d61390c Mon Sep 17 00:00:00 2001 From: Thomas Gardos <3973626+trgardos@users.noreply.github.com> Date: Mon, 8 Jun 2026 11:28:47 -0400 Subject: [PATCH 19/19] change slack notification interval to 10000 --- image_install_db.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/image_install_db.py b/image_install_db.py index 0603b72..cb3a803 100644 --- a/image_install_db.py +++ b/image_install_db.py @@ -604,7 +604,7 @@ def process_id(db, gbif_id, total_to_install): with counter_lock: n_installed += 1 current = n_installed - if current % 50000 == 0: + if current % 10000 == 0: send_notification( "Image Installation", f"Installed {current} images this run "