From f6b4b4cb2e2c5656a301405543996019199ea4fc Mon Sep 17 00:00:00 2001
From: Thomas Gardos <3973626+trgardos@users.noreply.github.com>
Date: Thu, 21 May 2026 09:13:46 -0400
Subject: [PATCH 01/19] ignore pycache

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)
diff --git a/.gitignore b/.gitignore
index 21d0b89..a230a78 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
 .venv/
+__pycache__/

From 5f70e5761071aa88df167153cc42a36221f28a23 Mon Sep 17 00:00:00 2001
From: Thomas Gardos <3973626+trgardos@users.noreply.github.com>
Date: Thu, 21 May 2026 09:37:31 -0400
Subject: [PATCH 02/19] new sqlite based download tracking

---
 DEPLOYMENT.md       | 212 +++++++++++++++++
 README.md           | 142 ++++++++++--
 download_db.py      | 293 ++++++++++++++++++++++++
 image_install_db.py | 546 ++++++++++++++++++++++++++++++++++++++++++++
 image_install_db.sh |  17 ++
 init_download_db.py | 267 ++++++++++++++++++++++
 init_download_db.sh |  21 ++
 status_report.py    | 157 +++++++++++++
 8 files changed, 1630 insertions(+), 25 deletions(-)
 create mode 100644 DEPLOYMENT.md
 create mode 100644 download_db.py
 create mode 100644 image_install_db.py
 create mode 100755 image_install_db.sh
 create mode 100644 init_download_db.py
 create mode 100755 init_download_db.sh
 create mode 100644 status_report.py

diff --git a/DEPLOYMENT.md b/DEPLOYMENT.md
new file mode 100644
index 0000000..e487545
--- /dev/null
+++ b/DEPLOYMENT.md
@@ -0,0 +1,212 @@
+# Deployment Guide — Image Downloader with SQLite Status Tracking
+
+This guide covers deploying the GBIF herbarium image downloader after its switch
+from flat-file checkpoints (`processed_ids.txt` / `failed_ids.txt`) to a
+queryable SQLite status database.
+
+There are two phases: **build the database once**, then **run (and re-run) the
+downloader**. All commands assume the SCC and the `spring-2026-pyt` conda
+environment.
+
+---
+
+## What changed
+
+| Before | After |
+|---|---|
+| Progress in `processed_ids.txt` / `failed_ids.txt` (ID only, no reason) | Progress in `download_status.db` — every URL's outcome and *why* it failed |
+| `multimedia.txt` re-read and re-grouped with pandas every run | Ingested into the DB once; later runs query the work queue |
+| Failed IDs all retried blindly (or skipped) | Only transient failures retried (timeout/rate-limit/5xx/dropped connection), capped at 4 attempts |
+| `analyze_image_progress.py` (slow, loads ~180 MB of text) | `status_report.py` (instant SQL queries) |
+| ~1.4 GB run logs, ~134 MB warning spam | `WARNING`-level log only; warning spam suppressed |
+
+The database lives **outside this git repo**, in the data directory, so it is
+never committed:
+
+- `download_status.db` (+ `-wal`, `-shm` companions) at
+  `/projectnb/herbdl/data/GBIF-F25h/download_status.db`
+- Estimated size after ingest: **~10–15 GB**
+
+---
+
+## Files
+
+| File | Role |
+|---|---|
+| `init_download_db.py` / `init_download_db.sh` | One-time database builder (+ qsub wrapper) |
+| `image_install_db.py` / `image_install_db.sh` | The downloader (+ qsub wrapper) |
+| `status_report.py` | Progress reporting |
+| `download_db.py` | Shared schema + DB helpers (imported, not run) |
+
+> The original flat-file downloader is preserved as `image_install_parallel.py`
+> (run via `image_install.sh`). It is independent of the database workflow
+> described here and is kept only for reference / fallback.
+
+---
+
+## Phase 1 — Build the status database (once)
+
+This step ingests `multimedia.txt`, imports already-completed downloads from
+`processed_ids.txt`, and renames legacy `<id>.jpg` files to `<id>-00.jpg`.
+
+It is heavy — it reads the ~59M-row `multimedia.txt` with pandas and renames up
+to ~13.5M files. **Run it as a batch job, not on a login node.**
+
+```bash
+qsub -N init_download_db -l h_rt=12:00:00 -pe omp 16 -P herbdl \
+     -m beas -M your_email@bu.edu init_download_db.sh
+```
+
+`init_download_db.sh` runs:
+
+```bash
+python init_download_db.py \
+    --processed-file /projectnb/herbdl/workspaces/ljhao/herbdl/utils/processed_ids.txt
+```
+
+> **Important:** the production `processed_ids.txt` (~13.5M IDs) lives in
+> ljhao's working directory, not in this repo. The wrapper already points there.
+> If you build the DB by hand, pass that `--processed-file` path explicitly, or
+> the legacy progress will not be imported.
+
+**Options:**
+
+| Command | Effect |
+|---|---|
+| `python init_download_db.py` | Build DB + import legacy progress |
+| `python init_download_db.py --skip-legacy` | Build DB only (everything starts `pending`) |
+| `python init_download_db.py --reset` | Delete an existing DB and rebuild from scratch |
+
+**Expected output** — a status breakdown, e.g.:
+
+```
+Final gbifID status counts:
+  done       13,200,000
+  partial       320,000
+  pending    36,900,000
+```
+
+- `done` — every image for the gbifID is present
+- `partial` — has an image already (legacy first image) but more to fetch
+- `pending` — never attempted
+
+Re-running the builder is safe: file renames are idempotent (already-renamed
+files are detected and reused). If a build fails partway, re-run with `--reset`.
+
+---
+
+## Phase 2 — Run the downloader
+
+The downloader has no separate "resume" mode — every run reads the work queue
+(`pending` + `partial` gbifIDs) from the database. Submit it as many times as
+needed; each run continues where the last left off.
+
+```bash
+qsub -N image_install_db -l h_rt=48:00:00 -pe omp 16 -P herbdl \
+     -m beas -M your_email@bu.edu image_install_db.sh
+```
+
+If the job hits its `h_rt` wall-clock limit, just submit it again — progress is
+committed to the database continuously, and host cooldown / circuit-breaker
+state is persisted between runs.
+
+When the work queue is empty the script prints
+`Nothing to download` and exits.
+
+To point at a non-default database, pass `--db PATH` (edit `image_install_db.sh`).
+
+---
+
+## Phase 3 — Monitor progress
+
+Run any time — it is read-only and returns in seconds:
+
+```bash
+python status_report.py
+```
+
+It prints (and writes `summary_YYYYMMDDHHMM.txt`): gbifID and per-image
+progress, failures broken down by type, retry-attempt distribution, the worst
+hosts, and circuit-breaker state.
+
+The run log (`WARNING` and above) is at
+`/projectnb/herbdl/logs/image_install_<timestamp>.log`.
+
+Ad hoc queries:
+
+```bash
+sqlite3 /projectnb/herbdl/data/GBIF-F25h/download_status.db
+```
+```sql
+-- count each kind of failure
+SELECT error_type, COUNT(*) FROM images
+WHERE status LIKE 'failed%' GROUP BY error_type ORDER BY 2 DESC;
+
+-- URLs still worth retrying
+SELECT gbif_id, url FROM images WHERE status='failed_transient' LIMIT 50;
+
+-- hosts currently in cooldown
+SELECT host, datetime(blocked_until,'unixepoch') FROM hosts
+WHERE blocked_until > strftime('%s','now');
+```
+
+---
+
+## How retries work
+
+Each failure is classified into an `error_type`:
+
+- **Permanent** — `http_404`, `http_401`, `http_403`, `http_410`,
+  `invalid_content_type`, `not_an_image`, … → never retried.
+- **Transient** — `timeout`, `rate_limited`, `server_error`,
+  `connection_broken`, `truncated`, `manifest_error` → retried on later runs,
+  up to **4 attempts** (`MAX_ATTEMPTS` in `download_db.py`), then they count
+  toward the gbifID's `failed` status.
+
+A gbifID leaves the work queue only when it is `done` (all images succeeded) or
+`failed` (all images terminal, no retries left). To re-open exhausted transient
+failures for another pass, raise `MAX_ATTEMPTS` or reset rows manually, e.g.:
+
+```sql
+UPDATE images SET status='pending', attempts=0
+WHERE status='failed_transient';
+UPDATE gbif_ids SET status='partial' WHERE status='failed';
+```
+
+---
+
+## Caveats
+
+- **Legacy first-image index is approximate.** For gbifIDs imported from
+  `processed_ids.txt` that have more than one image, the existing file is
+  assumed to be image index 0 and marked `error_type='legacy_unverified_index'`.
+  The old downloader shuffled URLs, so the exact source URL is unknown. This is
+  exact for the ~87% of gbifIDs that have only one image; for the rest it
+  affects only metadata, not the image files.
+- **Database size.** Expect ~10–15 GB. It sits in the data directory, not the
+  repo. Ensure the `herbdl` project has the space.
+- **Single job at a time.** SQLite (WAL mode) is fine for one job with 5 worker
+  threads. Do not run multiple `image_install_db.sh` jobs against the same
+  database concurrently.
+
+---
+
+## Rollback
+
+The previous flat-file downloader still exists in
+`/projectnb/herbdl/workspaces/ljhao/herbdl/utils/` and is unaffected by this
+work. To revert this repo, use git (`git log` / `git revert`). The status
+database is independent — deleting `download_status.db*` simply means Phase 1
+must be re-run.
+
+---
+
+## Troubleshooting
+
+| Symptom | Fix |
+|---|---|
+| `Status database not found` | Run Phase 1 first (`init_download_db.sh`). |
+| `Database already exists` from the builder | Intended guard — pass `--reset` to rebuild. |
+| `database is locked` | Another process is using the DB; ensure only one downloader job runs. The code already sets a 120 s busy timeout. |
+| Builder runs out of memory | `multimedia.txt` is large; request more memory (e.g. a larger `-pe omp` slot count). |
+| Legacy progress not imported | `--processed-file` was not pointed at ljhao's `processed_ids.txt`. |
diff --git a/README.md b/README.md
index 9a7e11c..2736324 100644
--- a/README.md
+++ b/README.md
@@ -2,12 +2,56 @@
 
 This directory contains utility scripts for managing herbarium specimen images, including downloading, processing, organizing, and labeling datasets.
 
+> **Deploying the image downloader?** See [DEPLOYMENT.md](DEPLOYMENT.md) for the
+> step-by-step procedure (build the status database once, then run/resume the
+> download job).
+
 ## Scripts Overview
 
 ### Image Download & Installation
 
-#### `image_install_parallel.py`
-**Purpose**: Primary script for downloading herbarium specimen images from GBIF (Global Biodiversity Information Facility) multimedia datasets.
+#### `image_install_db.py`
+**Purpose**: Current script for downloading herbarium specimen images from GBIF (Global Biodiversity Information Facility) multimedia datasets. Downloads **all** images per gbifID, each saved as `<gbifID>-NN.jpg`, with status tracked in a SQLite database. See [DEPLOYMENT.md](DEPLOYMENT.md) for the full procedure.
+
+**Key Features**:
+- Parallel downloading with ThreadPoolExecutor (5 workers)
+- Host-based rate limiting and circuit breaker pattern
+- IIIF (International Image Interoperability Framework) manifest support — one file saved per source URL, highest resolution first
+- Automatic image resizing to 1024px max dimension
+- Atomic downloads (stream to `.tmp`, length-check, then rename) so a dropped connection never leaves a corrupt file
+- SQLite status database for resumable downloads and queryable, classified error tracking — see [`download_db.py`](download_db.py)
+- Hierarchical directory organization (3-digit prefix structure)
+
+**Prerequisite**: build the status database once with `python init_download_db.py` before the first run.
+
+**Usage**:
+```bash
+python image_install_db.py [--db PATH]
+```
+
+**Configuration**:
+- Input: `/projectnb/herbdl/data/GBIF-F25/multimedia.txt` (ingested once into the database)
+- Output: `/projectnb/herbdl/data/GBIF-F25h/`
+- Logs: `/projectnb/herbdl/logs/image_install_*.log` (WARNING level and above only — routine successes are recorded in the database, not the log)
+- Status: `download_status.db` (default `/projectnb/herbdl/data/GBIF-F25h/download_status.db`)
+
+**Advanced Features**:
+- Host cooldown on rate limiting (429 errors): 30 minutes default
+- Host cooldown on timeouts: 60 minutes
+- Circuit breaker: skips hosts after 500+ errors; state persists across runs in the `hosts` table
+- Failures are classified (404, 401, timeout, rate-limited, dropped connection, …); only transient failures are retried, capped at 4 attempts
+- Retry strategy with backoff for 500-level errors
+
+#### `image_install_db.sh`
+**Purpose**: SCC job submission wrapper for `image_install_db.py`.
+
+**Usage**:
+```bash
+qsub -N image_install_db -l h_rt=48:00:00 -pe omp 16 -P herbdl -m beas -M your_email@bu.edu image_install_db.sh
+```
+
+#### `image_install_parallel.py` (original)
+**Purpose**: The original downloader, kept for reference and fallback — superseded by `image_install_db.py`. Downloads **one** image per gbifID (stops at the first URL that succeeds) and tracks progress in flat `processed_ids.txt` / `failed_ids.txt` files.
 
 **Key Features**:
 - Parallel downloading with ThreadPoolExecutor (5 workers)
@@ -30,21 +74,59 @@ python image_install_parallel.py [-c COUNTRY_CODE]
 - Logs: `/projectnb/herbdl/logs/image_install_*.log`
 - Checkpoints: `processed_ids.txt`, `failed_ids.txt`
 
-**Advanced Features**:
-- Host cooldown on rate limiting (429 errors): 30 minutes default
-- Host cooldown on timeouts: 60 minutes
-- Circuit breaker: Permanently blocks hosts after 50+ errors
-- Multiple URL fallback per GBIF ID
-- Retry strategy with backoff for 500-level errors
-
 #### `image_install.sh`
-**Purpose**: SCC job submission wrapper for `image_install_parallel.py`.
+**Purpose**: SCC job submission wrapper for the original `image_install_parallel.py`.
 
 **Usage**:
 ```bash
 qsub -N image_install -l h_rt=48:00:00 -pe omp 16 -P herbdl -m beas -M your_email@bu.edu image_install.sh
 ```
 
+#### `download_db.py`
+**Purpose**: SQLite-backed download-status tracking. Imported by the other download scripts — not run directly.
+
+**Why it exists**: replaces the flat `processed_ids.txt` / `failed_ids.txt` files, which recorded only an ID with no reason for failure. The database records, per image URL, whether it succeeded or failed and *why*, so failures are queryable and only transient ones get retried.
+
+**Tables**:
+- `images` — one row per source image URL: `status`, `http_status`, `error_type`, `error_detail`, `file_path`, `file_size`, `attempts`
+- `gbif_ids` — one row per gbifID; the resumable work queue (`pending` / `partial` / `done` / `failed`)
+- `hosts` — per-host error tally and cooldown, so circuit-breaker state survives a restart
+
+#### `init_download_db.py`
+**Purpose**: One-time builder for the status database.
+
+**What it does**:
+1. Creates the schema
+2. Ingests `multimedia.txt` into `images` + `gbif_ids` (so later runs never re-read the 59M-row file)
+3. Imports `processed_ids.txt`: renames legacy `<id>.jpg` files to `<id>-00.jpg` for a consistent naming scheme and marks them done. Multi-image gbifIDs are left `partial` so the downloader fetches their remaining images. (`failed_ids.txt` is **not** imported — those IDs get a fresh, tracked retry.)
+
+**Usage**:
+```bash
+python init_download_db.py                 # build DB + import legacy progress
+python init_download_db.py --skip-legacy   # build DB only
+python init_download_db.py --reset         # rebuild from scratch
+```
+
+#### `status_report.py`
+**Purpose**: Report download progress directly from the database — replaces `analyze_image_progress.py`. Every figure is a single indexed SQL query, so it returns in seconds instead of loading ~180 MB of text and re-grouping `multimedia.txt`.
+
+**Reports**: gbifID and per-image progress, failures broken down by type (permanent vs retryable), retry-attempt distribution, worst hosts, and circuit-breaker state. Writes a timestamped `summary_YYYYMMDDHHMM.txt`.
+
+**Usage**:
+```bash
+python status_report.py [--db PATH] [--output-dir DIR]
+```
+
+Ad hoc queries against the database, e.g.:
+```sql
+-- count each kind of failure
+SELECT error_type, COUNT(*) FROM images
+WHERE status LIKE 'failed%' GROUP BY error_type ORDER BY 2 DESC;
+
+-- every URL still worth retrying
+SELECT gbif_id, url FROM images WHERE status='failed_transient';
+```
+
 ### Image Processing
 
 #### `image_utils.py`
@@ -205,17 +287,24 @@ from notifications import send_notification
 send_notification("Image Installation", "Downloaded 50,000 images")
 ```
 
-**Integration**: Used by `image_install_parallel.py` to send progress updates every 50,000 images.
+**Integration**: Used by the image download scripts (`image_install_db.py` and `image_install_parallel.py`) to send progress updates every 50,000 images.
 
 ## Common Workflows
 
 ### 1. Download GBIF Images
+
+See [DEPLOYMENT.md](DEPLOYMENT.md) for the full procedure. In brief:
+
 ```bash
-# Submit parallel download job
-qsub -N image_install -l h_rt=48:00:00 -pe omp 16 -P herbdl image_install.sh
+# One-time: build the status database (ingest multimedia.txt + import progress)
+qsub -N init_download_db -l h_rt=12:00:00 -pe omp 16 -P herbdl init_download_db.sh
 
-# Monitor progress in logs
-tail -f /projectnb/herbdl/logs/image_install_*.log
+# Submit the download job (re-run any time to resume — it reads the work
+# queue from the database)
+qsub -N image_install_db -l h_rt=48:00:00 -pe omp 16 -P herbdl image_install_db.sh
+
+# Check progress at any time
+python status_report.py
 ```
 
 ### 2. Organize Downloaded Images
@@ -245,21 +334,23 @@ python link_check.py
 ## Directory Structures
 
 ### Hierarchical Image Storage
-Images are organized by GBIF ID prefix for efficient filesystem access:
+Images are organized by GBIF ID prefix for efficient filesystem access. Each
+image for a gbifID is saved with a zero-padded index suffix (`-00`, `-01`, ...):
 ```
 /projectnb/herbdl/data/GBIF-F25h/
-├── 000/
-│   ├── 000/
-│   │   ├── 000000.jpg
-│   │   ├── 000001.jpg
-│   ├── 001/
-│   │   ├── 000001000.jpg
-├── 001/
+├── 105/
+│   ├── 716/
+│   │   ├── 1057161997-00.jpg
+│   │   ├── 1057161997-01.jpg
+│   ├── 717/
+│   │   ├── 1057170001-00.jpg
+├── 106/
 │   ├── 000/
 │   ├── 001/
 ```
 
-This structure prevents issues with directories containing millions of files.
+`prefix1` is the first 3 digits of the gbifID, `prefix2` digits 4–6. This
+structure prevents issues with directories containing millions of files.
 
 ## Dependencies
 
@@ -278,5 +369,6 @@ This structure prevents issues with directories containing millions of files.
 
 - All scripts are designed for use on Boston University's Shared Computing Cluster (SCC)
 - Many scripts use parallel processing for performance
-- Checkpoint files enable resumable operations after interruptions
+- The `download_status.db` SQLite database enables resumable downloads and queryable error tracking; re-running the job simply continues the work queue
+- `analyze_image_progress.py` and the `processed_ids.txt` / `failed_ids.txt` files are superseded by the database (`status_report.py`); kept only for historical reference
 - Always verify paths before running scripts to avoid data loss
diff --git a/download_db.py b/download_db.py
new file mode 100644
index 0000000..3540a93
--- /dev/null
+++ b/download_db.py
@@ -0,0 +1,293 @@
+"""
+SQLite-backed download-status tracking for image_install_db.py.
+
+Replaces the flat processed_ids.txt / failed_ids.txt checkpoint files with a
+queryable database that records, for every image URL, whether it succeeded or
+failed and *why*. That makes it possible to:
+  * resume a run without re-reading and re-grouping the 59M-row multimedia.txt,
+  * retry only transient failures (timeouts, rate limits, 5xx, dropped
+    connections) while leaving permanent ones (404/410/etc.) alone,
+  * answer questions like "how many 404s?" or "which hosts fail most?" with a
+    single SQL query (see status_report.py).
+
+Tables
+------
+images    one row per source image URL (a GBIF "identifier").
+gbif_ids  one row per gbifID; doubles as the resumable work queue.
+hosts     per-host error tally + cooldown timestamp, so circuit-breaker and
+          rate-limit state survive a job restart.
+
+A gbifID is 'done' only when every one of its images has status 'success'.
+"""
+
+import os
+import time
+import sqlite3
+import threading
+
+DEFAULT_DB_PATH = "/projectnb/herbdl/data/GBIF-F25h/download_status.db"
+
+# Retry budget: a transient failure is retried until this many attempts.
+MAX_ATTEMPTS = 4
+
+# ---- images.status -----------------------------------------------------------
+ST_PENDING = "pending"            # never attempted
+ST_SUCCESS = "success"            # downloaded (and resized) OK
+ST_FAILED_PERMANENT = "failed_permanent"   # retrying will not help
+ST_FAILED_TRANSIENT = "failed_transient"   # may succeed on a later run
+
+# ---- gbif_ids.status ---------------------------------------------------------
+G_PENDING = "pending"             # no image attempted yet
+G_PARTIAL = "partial"             # some work still possible (in the work queue)
+G_DONE = "done"                   # every image succeeded
+G_FAILED = "failed"               # all images terminal, not all succeeded
+
+# ---- error_type values -------------------------------------------------------
+ERR_RATE_LIMITED = "rate_limited"          # HTTP 429
+ERR_TIMEOUT = "timeout"                    # connect/read timeout, HTTP 408
+ERR_SERVER = "server_error"                # HTTP 5xx
+ERR_CONNECTION = "connection_broken"       # dropped connection / IncompleteRead
+ERR_TRUNCATED = "truncated"                # download shorter than Content-Length
+ERR_MANIFEST = "manifest_error"            # IIIF manifest could not be parsed
+ERR_INVALID_CONTENT = "invalid_content_type"   # server returned HTML/XML/text
+ERR_NOT_IMAGE = "not_an_image"             # bytes downloaded but not decodable
+ERR_NO_URL = "no_url"                      # no usable URL for this identifier
+ERR_OTHER = "other"                        # anything uncategorised
+ERR_LEGACY = "legacy_unverified_index"     # marker on imported processed_ids.txt
+
+# Everything not in this set is treated as permanent (e.g. any "http_4xx").
+TRANSIENT_ERRORS = {
+    ERR_RATE_LIMITED, ERR_TIMEOUT, ERR_SERVER,
+    ERR_CONNECTION, ERR_TRUNCATED, ERR_MANIFEST, ERR_OTHER,
+}
+
+
+def http_error_type(code):
+    """Map an HTTP status code to an error_type string."""
+    if code == 429:
+        return ERR_RATE_LIMITED
+    if code == 408:
+        return ERR_TIMEOUT
+    if 500 <= code <= 599:
+        return ERR_SERVER
+    return f"http_{code}"
+
+
+def is_permanent(error_type):
+    """True if a failure of this type is not worth retrying."""
+    return error_type not in TRANSIENT_ERRORS
+
+
+def status_for_error(error_type):
+    """Pick the images.status value implied by an error_type."""
+    return ST_FAILED_PERMANENT if is_permanent(error_type) else ST_FAILED_TRANSIENT
+
+
+# ---- schema ------------------------------------------------------------------
+
+_TABLES = [
+    """CREATE TABLE IF NOT EXISTS images (
+        gbif_id         INTEGER NOT NULL,
+        img_index       INTEGER NOT NULL,   -- position in this ID's URL list
+        url             TEXT    NOT NULL,
+        host            TEXT,
+        status          TEXT    NOT NULL DEFAULT 'pending',
+        http_status     INTEGER,
+        error_type      TEXT,
+        error_detail    TEXT,               -- truncated message, for debugging
+        file_path       TEXT,
+        file_size       INTEGER,            -- bytes on disk after resize
+        attempts        INTEGER NOT NULL DEFAULT 0,
+        last_attempt_at TEXT,
+        PRIMARY KEY (gbif_id, img_index)
+    )""",
+    """CREATE TABLE IF NOT EXISTS gbif_ids (
+        gbif_id      INTEGER PRIMARY KEY,
+        n_images     INTEGER NOT NULL DEFAULT 0,
+        n_success    INTEGER NOT NULL DEFAULT 0,
+        status       TEXT    NOT NULL DEFAULT 'pending',
+        completed_at TEXT
+    )""",
+    """CREATE TABLE IF NOT EXISTS hosts (
+        host          TEXT PRIMARY KEY,
+        error_count   INTEGER NOT NULL DEFAULT 0,
+        blocked_until REAL                  -- epoch seconds; NULL when not blocked
+    )""",
+]
+
+_INDEXES = [
+    "CREATE INDEX IF NOT EXISTS idx_images_status ON images(status)",
+    "CREATE INDEX IF NOT EXISTS idx_images_host   ON images(host)",
+    "CREATE INDEX IF NOT EXISTS idx_images_error  ON images(error_type) "
+    "WHERE error_type IS NOT NULL",
+    "CREATE INDEX IF NOT EXISTS idx_gbif_status   ON gbif_ids(status)",
+]
+
+
+def create_tables(conn):
+    for sql in _TABLES:
+        conn.execute(sql)
+    conn.commit()
+
+
+def create_indexes(conn):
+    for sql in _INDEXES:
+        conn.execute(sql)
+    conn.commit()
+
+
+def apply_schema(conn):
+    """Create tables and indexes if they do not already exist."""
+    create_tables(conn)
+    create_indexes(conn)
+
+
+# ---- runtime handle ----------------------------------------------------------
+
+class DownloadDB:
+    """
+    Thread-safe handle used by image_install_db.py during a run.
+
+    One SQLite connection is shared by all worker threads and guarded by a
+    single lock. The downloads themselves take seconds each, so lock contention
+    on these short statements is negligible. WAL mode keeps writes durable
+    without blocking the occasional reader.
+    """
+
+    def __init__(self, db_path=DEFAULT_DB_PATH, max_attempts=MAX_ATTEMPTS):
+        self.path = db_path
+        self.max_attempts = max_attempts
+        self.conn = sqlite3.connect(db_path, check_same_thread=False, timeout=120)
+        self.conn.execute("PRAGMA journal_mode=WAL")
+        self.conn.execute("PRAGMA synchronous=NORMAL")
+        self.conn.execute("PRAGMA busy_timeout=120000")
+        apply_schema(self.conn)
+        self.lock = threading.Lock()
+
+    def close(self):
+        with self.lock:
+            self.conn.commit()
+            self.conn.close()
+
+    # -- work queue ------------------------------------------------------------
+
+    def get_work_gbif_ids(self):
+        """Return every gbifID that still has work to do, in ascending order."""
+        with self.lock:
+            cur = self.conn.execute(
+                "SELECT gbif_id FROM gbif_ids WHERE status IN (?, ?) ORDER BY gbif_id",
+                (G_PENDING, G_PARTIAL),
+            )
+            return [row[0] for row in cur.fetchall()]
+
+    def get_images_for(self, gbif_id):
+        """Return (img_index, url, host, status, attempts) rows for one gbifID."""
+        with self.lock:
+            cur = self.conn.execute(
+                "SELECT img_index, url, host, status, attempts "
+                "FROM images WHERE gbif_id=? ORDER BY img_index",
+                (gbif_id,),
+            )
+            return cur.fetchall()
+
+    # -- recording results -----------------------------------------------------
+
+    def record_image_result(self, gbif_id, img_index, status, *, host=None,
+                             http_status=None, error_type=None, error_detail=None,
+                             file_path=None, file_size=None,
+                             increment_attempts=True):
+        """Write the outcome of one image attempt into the images table."""
+        detail = (error_detail or "")[:500] or None
+        delta = 1 if increment_attempts else 0
+        with self.lock:
+            self.conn.execute(
+                "UPDATE images SET "
+                "  status=?, host=COALESCE(?, host), http_status=?, "
+                "  error_type=?, error_detail=?, file_path=?, file_size=?, "
+                "  attempts=attempts+?, last_attempt_at=datetime('now') "
+                "WHERE gbif_id=? AND img_index=?",
+                (status, host, http_status, error_type, detail, file_path,
+                 file_size, delta, gbif_id, img_index),
+            )
+            self.conn.commit()
+
+    def finalize_gbif_id(self, gbif_id):
+        """
+        Recompute and store a gbifID's rolled-up status from its image rows.
+        Returns the new status string.
+        """
+        with self.lock:
+            rows = self.conn.execute(
+                "SELECT status, attempts FROM images WHERE gbif_id=?",
+                (gbif_id,),
+            ).fetchall()
+            if not rows:
+                return None
+
+            n_success = sum(1 for s, _ in rows if s == ST_SUCCESS)
+
+            def retryable(status, attempts):
+                if status == ST_PENDING:
+                    return True
+                if status == ST_FAILED_TRANSIENT and attempts < self.max_attempts:
+                    return True
+                return False
+
+            if n_success == len(rows):
+                status = G_DONE
+            elif any(retryable(s, a) for s, a in rows):
+                status = G_PARTIAL
+            else:
+                status = G_FAILED
+
+            self.conn.execute(
+                "UPDATE gbif_ids SET n_success=?, status=?, "
+                "completed_at=CASE WHEN ? IN (?, ?) THEN datetime('now') "
+                "                  ELSE completed_at END "
+                "WHERE gbif_id=?",
+                (n_success, status, status, G_DONE, G_FAILED, gbif_id),
+            )
+            self.conn.commit()
+            return status
+
+    # -- host circuit-breaker state -------------------------------------------
+
+    def load_host_state(self):
+        """Return (error_counts, blocked_until) dicts to seed the in-memory state."""
+        now = time.time()
+        with self.lock:
+            cur = self.conn.execute(
+                "SELECT host, error_count, blocked_until FROM hosts"
+            )
+            error_counts, blocked_until = {}, {}
+            for host, count, until in cur.fetchall():
+                if count:
+                    error_counts[host] = count
+                if until and until > now:
+                    blocked_until[host] = until
+            return error_counts, blocked_until
+
+    def save_host_state(self, error_counts, blocked_until):
+        """Persist the in-memory circuit-breaker dicts so they survive a restart."""
+        hosts = set(error_counts) | set(blocked_until)
+        rows = [(h, error_counts.get(h, 0), blocked_until.get(h)) for h in hosts]
+        if not rows:
+            return
+        with self.lock:
+            self.conn.executemany(
+                "INSERT INTO hosts(host, error_count, blocked_until) VALUES(?,?,?) "
+                "ON CONFLICT(host) DO UPDATE SET "
+                "  error_count=excluded.error_count, "
+                "  blocked_until=excluded.blocked_until",
+                rows,
+            )
+            self.conn.commit()
+
+    # -- reporting helpers -----------------------------------------------------
+
+    def gbif_status_counts(self):
+        """Return {status: count} over the gbif_ids table."""
+        with self.lock:
+            return dict(self.conn.execute(
+                "SELECT status, COUNT(*) FROM gbif_ids GROUP BY status"
+            ).fetchall())
diff --git a/image_install_db.py b/image_install_db.py
new file mode 100644
index 0000000..503a0a5
--- /dev/null
+++ b/image_install_db.py
@@ -0,0 +1,546 @@
+"""
+Image install script: download herbarium specimen images from a GBIF
+multimedia.txt file.
+
+Downloads ALL images for each gbifID. Each source URL (a GBIF "identifier") is
+saved as one file with an index suffix: <gbifID>-00.jpg, <gbifID>-01.jpg, ...
+A gbifID is marked 'done' only once every one of its images has succeeded.
+
+Status tracking
+---------------
+Per-image and per-gbifID status lives in a SQLite database (download_status.db,
+see download_db.py) instead of the old processed_ids.txt / failed_ids.txt flat
+files. Build the database once with init_download_db.py before the first run.
+
+The database lets the script:
+  * resume without re-reading the 59M-row multimedia.txt every run,
+  * retry only transient failures (timeout / rate-limit / 5xx / dropped
+    connection), capped at MAX_ATTEMPTS, and never re-hammer permanent 404s,
+  * record *why* each download failed so failures are queryable afterwards
+    (see status_report.py).
+
+Accurate as of May 2026.
+"""
+
+import os
+import time
+import random
+import logging
+import threading
+import datetime as dt
+from argparse import ArgumentParser
+from urllib.parse import urlparse
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+import urllib3
+import requests as req
+from requests.exceptions import ConnectTimeout, ReadTimeout, Timeout, ConnectionError
+from requests.adapters import HTTPAdapter
+from urllib3.util.retry import Retry
+from PIL import UnidentifiedImageError
+
+from notifications import send_notification
+from image_utils import get_file_size_in_mb, resize_with_aspect_ratio
+import download_db as ddb
+from download_db import DownloadDB
+
+# verify=False is needed because many herbarium hosts have broken TLS certs.
+# Suppress the resulting per-request warning so it does not flood the .e log
+# (it previously produced ~134 MB of InsecureRequestWarning spam per run).
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+
+# ---- configuration -----------------------------------------------------------
+
+INSTALL_PATH = "/projectnb/herbdl/data/GBIF-F25h"
+LOG_DIR = "/projectnb/herbdl/logs"
+
+MAX_WORKERS = 5
+WORK_CHUNK = 20_000          # gbifIDs submitted to the pool at a time
+MIN_IMAGE_MB = 0.01          # files smaller than this are treated as invalid
+
+HOST_COOLDOWN_DEFAULT = 30 * 60
+HOST_COOLDOWN_TIMEOUT = 60 * 60
+HOST_ERROR_THRESHOLD = 500   # circuit breaker: skip a host after this many errors
+
+# ---- in-memory host circuit-breaker state (seeded from / saved to the DB) ----
+
+host_block_until = {}
+host_error_counts = {}
+host_lock = threading.Lock()
+circuit_breaker_lock = threading.Lock()
+counter_lock = threading.Lock()
+
+n_installed = 0
+
+user_agents = [
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15",
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0",
+]
+
+session = req.Session()
+retry_strategy = Retry(
+    total=2,
+    backoff_factor=1,
+    status_forcelist=[500, 502, 503, 504],
+    allowed_methods=["HEAD", "GET", "OPTIONS"],
+)
+adapter = HTTPAdapter(max_retries=retry_strategy)
+session.mount("http://", adapter)
+session.mount("https://", adapter)
+
+logger = logging.getLogger(__name__)
+
+
+# ---- paths -------------------------------------------------------------------
+
+def get_hierarchical_path(base_dir, gbif_id, suffix, ext=".jpg"):
+    """
+    Build a hierarchical storage path to avoid millions of files in one dir.
+    suffix: image index suffix, e.g. '-00', '-01'.
+    Example: gbifID=1057161997, suffix='-00' -> <base>/105/716/1057161997-00.jpg
+    """
+    stem = str(gbif_id)
+    prefix1 = stem[:3] if len(stem) >= 3 else stem
+    prefix2 = stem[3:6] if len(stem) >= 6 else "000"
+    dest_dir = os.path.join(base_dir, prefix1, prefix2)
+    os.makedirs(dest_dir, exist_ok=True)
+    return os.path.join(dest_dir, f"{stem}{suffix}{ext}")
+
+
+# ---- host circuit breaker / cooldown -----------------------------------------
+
+def _host_from_url(url):
+    return urlparse(url).netloc.split(":")[0]
+
+
+def is_host_blocked(url):
+    host = _host_from_url(url)
+    now = time.time()
+    with host_lock:
+        until = host_block_until.get(host)
+        if until and now < until:
+            return True
+        if until and now >= until:
+            del host_block_until[host]
+    return False
+
+
+def is_host_circuit_broken(url):
+    host = _host_from_url(url)
+    with circuit_breaker_lock:
+        return host_error_counts.get(host, 0) >= HOST_ERROR_THRESHOLD
+
+
+def increment_host_errors(url, is_rate_limit=False):
+    # Rate limiting is handled by a timed cooldown, not the permanent breaker.
+    if is_rate_limit:
+        return
+    host = _host_from_url(url)
+    with circuit_breaker_lock:
+        host_error_counts[host] = host_error_counts.get(host, 0) + 1
+        count = host_error_counts[host]
+        if count == HOST_ERROR_THRESHOLD:
+            logger.error(f"CIRCUIT BREAKER: host '{host}' reached "
+                         f"{HOST_ERROR_THRESHOLD} errors; skipping it from now on.")
+
+
+def block_host(url, retry_after=None, timeout_issue=False):
+    host = _host_from_url(url)
+    now = time.time()
+    seconds = HOST_COOLDOWN_TIMEOUT if timeout_issue else HOST_COOLDOWN_DEFAULT
+    if retry_after and not timeout_issue:
+        try:
+            seconds = int(retry_after)
+        except (TypeError, ValueError):
+            try:
+                from email.utils import parsedate_to_datetime
+                dt_retry = parsedate_to_datetime(retry_after)
+                seconds = max(0, (dt_retry - dt.datetime.now(dt.timezone.utc))
+                              .total_seconds())
+            except Exception:
+                seconds = HOST_COOLDOWN_DEFAULT
+    with host_lock:
+        host_block_until[host] = now + seconds
+    reason = "timeout issues" if timeout_issue else "rate limiting"
+    logger.warning(f"Blocking host '{host}' due to {reason} for ~{int(seconds)}s.")
+
+
+# ---- IIIF manifests ----------------------------------------------------------
+
+def extract_image_from_iiif_manifest(manifest_url, gbif_id):
+    """
+    Fetch a IIIF manifest and return (image_urls, error_type).
+
+    image_urls is an ordered list of direct image URLs (highest resolution
+    first). On failure image_urls is empty and error_type explains why, so the
+    caller can decide whether the manifest is worth retrying.
+    """
+    try:
+        response = session.get(
+            manifest_url,
+            headers={"User-Agent": random.choice(user_agents),
+                     "Accept": "application/json"},
+            timeout=120,
+        )
+        if response.status_code != 200:
+            logger.warning(f"IIIF manifest {gbif_id}: HTTP {response.status_code}")
+            return [], ddb.http_error_type(response.status_code)
+
+        manifest = response.json()
+        image_urls = []
+        for item in manifest.get("items", []):
+            if item.get("type") != "Canvas":
+                continue
+            for anno_page in item.get("items", []):
+                if anno_page.get("type") != "AnnotationPage":
+                    continue
+                for anno in anno_page.get("items", []):
+                    body = anno.get("body")
+                    if not isinstance(body, dict):
+                        continue
+                    for service in body.get("service", []):
+                        base_url = service.get("id")
+                        if base_url:
+                            # Highest resolution first; caller stops at the
+                            # first that succeeds, so only one file is saved.
+                            image_urls.append(f"{base_url}/full/1600,/0/default.jpg")
+                            image_urls.append(f"{base_url}/full/1200,/0/default.jpg")
+                            image_urls.append(f"{base_url}/full/800,/0/default.jpg")
+
+        if not image_urls:
+            return [], ddb.ERR_MANIFEST
+        return image_urls, None
+
+    except (ConnectTimeout, ReadTimeout, Timeout) as e:
+        logger.warning(f"IIIF manifest {gbif_id}: timeout {e}")
+        return [], ddb.ERR_TIMEOUT
+    except Exception as e:
+        logger.warning(f"IIIF manifest {gbif_id}: parse error {e}")
+        return [], ddb.ERR_MANIFEST
+
+
+# ---- downloading -------------------------------------------------------------
+
+def _rm(path):
+    try:
+        os.remove(path)
+    except OSError:
+        pass
+
+
+def download_one_url(gbif_id, image_url, local_path):
+    """
+    Download a single URL to local_path, atomically.
+
+    Bytes are streamed to a .tmp file, length-checked against Content-Length,
+    then renamed into place -- so a dropped connection never leaves a corrupt
+    file behind. Returns a result dict with keys: ok, size, http_status,
+    error_type, error_detail, host.
+    """
+    host = _host_from_url(image_url)
+    tmp_path = local_path + ".tmp"
+
+    def fail(error_type, detail, http_status=None):
+        return {"ok": False, "size": None, "http_status": http_status,
+                "error_type": error_type, "error_detail": detail, "host": host}
+
+    try:
+        time.sleep(random.uniform(0.2, 0.8))
+        with session.get(
+            image_url,
+            stream=True,
+            verify=False,
+            headers={
+                "User-Agent": random.choice(user_agents),
+                "Connection": "keep-alive",
+                "Referer": "https://scc-ondemand1.bu.edu/",
+            },
+            timeout=180,
+        ) as resp:
+            status = resp.status_code
+
+            if status == 429:
+                increment_host_errors(image_url, is_rate_limit=True)
+                block_host(image_url, resp.headers.get("Retry-After"))
+                return fail(ddb.ERR_RATE_LIMITED, "HTTP 429", status)
+
+            if status != 200:
+                increment_host_errors(image_url)
+                return fail(ddb.http_error_type(status), f"HTTP {status}", status)
+
+            ctype = (resp.headers.get("Content-Type") or "").lower()
+            if ctype and any(bad in ctype for bad in
+                             ("text/html", "text/plain", "application/xml")):
+                increment_host_errors(image_url)
+                return fail(ddb.ERR_INVALID_CONTENT, f"Content-Type: {ctype}", status)
+
+            expected = resp.headers.get("Content-Length")
+            written = 0
+            with open(tmp_path, "wb") as out:
+                for chunk in resp.iter_content(chunk_size=65536):
+                    if chunk:
+                        out.write(chunk)
+                        written += len(chunk)
+
+            if expected is not None:
+                try:
+                    if int(expected) != written:
+                        _rm(tmp_path)
+                        return fail(ddb.ERR_TRUNCATED,
+                                    f"expected {expected} bytes, got {written}",
+                                    status)
+                except ValueError:
+                    pass
+
+            if written < 1024:
+                _rm(tmp_path)
+                return fail(ddb.ERR_TRUNCATED, f"only {written} bytes", status)
+
+            os.replace(tmp_path, local_path)
+            return {"ok": True, "size": written, "http_status": status,
+                    "error_type": None, "error_detail": None, "host": host}
+
+    except (ConnectTimeout, ReadTimeout, Timeout) as e:
+        _rm(tmp_path)
+        block_host(image_url, timeout_issue=True)
+        return fail(ddb.ERR_TIMEOUT, str(e))
+    except (ConnectionError, req.exceptions.ChunkedEncodingError) as e:
+        # ChunkedEncodingError covers IncompleteRead -- a connection dropped
+        # mid-download, which leaves only a partial .tmp file.
+        _rm(tmp_path)
+        increment_host_errors(image_url)
+        return fail(ddb.ERR_CONNECTION, str(e))
+    except Exception as e:
+        _rm(tmp_path)
+        increment_host_errors(image_url)
+        return fail(ddb.ERR_OTHER, str(e))
+
+
+def resize_image(gbif_id, local_path):
+    changed, new_size = resize_with_aspect_ratio(
+        local_path, local_path, max_size=1024, format="JPEG", quality=85)
+    if changed:
+        logger.info(f"Resized {gbif_id} to {new_size} at {local_path}")
+
+
+def resolve_and_download(gbif_id, identifier_url, local_path):
+    """
+    Download the image for one source identifier (one img_index) and save it as
+    exactly one file at local_path.
+
+    For a plain URL there is one candidate. For a IIIF manifest the manifest is
+    expanded into resolution variants and tried highest-first; the first success
+    wins, so still only one file is saved per identifier.
+
+    Returns a result dict with keys: outcome ('success' | 'failed' |
+    'deferred'), db_status, http_status, error_type, error_detail, host,
+    file_size. 'deferred' means every candidate host was blocked/circuit-broken,
+    so the image was not really attempted and should stay 'pending'.
+    """
+    if "/manifest" in identifier_url or identifier_url.endswith(".json"):
+        candidates, manifest_err = extract_image_from_iiif_manifest(
+            identifier_url, gbif_id)
+        if not candidates:
+            return {"outcome": "failed",
+                    "db_status": ddb.status_for_error(manifest_err),
+                    "http_status": None, "error_type": manifest_err,
+                    "error_detail": "IIIF manifest yielded no image URLs",
+                    "host": _host_from_url(identifier_url), "file_size": None}
+    else:
+        candidates = [identifier_url]
+
+    # Deduplicate while preserving the highest-resolution-first order.
+    seen, ordered = set(), []
+    for url in candidates:
+        if url not in seen:
+            seen.add(url)
+            ordered.append(url)
+
+    failures = []
+    attempted_any = False
+    for url in ordered:
+        if is_host_circuit_broken(url) or is_host_blocked(url):
+            continue
+        attempted_any = True
+        result = download_one_url(gbif_id, url, local_path)
+        if result["ok"]:
+            try:
+                resize_image(gbif_id, local_path)
+            except (OSError, UnidentifiedImageError) as e:
+                _rm(local_path)
+                return {"outcome": "failed", "db_status": ddb.ST_FAILED_PERMANENT,
+                        "http_status": result["http_status"],
+                        "error_type": ddb.ERR_NOT_IMAGE,
+                        "error_detail": str(e), "host": result["host"],
+                        "file_size": None}
+            try:
+                size = os.path.getsize(local_path)
+            except OSError:
+                size = result["size"]
+            return {"outcome": "success", "db_status": ddb.ST_SUCCESS,
+                    "http_status": 200, "error_type": None,
+                    "error_detail": None, "host": result["host"],
+                    "file_size": size}
+        failures.append(result)
+
+    if not attempted_any:
+        # Every candidate's host was blocked -- leave the image 'pending'.
+        return {"outcome": "deferred"}
+
+    # Prefer a transient failure as the recorded reason: if any candidate could
+    # still succeed later, the whole identifier is worth retrying.
+    transient = [f for f in failures if not ddb.is_permanent(f["error_type"])]
+    chosen = transient[0] if transient else failures[0]
+    db_status = ddb.ST_FAILED_TRANSIENT if transient else ddb.ST_FAILED_PERMANENT
+    return {"outcome": "failed", "db_status": db_status,
+            "http_status": chosen["http_status"],
+            "error_type": chosen["error_type"],
+            "error_detail": chosen["error_detail"],
+            "host": chosen["host"], "file_size": None}
+
+
+# ---- per-gbifID processing ---------------------------------------------------
+
+def process_id(db, gbif_id, total_to_install):
+    """Download every not-yet-done image for one gbifID and update the DB."""
+    global n_installed
+    images = db.get_images_for(gbif_id)
+
+    for img_index, url, _host, status, attempts in images:
+        # Skip images that are already finished or have exhausted their retries.
+        if status == ddb.ST_SUCCESS:
+            continue
+        if status == ddb.ST_FAILED_PERMANENT:
+            continue
+        if status == ddb.ST_FAILED_TRANSIENT and attempts >= db.max_attempts:
+            continue
+
+        suffix = f"-{img_index:02d}"
+        local_path = get_hierarchical_path(INSTALL_PATH, gbif_id, suffix)
+
+        # If a valid file is already on disk, record it without downloading.
+        if os.path.exists(local_path):
+            try:
+                size_mb = get_file_size_in_mb(local_path)
+            except OSError:
+                size_mb = 0.0
+            if size_mb >= MIN_IMAGE_MB:
+                db.record_image_result(
+                    gbif_id, img_index, ddb.ST_SUCCESS,
+                    host=_host_from_url(url), http_status=200,
+                    file_path=local_path, file_size=int(size_mb * 1024 * 1024),
+                    increment_attempts=False)
+                continue
+
+        result = resolve_and_download(gbif_id, url, local_path)
+        if result["outcome"] == "deferred":
+            continue  # host blocked; leave 'pending' for a later run
+
+        db.record_image_result(
+            gbif_id, img_index, result["db_status"],
+            host=result.get("host"), http_status=result.get("http_status"),
+            error_type=result.get("error_type"),
+            error_detail=result.get("error_detail"),
+            file_path=local_path if result["outcome"] == "success" else None,
+            file_size=result.get("file_size"))
+
+        if result["outcome"] == "success":
+            with counter_lock:
+                n_installed += 1
+                current = n_installed
+            if current % 50000 == 0:
+                send_notification(
+                    "Image Installation",
+                    f"Installed {current} images this run "
+                    f"(work queue: {total_to_install} gbifIDs).")
+                logger.warning(f"Installed {current} images this run.")
+
+    db.finalize_gbif_id(gbif_id)
+
+
+# ---- main --------------------------------------------------------------------
+
+def main():
+    parser = ArgumentParser(description=__doc__)
+    parser.add_argument("-c", "--country", dest="country",
+                        help="(Unsupported) country filter -- multimedia.txt "
+                             "has no countryCode column; ignored.")
+    parser.add_argument("--db", default=ddb.DEFAULT_DB_PATH,
+                        help=f"Status database path (default: {ddb.DEFAULT_DB_PATH})")
+    args = parser.parse_args()
+
+    if args.country:
+        print("WARNING: -c/--country is ignored; the work queue comes from the "
+              "database and multimedia.txt has no countryCode column.")
+
+    today = dt.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+    os.makedirs(LOG_DIR, exist_ok=True)
+    # File log captures WARNING and above only. Routine per-image successes are
+    # recorded in the database, not the log -- this keeps the log from growing
+    # to the ~1.4 GB seen with INFO-level logging.
+    logging.basicConfig(filename=f"{LOG_DIR}/image_install_{today}.log",
+                        level=logging.WARNING, filemode="w",
+                        format="%(asctime)s %(levelname)s %(message)s")
+
+    if not os.path.exists(args.db):
+        raise SystemExit(
+            f"Status database not found: {args.db}\n"
+            f"Build it once first:  python init_download_db.py")
+
+    db = DownloadDB(args.db)
+
+    # Seed the in-memory circuit breaker from the last run's host stats.
+    saved_errors, saved_blocks = db.load_host_state()
+    host_error_counts.update(saved_errors)
+    host_block_until.update(saved_blocks)
+    print(f"Loaded host state: {len(saved_errors)} hosts with errors, "
+          f"{len(saved_blocks)} currently blocked.")
+
+    work = db.get_work_gbif_ids()
+    total_to_install = len(work)
+    print(f"gbifIDs with work to do: {total_to_install}")
+    if total_to_install == 0:
+        print("Nothing to download. All gbifIDs are 'done' or terminally 'failed'.")
+        db.close()
+        return
+
+    send_notification("Image Installation",
+                      f"Starting run: {total_to_install} gbifIDs to process.")
+
+    try:
+        with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
+            for start in range(0, total_to_install, WORK_CHUNK):
+                chunk = work[start:start + WORK_CHUNK]
+                futures = [executor.submit(process_id, db, gid, total_to_install)
+                           for gid in chunk]
+                for future in as_completed(futures):
+                    try:
+                        future.result()
+                    except Exception as e:
+                        logger.error(f"Worker error: {e}")
+                # Persist circuit-breaker state periodically so a killed job
+                # (e.g. qsub h_rt limit) does not lose it.
+                db.save_host_state(host_error_counts, host_block_until)
+                print(f"  processed {min(start + WORK_CHUNK, total_to_install)}"
+                      f"/{total_to_install} gbifIDs", end="\r")
+    except KeyboardInterrupt:
+        logger.warning("Interrupted by user; saving state and exiting.")
+    finally:
+        db.save_host_state(host_error_counts, host_block_until)
+        counts = db.gbif_status_counts()
+        broken = sum(1 for c in host_error_counts.values()
+                     if c >= HOST_ERROR_THRESHOLD)
+        logger.warning(f"Run finished. Images installed this run: {n_installed}. "
+                       f"gbifID status: {counts}. Circuit-broken hosts: {broken}.")
+        for host, count in sorted(host_error_counts.items(),
+                                  key=lambda x: x[1], reverse=True)[:10]:
+            logger.warning(f"  host errors: {host}: {count}")
+        db.close()
+
+    print(f"\nDone. Images installed this run: {n_installed}")
+    print(f"gbifID status: {counts}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_install_db.sh b/image_install_db.sh
new file mode 100755
index 0000000..43b426b
--- /dev/null
+++ b/image_install_db.sh
@@ -0,0 +1,17 @@
+#!/bin/bash -l
+
+# Run the SQLite-tracked image downloader (image_install_db.py).
+#
+# Prerequisite: build the status database once with init_download_db.sh.
+# This job is resumable -- re-submit it any time and it continues from the
+# work queue stored in download_status.db.
+
+module load miniconda
+module load academic-ml/spring-2026
+
+conda activate spring-2026-pyt
+
+python image_install_db.py
+
+### The command below is used to submit the job to the cluster
+### qsub -N image_install_db -l h_rt=48:00:00 -pe omp 16 -P herbdl -m beas -M your_email@bu.edu image_install_db.sh
diff --git a/init_download_db.py b/init_download_db.py
new file mode 100644
index 0000000..65b23f2
--- /dev/null
+++ b/init_download_db.py
@@ -0,0 +1,267 @@
+#!/usr/bin/env python3
+"""
+One-time builder for the image-download status database (download_status.db).
+
+What it does
+------------
+1. Creates the SQLite schema (see download_db.py).
+2. Reads multimedia.txt once and loads every (gbifID, image URL) pair into the
+   `images` table and every gbifID into `gbif_ids`. After this, runs of
+   image_install_db.py no longer need to re-read and re-group the 59M-row
+   multimedia.txt -- the work queue lives in the database.
+3. Imports processed_ids.txt: for each already-finished gbifID it locates the
+   downloaded file, renames legacy `<id>.jpg` to `<id>-00.jpg` so the dataset
+   uses one consistent naming scheme, and marks image index 0 as 'success'.
+   gbifIDs with more than one image are left 'partial' so the multi-image
+   downloader goes back and fetches their remaining images.
+
+   NOTE: the old one-image-per-ID downloader shuffled candidate URLs, so for a
+   multi-image gbifID we cannot know which URL the existing file came from. It
+   is recorded against img_index 0 with error_type 'legacy_unverified_index'.
+   ~87% of gbifIDs have only one image, where this assignment is exact.
+
+failed_ids.txt is intentionally NOT imported: those IDs stay 'pending' and get
+a fresh, fully-tracked retry.
+
+This script is destructive-ish (it renames files and can drop an existing DB
+with --reset). It does not download anything. Run it once before the first
+run of image_install_db.py.
+
+Usage
+-----
+    python init_download_db.py                 # build DB + import legacy
+    python init_download_db.py --skip-legacy   # build DB only
+    python init_download_db.py --reset         # rebuild from scratch
+"""
+
+import os
+import sys
+import time
+import sqlite3
+import argparse
+
+import pandas as pd
+
+import download_db as ddb
+
+GBIF_MULTIMEDIA_DATA = "/projectnb/herbdl/data/GBIF-F25/multimedia.txt"
+INSTALL_PATH = "/projectnb/herbdl/data/GBIF-F25h"
+PROCESSED_FILE = "processed_ids.txt"
+
+INSERT_BATCH = 200_000
+LEGACY_BATCH = 50_000
+
+
+def hierarchical_path(base_dir, gbif_id, suffix=""):
+    """Mirror image_install_db.get_hierarchical_path (without makedirs)."""
+    stem = str(gbif_id)
+    prefix1 = stem[:3] if len(stem) >= 3 else stem
+    prefix2 = stem[3:6] if len(stem) >= 6 else "000"
+    return os.path.join(base_dir, prefix1, prefix2, f"{stem}{suffix}.jpg")
+
+
+def parse_args():
+    p = argparse.ArgumentParser(description=__doc__,
+                                formatter_class=argparse.RawDescriptionHelpFormatter)
+    p.add_argument("--db", default=ddb.DEFAULT_DB_PATH,
+                   help=f"Path to the SQLite database (default: {ddb.DEFAULT_DB_PATH})")
+    p.add_argument("--multimedia", default=GBIF_MULTIMEDIA_DATA,
+                   help="GBIF multimedia.txt to ingest")
+    p.add_argument("--install-path", default=INSTALL_PATH,
+                   help="Root directory where images are stored")
+    p.add_argument("--processed-file", default=PROCESSED_FILE,
+                   help="processed_ids.txt to import as already-done gbifIDs")
+    p.add_argument("--skip-legacy", action="store_true",
+                   help="Do not import processed_ids.txt")
+    p.add_argument("--reset", action="store_true",
+                   help="Delete an existing database before building")
+    return p.parse_args()
+
+
+def ingest_multimedia(conn, multimedia_path):
+    """Load every image URL from multimedia.txt into images + gbif_ids."""
+    print(f"Reading {multimedia_path} ...")
+    df = pd.read_csv(
+        multimedia_path,
+        delimiter="\t",
+        usecols=lambda c: c in ("gbifID", "identifier"),
+        on_bad_lines="skip",
+    )
+    df = df.dropna(subset=["gbifID", "identifier"])
+    df["gbifID"] = df["gbifID"].astype("int64")
+    df["identifier"] = df["identifier"].astype("string")
+    print(f"  {len(df):,} (gbifID, URL) rows")
+
+    # Sort so each gbifID's rows are contiguous, then number them 0,1,2,...
+    df = df.sort_values("gbifID", kind="stable").reset_index(drop=True)
+    df["img_index"] = df.groupby("gbifID").cumcount()
+    df["host"] = (
+        df["identifier"].str.extract(r"^[a-zA-Z][a-zA-Z0-9+.-]*://([^/:]+)",
+                                     expand=False)
+        .fillna("")
+    )
+
+    print("  Inserting image rows ...")
+    inserted = 0
+    for start in range(0, len(df), INSERT_BATCH):
+        sub = df.iloc[start:start + INSERT_BATCH]
+        rows = list(zip(
+            sub["gbifID"].tolist(),
+            sub["img_index"].tolist(),
+            sub["identifier"].tolist(),
+            sub["host"].tolist(),
+        ))
+        conn.executemany(
+            "INSERT OR IGNORE INTO images(gbif_id, img_index, url, host) "
+            "VALUES(?,?,?,?)",
+            rows,
+        )
+        conn.commit()
+        inserted += len(rows)
+        print(f"    {inserted:,}/{len(df):,} image rows", end="\r")
+    print(f"    {inserted:,} image rows inserted        ")
+
+    print("  Inserting gbifID rows ...")
+    sizes = df.groupby("gbifID").size()
+    gid_rows = list(zip(sizes.index.tolist(), sizes.tolist()))
+    for start in range(0, len(gid_rows), INSERT_BATCH):
+        conn.executemany(
+            "INSERT OR IGNORE INTO gbif_ids(gbif_id, n_images) VALUES(?,?)",
+            gid_rows[start:start + INSERT_BATCH],
+        )
+        conn.commit()
+    print(f"    {len(gid_rows):,} gbifIDs inserted")
+
+
+def import_legacy(conn, processed_file, install_path):
+    """Mark gbifIDs from processed_ids.txt as already having their first image."""
+    if not os.path.exists(processed_file):
+        print(f"  {processed_file} not found -- skipping legacy import.")
+        return
+
+    print(f"Importing already-processed gbifIDs from {processed_file} ...")
+    renamed = relabeled = missing = 0
+    updates = []
+
+    def flush(batch):
+        if batch:
+            conn.executemany(
+                "UPDATE images SET status='success', "
+                "  error_type=?, file_path=?, file_size=?, "
+                "  last_attempt_at=datetime('now') "
+                "WHERE gbif_id=? AND img_index=0",
+                batch,
+            )
+            conn.commit()
+
+    with open(processed_file) as fh:
+        for line in fh:
+            gid = line.strip()
+            if not gid or not gid.isdigit():
+                continue
+
+            new_path = hierarchical_path(install_path, gid, "-00")
+            old_path = hierarchical_path(install_path, gid, "")
+
+            if os.path.exists(new_path):
+                path = new_path
+                relabeled += 1
+            elif os.path.exists(old_path):
+                try:
+                    os.rename(old_path, new_path)
+                except OSError:
+                    missing += 1
+                    continue
+                path = new_path
+                renamed += 1
+            else:
+                missing += 1
+                continue
+
+            try:
+                size = os.path.getsize(path)
+            except OSError:
+                missing += 1
+                continue
+
+            updates.append((ddb.ERR_LEGACY, path, size, int(gid)))
+            if len(updates) >= LEGACY_BATCH:
+                flush(updates)
+                updates = []
+                print(f"    renamed={renamed:,} relabeled={relabeled:,} "
+                      f"missing={missing:,}", end="\r")
+    flush(updates)
+    print(f"    renamed={renamed:,}  already-suffixed={relabeled:,}  "
+          f"file-missing={missing:,}")
+
+    # Roll the per-image success flags up into gbif_ids statuses in one pass.
+    print("  Recomputing gbifID statuses ...")
+    conn.execute(
+        "UPDATE gbif_ids SET "
+        "  n_success=(SELECT COUNT(*) FROM images i "
+        "             WHERE i.gbif_id=gbif_ids.gbif_id AND i.status='success'), "
+        "  status=CASE "
+        "    WHEN n_images>0 AND n_images=(SELECT COUNT(*) FROM images i "
+        "         WHERE i.gbif_id=gbif_ids.gbif_id AND i.status='success') "
+        "      THEN 'done' "
+        "    WHEN (SELECT COUNT(*) FROM images i "
+        "         WHERE i.gbif_id=gbif_ids.gbif_id AND i.status='success')>0 "
+        "      THEN 'partial' "
+        "    ELSE 'pending' END "
+        "WHERE gbif_id IN (SELECT DISTINCT gbif_id FROM images "
+        "                  WHERE status='success')"
+    )
+    conn.execute(
+        "UPDATE gbif_ids SET completed_at=datetime('now') "
+        "WHERE status='done' AND completed_at IS NULL"
+    )
+    conn.commit()
+
+
+def main():
+    args = parse_args()
+    start = time.time()
+
+    if os.path.exists(args.db):
+        if args.reset:
+            print(f"Removing existing database {args.db}")
+            for suffix in ("", "-wal", "-shm"):
+                try:
+                    os.remove(args.db + suffix)
+                except FileNotFoundError:
+                    pass
+        else:
+            sys.exit(f"Database already exists: {args.db}\n"
+                     f"Pass --reset to rebuild it from scratch.")
+
+    os.makedirs(os.path.dirname(os.path.abspath(args.db)), exist_ok=True)
+    conn = sqlite3.connect(args.db)
+    # Fast bulk-load settings; the DB is fully rebuildable, so durability during
+    # ingest is not needed. image_install_db.py switches it to WAL later.
+    conn.execute("PRAGMA journal_mode=OFF")
+    conn.execute("PRAGMA synchronous=OFF")
+    conn.execute("PRAGMA cache_size=-200000")  # ~200 MB page cache
+
+    print("Creating schema ...")
+    ddb.create_tables(conn)
+
+    ingest_multimedia(conn, args.multimedia)
+
+    print("Building indexes (this takes a few minutes) ...")
+    ddb.create_indexes(conn)
+
+    if not args.skip_legacy:
+        import_legacy(conn, args.processed_file, args.install_path)
+
+    print("\nFinal gbifID status counts:")
+    for status, count in conn.execute(
+        "SELECT status, COUNT(*) FROM gbif_ids GROUP BY status ORDER BY status"
+    ):
+        print(f"  {status:10s} {count:,}")
+
+    conn.close()
+    print(f"\nDone in {time.time() - start:.0f}s. Database: {os.path.abspath(args.db)}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/init_download_db.sh b/init_download_db.sh
new file mode 100755
index 0000000..5c5278a
--- /dev/null
+++ b/init_download_db.sh
@@ -0,0 +1,21 @@
+#!/bin/bash -l
+
+# One-time build of the image-download status database (download_status.db).
+#
+# This is heavy: it reads the ~59M-row multimedia.txt with pandas and renames
+# up to ~13.5M already-downloaded files. Run it as a batch job, not on a login
+# node. It only needs to be run once; after that, just (re-)submit
+# image_install.sh to download and resume.
+
+module load miniconda
+module load academic-ml/spring-2026
+
+conda activate spring-2026-pyt
+
+# --processed-file points at the production processed_ids.txt, which lives in
+# ljhao's working directory, not this repo.
+python init_download_db.py \
+    --processed-file /projectnb/herbdl/workspaces/ljhao/herbdl/utils/processed_ids.txt
+
+### The command below is used to submit the job to the cluster
+### qsub -N init_download_db -l h_rt=12:00:00 -pe omp 16 -P herbdl -m beas -M your_email@bu.edu init_download_db.sh
diff --git a/status_report.py b/status_report.py
new file mode 100644
index 0000000..9c8e4b5
--- /dev/null
+++ b/status_report.py
@@ -0,0 +1,157 @@
+#!/usr/bin/env python3
+"""
+Report image-download progress straight from the SQLite status database.
+
+This replaces analyze_image_progress.py: instead of loading ~180 MB of text
+checkpoint files and re-grouping the 59M-row multimedia.txt with pandas, every
+number here is a single indexed SQL query, so the report returns in seconds.
+
+The same numbers are available ad hoc -- a few useful queries:
+
+    -- how many of each kind of failure?
+    SELECT error_type, COUNT(*) FROM images
+    WHERE status LIKE 'failed%' GROUP BY error_type ORDER BY 2 DESC;
+
+    -- every URL still worth retrying
+    SELECT gbif_id, url FROM images WHERE status='failed_transient';
+
+    -- worst hosts
+    SELECT host, COUNT(*) FROM images WHERE status LIKE 'failed%'
+    GROUP BY host ORDER BY 2 DESC LIMIT 20;
+
+Usage:
+    python status_report.py [--db PATH] [--output-dir DIR]
+"""
+
+import os
+import sqlite3
+import argparse
+from datetime import datetime
+
+import download_db as ddb
+
+
+def parse_args():
+    p = argparse.ArgumentParser(description=__doc__,
+                                formatter_class=argparse.RawDescriptionHelpFormatter)
+    p.add_argument("--db", default=ddb.DEFAULT_DB_PATH,
+                   help=f"Status database path (default: {ddb.DEFAULT_DB_PATH})")
+    p.add_argument("--output-dir", default=os.getcwd(),
+                   help="Directory for the summary_YYYYMMDDHHMM.txt file")
+    return p.parse_args()
+
+
+def main():
+    args = parse_args()
+    if not os.path.exists(args.db):
+        raise SystemExit(f"Status database not found: {args.db}")
+
+    conn = sqlite3.connect(args.db)
+    run_time = datetime.now()
+    output_file = os.path.join(
+        args.output_dir, f"summary_{run_time:%Y%m%d%H%M}.txt")
+    os.makedirs(args.output_dir, exist_ok=True)
+
+    with open(output_file, "w") as out:
+        def write(msg=""):
+            out.write(msg + "\n")
+            print(msg)
+
+        def section(title):
+            write()
+            write("=" * 70)
+            write(title)
+            write("=" * 70)
+
+        write(f"Run date: {run_time:%Y-%m-%d %H:%M:%S}")
+        write(f"Database: {os.path.abspath(args.db)}")
+
+        # -- gbifID progress --------------------------------------------------
+        section("GBIFID PROGRESS")
+        gbif_counts = dict(conn.execute(
+            "SELECT status, COUNT(*) FROM gbif_ids GROUP BY status").fetchall())
+        total_ids = sum(gbif_counts.values())
+        write(f"Total gbifIDs:                  {total_ids:,}")
+        for status in (ddb.G_DONE, ddb.G_PARTIAL, ddb.G_PENDING, ddb.G_FAILED):
+            count = gbif_counts.get(status, 0)
+            pct = (count / total_ids * 100) if total_ids else 0.0
+            write(f"  {status:10s}                  {count:>14,}  ({pct:5.2f}%)")
+        remaining = gbif_counts.get(ddb.G_PENDING, 0) + gbif_counts.get(ddb.G_PARTIAL, 0)
+        write(f"Still in the work queue:        {remaining:,}")
+
+        # -- per-image progress ----------------------------------------------
+        section("IMAGE (URL) PROGRESS")
+        img_counts = dict(conn.execute(
+            "SELECT status, COUNT(*) FROM images GROUP BY status").fetchall())
+        total_imgs = sum(img_counts.values())
+        write(f"Total image URLs:               {total_imgs:,}")
+        for status in (ddb.ST_SUCCESS, ddb.ST_PENDING,
+                       ddb.ST_FAILED_TRANSIENT, ddb.ST_FAILED_PERMANENT):
+            count = img_counts.get(status, 0)
+            pct = (count / total_imgs * 100) if total_imgs else 0.0
+            write(f"  {status:18s}          {count:>14,}  ({pct:5.2f}%)")
+
+        # -- failure breakdown ------------------------------------------------
+        section("FAILURES BY TYPE")
+        write(f"{'error_type':24s} {'count':>14s}  {'verdict':s}")
+        write("-" * 60)
+        rows = conn.execute(
+            "SELECT error_type, COUNT(*) FROM images "
+            "WHERE status LIKE 'failed%' AND error_type IS NOT NULL "
+            "GROUP BY error_type ORDER BY 2 DESC").fetchall()
+        for error_type, count in rows:
+            verdict = "permanent" if ddb.is_permanent(error_type) else "retryable"
+            write(f"{error_type:24s} {count:>14,}  {verdict}")
+        if not rows:
+            write("(no failures recorded yet)")
+
+        # -- retry attempt distribution --------------------------------------
+        section("RETRY ATTEMPTS (failed_transient images)")
+        rows = conn.execute(
+            "SELECT attempts, COUNT(*) FROM images "
+            "WHERE status='failed_transient' GROUP BY attempts ORDER BY attempts"
+        ).fetchall()
+        for attempts, count in rows:
+            note = "  <- retry budget exhausted" if attempts >= ddb.MAX_ATTEMPTS else ""
+            write(f"  {attempts} attempt(s): {count:,}{note}")
+        if not rows:
+            write("(none)")
+
+        # -- worst hosts ------------------------------------------------------
+        section("TOP 20 HOSTS BY FAILED IMAGES")
+        write(f"{'host':40s} {'failed':>10s} {'success':>10s}")
+        write("-" * 64)
+        rows = conn.execute(
+            "SELECT host, "
+            "  SUM(CASE WHEN status LIKE 'failed%' THEN 1 ELSE 0 END) AS failed, "
+            "  SUM(CASE WHEN status='success' THEN 1 ELSE 0 END) AS ok "
+            "FROM images WHERE host IS NOT NULL AND host != '' "
+            "GROUP BY host ORDER BY failed DESC LIMIT 20").fetchall()
+        for host, failed, ok in rows:
+            write(f"{host[:40]:40s} {failed or 0:>10,} {ok or 0:>10,}")
+
+        # -- circuit-breaker state -------------------------------------------
+        section("CIRCUIT BREAKER / COOLDOWNS")
+        broken = conn.execute(
+            "SELECT COUNT(*) FROM hosts WHERE error_count >= 500").fetchone()[0]
+        blocked = conn.execute(
+            "SELECT COUNT(*) FROM hosts "
+            "WHERE blocked_until IS NOT NULL "
+            "AND blocked_until > strftime('%s','now')").fetchone()[0]
+        write(f"Hosts past the circuit-breaker threshold (500 errors): {broken:,}")
+        write(f"Hosts currently in cooldown:                          {blocked:,}")
+
+        section("NOTES")
+        write("- 'done'    = every image URL for the gbifID succeeded.")
+        write("- 'partial' = still has retryable work; stays in the queue.")
+        write("- 'failed'  = all images terminal, not all succeeded; no retries left.")
+        write("- failed_transient images are retried until "
+              f"{ddb.MAX_ATTEMPTS} attempts, then count toward 'failed'.")
+        write()
+        write(f"Summary written to: {os.path.abspath(output_file)}")
+
+    conn.close()
+
+
+if __name__ == "__main__":
+    main()

From 0b59c51c677d2333a750ef8d851dcf67e1bfb881 Mon Sep 17 00:00:00 2001
From: Thomas Gardos <3973626+trgardos@users.noreply.github.com>
Date: Thu, 21 May 2026 09:38:19 -0400
Subject: [PATCH 03/19] ignore generated summary files

---
 .gitignore | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitignore b/.gitignore
index a230a78..b85c0e6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,5 @@
 .venv/
 __pycache__/
+
+# ignore generated summary files
+summary*.txt

From 8f3e299dc080463267497aa7c27273cb19678ce7 Mon Sep 17 00:00:00 2001
From: Thomas Gardos <3973626+trgardos@users.noreply.github.com>
Date: Thu, 21 May 2026 13:37:07 -0400
Subject: [PATCH 04/19] ignore batch scheduler log files

---
 .gitignore | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.gitignore b/.gitignore
index b85c0e6..68a21d2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,7 @@ __pycache__/
 
 # ignore generated summary files
 summary*.txt
+
+# ignore batch scheduler log files
+*.e[0-9]*
+*.o[0-9]*

From 8a9d326ab80fb4f7a396b0217bdc3f27c4d46f88 Mon Sep 17 00:00:00 2001
From: Thomas Gardos <3973626+trgardos@users.noreply.github.com>
Date: Thu, 21 May 2026 13:39:24 -0400
Subject: [PATCH 05/19] add legacy-only mode to finish reading processed IDs,
 and allow concurrent read-only access to db

---
 DEPLOYMENT.md       |  13 ++++--
 README.md           |   1 +
 init_download_db.py | 104 +++++++++++++++++++++++++++++++++++---------
 init_download_db.sh |   4 +-
 4 files changed, 96 insertions(+), 26 deletions(-)

diff --git a/DEPLOYMENT.md b/DEPLOYMENT.md
index e487545..d7ad4c8 100644
--- a/DEPLOYMENT.md
+++ b/DEPLOYMENT.md
@@ -75,6 +75,7 @@ python init_download_db.py \
 |---|---|
 | `python init_download_db.py` | Build DB + import legacy progress |
 | `python init_download_db.py --skip-legacy` | Build DB only (everything starts `pending`) |
+| `python init_download_db.py --legacy-only` | Skip the ingest; only (re-)run the legacy import on an existing DB |
 | `python init_download_db.py --reset` | Delete an existing DB and rebuild from scratch |
 
 **Expected output** — a status breakdown, e.g.:
@@ -90,8 +91,11 @@ Final gbifID status counts:
 - `partial` — has an image already (legacy first image) but more to fetch
 - `pending` — never attempted
 
-Re-running the builder is safe: file renames are idempotent (already-renamed
-files are detected and reused). If a build fails partway, re-run with `--reset`.
+Re-running is safe: file renames and database updates are idempotent
+(already-renamed files are detected and reused). If the **ingest** fails partway,
+re-run with `--reset`. If only the **legacy import** fails partway (e.g. it was
+interrupted), re-run with `--legacy-only` — that finishes the import without
+redoing the hour-long ingest.
 
 ---
 
@@ -206,7 +210,8 @@ must be re-run.
 | Symptom | Fix |
 |---|---|
 | `Status database not found` | Run Phase 1 first (`init_download_db.sh`). |
-| `Database already exists` from the builder | Intended guard — pass `--reset` to rebuild. |
-| `database is locked` | Another process is using the DB; ensure only one downloader job runs. The code already sets a 120 s busy timeout. |
+| `Database already exists` from the builder | Intended guard — `--reset` to rebuild, or `--legacy-only` to just (re-)run the legacy import. |
+| `database is locked` | The builder now uses WAL mode (readers do not block the writer) and a 120 s busy timeout, so this should not recur. If the legacy import was interrupted by it, finish it with `init_download_db.py --legacy-only`. Still avoid running two writers against one DB. |
+| Legacy import interrupted partway | Re-run `init_download_db.py --legacy-only` — it is idempotent and skips the hour-long ingest. |
 | Builder runs out of memory | `multimedia.txt` is large; request more memory (e.g. a larger `-pe omp` slot count). |
 | Legacy progress not imported | `--processed-file` was not pointed at ljhao's `processed_ids.txt`. |
diff --git a/README.md b/README.md
index 2736324..930339a 100644
--- a/README.md
+++ b/README.md
@@ -104,6 +104,7 @@ qsub -N image_install -l h_rt=48:00:00 -pe omp 16 -P herbdl -m beas -M your_emai
 ```bash
 python init_download_db.py                 # build DB + import legacy progress
 python init_download_db.py --skip-legacy   # build DB only
+python init_download_db.py --legacy-only   # (re-)run only the legacy import
 python init_download_db.py --reset         # rebuild from scratch
 ```
 
diff --git a/init_download_db.py b/init_download_db.py
index 65b23f2..ca03bf9 100644
--- a/init_download_db.py
+++ b/init_download_db.py
@@ -27,10 +27,14 @@
 with --reset). It does not download anything. Run it once before the first
 run of image_install_db.py.
 
+The legacy import is idempotent and resumable: if it is interrupted, re-run
+with --legacy-only to finish it without redoing the multimedia ingest.
+
 Usage
 -----
     python init_download_db.py                 # build DB + import legacy
     python init_download_db.py --skip-legacy   # build DB only
+    python init_download_db.py --legacy-only   # (re-)run only the legacy import
     python init_download_db.py --reset         # rebuild from scratch
 """
 
@@ -73,6 +77,9 @@ def parse_args():
                    help="processed_ids.txt to import as already-done gbifIDs")
     p.add_argument("--skip-legacy", action="store_true",
                    help="Do not import processed_ids.txt")
+    p.add_argument("--legacy-only", action="store_true",
+                   help="Skip the ingest; only (re-)run the legacy import on an "
+                        "existing database (use this to finish an interrupted import)")
     p.add_argument("--reset", action="store_true",
                    help="Delete an existing database before building")
     return p.parse_args()
@@ -144,15 +151,28 @@ def import_legacy(conn, processed_file, install_path):
     updates = []
 
     def flush(batch):
-        if batch:
-            conn.executemany(
-                "UPDATE images SET status='success', "
-                "  error_type=?, file_path=?, file_size=?, "
-                "  last_attempt_at=datetime('now') "
-                "WHERE gbif_id=? AND img_index=0",
-                batch,
-            )
-            conn.commit()
+        if not batch:
+            return
+        # WAL mode + the 120 s busy timeout make a lock here very unlikely, but
+        # retry rather than throw away a long-running import if one occurs.
+        for attempt in range(1, 4):
+            try:
+                conn.executemany(
+                    "UPDATE images SET status='success', "
+                    "  error_type=?, file_path=?, file_size=?, "
+                    "  last_attempt_at=datetime('now') "
+                    "WHERE gbif_id=? AND img_index=0",
+                    batch,
+                )
+                conn.commit()
+                return
+            except sqlite3.OperationalError as e:
+                if "locked" in str(e).lower() and attempt < 3:
+                    print(f"\n  database locked; retry {attempt}/3 "
+                          f"in {10 * attempt}s ...")
+                    time.sleep(10 * attempt)
+                    continue
+                raise
 
     with open(processed_file) as fh:
         for line in fh:
@@ -218,10 +238,55 @@ def flush(batch):
     conn.commit()
 
 
+def report_status(conn):
+    """Print the gbifID status breakdown."""
+    print("\nFinal gbifID status counts:")
+    for status, count in conn.execute(
+        "SELECT status, COUNT(*) FROM gbif_ids GROUP BY status ORDER BY status"
+    ):
+        print(f"  {status:10s} {count:,}")
+
+
+def connect(db_path, bulk_load):
+    """
+    Open the database with a 120 s busy timeout, so a momentary lock from a
+    concurrent reader (e.g. status_report.py) makes the write wait rather than
+    abort the run.
+
+    bulk_load=True  -> fastest, no durability (for the rebuildable ingest).
+    bulk_load=False -> WAL + synchronous=NORMAL: durable, and readers never
+                       block the writer (used for the legacy import).
+    """
+    conn = sqlite3.connect(db_path, timeout=120)
+    conn.execute("PRAGMA busy_timeout=120000")
+    conn.execute("PRAGMA cache_size=-200000")  # ~200 MB page cache
+    if bulk_load:
+        conn.execute("PRAGMA journal_mode=OFF")
+        conn.execute("PRAGMA synchronous=OFF")
+    else:
+        conn.execute("PRAGMA journal_mode=WAL")
+        conn.execute("PRAGMA synchronous=NORMAL")
+    return conn
+
+
 def main():
     args = parse_args()
     start = time.time()
 
+    # --legacy-only: skip the ingest and just (re-)run the legacy import. Use
+    # this to finish an interrupted import without rebuilding the database.
+    if args.legacy_only:
+        if not os.path.exists(args.db):
+            sys.exit(f"--legacy-only needs an existing database, but none was "
+                     f"found at: {args.db}\nRun the full build first.")
+        print(f"--legacy-only: (re-)running the legacy import on {args.db}")
+        conn = connect(args.db, bulk_load=False)
+        import_legacy(conn, args.processed_file, args.install_path)
+        report_status(conn)
+        conn.close()
+        print(f"\nDone in {time.time() - start:.0f}s.")
+        return
+
     if os.path.exists(args.db):
         if args.reset:
             print(f"Removing existing database {args.db}")
@@ -232,15 +297,13 @@ def main():
                     pass
         else:
             sys.exit(f"Database already exists: {args.db}\n"
-                     f"Pass --reset to rebuild it from scratch.")
+                     f"  --reset        rebuild it from scratch\n"
+                     f"  --legacy-only  (re-)run just the legacy import on it")
 
     os.makedirs(os.path.dirname(os.path.abspath(args.db)), exist_ok=True)
-    conn = sqlite3.connect(args.db)
     # Fast bulk-load settings; the DB is fully rebuildable, so durability during
-    # ingest is not needed. image_install_db.py switches it to WAL later.
-    conn.execute("PRAGMA journal_mode=OFF")
-    conn.execute("PRAGMA synchronous=OFF")
-    conn.execute("PRAGMA cache_size=-200000")  # ~200 MB page cache
+    # the ingest is not needed.
+    conn = connect(args.db, bulk_load=True)
 
     print("Creating schema ...")
     ddb.create_tables(conn)
@@ -251,14 +314,13 @@ def main():
     ddb.create_indexes(conn)
 
     if not args.skip_legacy:
+        # Switch to a durable, reader-tolerant mode for the legacy import: it
+        # renames files on disk, so a crash here is costlier than during ingest.
+        conn.execute("PRAGMA journal_mode=WAL")
+        conn.execute("PRAGMA synchronous=NORMAL")
         import_legacy(conn, args.processed_file, args.install_path)
 
-    print("\nFinal gbifID status counts:")
-    for status, count in conn.execute(
-        "SELECT status, COUNT(*) FROM gbif_ids GROUP BY status ORDER BY status"
-    ):
-        print(f"  {status:10s} {count:,}")
-
+    report_status(conn)
     conn.close()
     print(f"\nDone in {time.time() - start:.0f}s. Database: {os.path.abspath(args.db)}")
 
diff --git a/init_download_db.sh b/init_download_db.sh
index 5c5278a..1fde26b 100755
--- a/init_download_db.sh
+++ b/init_download_db.sh
@@ -14,7 +14,9 @@ conda activate spring-2026-pyt
 
 # --processed-file points at the production processed_ids.txt, which lives in
 # ljhao's working directory, not this repo.
-python init_download_db.py \
+# --legacy-only means only process the legacy images without creating the database.
+#    Run this on subsequent times after db was created.
+python init_download_db.py --legacy-only \
     --processed-file /projectnb/herbdl/workspaces/ljhao/herbdl/utils/processed_ids.txt
 
 ### The command below is used to submit the job to the cluster

From 5abbaea9a064f56d818c55be5342777bd9a9f725 Mon Sep 17 00:00:00 2001
From: Thomas Gardos <3973626+trgardos@users.noreply.github.com>
Date: Thu, 21 May 2026 16:11:34 -0400
Subject: [PATCH 06/19] doc on how to download GBIF metadata

---
 README.md                      |  2 +-
 docs/gbif-metadata-download.md | 71 ++++++++++++++++++++++++++++++++++
 2 files changed, 72 insertions(+), 1 deletion(-)
 create mode 100644 docs/gbif-metadata-download.md

diff --git a/README.md b/README.md
index 930339a..d01c343 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# Utils Directory
+# GBIF Image Data Download
 
 This directory contains utility scripts for managing herbarium specimen images, including downloading, processing, organizing, and labeling datasets.
 
diff --git a/docs/gbif-metadata-download.md b/docs/gbif-metadata-download.md
new file mode 100644
index 0000000..003f695
--- /dev/null
+++ b/docs/gbif-metadata-download.md
@@ -0,0 +1,71 @@
+# GBIF Metadata Download
+
+To get a complete list of all entries for vascular plants that include herbaria image assets on [GBIF](https://www.gbif.org/), you need to filter by a specific taxonomic phylum, restrict the results to preserved specimens (herbaria), and ensure they contain image media.
+
+Because this query will yield tens of millions of records, navigating them via the web interface is inefficient. The standard way to get this list is by initiating a filtered asynchronous download.
+
+Here is how you can do it using both the website interface and the GBIF API:
+
+### Method 1: Using the GBIF Web Interface
+
+1. Go to the [GBIF Occurrence Search Page](https://www.gbif.org/occurrence/search).
+2. Apply the following filters in the left-hand panel:
+* **Scientific Name / Taxon**: Search for and select **`Tracheophyta`** (this is the phylum name for all vascular plants).
+* **Basis of Record**: Select **`Preserved specimen`** (this restricts your search to herbarium sheets and physical collections rather than citizen science observations).
+* **Media Type**: Select **`Image`** (this ensures every record has an attached digital photo asset).
+
+
+3. Once the filters are applied, click the **Download** button at the top right of the search panel.
+4. Choose the **Darwin Core Archive (DwC-A)** format. This format is ideal because it generates a `.zip` package containing:
+* `occurrence.txt`: The main list of data entries.
+* `multimedia.txt`: A ledger mapping the occurrences directly to their herbarium image URLs.
+
+
+
+---
+
+### Method 2: Programmatically via the GBIF API
+
+If you want to automate the request or incorporate it into a script (using Python, R, or `curl`), you can send a `POST` request to the GBIF download API using the exact keys for your filters.
+
+**Taxon Key for Tracheophyta:** `7707728`
+
+#### Example API Request Payload
+
+You can send a JSON object to `https://api.gbif.org/v1/occurrence/download/request` (requires your GBIF account credentials):
+
+```json
+{
+  "creator": "your_gbif_username",
+  "notificationAddresses": [
+    "your_email@example.com"
+  ],
+  "sendNotification": true,
+  "format": "DWCA",
+  "predicate": {
+    "type": "and",
+    "predicates": [
+      {
+        "type": "equals",
+        "key": "TAXON_KEY",
+        "value": "7707728"
+      },
+      {
+        "type": "equals",
+        "key": "BASIS_OF_RECORD",
+        "value": "PRESERVED_SPECIMEN"
+      },
+      {
+        "type": "equals",
+        "key": "MEDIA_TYPE",
+        "value": "StillImage"
+      }
+    ]
+  }
+}
+
+```
+
+### Pro-Tip for Data Handling
+
+Once your download is processed and unzipped, the `multimedia.txt` file will serve as your master list for image links, which you can link back to the metadata in `occurrence.txt` using the shared `gbifID` column. If you are using Python, you can also use the [`plantnet/gbif-dl`](https://www.google.com/search?q=%5Bhttps://github.com/plantnet/gbif-dl%5D(https://github.com/plantnet/gbif-dl)) library specifically designed to parse these queries and download the image assets efficiently.
\ No newline at end of file

From ed94e6007ff0cd62ca5fdd33143240f4ead3d2b2 Mon Sep 17 00:00:00 2001
From: Thomas Gardos <3973626+trgardos@users.noreply.github.com>
Date: Thu, 21 May 2026 16:11:53 -0400
Subject: [PATCH 07/19] flush printing so it shows up in logs right away

---
 image_install_db.py | 22 ++++++++++++++++++++--
 init_download_db.py | 22 +++++++++++++++++++---
 2 files changed, 39 insertions(+), 5 deletions(-)

diff --git a/image_install_db.py b/image_install_db.py
index 503a0a5..a2fe443 100644
--- a/image_install_db.py
+++ b/image_install_db.py
@@ -23,6 +23,7 @@
 """
 
 import os
+import sys
 import time
 import random
 import logging
@@ -92,6 +93,18 @@
 logger = logging.getLogger(__name__)
 
 
+def progress(msg):
+    """
+    Print a progress line, flushed immediately so a batch job's .o log updates
+    live instead of only at exit. Overwrites in place on an interactive
+    terminal; writes one line per update when redirected to a log file.
+    """
+    if sys.stdout.isatty():
+        print(f"\r{msg}", end="", flush=True)
+    else:
+        print(msg, flush=True)
+
+
 # ---- paths -------------------------------------------------------------------
 
 def get_hierarchical_path(base_dir, gbif_id, suffix, ext=".jpg"):
@@ -462,6 +475,10 @@ def process_id(db, gbif_id, total_to_install):
 # ---- main --------------------------------------------------------------------
 
 def main():
+    # Line-buffer stdout so progress appears in a batch job's .o log live,
+    # not only when the job finishes.
+    sys.stdout.reconfigure(line_buffering=True)
+
     parser = ArgumentParser(description=__doc__)
     parser.add_argument("-c", "--country", dest="country",
                         help="(Unsupported) country filter -- multimedia.txt "
@@ -522,8 +539,9 @@ def main():
                 # Persist circuit-breaker state periodically so a killed job
                 # (e.g. qsub h_rt limit) does not lose it.
                 db.save_host_state(host_error_counts, host_block_until)
-                print(f"  processed {min(start + WORK_CHUNK, total_to_install)}"
-                      f"/{total_to_install} gbifIDs", end="\r")
+                progress(f"  processed "
+                         f"{min(start + WORK_CHUNK, total_to_install)}"
+                         f"/{total_to_install} gbifIDs")
     except KeyboardInterrupt:
         logger.warning("Interrupted by user; saving state and exiting.")
     finally:
diff --git a/init_download_db.py b/init_download_db.py
index ca03bf9..dac6214 100644
--- a/init_download_db.py
+++ b/init_download_db.py
@@ -56,6 +56,18 @@
 LEGACY_BATCH = 50_000
 
 
+def progress(msg):
+    """
+    Print a progress line, flushed immediately so a batch job's .o log updates
+    live instead of only at exit. Overwrites in place on an interactive
+    terminal; writes one line per update when redirected to a log file.
+    """
+    if sys.stdout.isatty():
+        print(f"\r{msg}", end="", flush=True)
+    else:
+        print(msg, flush=True)
+
+
 def hierarchical_path(base_dir, gbif_id, suffix=""):
     """Mirror image_install_db.get_hierarchical_path (without makedirs)."""
     stem = str(gbif_id)
@@ -125,7 +137,7 @@ def ingest_multimedia(conn, multimedia_path):
         )
         conn.commit()
         inserted += len(rows)
-        print(f"    {inserted:,}/{len(df):,} image rows", end="\r")
+        progress(f"    {inserted:,}/{len(df):,} image rows")
     print(f"    {inserted:,} image rows inserted        ")
 
     print("  Inserting gbifID rows ...")
@@ -208,8 +220,8 @@ def flush(batch):
             if len(updates) >= LEGACY_BATCH:
                 flush(updates)
                 updates = []
-                print(f"    renamed={renamed:,} relabeled={relabeled:,} "
-                      f"missing={missing:,}", end="\r")
+                progress(f"    renamed={renamed:,} relabeled={relabeled:,} "
+                         f"missing={missing:,}")
     flush(updates)
     print(f"    renamed={renamed:,}  already-suffixed={relabeled:,}  "
           f"file-missing={missing:,}")
@@ -270,6 +282,10 @@ def connect(db_path, bulk_load):
 
 
 def main():
+    # Line-buffer stdout so progress appears in a batch job's .o log live,
+    # not only when the job finishes.
+    sys.stdout.reconfigure(line_buffering=True)
+
     args = parse_args()
     start = time.time()
 

From 26c416955ecc5fd24e9e15af493c86bd0f0eb80e Mon Sep 17 00:00:00 2001
From: Thomas Gardos <3973626+trgardos@users.noreply.github.com>
Date: Thu, 21 May 2026 16:50:32 -0400
Subject: [PATCH 08/19] add schema description for occurence.txt and
 multimedia.txt files

---
 docs/gbif-occurence-multimedia-txt-schema.md | 69 ++++++++++++++++++++
 1 file changed, 69 insertions(+)
 create mode 100644 docs/gbif-occurence-multimedia-txt-schema.md

diff --git a/docs/gbif-occurence-multimedia-txt-schema.md b/docs/gbif-occurence-multimedia-txt-schema.md
new file mode 100644
index 0000000..5132e3f
--- /dev/null
+++ b/docs/gbif-occurence-multimedia-txt-schema.md
@@ -0,0 +1,69 @@
+In a Darwin Core Archive (DwC-A) downloaded from GBIF, the relationship between `occurrence.txt` and `multimedia.txt` follows a **star schema** layout.
+
+`occurrence.txt` acts as the **Core file** (the center of the star), while `multimedia.txt` acts as an **Extension file** linked back to the core.
+
+---
+
+### 1. The Linkage: How They Connect
+
+The two files are mapped together using a single relational key: **`gbifID`**.
+
+* **`occurrence.txt`**: Each row represents a unique biodiversity record and has a unique `gbifID`.
+* **`multimedia.txt`**: May contain zero, one, or multiple rows for a single `gbifID` (since one physical herbarium sheet might have multiple photos taken of it, or close-ups of its label).
+
+---
+
+### 2. Schema for `occurrence.txt` (The Core Metadata)
+
+This file contains the biological, geographical, and administrative data for the specimen. While it can feature over 200 columns of standardized Darwin Core terms, the most vital fields for a vascular plant herbaria project include:
+
+| Column Name | Description | Example Data |
+| --- | --- | --- |
+| **`gbifID`** | The unique numerical primary key assigned by GBIF. | `402391023` |
+| **`basisOfRecord`** | The physical nature of the record. For herbaria, this is always filtered to this value. | `PRESERVED_SPECIMEN` |
+| **`scientificName`** | The full, three-part or two-part taxon name with authorship. | *Quercus alba L.* |
+| **`taxonKey`** / **`speciesKey`** | Unique backbone taxonomic ID numbers used to group species regardless of spelling variations. | `2878688` |
+| **`institutionCode`** / **`collectionCode`** | Identifiers for the home museum or herbarium hosting the physical asset. | `NY` (New York Botanical Garden) |
+| **`catalogNumber`** | The barcode or physical filing number stamped on the sheet. | `NY00123456` |
+| **`recordedBy`** | The name of the original collector who found the plant. | `Asa Gray` |
+| **`eventDate`** | The ISO 8601 date the plant was harvested from the wild. | `1874-06-15` |
+| **`decimalLatitude`** / **`decimalLongitude`** | GPS/Coordinate mapping of where the specimen originally grew. | `42.3601`, `-71.0589` |
+
+---
+
+### 3. Schema for `multimedia.txt` (The Asset Ledger)
+
+This file is much narrower and strictly handles the digital representations of the specimen. It breaks down into media-specific fields:
+
+| Column Name | Description | Example Data |
+| --- | --- | --- |
+| **`gbifID`** | The foreign key pointing straight back to `occurrence.txt`. | `402391023` |
+| **`type`** | The type of media asset. For photos, this standard term is used. | `StillImage` |
+| **`format`** | The MIME type indicating the file extension pattern. | `image/jpeg` or `image/tiff` |
+| **`identifier`** | **The actual URL** where the high-resolution image asset is publicly hosted by the museum. | `https://sweetgum.nybg.org/images/v2/highres...jpg` |
+| **`references`** | A web URL directing to the museum’s interactive webpage for that specimen. | `https://word.nybg.org/detail.php?irn=4920` |
+| **`license`** | The text declaration or Creative Commons status of the photograph. | `CC BY 4.0` or `CC0` |
+| **`creator`** / **`rightsHolder`** | The photographer or the legal institution holding the copyright to the image. | `The New York Botanical Garden` |
+
+---
+
+### Practical Data Layout Example
+
+If a single white oak specimen (`gbifID: 101`) has a photo of the full sheet and a secondary close-up macro photo of its acorns, your files will structurally parse out like this:
+
+**`occurrence.txt`**
+
+```text
+gbifID   scientificName   basisOfRecord       institutionCode
+101      Quercus alba     PRESERVED_SPECIMEN  NY
+
+```
+
+**`multimedia.txt`**
+
+```text
+gbifID   type        format      identifier
+101      StillImage  image/jpeg  https://museum.org/specimen101_full.jpg
+101      StillImage  image/jpeg  https://museum.org/specimen101_acorn_zoom.jpg
+
+```

From 0197d99369d4fc3d2f9253c1a48f12f2472ab2a6 Mon Sep 17 00:00:00 2001
From: Thomas Gardos <3973626+trgardos@users.noreply.github.com>
Date: Fri, 22 May 2026 12:17:21 -0400
Subject: [PATCH 09/19] document the location of the other processed_ids.txt
 file

---
 init_download_db.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/init_download_db.sh b/init_download_db.sh
index 1fde26b..f42c1bb 100755
--- a/init_download_db.sh
+++ b/init_download_db.sh
@@ -19,5 +19,8 @@ conda activate spring-2026-pyt
 python init_download_db.py --legacy-only \
     --processed-file /projectnb/herbdl/workspaces/ljhao/herbdl/utils/processed_ids.txt
 
+# The other big initial run is tracked in
+# /projectnb/herbdl/workspaces/tsehou26/herbarium_project/utils/processed_ids.txt and .../failed_ids.txt
+
 ### The command below is used to submit the job to the cluster
 ### qsub -N init_download_db -l h_rt=12:00:00 -pe omp 16 -P herbdl -m beas -M your_email@bu.edu init_download_db.sh

From 2032781b49b0dca204cd4494d700a8d8097b098f Mon Sep 17 00:00:00 2001
From: Thomas Gardos <3973626+trgardos@users.noreply.github.com>
Date: Fri, 22 May 2026 12:35:26 -0400
Subject: [PATCH 10/19] dedupe images, capture non-image responses, keep raw
 DNG files

Rework the SQLite download tracking to count and download one file per
distinct image instead of one per multimedia.txt row.

- download_db: images table is now one row per distinct image
  (image_no, image_key, packed candidate urls). canonical_image_key()
  collapses a IIIF manifest and the resolution variants of one specimen
  photo to a single key.
- init_download_db: ingest groups multimedia rows into distinct images,
  ordering candidate URLs highest-resolution-first.
- image_install_db: downloads one file per distinct image; detects
  HTML/text responses (Content-Type header + body sniff) and captures
  the page text into error_detail for follow-up; keeps undecodable
  camera-raw DNG as <id>-NN.dng flagged raw_unprocessed instead of
  discarding it.
- status_report: distinct-image counts, non-image-responses-by-host
  section, raw_unprocessed count.
- init_download_db.sh: take the build mode (--reset / --legacy-only)
  from the qsub command line.

The images table schema changed, so this requires a --reset rebuild.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 DEPLOYMENT.md       |  34 +++-
 README.md           |  20 ++-
 download_db.py      | 101 ++++++++----
 image_install_db.py | 377 ++++++++++++++++++++++++++++++--------------
 init_download_db.py |  88 ++++++++---
 init_download_db.sh |  12 +-
 status_report.py    |  49 +++++-
 7 files changed, 482 insertions(+), 199 deletions(-)

diff --git a/DEPLOYMENT.md b/DEPLOYMENT.md
index d7ad4c8..f9fce1c 100644
--- a/DEPLOYMENT.md
+++ b/DEPLOYMENT.md
@@ -14,9 +14,12 @@ environment.
 
 | Before | After |
 |---|---|
-| Progress in `processed_ids.txt` / `failed_ids.txt` (ID only, no reason) | Progress in `download_status.db` — every URL's outcome and *why* it failed |
+| Progress in `processed_ids.txt` / `failed_ids.txt` (ID only, no reason) | Progress in `download_status.db` — every image's outcome and *why* it failed |
 | `multimedia.txt` re-read and re-grouped with pandas every run | Ingested into the DB once; later runs query the work queue |
+| One file per multimedia row (a manifest + its resolution variants → 3 files) | One file per **distinct image** — IIIF manifest/resolution variants deduplicated |
 | Failed IDs all retried blindly (or skipped) | Only transient failures retried (timeout/rate-limit/5xx/dropped connection), capped at 4 attempts |
+| Camera-raw DNG silently discarded | DNG kept as `<id>-NN.dng`, flagged `raw_unprocessed` for a later conversion pass |
+| HTML "download not supported" pages saved as junk `.jpg` | Detected; the page text is captured in `error_detail` for follow-up |
 | `analyze_image_progress.py` (slow, loads ~180 MB of text) | `status_report.py` (instant SQL queries) |
 | ~1.4 GB run logs, ~134 MB warning spam | `WARNING`-level log only; warning spam suppressed |
 
@@ -49,6 +52,11 @@ never committed:
 This step ingests `multimedia.txt`, imports already-completed downloads from
 `processed_ids.txt`, and renames legacy `<id>.jpg` files to `<id>-00.jpg`.
 
+> **If a `download_status.db` already exists from before the distinct-image
+> change**, it has the old schema and must be rebuilt — run the builder with
+> `--reset`. Files already on disk are detected and re-used, so this re-discovers
+> existing progress; it does not re-download anything.
+
 It is heavy — it reads the ~59M-row `multimedia.txt` with pandas and renames up
 to ~13.5M files. **Run it as a batch job, not on a login node.**
 
@@ -146,8 +154,17 @@ sqlite3 /projectnb/herbdl/data/GBIF-F25h/download_status.db
 SELECT error_type, COUNT(*) FROM images
 WHERE status LIKE 'failed%' GROUP BY error_type ORDER BY 2 DESC;
 
--- URLs still worth retrying
-SELECT gbif_id, url FROM images WHERE status='failed_transient' LIMIT 50;
+-- images still worth retrying
+SELECT gbif_id, image_no, urls FROM images
+WHERE status='failed_transient' LIMIT 50;
+
+-- URLs that returned an HTML/text page, with the captured message
+SELECT host, error_detail FROM images
+WHERE error_type='invalid_content_type' GROUP BY host;
+
+-- raw files (DNG etc.) kept for a later conversion pass
+SELECT gbif_id, image_no, file_path FROM images
+WHERE error_type='raw_unprocessed';
 
 -- hosts currently in cooldown
 SELECT host, datetime(blocked_until,'unixepoch') FROM hosts
@@ -187,6 +204,17 @@ UPDATE gbif_ids SET status='partial' WHERE status='failed';
   The old downloader shuffled URLs, so the exact source URL is unknown. This is
   exact for the ~87% of gbifIDs that have only one image; for the rest it
   affects only metadata, not the image files.
+- **Non-IIIF duplicates are not deduplicated.** Two distinct non-IIIF URLs on
+  one gbifID stay separate images — metadata cannot tell whether they are the
+  same photo at different sizes. Only content hashing could, and that is not
+  done here.
+- **Raw DNG files are kept, not converted.** A camera-raw DNG is saved as
+  `<id>-NN.dng` with `error_type='raw_unprocessed'` (the row still counts as
+  `success`). Converting them to JPEG-1024 is a later pass and needs a raw
+  decoder (`rawpy`) added to the environment. Query them with
+  `WHERE error_type='raw_unprocessed'`.
+- **HTML/non-image responses** are recorded `invalid_content_type` with the page
+  text captured in `error_detail`; `status_report.py` lists them by host.
 - **Database size.** Expect ~10–15 GB. It sits in the data directory, not the
   repo. Ensure the `herbdl` project has the space.
 - **Single job at a time.** SQLite (WAL mode) is fine for one job with 5 worker
diff --git a/README.md b/README.md
index d01c343..fbee679 100644
--- a/README.md
+++ b/README.md
@@ -11,13 +11,15 @@ This directory contains utility scripts for managing herbarium specimen images,
 ### Image Download & Installation
 
 #### `image_install_db.py`
-**Purpose**: Current script for downloading herbarium specimen images from GBIF (Global Biodiversity Information Facility) multimedia datasets. Downloads **all** images per gbifID, each saved as `<gbifID>-NN.jpg`, with status tracked in a SQLite database. See [DEPLOYMENT.md](DEPLOYMENT.md) for the full procedure.
+**Purpose**: Current script for downloading herbarium specimen images from GBIF (Global Biodiversity Information Facility) multimedia datasets. Downloads one file per **distinct image** of each gbifID, saved as `<gbifID>-NN.jpg`, with status tracked in a SQLite database. See [DEPLOYMENT.md](DEPLOYMENT.md) for the full procedure.
 
 **Key Features**:
 - Parallel downloading with ThreadPoolExecutor (5 workers)
 - Host-based rate limiting and circuit breaker pattern
-- IIIF (International Image Interoperability Framework) manifest support — one file saved per source URL, highest resolution first
-- Automatic image resizing to 1024px max dimension
+- IIIF manifest support, with deduplication — a manifest plus the resolution variants of one specimen photo count as **one** image, so one file is saved, not three
+- Automatic image resizing to 1024px max-dimension JPEG
+- TIFF/PNG decoded and saved as JPEG; camera-raw **DNG kept as-is** (`<gbifID>-NN.dng`, flagged `raw_unprocessed`) rather than discarded
+- HTML/text responses (e.g. "direct download no longer supported") detected and the page text captured for follow-up
 - Atomic downloads (stream to `.tmp`, length-check, then rename) so a dropped connection never leaves a corrupt file
 - SQLite status database for resumable downloads and queryable, classified error tracking — see [`download_db.py`](download_db.py)
 - Hierarchical directory organization (3-digit prefix structure)
@@ -85,20 +87,22 @@ qsub -N image_install -l h_rt=48:00:00 -pe omp 16 -P herbdl -m beas -M your_emai
 #### `download_db.py`
 **Purpose**: SQLite-backed download-status tracking. Imported by the other download scripts — not run directly.
 
-**Why it exists**: replaces the flat `processed_ids.txt` / `failed_ids.txt` files, which recorded only an ID with no reason for failure. The database records, per image URL, whether it succeeded or failed and *why*, so failures are queryable and only transient ones get retried.
+**Why it exists**: replaces the flat `processed_ids.txt` / `failed_ids.txt` files, which recorded only an ID with no reason for failure. The database records, per distinct image, whether it succeeded or failed and *why*, so failures are queryable and only transient ones get retried.
 
 **Tables**:
-- `images` — one row per source image URL: `status`, `http_status`, `error_type`, `error_detail`, `file_path`, `file_size`, `attempts`
-- `gbif_ids` — one row per gbifID; the resumable work queue (`pending` / `partial` / `done` / `failed`)
+- `images` — one row per **distinct image** (the download unit): `image_no`, `image_key` (canonical identity), `urls` (candidate URLs), `status`, `http_status`, `error_type`, `error_detail`, `file_path`, `file_size`, `attempts`
+- `gbif_ids` — one row per gbifID; the resumable work queue (`pending` / `partial` / `done` / `failed`); `n_images` is the distinct-image count
 - `hosts` — per-host error tally and cooldown, so circuit-breaker state survives a restart
 
+`canonical_image_key()` collapses a IIIF manifest and the resolution variants of one specimen photo to a single key, so multiple `multimedia.txt` rows become one `images` row.
+
 #### `init_download_db.py`
 **Purpose**: One-time builder for the status database.
 
 **What it does**:
 1. Creates the schema
-2. Ingests `multimedia.txt` into `images` + `gbif_ids` (so later runs never re-read the 59M-row file)
-3. Imports `processed_ids.txt`: renames legacy `<id>.jpg` files to `<id>-00.jpg` for a consistent naming scheme and marks them done. Multi-image gbifIDs are left `partial` so the downloader fetches their remaining images. (`failed_ids.txt` is **not** imported — those IDs get a fresh, tracked retry.)
+2. Reads `multimedia.txt` once, groups its rows into **distinct images** (a manifest + resolution variants of one photo collapse to one), and loads `images` + `gbif_ids` — so later runs never re-read the 59M-row file
+3. Imports `processed_ids.txt`: renames legacy `<id>.jpg` files to `<id>-00.jpg` for a consistent naming scheme and marks image 0 done. Multi-image gbifIDs are left `partial` so the downloader fetches their remaining images. (`failed_ids.txt` is **not** imported — those IDs get a fresh, tracked retry.)
 
 **Usage**:
 ```bash
diff --git a/download_db.py b/download_db.py
index 3540a93..c0358f3 100644
--- a/download_db.py
+++ b/download_db.py
@@ -2,25 +2,29 @@
 SQLite-backed download-status tracking for image_install_db.py.
 
 Replaces the flat processed_ids.txt / failed_ids.txt checkpoint files with a
-queryable database that records, for every image URL, whether it succeeded or
-failed and *why*. That makes it possible to:
-  * resume a run without re-reading and re-grouping the 59M-row multimedia.txt,
-  * retry only transient failures (timeouts, rate limits, 5xx, dropped
-    connections) while leaving permanent ones (404/410/etc.) alone,
-  * answer questions like "how many 404s?" or "which hosts fail most?" with a
-    single SQL query (see status_report.py).
+queryable database that records, for every *distinct image*, whether it was
+downloaded and -- when it failed -- why.
+
+Distinct images vs. multimedia rows
+-----------------------------------
+A gbifID often has several rows in GBIF's multimedia.txt that all point at the
+SAME photo: a IIIF manifest plus the 300px / 1600px renderings of one specimen.
+canonical_image_key() collapses those to a single key, so one row in the
+`images` table = one distinct image = one downloaded file. Non-IIIF URLs key to
+themselves (metadata cannot tell whether two opaque URLs are the same photo --
+that needs content hashing, which this layer does not do).
 
 Tables
 ------
-images    one row per source image URL (a GBIF "identifier").
-gbif_ids  one row per gbifID; doubles as the resumable work queue.
-hosts     per-host error tally + cooldown timestamp, so circuit-breaker and
-          rate-limit state survive a job restart.
+images    one row per distinct image; the download/work unit.
+gbif_ids  one row per gbifID; the resumable work queue.
+hosts     per-host error tally + cooldown, surviving a restart.
 
-A gbifID is 'done' only when every one of its images has status 'success'.
+A gbifID is 'done' only when every one of its distinct images has succeeded.
 """
 
 import os
+import re
 import time
 import sqlite3
 import threading
@@ -32,28 +36,32 @@
 
 # ---- images.status -----------------------------------------------------------
 ST_PENDING = "pending"            # never attempted
-ST_SUCCESS = "success"            # downloaded (and resized) OK
+ST_SUCCESS = "success"            # image obtained (resized JPEG, or kept raw)
 ST_FAILED_PERMANENT = "failed_permanent"   # retrying will not help
 ST_FAILED_TRANSIENT = "failed_transient"   # may succeed on a later run
 
 # ---- gbif_ids.status ---------------------------------------------------------
 G_PENDING = "pending"             # no image attempted yet
 G_PARTIAL = "partial"             # some work still possible (in the work queue)
-G_DONE = "done"                   # every image succeeded
+G_DONE = "done"                   # every distinct image succeeded
 G_FAILED = "failed"               # all images terminal, not all succeeded
 
-# ---- error_type values -------------------------------------------------------
+# ---- error_type values (on failure rows) ------------------------------------
 ERR_RATE_LIMITED = "rate_limited"          # HTTP 429
 ERR_TIMEOUT = "timeout"                    # connect/read timeout, HTTP 408
 ERR_SERVER = "server_error"                # HTTP 5xx
 ERR_CONNECTION = "connection_broken"       # dropped connection / IncompleteRead
 ERR_TRUNCATED = "truncated"                # download shorter than Content-Length
 ERR_MANIFEST = "manifest_error"            # IIIF manifest could not be parsed
-ERR_INVALID_CONTENT = "invalid_content_type"   # server returned HTML/XML/text
-ERR_NOT_IMAGE = "not_an_image"             # bytes downloaded but not decodable
-ERR_NO_URL = "no_url"                      # no usable URL for this identifier
+ERR_INVALID_CONTENT = "invalid_content_type"   # URL returned HTML/text, not an image
+ERR_NOT_IMAGE = "not_an_image"             # bytes downloaded but undecodable junk
+ERR_NO_URL = "no_url"                      # no usable URL for this image
 ERR_OTHER = "other"                        # anything uncategorised
-ERR_LEGACY = "legacy_unverified_index"     # marker on imported processed_ids.txt
+
+# ---- flags carried on status='success' rows (not failures) ------------------
+ERR_LEGACY = "legacy_unverified_index"     # imported from processed_ids.txt
+ERR_RAW_UNPROCESSED = "raw_unprocessed"    # kept as a raw file (e.g. DNG); needs
+                                           # a later conversion pass to JPEG
 
 # Everything not in this set is treated as permanent (e.g. any "http_4xx").
 TRANSIENT_ERRORS = {
@@ -83,27 +91,56 @@ def status_for_error(error_type):
     return ST_FAILED_PERMANENT if is_permanent(error_type) else ST_FAILED_TRANSIENT
 
 
+# ---- canonical image identity -----------------------------------------------
+
+_IIIF_MANIFEST = re.compile(r"/manifest(?:\.json)?$", re.IGNORECASE)
+_IIIF_IMAGE_TAIL = re.compile(
+    r"/[^/]+/[^/]+/[-+0-9.!]+/(?:default|color|gray|bitonal)\.[A-Za-z0-9]+$",
+    re.IGNORECASE,
+)
+
+
+def canonical_image_key(url):
+    """
+    Return a canonical identity for the image a URL points at.
+
+    A IIIF Presentation manifest (".../E00699064/manifest") and every IIIF
+    Image-API rendering (".../E00699064/full/1600,/0/default.jpg") of one
+    specimen collapse to the same key -- the IIIF identifier ".../E00699064".
+    Non-IIIF URLs key to themselves.
+    """
+    u = (url or "").strip()
+    stripped = _IIIF_MANIFEST.sub("", u)
+    if stripped != u:
+        return stripped
+    stripped = _IIIF_IMAGE_TAIL.sub("", u)
+    if stripped != u:
+        return stripped
+    return u
+
+
 # ---- schema ------------------------------------------------------------------
 
 _TABLES = [
     """CREATE TABLE IF NOT EXISTS images (
         gbif_id         INTEGER NOT NULL,
-        img_index       INTEGER NOT NULL,   -- position in this ID's URL list
-        url             TEXT    NOT NULL,
+        image_no        INTEGER NOT NULL,   -- distinct-image ordinal in the gbifID
+        image_key       TEXT    NOT NULL,   -- canonical identity (debug/transparency)
+        urls            TEXT    NOT NULL,   -- newline-joined candidate URLs, best first
         host            TEXT,
         status          TEXT    NOT NULL DEFAULT 'pending',
         http_status     INTEGER,
         error_type      TEXT,
-        error_detail    TEXT,               -- truncated message, for debugging
+        error_detail    TEXT,               -- truncated message / captured page text
         file_path       TEXT,
-        file_size       INTEGER,            -- bytes on disk after resize
+        file_size       INTEGER,            -- bytes on disk
         attempts        INTEGER NOT NULL DEFAULT 0,
         last_attempt_at TEXT,
-        PRIMARY KEY (gbif_id, img_index)
+        PRIMARY KEY (gbif_id, image_no)
     )""",
     """CREATE TABLE IF NOT EXISTS gbif_ids (
         gbif_id      INTEGER PRIMARY KEY,
-        n_images     INTEGER NOT NULL DEFAULT 0,
+        n_images     INTEGER NOT NULL DEFAULT 0,   -- distinct images
         n_success    INTEGER NOT NULL DEFAULT 0,
         status       TEXT    NOT NULL DEFAULT 'pending',
         completed_at TEXT
@@ -181,23 +218,23 @@ def get_work_gbif_ids(self):
             return [row[0] for row in cur.fetchall()]
 
     def get_images_for(self, gbif_id):
-        """Return (img_index, url, host, status, attempts) rows for one gbifID."""
+        """Return (image_no, image_key, urls, host, status, attempts) per image."""
         with self.lock:
             cur = self.conn.execute(
-                "SELECT img_index, url, host, status, attempts "
-                "FROM images WHERE gbif_id=? ORDER BY img_index",
+                "SELECT image_no, image_key, urls, host, status, attempts "
+                "FROM images WHERE gbif_id=? ORDER BY image_no",
                 (gbif_id,),
             )
             return cur.fetchall()
 
     # -- recording results -----------------------------------------------------
 
-    def record_image_result(self, gbif_id, img_index, status, *, host=None,
+    def record_image_result(self, gbif_id, image_no, status, *, host=None,
                              http_status=None, error_type=None, error_detail=None,
                              file_path=None, file_size=None,
                              increment_attempts=True):
         """Write the outcome of one image attempt into the images table."""
-        detail = (error_detail or "")[:500] or None
+        detail = (error_detail or "")[:2000] or None
         delta = 1 if increment_attempts else 0
         with self.lock:
             self.conn.execute(
@@ -205,9 +242,9 @@ def record_image_result(self, gbif_id, img_index, status, *, host=None,
                 "  status=?, host=COALESCE(?, host), http_status=?, "
                 "  error_type=?, error_detail=?, file_path=?, file_size=?, "
                 "  attempts=attempts+?, last_attempt_at=datetime('now') "
-                "WHERE gbif_id=? AND img_index=?",
+                "WHERE gbif_id=? AND image_no=?",
                 (status, host, http_status, error_type, detail, file_path,
-                 file_size, delta, gbif_id, img_index),
+                 file_size, delta, gbif_id, image_no),
             )
             self.conn.commit()
 
diff --git a/image_install_db.py b/image_install_db.py
index a2fe443..0603b72 100644
--- a/image_install_db.py
+++ b/image_install_db.py
@@ -2,27 +2,38 @@
 Image install script: download herbarium specimen images from a GBIF
 multimedia.txt file.
 
-Downloads ALL images for each gbifID. Each source URL (a GBIF "identifier") is
-saved as one file with an index suffix: <gbifID>-00.jpg, <gbifID>-01.jpg, ...
-A gbifID is marked 'done' only once every one of its images has succeeded.
+Downloads one file per *distinct image* of each gbifID. A IIIF manifest and the
+resolution variants of one specimen photo are treated as a single image (see
+download_db.canonical_image_key) -- so a record listed in multimedia.txt as
+"manifest + 300px + 1600px" produces ONE file, not three. Files are named
+<gbifID>-00.jpg, <gbifID>-01.jpg, ... A gbifID is 'done' only once every one of
+its distinct images has succeeded.
 
 Status tracking
 ---------------
 Per-image and per-gbifID status lives in a SQLite database (download_status.db,
-see download_db.py) instead of the old processed_ids.txt / failed_ids.txt flat
-files. Build the database once with init_download_db.py before the first run.
+see download_db.py). Build it once with init_download_db.py before the first run.
 
 The database lets the script:
   * resume without re-reading the 59M-row multimedia.txt every run,
   * retry only transient failures (timeout / rate-limit / 5xx / dropped
     connection), capped at MAX_ATTEMPTS, and never re-hammer permanent 404s,
-  * record *why* each download failed so failures are queryable afterwards
-    (see status_report.py).
+  * record *why* each download failed so failures are queryable afterwards.
+
+Non-JPEG handling
+-----------------
+TIFF/PNG/etc. are decoded by Pillow and saved as resized JPEG like everything
+else. A file Pillow cannot decode but that is a real image format (camera-raw
+DNG) is kept as-is (<gbifID>-NN.dng) and flagged 'raw_unprocessed' for a later
+conversion pass -- it is not discarded. A URL that returns an HTML/text page
+(e.g. "direct download no longer supported") is recorded as
+'invalid_content_type' with the page text captured for follow-up.
 
 Accurate as of May 2026.
 """
 
 import os
+import re
 import sys
 import time
 import random
@@ -46,8 +57,7 @@
 from download_db import DownloadDB
 
 # verify=False is needed because many herbarium hosts have broken TLS certs.
-# Suppress the resulting per-request warning so it does not flood the .e log
-# (it previously produced ~134 MB of InsecureRequestWarning spam per run).
+# Suppress the resulting per-request warning so it does not flood the .e log.
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 
 # ---- configuration -----------------------------------------------------------
@@ -63,6 +73,10 @@
 HOST_COOLDOWN_TIMEOUT = 60 * 60
 HOST_ERROR_THRESHOLD = 500   # circuit breaker: skip a host after this many errors
 
+# Extensions under which an undecodable-but-real image is kept for later.
+RAW_EXTS = (".dng", ".nef", ".cr2", ".cr3", ".arw", ".raf", ".orf", ".rw2",
+            ".tif", ".raw")
+
 # ---- in-memory host circuit-breaker state (seeded from / saved to the DB) ----
 
 host_block_until = {}
@@ -107,18 +121,23 @@ def progress(msg):
 
 # ---- paths -------------------------------------------------------------------
 
-def get_hierarchical_path(base_dir, gbif_id, suffix, ext=".jpg"):
+def image_path(gbif_id, image_no, ext):
     """
-    Build a hierarchical storage path to avoid millions of files in one dir.
-    suffix: image index suffix, e.g. '-00', '-01'.
-    Example: gbifID=1057161997, suffix='-00' -> <base>/105/716/1057161997-00.jpg
+    Storage path for one image: <base>/<p1>/<p2>/<gbifID>-NN<ext> (no mkdir).
+    p1 = first 3 digits of the gbifID, p2 = digits 4-6.
     """
     stem = str(gbif_id)
     prefix1 = stem[:3] if len(stem) >= 3 else stem
     prefix2 = stem[3:6] if len(stem) >= 6 else "000"
-    dest_dir = os.path.join(base_dir, prefix1, prefix2)
-    os.makedirs(dest_dir, exist_ok=True)
-    return os.path.join(dest_dir, f"{stem}{suffix}{ext}")
+    return os.path.join(INSTALL_PATH, prefix1, prefix2,
+                        f"{stem}-{image_no:02d}{ext}")
+
+
+def _rm(path):
+    try:
+        os.remove(path)
+    except OSError:
+        pass
 
 
 # ---- host circuit breaker / cooldown -----------------------------------------
@@ -181,13 +200,17 @@ def block_host(url, retry_after=None, timeout_issue=False):
 
 # ---- IIIF manifests ----------------------------------------------------------
 
+def is_manifest_url(url):
+    low = url.lower()
+    return "/manifest" in low or low.endswith(".json")
+
+
 def extract_image_from_iiif_manifest(manifest_url, gbif_id):
     """
     Fetch a IIIF manifest and return (image_urls, error_type).
 
     image_urls is an ordered list of direct image URLs (highest resolution
-    first). On failure image_urls is empty and error_type explains why, so the
-    caller can decide whether the manifest is worth retrying.
+    first). On failure image_urls is empty and error_type explains why.
     """
     try:
         response = session.get(
@@ -233,30 +256,95 @@ def extract_image_from_iiif_manifest(manifest_url, gbif_id):
         return [], ddb.ERR_MANIFEST
 
 
-# ---- downloading -------------------------------------------------------------
+# ---- non-image response detection -------------------------------------------
 
-def _rm(path):
+_NON_IMAGE_CTYPES = ("text/html", "text/plain", "text/xml",
+                     "application/xml", "application/json", "ld+json")
+
+
+def _is_non_image_ctype(ctype):
+    return bool(ctype) and any(t in ctype for t in _NON_IMAGE_CTYPES)
+
+
+def _looks_like_text(data):
+    """Heuristic: do the first bytes look like an HTML / XML / JSON document?"""
+    if not data:
+        return False
+    head = data.lstrip()[:64].lower()
+    return head.startswith((b"<!doctype", b"<html", b"<head", b"<body",
+                            b"<?xml", b"{", b"["))
+
+
+def _read_bounded(resp, limit=16384):
+    """Read at most `limit` bytes of a streamed response body."""
+    raw = b""
+    for chunk in resp.iter_content(chunk_size=8192):
+        raw += chunk
+        if len(raw) >= limit:
+            break
+    return raw[:limit]
+
+
+def _join_bounded(stream, limit=16384):
+    """Drain at most `limit` bytes from an iter_content generator."""
+    raw = b""
+    for chunk in stream:
+        raw += chunk
+        if len(raw) >= limit:
+            break
+    return raw[:limit]
+
+
+def _html_to_text(raw):
+    """Strip an HTML/text body down to readable text for capture in the DB."""
     try:
-        os.remove(path)
-    except OSError:
-        pass
+        text = raw.decode("utf-8", errors="replace")
+    except Exception:
+        text = str(raw)
+    text = re.sub(r"(?is)<(script|style)[^>]*>.*?</\1>", " ", text)
+    text = re.sub(r"(?s)<[^>]+>", " ", text)
+    text = re.sub(r"\s+", " ", text).strip()
+    return text[:1800]
 
 
-def download_one_url(gbif_id, image_url, local_path):
+def raw_keep_extension(content_type, url):
     """
-    Download a single URL to local_path, atomically.
+    Extension under which to keep an image Pillow could not decode, or None to
+    discard it. Camera-raw DNG is the main case -- kept for a later conversion
+    pass rather than lost.
+    """
+    ct = (content_type or "").lower()
+    low = (url or "").lower().split("?")[0]
+    if "dng" in ct or low.endswith(".dng"):
+        return ".dng"
+    for ext in (".nef", ".cr2", ".cr3", ".arw", ".raf", ".orf", ".rw2"):
+        if low.endswith(ext):
+            return ext
+    if "tiff" in ct or "tif" in ct or low.endswith((".tif", ".tiff")):
+        return ".tif"
+    if ct.startswith("image/"):
+        return ".raw"        # an image/* type Pillow cannot read -- keep it anyway
+    return None
+
 
-    Bytes are streamed to a .tmp file, length-checked against Content-Length,
-    then renamed into place -- so a dropped connection never leaves a corrupt
-    file behind. Returns a result dict with keys: ok, size, http_status,
-    error_type, error_detail, host.
+# ---- downloading -------------------------------------------------------------
+
+def download_one_url(gbif_id, image_url, tmp_path):
+    """
+    Download one URL to tmp_path. Returns a dict:
+      success -> {ok: True, host, http_status, content_type, size}
+      failure -> {ok: False, host, http_status, content_type, size,
+                  error_type, error_detail}
+
+    Detects HTML/text responses -- including ones disguised with an image
+    Content-Type -- and captures the page text into error_detail.
     """
     host = _host_from_url(image_url)
-    tmp_path = local_path + ".tmp"
 
-    def fail(error_type, detail, http_status=None):
-        return {"ok": False, "size": None, "http_status": http_status,
-                "error_type": error_type, "error_detail": detail, "host": host}
+    def fail(error_type, detail, http_status=None, content_type=None):
+        return {"ok": False, "host": host, "http_status": http_status,
+                "content_type": content_type, "size": None,
+                "error_type": error_type, "error_detail": detail}
 
     try:
         time.sleep(random.uniform(0.2, 0.8))
@@ -283,15 +371,36 @@ def fail(error_type, detail, http_status=None):
                 return fail(ddb.http_error_type(status), f"HTTP {status}", status)
 
             ctype = (resp.headers.get("Content-Type") or "").lower()
-            if ctype and any(bad in ctype for bad in
-                             ("text/html", "text/plain", "application/xml")):
+
+            # Content-Type clearly says this is not an image -> capture the text.
+            if _is_non_image_ctype(ctype):
                 increment_host_errors(image_url)
-                return fail(ddb.ERR_INVALID_CONTENT, f"Content-Type: {ctype}", status)
+                snippet = _html_to_text(_read_bounded(resp))
+                return fail(ddb.ERR_INVALID_CONTENT,
+                            f"[{ctype}] {snippet}", status, ctype)
+
+            # Sniff the first chunk: some hosts serve an HTML notice ("direct
+            # download no longer supported ...") with an image/* Content-Type.
+            stream = resp.iter_content(chunk_size=65536)
+            first = b""
+            for chunk in stream:
+                if chunk:
+                    first = chunk
+                    break
+            if _looks_like_text(first):
+                increment_host_errors(image_url)
+                snippet = _html_to_text(first + _join_bounded(stream))
+                return fail(ddb.ERR_INVALID_CONTENT,
+                            f"[non-image body, {ctype or 'no Content-Type'}] "
+                            f"{snippet}", status, ctype)
 
             expected = resp.headers.get("Content-Length")
             written = 0
             with open(tmp_path, "wb") as out:
-                for chunk in resp.iter_content(chunk_size=65536):
+                if first:
+                    out.write(first)
+                    written += len(first)
+                for chunk in stream:
                     if chunk:
                         out.write(chunk)
                         written += len(chunk)
@@ -302,17 +411,17 @@ def fail(error_type, detail, http_status=None):
                         _rm(tmp_path)
                         return fail(ddb.ERR_TRUNCATED,
                                     f"expected {expected} bytes, got {written}",
-                                    status)
+                                    status, ctype)
                 except ValueError:
                     pass
 
             if written < 1024:
                 _rm(tmp_path)
-                return fail(ddb.ERR_TRUNCATED, f"only {written} bytes", status)
+                return fail(ddb.ERR_TRUNCATED, f"only {written} bytes",
+                            status, ctype)
 
-            os.replace(tmp_path, local_path)
-            return {"ok": True, "size": written, "http_status": status,
-                    "error_type": None, "error_detail": None, "host": host}
+            return {"ok": True, "host": host, "http_status": status,
+                    "content_type": ctype, "size": written}
 
     except (ConnectTimeout, ReadTimeout, Timeout) as e:
         _rm(tmp_path)
@@ -330,79 +439,99 @@ def fail(error_type, detail, http_status=None):
         return fail(ddb.ERR_OTHER, str(e))
 
 
-def resize_image(gbif_id, local_path):
-    changed, new_size = resize_with_aspect_ratio(
-        local_path, local_path, max_size=1024, format="JPEG", quality=85)
-    if changed:
-        logger.info(f"Resized {gbif_id} to {new_size} at {local_path}")
+def _finalize_download(gbif_id, image_no, image_url, res, tmp_path):
+    """Turn a downloaded temp file into the final image, or keep it raw."""
+    try:
+        # Decode + resize as a normal image (JPEG/TIFF/PNG/...), in place.
+        resize_with_aspect_ratio(tmp_path, tmp_path, max_size=1024,
+                                 format="JPEG", quality=85)
+    except (OSError, UnidentifiedImageError) as e:
+        # Pillow cannot decode it. If it is a real image format (DNG etc.),
+        # keep the raw file for a later conversion pass; otherwise discard.
+        ext = raw_keep_extension(res["content_type"], image_url)
+        if ext:
+            raw_path = image_path(gbif_id, image_no, ext)
+            os.replace(tmp_path, raw_path)
+            try:
+                size = os.path.getsize(raw_path)
+            except OSError:
+                size = res["size"]
+            logger.warning(f"Kept raw image {gbif_id} #{image_no} "
+                           f"({res['content_type']}) at {raw_path}")
+            return {"outcome": "success", "db_status": ddb.ST_SUCCESS,
+                    "http_status": 200, "error_type": ddb.ERR_RAW_UNPROCESSED,
+                    "error_detail": f"kept raw: {res['content_type'] or 'unknown'}",
+                    "host": res["host"], "file_path": raw_path, "file_size": size}
+        _rm(tmp_path)
+        return {"outcome": "failed", "db_status": ddb.ST_FAILED_PERMANENT,
+                "http_status": 200, "error_type": ddb.ERR_NOT_IMAGE,
+                "error_detail": str(e), "host": res["host"],
+                "file_path": None, "file_size": None}
+
+    jpg_path = image_path(gbif_id, image_no, ".jpg")
+    os.replace(tmp_path, jpg_path)
+    try:
+        size = os.path.getsize(jpg_path)
+    except OSError:
+        size = res["size"]
+    return {"outcome": "success", "db_status": ddb.ST_SUCCESS, "http_status": 200,
+            "error_type": None, "error_detail": None, "host": res["host"],
+            "file_path": jpg_path, "file_size": size}
 
 
-def resolve_and_download(gbif_id, identifier_url, local_path):
+def resolve_and_download(gbif_id, image_no, candidate_urls):
     """
-    Download the image for one source identifier (one img_index) and save it as
-    exactly one file at local_path.
-
-    For a plain URL there is one candidate. For a IIIF manifest the manifest is
-    expanded into resolution variants and tried highest-first; the first success
-    wins, so still only one file is saved per identifier.
+    Fetch one distinct image and save it as exactly one file.
 
-    Returns a result dict with keys: outcome ('success' | 'failed' |
-    'deferred'), db_status, http_status, error_type, error_detail, host,
-    file_size. 'deferred' means every candidate host was blocked/circuit-broken,
-    so the image was not really attempted and should stay 'pending'.
+    candidate_urls are this image's URLs from the database, best-resolution
+    first. IIIF manifests among them are expanded into image URLs. The first
+    URL that yields a usable image wins. Returns an outcome dict; outcome is
+    'success', 'failed', or 'deferred' (every candidate host was blocked, so
+    the image was not really attempted and should stay 'pending').
     """
-    if "/manifest" in identifier_url or identifier_url.endswith(".json"):
-        candidates, manifest_err = extract_image_from_iiif_manifest(
-            identifier_url, gbif_id)
-        if not candidates:
-            return {"outcome": "failed",
-                    "db_status": ddb.status_for_error(manifest_err),
-                    "http_status": None, "error_type": manifest_err,
-                    "error_detail": "IIIF manifest yielded no image URLs",
-                    "host": _host_from_url(identifier_url), "file_size": None}
-    else:
-        candidates = [identifier_url]
+    resolved, manifest_err = [], None
+    for url in candidate_urls:
+        if is_manifest_url(url):
+            extracted, err = extract_image_from_iiif_manifest(url, gbif_id)
+            if extracted:
+                resolved.extend(extracted)
+            elif err:
+                manifest_err = err
+        else:
+            resolved.append(url)
 
-    # Deduplicate while preserving the highest-resolution-first order.
     seen, ordered = set(), []
-    for url in candidates:
+    for url in resolved:
         if url not in seen:
             seen.add(url)
             ordered.append(url)
 
-    failures = []
-    attempted_any = False
+    if not ordered:
+        et = manifest_err or ddb.ERR_NO_URL
+        return {"outcome": "failed", "db_status": ddb.status_for_error(et),
+                "http_status": None, "error_type": et,
+                "error_detail": "no downloadable image URL for this image",
+                "host": None, "file_path": None, "file_size": None}
+
+    tmp_path = image_path(gbif_id, image_no, ".tmp")
+    os.makedirs(os.path.dirname(tmp_path), exist_ok=True)
+
+    failures, attempted = [], False
     for url in ordered:
         if is_host_circuit_broken(url) or is_host_blocked(url):
             continue
-        attempted_any = True
-        result = download_one_url(gbif_id, url, local_path)
-        if result["ok"]:
-            try:
-                resize_image(gbif_id, local_path)
-            except (OSError, UnidentifiedImageError) as e:
-                _rm(local_path)
-                return {"outcome": "failed", "db_status": ddb.ST_FAILED_PERMANENT,
-                        "http_status": result["http_status"],
-                        "error_type": ddb.ERR_NOT_IMAGE,
-                        "error_detail": str(e), "host": result["host"],
-                        "file_size": None}
-            try:
-                size = os.path.getsize(local_path)
-            except OSError:
-                size = result["size"]
-            return {"outcome": "success", "db_status": ddb.ST_SUCCESS,
-                    "http_status": 200, "error_type": None,
-                    "error_detail": None, "host": result["host"],
-                    "file_size": size}
-        failures.append(result)
+        attempted = True
+        res = download_one_url(gbif_id, url, tmp_path)
+        if res["ok"]:
+            return _finalize_download(gbif_id, image_no, url, res, tmp_path)
+        failures.append(res)
 
-    if not attempted_any:
+    if not attempted:
         # Every candidate's host was blocked -- leave the image 'pending'.
         return {"outcome": "deferred"}
 
     # Prefer a transient failure as the recorded reason: if any candidate could
-    # still succeed later, the whole identifier is worth retrying.
+    # still succeed later, the whole image is worth retrying.
     transient = [f for f in failures if not ddb.is_permanent(f["error_type"])]
     chosen = transient[0] if transient else failures[0]
     db_status = ddb.ST_FAILED_TRANSIENT if transient else ddb.ST_FAILED_PERMANENT
@@ -410,17 +539,30 @@ def resolve_and_download(gbif_id, identifier_url, local_path):
             "http_status": chosen["http_status"],
             "error_type": chosen["error_type"],
             "error_detail": chosen["error_detail"],
-            "host": chosen["host"], "file_size": None}
+            "host": chosen["host"], "file_path": None, "file_size": None}
 
 
 # ---- per-gbifID processing ---------------------------------------------------
 
+def _existing_file(gbif_id, image_no):
+    """Return (path, is_raw) if a valid file is already on disk, else None."""
+    for ext in (".jpg",) + RAW_EXTS:
+        path = image_path(gbif_id, image_no, ext)
+        if os.path.exists(path):
+            try:
+                if get_file_size_in_mb(path) >= MIN_IMAGE_MB:
+                    return path, (ext != ".jpg")
+            except OSError:
+                pass
+    return None
+
+
 def process_id(db, gbif_id, total_to_install):
-    """Download every not-yet-done image for one gbifID and update the DB."""
+    """Download every not-yet-done distinct image for one gbifID."""
     global n_installed
     images = db.get_images_for(gbif_id)
 
-    for img_index, url, _host, status, attempts in images:
+    for image_no, image_key, urls_str, host, status, attempts in images:
         # Skip images that are already finished or have exhausted their retries.
         if status == ddb.ST_SUCCESS:
             continue
@@ -429,33 +571,33 @@ def process_id(db, gbif_id, total_to_install):
         if status == ddb.ST_FAILED_TRANSIENT and attempts >= db.max_attempts:
             continue
 
-        suffix = f"-{img_index:02d}"
-        local_path = get_hierarchical_path(INSTALL_PATH, gbif_id, suffix)
-
-        # If a valid file is already on disk, record it without downloading.
-        if os.path.exists(local_path):
+        # If a valid file is already on disk (a previous run, or the legacy
+        # import), record it without downloading.
+        existing = _existing_file(gbif_id, image_no)
+        if existing:
+            path, is_raw = existing
             try:
-                size_mb = get_file_size_in_mb(local_path)
+                size = os.path.getsize(path)
             except OSError:
-                size_mb = 0.0
-            if size_mb >= MIN_IMAGE_MB:
-                db.record_image_result(
-                    gbif_id, img_index, ddb.ST_SUCCESS,
-                    host=_host_from_url(url), http_status=200,
-                    file_path=local_path, file_size=int(size_mb * 1024 * 1024),
-                    increment_attempts=False)
-                continue
+                size = None
+            db.record_image_result(
+                gbif_id, image_no, ddb.ST_SUCCESS, host=host, http_status=200,
+                error_type=ddb.ERR_RAW_UNPROCESSED if is_raw else None,
+                file_path=path, file_size=size, increment_attempts=False)
+            continue
 
-        result = resolve_and_download(gbif_id, url, local_path)
+        candidate_urls = [u for u in urls_str.split("\n") if u]
+        result = resolve_and_download(gbif_id, image_no, candidate_urls)
         if result["outcome"] == "deferred":
             continue  # host blocked; leave 'pending' for a later run
 
         db.record_image_result(
-            gbif_id, img_index, result["db_status"],
-            host=result.get("host"), http_status=result.get("http_status"),
+            gbif_id, image_no, result["db_status"],
+            host=result.get("host") or host,
+            http_status=result.get("http_status"),
             error_type=result.get("error_type"),
             error_detail=result.get("error_detail"),
-            file_path=local_path if result["outcome"] == "success" else None,
+            file_path=result.get("file_path"),
             file_size=result.get("file_size"))
 
         if result["outcome"] == "success":
@@ -525,6 +667,7 @@ def main():
     send_notification("Image Installation",
                       f"Starting run: {total_to_install} gbifIDs to process.")
 
+    counts = {}
     try:
         with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
             for start in range(0, total_to_install, WORK_CHUNK):
diff --git a/init_download_db.py b/init_download_db.py
index dac6214..899f980 100644
--- a/init_download_db.py
+++ b/init_download_db.py
@@ -5,27 +5,27 @@
 What it does
 ------------
 1. Creates the SQLite schema (see download_db.py).
-2. Reads multimedia.txt once and loads every (gbifID, image URL) pair into the
-   `images` table and every gbifID into `gbif_ids`. After this, runs of
-   image_install_db.py no longer need to re-read and re-group the 59M-row
+2. Reads multimedia.txt once and groups its rows into *distinct images*: a IIIF
+   manifest plus the resolution variants of one specimen collapse to a single
+   image (see download_db.canonical_image_key). Each distinct image becomes one
+   row in `images`, carrying all of its candidate URLs; `gbif_ids` gets one row
+   per gbifID. After this, runs of image_install_db.py never re-read the 59M-row
    multimedia.txt -- the work queue lives in the database.
 3. Imports processed_ids.txt: for each already-finished gbifID it locates the
    downloaded file, renames legacy `<id>.jpg` to `<id>-00.jpg` so the dataset
-   uses one consistent naming scheme, and marks image index 0 as 'success'.
-   gbifIDs with more than one image are left 'partial' so the multi-image
-   downloader goes back and fetches their remaining images.
+   uses one consistent naming scheme, and marks image 0 as 'success'. gbifIDs
+   with more than one distinct image are left 'partial' so the downloader goes
+   back and fetches their remaining images.
 
    NOTE: the old one-image-per-ID downloader shuffled candidate URLs, so for a
-   multi-image gbifID we cannot know which URL the existing file came from. It
-   is recorded against img_index 0 with error_type 'legacy_unverified_index'.
-   ~87% of gbifIDs have only one image, where this assignment is exact.
+   multi-image gbifID we cannot know which image the existing file is. It is
+   recorded against image 0 with error_type 'legacy_unverified_index'.
 
 failed_ids.txt is intentionally NOT imported: those IDs stay 'pending' and get
 a fresh, fully-tracked retry.
 
 This script is destructive-ish (it renames files and can drop an existing DB
-with --reset). It does not download anything. Run it once before the first
-run of image_install_db.py.
+with --reset). It does not download anything.
 
 The legacy import is idempotent and resumable: if it is interrupted, re-run
 with --legacy-only to finish it without redoing the multimedia ingest.
@@ -69,7 +69,7 @@ def progress(msg):
 
 
 def hierarchical_path(base_dir, gbif_id, suffix=""):
-    """Mirror image_install_db.get_hierarchical_path (without makedirs)."""
+    """Path for a stored image (mirrors image_install_db.image_path, no mkdir)."""
     stem = str(gbif_id)
     prefix1 = stem[:3] if len(stem) >= 3 else stem
     prefix2 = stem[3:6] if len(stem) >= 6 else "000"
@@ -98,7 +98,7 @@ def parse_args():
 
 
 def ingest_multimedia(conn, multimedia_path):
-    """Load every image URL from multimedia.txt into images + gbif_ids."""
+    """Group multimedia.txt rows into distinct images and load images + gbif_ids."""
     print(f"Reading {multimedia_path} ...")
     df = pd.read_csv(
         multimedia_path,
@@ -111,37 +111,73 @@ def ingest_multimedia(conn, multimedia_path):
     df["identifier"] = df["identifier"].astype("string")
     print(f"  {len(df):,} (gbifID, URL) rows")
 
-    # Sort so each gbifID's rows are contiguous, then number them 0,1,2,...
+    # Stable sort -> each gbifID's rows keep their multimedia.txt order.
     df = df.sort_values("gbifID", kind="stable").reset_index(drop=True)
-    df["img_index"] = df.groupby("gbifID").cumcount()
+
+    print("  Computing canonical image keys ...")
+    df["image_key"] = df["identifier"].map(ddb.canonical_image_key).astype("string")
+
+    # image_no: dense rank of image_key within each gbifID, by first appearance.
+    df["is_new"] = ~df.duplicated(["gbifID", "image_key"])
+    df["image_no"] = df.groupby("gbifID")["is_new"].cumsum().astype("int32") - 1
+
+    # Order a distinct image's candidate URLs: highest resolution first, IIIF
+    # manifests last (they are only a fallback -- expanded at download time).
+    df["is_manifest"] = df["identifier"].str.contains(
+        "/manifest", case=False, na=False).astype("int8")
+    size = pd.to_numeric(
+        df["identifier"].str.extract(r"/full/(\d+),", expand=False),
+        errors="coerce")
+    big = df["identifier"].str.contains(
+        r"/full/(?:max|full)/", case=False, na=False, regex=True)
+    size = size.where(~big, 100000)
+    size = size.mask(df["is_manifest"] == 1, 1600)   # manifest expands to ~1600
+    df["eff_size"] = size.fillna(0).astype("int32")
+
     df["host"] = (
         df["identifier"].str.extract(r"^[a-zA-Z][a-zA-Z0-9+.-]*://([^/:]+)",
                                      expand=False)
-        .fillna("")
+        .fillna("").astype("string")
+    )
+
+    df = df.sort_values(
+        ["gbifID", "image_no", "eff_size", "is_manifest"],
+        ascending=[True, True, False, True], kind="stable")
+
+    print("  Grouping rows into distinct images ...")
+    images = (
+        df.groupby(["gbifID", "image_no"], sort=True)
+        .agg(urls=("identifier", "\n".join),
+             image_key=("image_key", "first"),
+             host=("host", "first"))
+        .reset_index()
     )
+    del df
+    print(f"    {len(images):,} distinct images")
 
     print("  Inserting image rows ...")
     inserted = 0
-    for start in range(0, len(df), INSERT_BATCH):
-        sub = df.iloc[start:start + INSERT_BATCH]
+    for start in range(0, len(images), INSERT_BATCH):
+        sub = images.iloc[start:start + INSERT_BATCH]
         rows = list(zip(
             sub["gbifID"].tolist(),
-            sub["img_index"].tolist(),
-            sub["identifier"].tolist(),
+            sub["image_no"].tolist(),
+            sub["image_key"].tolist(),
+            sub["urls"].tolist(),
             sub["host"].tolist(),
         ))
         conn.executemany(
-            "INSERT OR IGNORE INTO images(gbif_id, img_index, url, host) "
-            "VALUES(?,?,?,?)",
+            "INSERT OR IGNORE INTO images"
+            "(gbif_id, image_no, image_key, urls, host) VALUES(?,?,?,?,?)",
             rows,
         )
         conn.commit()
         inserted += len(rows)
-        progress(f"    {inserted:,}/{len(df):,} image rows")
-    print(f"    {inserted:,} image rows inserted        ")
+        progress(f"    {inserted:,}/{len(images):,} image rows")
+    print(f"    {inserted:,} distinct images inserted        ")
 
     print("  Inserting gbifID rows ...")
-    sizes = df.groupby("gbifID").size()
+    sizes = images.groupby("gbifID").size()
     gid_rows = list(zip(sizes.index.tolist(), sizes.tolist()))
     for start in range(0, len(gid_rows), INSERT_BATCH):
         conn.executemany(
@@ -173,7 +209,7 @@ def flush(batch):
                     "UPDATE images SET status='success', "
                     "  error_type=?, file_path=?, file_size=?, "
                     "  last_attempt_at=datetime('now') "
-                    "WHERE gbif_id=? AND img_index=0",
+                    "WHERE gbif_id=? AND image_no=0",
                     batch,
                 )
                 conn.commit()
diff --git a/init_download_db.sh b/init_download_db.sh
index f42c1bb..80872d1 100755
--- a/init_download_db.sh
+++ b/init_download_db.sh
@@ -12,15 +12,17 @@ module load academic-ml/spring-2026
 
 conda activate spring-2026-pyt
 
+# The build mode is taken from the qsub command line. Submit with one of:
+#   qsub ... init_download_db.sh --reset         # full (re)build from scratch
+#   qsub ... init_download_db.sh --legacy-only   # only (re-)run the legacy import
 # --processed-file points at the production processed_ids.txt, which lives in
 # ljhao's working directory, not this repo.
-# --legacy-only means only process the legacy images without creating the database.
-#    Run this on subsequent times after db was created.
-python init_download_db.py --legacy-only \
+python init_download_db.py "$@" \
     --processed-file /projectnb/herbdl/workspaces/ljhao/herbdl/utils/processed_ids.txt
 
 # The other big initial run is tracked in
 # /projectnb/herbdl/workspaces/tsehou26/herbarium_project/utils/processed_ids.txt and .../failed_ids.txt
 
-### The command below is used to submit the job to the cluster
-### qsub -N init_download_db -l h_rt=12:00:00 -pe omp 16 -P herbdl -m beas -M your_email@bu.edu init_download_db.sh
+### The command below is used to submit the job to the cluster. Use --reset for
+### the first build on the new distinct-image schema; --legacy-only for later top-ups:
+### qsub -N init_download_db -l h_rt=12:00:00 -pe omp 16 -P herbdl -m beas -M your_email@bu.edu init_download_db.sh --reset
diff --git a/status_report.py b/status_report.py
index 9c8e4b5..2e71767 100644
--- a/status_report.py
+++ b/status_report.py
@@ -6,18 +6,25 @@
 checkpoint files and re-grouping the 59M-row multimedia.txt with pandas, every
 number here is a single indexed SQL query, so the report returns in seconds.
 
+Counts are over *distinct images* (a IIIF manifest plus its resolution variants
+count once -- see download_db.canonical_image_key).
+
 The same numbers are available ad hoc -- a few useful queries:
 
     -- how many of each kind of failure?
     SELECT error_type, COUNT(*) FROM images
     WHERE status LIKE 'failed%' GROUP BY error_type ORDER BY 2 DESC;
 
-    -- every URL still worth retrying
-    SELECT gbif_id, url FROM images WHERE status='failed_transient';
+    -- every image still worth retrying
+    SELECT gbif_id, image_no, urls FROM images WHERE status='failed_transient';
+
+    -- URLs that returned an HTML/text page, with the captured message
+    SELECT host, error_detail FROM images
+    WHERE error_type='invalid_content_type' GROUP BY host;
 
-    -- worst hosts
-    SELECT host, COUNT(*) FROM images WHERE status LIKE 'failed%'
-    GROUP BY host ORDER BY 2 DESC LIMIT 20;
+    -- raw files (DNG etc.) kept for a later conversion pass
+    SELECT gbif_id, image_no, file_path FROM images
+    WHERE error_type='raw_unprocessed';
 
 Usage:
     python status_report.py [--db PATH] [--output-dir DIR]
@@ -80,16 +87,21 @@ def section(title):
         write(f"Still in the work queue:        {remaining:,}")
 
         # -- per-image progress ----------------------------------------------
-        section("IMAGE (URL) PROGRESS")
+        section("DISTINCT-IMAGE PROGRESS")
         img_counts = dict(conn.execute(
             "SELECT status, COUNT(*) FROM images GROUP BY status").fetchall())
         total_imgs = sum(img_counts.values())
-        write(f"Total image URLs:               {total_imgs:,}")
+        write(f"Total distinct images:          {total_imgs:,}")
         for status in (ddb.ST_SUCCESS, ddb.ST_PENDING,
                        ddb.ST_FAILED_TRANSIENT, ddb.ST_FAILED_PERMANENT):
             count = img_counts.get(status, 0)
             pct = (count / total_imgs * 100) if total_imgs else 0.0
             write(f"  {status:18s}          {count:>14,}  ({pct:5.2f}%)")
+        raw_kept = conn.execute(
+            "SELECT COUNT(*) FROM images WHERE error_type=?",
+            (ddb.ERR_RAW_UNPROCESSED,)).fetchone()[0]
+        write(f"  (of 'success', kept raw -- DNG etc., need conversion: "
+              f"{raw_kept:,})")
 
         # -- failure breakdown ------------------------------------------------
         section("FAILURES BY TYPE")
@@ -117,6 +129,23 @@ def section(title):
         if not rows:
             write("(none)")
 
+        # -- non-image (HTML/text) responses ---------------------------------
+        section("NON-IMAGE RESPONSES BY HOST (for follow-up)")
+        write("Hosts whose URLs returned an HTML/text page instead of an image")
+        write("(e.g. 'direct download no longer supported'). Sample message shown.")
+        write("-" * 70)
+        rows = conn.execute(
+            "SELECT host, COUNT(*) AS n, MIN(error_detail) "
+            "FROM images WHERE error_type=? "
+            "GROUP BY host ORDER BY n DESC LIMIT 20",
+            (ddb.ERR_INVALID_CONTENT,)).fetchall()
+        for host, count, sample in rows:
+            write(f"{(host or '?')[:45]:45s} {count:>10,}")
+            if sample:
+                write(f"    {sample[:200].strip()}")
+        if not rows:
+            write("(none recorded yet)")
+
         # -- worst hosts ------------------------------------------------------
         section("TOP 20 HOSTS BY FAILED IMAGES")
         write(f"{'host':40s} {'failed':>10s} {'success':>10s}")
@@ -142,11 +171,15 @@ def section(title):
         write(f"Hosts currently in cooldown:                          {blocked:,}")
 
         section("NOTES")
-        write("- 'done'    = every image URL for the gbifID succeeded.")
+        write("- Counts are over distinct images: a IIIF manifest plus its")
+        write("  resolution variants count as one image.")
+        write("- 'done'    = every distinct image for the gbifID succeeded.")
         write("- 'partial' = still has retryable work; stays in the queue.")
         write("- 'failed'  = all images terminal, not all succeeded; no retries left.")
         write("- failed_transient images are retried until "
               f"{ddb.MAX_ATTEMPTS} attempts, then count toward 'failed'.")
+        write("- 'raw_unprocessed' images ARE downloaded (status success); they")
+        write("  are raw files (DNG etc.) awaiting a separate conversion pass.")
         write()
         write(f"Summary written to: {os.path.abspath(output_file)}")
 

From 138974c92cac9c26190776f2f5fe81610b4043ac Mon Sep 17 00:00:00 2001
From: Thomas Gardos <3973626+trgardos@users.noreply.github.com>
Date: Sun, 24 May 2026 15:57:13 -0400
Subject: [PATCH 11/19] stream the distinct-image groupby so ingest fits in
 wall-clock

The previous ingest used df.groupby([gbifID, image_no]).agg("\n".join,
image_key="first", host="first") to build one row per distinct image.
With ~52M groups and a Python callable, that one statement did not
finish inside the 12 h qsub limit -- the job died before ever reaching
the first INSERT, so nothing was committed.

Replace it with a single-pass streaming groupby that walks the already-
sorted rows in native Python lists and emits one INSERT batch per chunk
of distinct images (commit every 200k images, so partial progress is
saved on a future kill too). Replace the pandas gbif_ids build with one
SQL `INSERT INTO gbif_ids SELECT gbif_id, COUNT(*) FROM images
GROUP BY gbif_id` which rides the (gbif_id, image_no) primary-key index.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 init_download_db.py | 95 +++++++++++++++++++++++++++------------------
 1 file changed, 57 insertions(+), 38 deletions(-)

diff --git a/init_download_db.py b/init_download_db.py
index 899f980..15e1a7e 100644
--- a/init_download_db.py
+++ b/init_download_db.py
@@ -40,6 +40,7 @@
 
 import os
 import sys
+import gc
 import time
 import sqlite3
 import argparse
@@ -144,48 +145,66 @@ def ingest_multimedia(conn, multimedia_path):
         ["gbifID", "image_no", "eff_size", "is_manifest"],
         ascending=[True, True, False, True], kind="stable")
 
-    print("  Grouping rows into distinct images ...")
-    images = (
-        df.groupby(["gbifID", "image_no"], sort=True)
-        .agg(urls=("identifier", "\n".join),
-             image_key=("image_key", "first"),
-             host=("host", "first"))
-        .reset_index()
-    )
-    del df
-    print(f"    {len(images):,} distinct images")
-
-    print("  Inserting image rows ...")
-    inserted = 0
-    for start in range(0, len(images), INSERT_BATCH):
-        sub = images.iloc[start:start + INSERT_BATCH]
-        rows = list(zip(
-            sub["gbifID"].tolist(),
-            sub["image_no"].tolist(),
-            sub["image_key"].tolist(),
-            sub["urls"].tolist(),
-            sub["host"].tolist(),
-        ))
+    print("  Streaming distinct images into the database ...")
+    # Single-pass streaming groupby. The previous pandas
+    # .agg('\n'.join, ...) over ~52M groups was the bottleneck (did not
+    # finish inside a 12 h job). Pull the sorted columns into native
+    # Python lists and emit one INSERT batch per chunk of distinct images.
+    gid_l  = df["gbifID"].tolist();     del df["gbifID"]
+    no_l   = df["image_no"].tolist();   del df["image_no"]
+    url_l  = df["identifier"].tolist(); del df["identifier"]
+    key_l  = df["image_key"].tolist();  del df["image_key"]
+    host_l = df["host"].tolist();       del df
+    gc.collect()
+    n_rows = len(gid_l)
+
+    img_batch = []
+    distinct = 0
+    cur_gid = cur_no = None
+    cur_key = cur_host = None
+    cur_urls = []
+
+    def flush():
+        if not img_batch:
+            return
         conn.executemany(
             "INSERT OR IGNORE INTO images"
             "(gbif_id, image_no, image_key, urls, host) VALUES(?,?,?,?,?)",
-            rows,
-        )
-        conn.commit()
-        inserted += len(rows)
-        progress(f"    {inserted:,}/{len(images):,} image rows")
-    print(f"    {inserted:,} distinct images inserted        ")
-
-    print("  Inserting gbifID rows ...")
-    sizes = images.groupby("gbifID").size()
-    gid_rows = list(zip(sizes.index.tolist(), sizes.tolist()))
-    for start in range(0, len(gid_rows), INSERT_BATCH):
-        conn.executemany(
-            "INSERT OR IGNORE INTO gbif_ids(gbif_id, n_images) VALUES(?,?)",
-            gid_rows[start:start + INSERT_BATCH],
-        )
+            img_batch)
         conn.commit()
-    print(f"    {len(gid_rows):,} gbifIDs inserted")
+        img_batch.clear()
+
+    for i in range(n_rows):
+        g = gid_l[i]
+        n = no_l[i]
+        if g != cur_gid or n != cur_no:
+            if cur_gid is not None:
+                img_batch.append((cur_gid, cur_no, cur_key,
+                                  "\n".join(cur_urls), cur_host))
+                distinct += 1
+                if len(img_batch) >= INSERT_BATCH:
+                    flush()
+                    progress(f"    {distinct:,} images inserted "
+                             f"({100 * i / n_rows:.1f}% through input)")
+            cur_gid, cur_no = g, n
+            cur_key, cur_host = key_l[i], host_l[i]
+            cur_urls = [url_l[i]]
+        else:
+            cur_urls.append(url_l[i])
+    if cur_gid is not None:
+        img_batch.append((cur_gid, cur_no, cur_key,
+                          "\n".join(cur_urls), cur_host))
+        distinct += 1
+    flush()
+    print(f"    {distinct:,} distinct images inserted        ")
+
+    print("  Populating gbif_ids from images ...")
+    conn.execute(
+        "INSERT INTO gbif_ids(gbif_id, n_images) "
+        "SELECT gbif_id, COUNT(*) FROM images GROUP BY gbif_id")
+    conn.commit()
+    n_gbif = conn.execute("SELECT COUNT(*) FROM gbif_ids").fetchone()[0]
+    print(f"    {n_gbif:,} gbifIDs inserted")
 
 
 def import_legacy(conn, processed_file, install_path):

From 54fb3e31de6aa741f7da0cb5835565dfb84cbbf1 Mon Sep 17 00:00:00 2001
From: Thomas Gardos <3973626+trgardos@users.noreply.github.com>
Date: Tue, 26 May 2026 15:17:55 -0400
Subject: [PATCH 12/19] chunk the gbif_ids recompute and add --finalize-only
 resume mode

The final step of the legacy import rolled images.status up into
gbif_ids.status via one giant UPDATE with three correlated COUNT
subqueries plus an `IN (SELECT DISTINCT gbif_id FROM images ...)`
filter. SQLite's planner pessimised it badly enough that it did not
finish in 24 h, and because the whole thing was a single transaction,
nothing was committed and no progress was visible.

Replace it with _finalize_gbif_ids_status(): one
  SELECT gbif_id, COUNT(*) FROM images WHERE status='success' GROUP BY gbif_id
to materialise per-gbifID success counts, an in-memory n_images map
from gbif_ids, then chunked UPDATEs of 50k rows each with a commit and
a progress print per batch. A kill mid-finalize now leaves a partially
updated table and re-running is a safe idempotent retry.

Add --finalize-only so the finalize can be re-run without redoing the
disk re-scan -- useful when import_legacy completed its per-image
flushes but the recompute did not.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 init_download_db.py | 93 ++++++++++++++++++++++++++++++++++++---------
 1 file changed, 74 insertions(+), 19 deletions(-)

diff --git a/init_download_db.py b/init_download_db.py
index 15e1a7e..0b7ed1e 100644
--- a/init_download_db.py
+++ b/init_download_db.py
@@ -29,12 +29,15 @@
 
 The legacy import is idempotent and resumable: if it is interrupted, re-run
 with --legacy-only to finish it without redoing the multimedia ingest.
+The final "roll up images.status into gbif_ids" step is also resumable on its
+own -- re-run with --finalize-only to redo just that, no disk re-scan.
 
 Usage
 -----
     python init_download_db.py                 # build DB + import legacy
     python init_download_db.py --skip-legacy   # build DB only
     python init_download_db.py --legacy-only   # (re-)run only the legacy import
+    python init_download_db.py --finalize-only # only roll images -> gbif_ids
     python init_download_db.py --reset         # rebuild from scratch
 """
 
@@ -93,6 +96,9 @@ def parse_args():
     p.add_argument("--legacy-only", action="store_true",
                    help="Skip the ingest; only (re-)run the legacy import on an "
                         "existing database (use this to finish an interrupted import)")
+    p.add_argument("--finalize-only", action="store_true",
+                   help="Skip ingest and the disk re-scan; only (re-)roll "
+                        "images.status up into gbif_ids.status on an existing DB")
     p.add_argument("--reset", action="store_true",
                    help="Delete an existing database before building")
     return p.parse_args()
@@ -281,27 +287,61 @@ def flush(batch):
     print(f"    renamed={renamed:,}  already-suffixed={relabeled:,}  "
           f"file-missing={missing:,}")
 
-    # Roll the per-image success flags up into gbif_ids statuses in one pass.
-    print("  Recomputing gbifID statuses ...")
-    conn.execute(
-        "UPDATE gbif_ids SET "
-        "  n_success=(SELECT COUNT(*) FROM images i "
-        "             WHERE i.gbif_id=gbif_ids.gbif_id AND i.status='success'), "
-        "  status=CASE "
-        "    WHEN n_images>0 AND n_images=(SELECT COUNT(*) FROM images i "
-        "         WHERE i.gbif_id=gbif_ids.gbif_id AND i.status='success') "
-        "      THEN 'done' "
-        "    WHEN (SELECT COUNT(*) FROM images i "
-        "         WHERE i.gbif_id=gbif_ids.gbif_id AND i.status='success')>0 "
-        "      THEN 'partial' "
-        "    ELSE 'pending' END "
-        "WHERE gbif_id IN (SELECT DISTINCT gbif_id FROM images "
-        "                  WHERE status='success')"
-    )
+    # Roll the per-image success flags up into gbif_ids statuses.
+    _finalize_gbif_ids_status(conn)
+
+
+def _finalize_gbif_ids_status(conn):
+    """
+    Roll images.status up into gbif_ids.status (done / partial / pending),
+    set n_success, and stamp completed_at on freshly-done IDs.
+
+    Chunked and committed per batch so progress is visible and a kill mid-run
+    is resumable -- a re-run just re-applies the same UPDATEs.
+
+    A previous one-statement UPDATE with three correlated COUNT subqueries
+    and an `IN (SELECT DISTINCT ...)` filter did not finish in 24 h; this
+    Python-side pass takes well under an hour.
+    """
+    print("  Counting success images per gbifID ...")
+    success_counts = conn.execute(
+        "SELECT gbif_id, COUNT(*) FROM images "
+        "WHERE status='success' GROUP BY gbif_id ORDER BY gbif_id"
+    ).fetchall()
+    print(f"    {len(success_counts):,} gbifIDs have at least one success image")
+
+    print("  Loading n_images per gbifID ...")
+    n_images_map = dict(conn.execute(
+        "SELECT gbif_id, n_images FROM gbif_ids"
+    ).fetchall())
+    print(f"    {len(n_images_map):,} gbifIDs total")
+
+    print(f"  Updating gbif_ids statuses ...")
+    BATCH = 50_000
+    updated = 0
+    for start in range(0, len(success_counts), BATCH):
+        chunk = success_counts[start:start + BATCH]
+        rows = []
+        for gid, n_success in chunk:
+            n_total = n_images_map.get(gid, 0)
+            if n_total > 0 and n_success >= n_total:
+                status = "done"
+            elif n_success > 0:
+                status = "partial"
+            else:
+                status = "pending"
+            rows.append((n_success, status, gid))
+        conn.executemany(
+            "UPDATE gbif_ids SET n_success=?, status=? WHERE gbif_id=?", rows)
+        conn.commit()
+        updated += len(rows)
+        progress(f"    {updated:,}/{len(success_counts):,} gbif_ids updated")
+    print(f"    {updated:,} gbif_ids updated        ")
+
+    print("  Setting completed_at for done gbifIDs ...")
     conn.execute(
         "UPDATE gbif_ids SET completed_at=datetime('now') "
-        "WHERE status='done' AND completed_at IS NULL"
-    )
+        "WHERE status='done' AND completed_at IS NULL")
     conn.commit()
 
 
@@ -358,6 +398,21 @@ def main():
         print(f"\nDone in {time.time() - start:.0f}s.")
         return
 
+    # --finalize-only: just (re-)roll images.status into gbif_ids.status. Use
+    # this to finish a run that was killed during the recompute step.
+    if args.finalize_only:
+        if not os.path.exists(args.db):
+            sys.exit(f"--finalize-only needs an existing database, but none was "
+                     f"found at: {args.db}\nRun the full build first.")
+        print(f"--finalize-only: rolling images.status up into gbif_ids on "
+              f"{args.db}")
+        conn = connect(args.db, bulk_load=False)
+        _finalize_gbif_ids_status(conn)
+        report_status(conn)
+        conn.close()
+        print(f"\nDone in {time.time() - start:.0f}s.")
+        return
+
     if os.path.exists(args.db):
         if args.reset:
             print(f"Removing existing database {args.db}")

From f72673cf81159e8d09989bc1da4b3d7ac4c06d91 Mon Sep 17 00:00:00 2001
From: Thomas Gardos <3973626+trgardos@users.noreply.github.com>
Date: Tue, 26 May 2026 15:24:27 -0400
Subject: [PATCH 13/19] document --finalize-only and the three resumable build
 stages

Update DEPLOYMENT.md for the recent init_download_db.py changes:
- Phase 1 qsub example now passes --reset and explains the wrapper
  forwards "$@" through, so --legacy-only and --finalize-only work the
  same way.
- Options table gains a --finalize-only row.
- "Re-running is safe" rewritten as three resumable stages: ingest
  (--reset), legacy import (--legacy-only), finalize (--finalize-only).
- Troubleshooting gets a row for jobs killed during "Recomputing gbifID
  statuses".

Cross-link gbif-metadata-download.md to DEPLOYMENT.md so the GBIF query
how-to ends pointing at the project-specific build/download pipeline.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 DEPLOYMENT.md                  | 26 +++++++++++++++++++-------
 docs/gbif-metadata-download.md | 15 ++++++++++++++-
 2 files changed, 33 insertions(+), 8 deletions(-)

diff --git a/DEPLOYMENT.md b/DEPLOYMENT.md
index f9fce1c..2f675e2 100644
--- a/DEPLOYMENT.md
+++ b/DEPLOYMENT.md
@@ -61,17 +61,21 @@ It is heavy — it reads the ~59M-row `multimedia.txt` with pandas and renames u
 to ~13.5M files. **Run it as a batch job, not on a login node.**
 
 ```bash
+# fresh build (use --reset if a DB already exists at the destination):
 qsub -N init_download_db -l h_rt=12:00:00 -pe omp 16 -P herbdl \
-     -m beas -M your_email@bu.edu init_download_db.sh
+     -m beas -M your_email@bu.edu init_download_db.sh --reset
 ```
 
-`init_download_db.sh` runs:
+`init_download_db.sh` forwards its arguments through to the python script:
 
 ```bash
-python init_download_db.py \
+python init_download_db.py "$@" \
     --processed-file /projectnb/herbdl/workspaces/ljhao/herbdl/utils/processed_ids.txt
 ```
 
+So `qsub ... init_download_db.sh --reset`, `... --legacy-only`, and
+`... --finalize-only` all work without editing the wrapper.
+
 > **Important:** the production `processed_ids.txt` (~13.5M IDs) lives in
 > ljhao's working directory, not in this repo. The wrapper already points there.
 > If you build the DB by hand, pass that `--processed-file` path explicitly, or
@@ -84,6 +88,7 @@ python init_download_db.py \
 | `python init_download_db.py` | Build DB + import legacy progress |
 | `python init_download_db.py --skip-legacy` | Build DB only (everything starts `pending`) |
 | `python init_download_db.py --legacy-only` | Skip the ingest; only (re-)run the legacy import on an existing DB |
+| `python init_download_db.py --finalize-only` | Skip ingest and disk re-scan; only (re-)roll `images.status` up into `gbif_ids.status` |
 | `python init_download_db.py --reset` | Delete an existing DB and rebuild from scratch |
 
 **Expected output** — a status breakdown, e.g.:
@@ -100,10 +105,16 @@ Final gbifID status counts:
 - `pending` — never attempted
 
 Re-running is safe: file renames and database updates are idempotent
-(already-renamed files are detected and reused). If the **ingest** fails partway,
-re-run with `--reset`. If only the **legacy import** fails partway (e.g. it was
-interrupted), re-run with `--legacy-only` — that finishes the import without
-redoing the hour-long ingest.
+(already-renamed files are detected and reused). The build has three resumable
+stages:
+
+1. **Ingest** (`multimedia.txt` → `images`/`gbif_ids`). If it fails partway,
+   re-run with `--reset`.
+2. **Legacy import** (disk re-scan + mark image 0 success per gbifID). If
+   killed partway, re-run with `--legacy-only` — skips the hour-long ingest.
+3. **Finalize** (roll `images.status` up into `gbif_ids.status` in 50k-row
+   batches). If killed partway, re-run with `--finalize-only` — skips ingest
+   AND the disk re-scan; takes well under an hour.
 
 ---
 
@@ -241,5 +252,6 @@ must be re-run.
 | `Database already exists` from the builder | Intended guard — `--reset` to rebuild, or `--legacy-only` to just (re-)run the legacy import. |
 | `database is locked` | The builder now uses WAL mode (readers do not block the writer) and a 120 s busy timeout, so this should not recur. If the legacy import was interrupted by it, finish it with `init_download_db.py --legacy-only`. Still avoid running two writers against one DB. |
 | Legacy import interrupted partway | Re-run `init_download_db.py --legacy-only` — it is idempotent and skips the hour-long ingest. |
+| Killed during "Recomputing gbifID statuses" | Re-run with `--finalize-only` — it skips ingest and the disk re-scan, commits every 50k gbif_ids, and finishes in under an hour. The previous one-shot `UPDATE` (correlated subqueries + `IN (SELECT DISTINCT …)`) didn't finish in 24h; the chunked replacement does. |
 | Builder runs out of memory | `multimedia.txt` is large; request more memory (e.g. a larger `-pe omp` slot count). |
 | Legacy progress not imported | `--processed-file` was not pointed at ljhao's `processed_ids.txt`. |
diff --git a/docs/gbif-metadata-download.md b/docs/gbif-metadata-download.md
index 003f695..1e5a9d3 100644
--- a/docs/gbif-metadata-download.md
+++ b/docs/gbif-metadata-download.md
@@ -68,4 +68,17 @@ You can send a JSON object to `https://api.gbif.org/v1/occurrence/download/reque
 
 ### Pro-Tip for Data Handling
 
-Once your download is processed and unzipped, the `multimedia.txt` file will serve as your master list for image links, which you can link back to the metadata in `occurrence.txt` using the shared `gbifID` column. If you are using Python, you can also use the [`plantnet/gbif-dl`](https://www.google.com/search?q=%5Bhttps://github.com/plantnet/gbif-dl%5D(https://github.com/plantnet/gbif-dl)) library specifically designed to parse these queries and download the image assets efficiently.
\ No newline at end of file
+Once your download is processed and unzipped, the `multimedia.txt` file will serve as your master list for image links, which you can link back to the metadata in `occurrence.txt` using the shared `gbifID` column. If you are using Python, you can also use the [`plantnet/gbif-dl`](https://www.google.com/search?q=%5Bhttps://github.com/plantnet/gbif-dl%5D(https://github.com/plantnet/gbif-dl)) library specifically designed to parse these queries and download the image assets efficiently.
+
+---
+
+### Next: download the images for this project
+
+Once you have a fresh `multimedia.txt`, place it at
+`/projectnb/herbdl/data/GBIF-F25/multimedia.txt` and follow
+[DEPLOYMENT.md](../DEPLOYMENT.md) — it builds a SQLite status database from the
+file (one row per *distinct image*, with IIIF manifest + resolution variants
+deduplicated), imports any prior `processed_ids.txt` progress, and runs the
+parallel downloader. The build has three resumable stages (ingest, legacy
+import, finalize), each with its own `--reset` / `--legacy-only` /
+`--finalize-only` flag.
\ No newline at end of file

From 11c4ac3ec4e02d7cd7ea64e8b7d3016235893f07 Mon Sep 17 00:00:00 2001
From: Thomas Gardos <3973626+trgardos@users.noreply.github.com>
Date: Thu, 28 May 2026 17:25:38 -0400
Subject: [PATCH 14/19] open status_report.py read-only; add
 db_integrity_check.sh

status_report.py was opening the DB read-write with no busy timeout, so
a brief WAL contention window from the running downloader could surface
as `sqlite3.OperationalError: disk I/O error` mid-report. Open the
connection read-only via the URI form and set a 60 s busy timeout so
SQLite waits through a writer checkpoint instead of erroring out.

Add db_integrity_check.sh: a qsub wrapper that runs PRAGMA
integrity_check (read-only against the live DB, safe alongside an
active writer in WAL mode) and prints fresh row counts at the end so
results can be cross-checked against status_report.py.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 db_integrity_check.sh | 40 ++++++++++++++++++++++++++++++++++++++++
 status_report.py      |  5 ++++-
 2 files changed, 44 insertions(+), 1 deletion(-)
 create mode 100755 db_integrity_check.sh

diff --git a/db_integrity_check.sh b/db_integrity_check.sh
new file mode 100755
index 0000000..2a9d07c
--- /dev/null
+++ b/db_integrity_check.sh
@@ -0,0 +1,40 @@
+#!/bin/bash -l
+
+# Run SQLite's PRAGMA integrity_check against download_status.db.
+#
+# Read-only against the live DB -- in WAL mode the running downloader can
+# keep writing while this reads a consistent snapshot. On a ~19 GB DB this
+# typically takes ~10-60 minutes; h_rt below is set to 4 h for headroom.
+#
+# Also prints fresh row counts at the end so we can cross-check against the
+# numbers status_report.py was seeing.
+
+module load miniconda
+module load academic-ml/spring-2026
+
+conda activate spring-2026-pyt
+
+DB=/projectnb/herbdl/data/GBIF-F25h/download_status.db
+
+echo "=== PRAGMA integrity_check on $DB ==="
+echo "started: $(date)"
+python3 -u -c "
+import sqlite3
+conn = sqlite3.connect('file:$DB?mode=ro', uri=True, timeout=300)
+conn.execute('PRAGMA busy_timeout=300000')
+print('  running PRAGMA integrity_check ...', flush=True)
+for row in conn.execute('PRAGMA integrity_check'):
+    print(f'    {row[0]}', flush=True)
+print()
+print('  row counts (live snapshot):', flush=True)
+for t in ('images', 'gbif_ids', 'hosts'):
+    n = conn.execute('SELECT COUNT(*) FROM ' + t).fetchone()[0]
+    print(f'    {t}: {n:,} rows', flush=True)
+print('  gbif_ids by status:', flush=True)
+for status, n in conn.execute('SELECT status, COUNT(*) FROM gbif_ids GROUP BY status'):
+    print(f'    {status}: {n:,}', flush=True)
+"
+echo "finished: $(date)"
+
+### The command below is used to submit the job to the cluster:
+### qsub -N db_integrity -l h_rt=4:00:00 -pe omp 4 -P herbdl -j y -o db_integrity.out db_integrity_check.sh
diff --git a/status_report.py b/status_report.py
index 2e71767..3f738fd 100644
--- a/status_report.py
+++ b/status_report.py
@@ -53,7 +53,10 @@ def main():
     if not os.path.exists(args.db):
         raise SystemExit(f"Status database not found: {args.db}")
 
-    conn = sqlite3.connect(args.db)
+    # Read-only with a generous busy timeout so a brief WAL contention
+    # window from a running downloader can't surface as "disk I/O error".
+    conn = sqlite3.connect(f"file:{args.db}?mode=ro", uri=True, timeout=60)
+    conn.execute("PRAGMA busy_timeout=60000")
     run_time = datetime.now()
     output_file = os.path.join(
         args.output_dir, f"summary_{run_time:%Y%m%d%H%M}.txt")

From 482cc010ee40b85335e85a5666238822c19e0233 Mon Sep 17 00:00:00 2001
From: Thomas Gardos <3973626+trgardos@users.noreply.github.com>
Date: Fri, 29 May 2026 16:55:11 -0400
Subject: [PATCH 15/19] retry SELECTs in status_report.py to ride out WAL
 hiccups

Brief WAL checkpoint races on GPFS occasionally surface as transient
`disk I/O error` or `file is not a database` when reading the live DB
while image_install_db is writing. PRAGMA integrity_check confirmed the
DB itself is fine; the read is just unlucky.

Add _q_all() / _q_one() helpers that wrap conn.execute().fetchall() /
.fetchone() with up to three attempts, sleeping 0.5 s and 2 s between
tries on sqlite3.OperationalError / DatabaseError. Switch every query
call site in main() to go through them. If all three attempts still
fail, the original exception propagates.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 status_report.py | 64 ++++++++++++++++++++++++++++++++++--------------
 1 file changed, 45 insertions(+), 19 deletions(-)

diff --git a/status_report.py b/status_report.py
index 3f738fd..6055a45 100644
--- a/status_report.py
+++ b/status_report.py
@@ -31,6 +31,7 @@
 """
 
 import os
+import time
 import sqlite3
 import argparse
 from datetime import datetime
@@ -38,6 +39,32 @@
 import download_db as ddb
 
 
+# Transient WAL/checkpoint races on networked filesystems (GPFS) can briefly
+# surface as `disk I/O error` or `file is not a database`. The data is fine;
+# the read is just unlucky. Retry once or twice and the next attempt succeeds.
+_RETRY_DELAYS = (0.5, 2.0)
+
+
+def _q_all(conn, sql, params=()):
+    """Run a SELECT and fetchall() with brief retries on transient I/O errors."""
+    for delay in _RETRY_DELAYS:
+        try:
+            return conn.execute(sql, params).fetchall()
+        except (sqlite3.OperationalError, sqlite3.DatabaseError):
+            time.sleep(delay)
+    return conn.execute(sql, params).fetchall()
+
+
+def _q_one(conn, sql, params=()):
+    """Run a SELECT and fetchone() with brief retries on transient I/O errors."""
+    for delay in _RETRY_DELAYS:
+        try:
+            return conn.execute(sql, params).fetchone()
+        except (sqlite3.OperationalError, sqlite3.DatabaseError):
+            time.sleep(delay)
+    return conn.execute(sql, params).fetchone()
+
+
 def parse_args():
     p = argparse.ArgumentParser(description=__doc__,
                                 formatter_class=argparse.RawDescriptionHelpFormatter)
@@ -78,8 +105,8 @@ def section(title):
 
         # -- gbifID progress --------------------------------------------------
         section("GBIFID PROGRESS")
-        gbif_counts = dict(conn.execute(
-            "SELECT status, COUNT(*) FROM gbif_ids GROUP BY status").fetchall())
+        gbif_counts = dict(_q_all(conn,
+            "SELECT status, COUNT(*) FROM gbif_ids GROUP BY status"))
         total_ids = sum(gbif_counts.values())
         write(f"Total gbifIDs:                  {total_ids:,}")
         for status in (ddb.G_DONE, ddb.G_PARTIAL, ddb.G_PENDING, ddb.G_FAILED):
@@ -91,8 +118,8 @@ def section(title):
 
         # -- per-image progress ----------------------------------------------
         section("DISTINCT-IMAGE PROGRESS")
-        img_counts = dict(conn.execute(
-            "SELECT status, COUNT(*) FROM images GROUP BY status").fetchall())
+        img_counts = dict(_q_all(conn,
+            "SELECT status, COUNT(*) FROM images GROUP BY status"))
         total_imgs = sum(img_counts.values())
         write(f"Total distinct images:          {total_imgs:,}")
         for status in (ddb.ST_SUCCESS, ddb.ST_PENDING,
@@ -100,9 +127,9 @@ def section(title):
             count = img_counts.get(status, 0)
             pct = (count / total_imgs * 100) if total_imgs else 0.0
             write(f"  {status:18s}          {count:>14,}  ({pct:5.2f}%)")
-        raw_kept = conn.execute(
+        raw_kept = _q_one(conn,
             "SELECT COUNT(*) FROM images WHERE error_type=?",
-            (ddb.ERR_RAW_UNPROCESSED,)).fetchone()[0]
+            (ddb.ERR_RAW_UNPROCESSED,))[0]
         write(f"  (of 'success', kept raw -- DNG etc., need conversion: "
               f"{raw_kept:,})")
 
@@ -110,10 +137,10 @@ def section(title):
         section("FAILURES BY TYPE")
         write(f"{'error_type':24s} {'count':>14s}  {'verdict':s}")
         write("-" * 60)
-        rows = conn.execute(
+        rows = _q_all(conn,
             "SELECT error_type, COUNT(*) FROM images "
             "WHERE status LIKE 'failed%' AND error_type IS NOT NULL "
-            "GROUP BY error_type ORDER BY 2 DESC").fetchall()
+            "GROUP BY error_type ORDER BY 2 DESC")
         for error_type, count in rows:
             verdict = "permanent" if ddb.is_permanent(error_type) else "retryable"
             write(f"{error_type:24s} {count:>14,}  {verdict}")
@@ -122,10 +149,9 @@ def section(title):
 
         # -- retry attempt distribution --------------------------------------
         section("RETRY ATTEMPTS (failed_transient images)")
-        rows = conn.execute(
+        rows = _q_all(conn,
             "SELECT attempts, COUNT(*) FROM images "
-            "WHERE status='failed_transient' GROUP BY attempts ORDER BY attempts"
-        ).fetchall()
+            "WHERE status='failed_transient' GROUP BY attempts ORDER BY attempts")
         for attempts, count in rows:
             note = "  <- retry budget exhausted" if attempts >= ddb.MAX_ATTEMPTS else ""
             write(f"  {attempts} attempt(s): {count:,}{note}")
@@ -137,11 +163,11 @@ def section(title):
         write("Hosts whose URLs returned an HTML/text page instead of an image")
         write("(e.g. 'direct download no longer supported'). Sample message shown.")
         write("-" * 70)
-        rows = conn.execute(
+        rows = _q_all(conn,
             "SELECT host, COUNT(*) AS n, MIN(error_detail) "
             "FROM images WHERE error_type=? "
             "GROUP BY host ORDER BY n DESC LIMIT 20",
-            (ddb.ERR_INVALID_CONTENT,)).fetchall()
+            (ddb.ERR_INVALID_CONTENT,))
         for host, count, sample in rows:
             write(f"{(host or '?')[:45]:45s} {count:>10,}")
             if sample:
@@ -153,23 +179,23 @@ def section(title):
         section("TOP 20 HOSTS BY FAILED IMAGES")
         write(f"{'host':40s} {'failed':>10s} {'success':>10s}")
         write("-" * 64)
-        rows = conn.execute(
+        rows = _q_all(conn,
             "SELECT host, "
             "  SUM(CASE WHEN status LIKE 'failed%' THEN 1 ELSE 0 END) AS failed, "
             "  SUM(CASE WHEN status='success' THEN 1 ELSE 0 END) AS ok "
             "FROM images WHERE host IS NOT NULL AND host != '' "
-            "GROUP BY host ORDER BY failed DESC LIMIT 20").fetchall()
+            "GROUP BY host ORDER BY failed DESC LIMIT 20")
         for host, failed, ok in rows:
             write(f"{host[:40]:40s} {failed or 0:>10,} {ok or 0:>10,}")
 
         # -- circuit-breaker state -------------------------------------------
         section("CIRCUIT BREAKER / COOLDOWNS")
-        broken = conn.execute(
-            "SELECT COUNT(*) FROM hosts WHERE error_count >= 500").fetchone()[0]
-        blocked = conn.execute(
+        broken = _q_one(conn,
+            "SELECT COUNT(*) FROM hosts WHERE error_count >= 500")[0]
+        blocked = _q_one(conn,
             "SELECT COUNT(*) FROM hosts "
             "WHERE blocked_until IS NOT NULL "
-            "AND blocked_until > strftime('%s','now')").fetchone()[0]
+            "AND blocked_until > strftime('%s','now')")[0]
         write(f"Hosts past the circuit-breaker threshold (500 errors): {broken:,}")
         write(f"Hosts currently in cooldown:                          {blocked:,}")
 

From 4b229c3640c58d1e5a767d328c4b7e31048e4bce Mon Sep 17 00:00:00 2001
From: Thomas Gardos <3973626+trgardos@users.noreply.github.com>
Date: Mon, 1 Jun 2026 10:43:04 -0400
Subject: [PATCH 16/19] fix README example query: url -> urls (and add LIMIT)

The `images` column is `urls` (plural -- newline-joined candidate URLs
for a distinct image, since IIIF manifest + resolution variants collapse
into one row). The example was carrying over the old per-URL column name
and would have failed with "no such column: url". Also add LIMIT 50 so
the example does not dump every retryable row.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 README.md | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index fbee679..ce878f5 100644
--- a/README.md
+++ b/README.md
@@ -125,11 +125,15 @@ python status_report.py [--db PATH] [--output-dir DIR]
 Ad hoc queries against the database, e.g.:
 ```sql
 -- count each kind of failure
-SELECT error_type, COUNT(*) FROM images
-WHERE status LIKE 'failed%' GROUP BY error_type ORDER BY 2 DESC;
+SELECT error_type, COUNT(*) FROM images WHERE status LIKE 'failed%' GROUP BY error_type ORDER BY 2 DESC;
 
--- every URL still worth retrying
-SELECT gbif_id, url FROM images WHERE status='failed_transient';
+-- every image still worth retrying (urls is the newline-joined candidate list)
+SELECT gbif_id, urls FROM images WHERE status='failed_transient' LIMIT 50;
+```
+
+Count of how many downloads since a certain date and time.
+```bash
+sqlite3 -readonly /projectnb/herbdl/data/GBIF-F25h/download_status.db "SELECT COUNT(*) FROM images WHERE status='success' AND last_attempt_at > '2026-05-31 20:00:00';"
 ```
 
 ### Image Processing

From 9ec6b412f0b40af6ffba643e59c4f69811fbd6ff Mon Sep 17 00:00:00 2001
From: Thomas Gardos <3973626+trgardos@users.noreply.github.com>
Date: Wed, 3 Jun 2026 13:47:53 -0400
Subject: [PATCH 17/19] add documentation on db_integrity_check.sh

---
 README.md | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/README.md b/README.md
index ce878f5..c32aafe 100644
--- a/README.md
+++ b/README.md
@@ -136,6 +136,30 @@ Count of how many downloads since a certain date and time.
 sqlite3 -readonly /projectnb/herbdl/data/GBIF-F25h/download_status.db "SELECT COUNT(*) FROM images WHERE status='success' AND last_attempt_at > '2026-05-31 20:00:00';"
 ```
 
+#### `db_integrity_check.sh`
+**Purpose**: Run SQLite's `PRAGMA integrity_check` against `download_status.db` and print a fresh row-count snapshot. Use it whenever something looks off — e.g. transient `file is not a database` / `disk I/O error` from concurrent readers, anomalous row counts, or just for periodic verification.
+
+**Best practice — pause the downloader first**. Concurrent WAL writes from `image_install_db.py` on networked filesystems (GPFS) can race with the integrity scan and either slow it down or surface as false positives. The downloader is fully resumable, so stopping it costs nothing.
+
+**Usage**:
+```bash
+# 1. pause the downloader (find its job-ID with `qstat -u $USER`)
+qdel <image_install_db_jobid>
+
+# 2. give SQLite a few seconds to checkpoint the WAL
+sleep 30
+
+# 3. submit the integrity check
+qsub -N db_integrity -l h_rt=4:00:00 -pe omp 4 -P herbdl -j y \
+     -o db_integrity.out db_integrity_check.sh
+
+# 4. when db_integrity.out shows 'ok', restart the downloader
+qsub -N image_install_db -l h_rt=48:00:00 -pe omp 16 -P herbdl \
+     -m beas -M your_email@bu.edu image_install_db.sh
+```
+
+On a clean DB the check prints `ok` (takes ~10–20 minutes on a ~20 GB DB), followed by current row counts for `images`, `gbif_ids`, `hosts`, and a `gbif_ids`-by-status breakdown. Any other output indicates real corruption — capture it from `db_integrity.out`.
+
 ### Image Processing
 
 #### `image_utils.py`

From 889cc0854f6c49531f06a408096966f49de08333 Mon Sep 17 00:00:00 2001
From: Thomas Gardos <3973626+trgardos@users.noreply.github.com>
Date: Wed, 3 Jun 2026 14:06:26 -0400
Subject: [PATCH 18/19] switch notifications.py from Pushover to Slack webhook

Replace the Pushover POST in send_notification() with a Slack incoming
webhook POST. SLACK_WEBHOOK_URL is read from .env; without it set the
function logs a one-line warning and silently no-ops so the downloader
keeps working without notifications. A try/except wraps the POST so a
Slack hiccup never interrupts the caller.

Update the README:
- Add a "Push notifications (optional)" callout in the image_install_db.py
  section that points at the notifications.py setup.
- Rewrite the notifications.py setup as a five-step Slack walkthrough
  (create Slack app -> enable Incoming Webhooks -> add to channel ->
   paste URL into .env -> chmod 600 + verify with a one-liner).
- Add a db_integrity_check.sh section describing when/how to run the
  integrity check (pause downloader -> qdel -> qsub -> resume).
- Fix the example query to use `urls` (plural) and add LIMIT.

Add `.env` to .gitignore so the Slack webhook URL never gets committed.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .gitignore       |  3 +++
 README.md        | 33 +++++++++++++++++++++++++--------
 notifications.py | 40 ++++++++++++++++++++++------------------
 3 files changed, 50 insertions(+), 26 deletions(-)

diff --git a/.gitignore b/.gitignore
index 68a21d2..1ea641e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,9 @@
 .venv/
 __pycache__/
 
+# secrets (Slack incoming webhook URL for notifications.py)
+.env
+
 # ignore generated summary files
 summary*.txt
 
diff --git a/README.md b/README.md
index c32aafe..a3e9605 100644
--- a/README.md
+++ b/README.md
@@ -44,6 +44,8 @@ python image_install_db.py [--db PATH]
 - Failures are classified (404, 401, timeout, rate-limited, dropped connection, …); only transient failures are retried, capped at 4 attempts
 - Retry strategy with backoff for 500-level errors
 
+**Push notifications (optional)**: every 50,000 images downloaded in a run, `image_install_db.py` calls `send_notification(...)` from `notifications.py`, which posts a message to a Slack channel via an incoming webhook. Without `SLACK_WEBHOOK_URL` set, it logs a one-line warning and silently no-ops — the downloader works either way. See the [`notifications.py`](#notificationspy) section below for the one-time setup.
+
 #### `image_install_db.sh`
 **Purpose**: SCC job submission wrapper for `image_install_db.py`.
 
@@ -300,14 +302,29 @@ python link_check.py
 ```
 
 #### `notifications.py`
-**Purpose**: Send push notifications via Pushover API for long-running job monitoring.
-
-**Setup**:
-1. Create a `.env` file with:
-```
-PUSHOVER_API_TOKEN=your_token_here
-PUSHOVER_USER_KEY=your_user_key_here
-```
+**Purpose**: Post a message to a Slack channel via an incoming webhook, for long-running job monitoring. `image_install_db.py` calls it every 50,000 images successfully downloaded in a run.
+
+**Setup (one-time)**:
+
+1. Open <https://api.slack.com/apps> and click **Create New App** → **From scratch**. Pick the workspace you want notifications in and give the app a name (e.g. *Herbarium downloader*).
+2. In the new app, open **Incoming Webhooks** (left sidebar) and toggle it **On**.
+3. Click **Add New Webhook to Workspace**, pick the channel that should receive the notifications, and authorise.
+4. Copy the resulting webhook URL (looks like `https://hooks.slack.com/services/T…/B…/…`) into a `.env` file in the repo root:
+    ```
+    SLACK_WEBHOOK_URL=https://hooks.slack.com/services/your/webhook/url
+    ```
+    Lock it down so others can't read the URL:
+    ```bash
+    chmod 600 .env
+    ```
+    `.env` is already in `.gitignore`, so the URL won't get committed.
+5. Verify it works:
+    ```bash
+    python -c "from notifications import send_notification; send_notification('Test', 'Slack is working')"
+    ```
+    You should see the message in the channel within a second or two. If you see `SLACK_WEBHOOK_URL not set; skipping notification.` instead, the env var is empty — re-check `.env`.
+
+Without `.env`, notifications are a silent no-op — the downloader runs fine, just without Slack updates. Treat the webhook URL like a secret: anyone with it can post to the channel.
 
 **Function**:
 ```python
diff --git a/notifications.py b/notifications.py
index 8cc01c9..e844e6c 100644
--- a/notifications.py
+++ b/notifications.py
@@ -1,25 +1,29 @@
-import requests
-import json
+"""
+Push notifications via a Slack incoming webhook.
+
+The webhook URL is read from `.env` (SLACK_WEBHOOK_URL). Without it
+configured, send_notification() is a silent no-op so the caller (e.g.
+image_install_db.py) keeps working without notifications.
+"""
+
 import os
+import requests
 from dotenv import load_dotenv
 
 load_dotenv()
 
-def send_notification(title, message):
 
-    URL = "https://api.pushover.net/1/messages.json"
-
-    api_token = os.getenv("PUSHOVER_API_TOKEN")
-    user_key = os.getenv("PUSHOVER_USER_KEY")
-
-    if not api_token or not user_key:
-        print("Pushover API token or user key not set")
+def send_notification(title, message):
+    url = os.getenv("SLACK_WEBHOOK_URL")
+    if not url:
+        print("SLACK_WEBHOOK_URL not set; skipping notification.")
         return
-
-    data = {
-        "token": os.getenv("PUSHOVER_API_TOKEN"),
-        "user": os.getenv("PUSHOVER_USER_KEY"),
-        "title": title,
-        "message": message
-    }
-    requests.post(URL, data=data)
+    try:
+        requests.post(
+            url,
+            json={"text": f"*{title}*\n{message}"},
+            timeout=10,
+        )
+    except requests.RequestException as e:
+        # Never let a notification failure interrupt the caller.
+        print(f"Slack notification failed: {e}")

From 7811b9727d62f8157f69965b9869f11a2d61390c Mon Sep 17 00:00:00 2001
From: Thomas Gardos <3973626+trgardos@users.noreply.github.com>
Date: Mon, 8 Jun 2026 11:28:47 -0400
Subject: [PATCH 19/19] change slack notification interval to 10000

---
 image_install_db.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/image_install_db.py b/image_install_db.py
index 0603b72..cb3a803 100644
--- a/image_install_db.py
+++ b/image_install_db.py
@@ -604,7 +604,7 @@ def process_id(db, gbif_id, total_to_install):
             with counter_lock:
                 n_installed += 1
                 current = n_installed
-            if current % 50000 == 0:
+            if current % 10000 == 0:
                 send_notification(
                     "Image Installation",
                     f"Installed {current} images this run "