From 92dcffa575e9175c6b7d29cc30cd063c10f724ab Mon Sep 17 00:00:00 2001 From: alexlogs Date: Tue, 12 May 2026 14:34:56 -0400 Subject: [PATCH] =?UTF-8?q?Cecil=20data=20analysis=20with=20xarray-sql=20?= =?UTF-8?q?=E2=80=94=20skill=20+=20tutorial?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Structural restructure of #4 (jayendra13/add-cecil-data-analysis-xql-skill). The original PR is a single skill containing 4 runnable Python scripts and 3 reference documents (~1,200 LOC). That shape is closer to a tutorial than a skill — skills are short, focused text loaded into an agent's context at inference time, not multi-file CLI projects. Splitting into two artifacts: - skills/cecil-data-analysis-xql/SKILL.md (~135 lines) Just what an agent needs in-context: the subscription gate (wallet safety), the golden-rule output format, the SQL idioms, the PascalCase quoting rule, vector-vs-raster routing, and failure modes. Links out to the tutorial for everything else. - tutorials/cecil-data-analysis-xql/ ├── README.md full Step 0–5 walkthrough + worked example ├── references/ datasets.md, sdk.md, xarray_sql.md (Jayendra's) └── scripts/ _env.py, list_subscriptions.py, inspect_dataset.py, run_analysis.py (Jayendra's, with three small fixes) Three review fixes applied to the script: 1. run_analysis.py: added --vector flag using load_dataframe + XarrayContext.from_pandas. Previously the runner unconditionally called load_xarray, which would fail on IBAT vector datasets the description says are in scope ("threatened species ranges intersect this AOI"). 2. run_analysis.py: replaced fig.autofmt_xdate() in the bar-chart branch with rotation of categorical xticks. autofmt_xdate is a date-axis helper applied to a non-date axis. 3. README.md: pinned xarray-sql to a sub-0.1 range (pre-1.0 package; the XarrayContext().from_dataset() API is the kind that drifts in 0.x). Also dropped the bash-specific `set -a && source .env && set +a` syntax from the prerequisites — fish/zsh-without-posix users would hit it. If this merges, #4 should close as superseded. Co-Authored-By: jayendra13 Co-Authored-By: Claude Opus 4.7 (1M context) --- skills/cecil-data-analysis-xql/SKILL.md | 123 ++++++++ tutorials/cecil-data-analysis-xql/README.md | 180 +++++++++++ .../references/datasets.md | 78 +++++ .../cecil-data-analysis-xql/references/sdk.md | 78 +++++ .../references/xarray_sql.md | 162 ++++++++++ .../cecil-data-analysis-xql/scripts/_env.py | 27 ++ .../scripts/inspect_dataset.py | 145 +++++++++ .../scripts/list_subscriptions.py | 95 ++++++ .../scripts/run_analysis.py | 289 ++++++++++++++++++ 9 files changed, 1177 insertions(+) create mode 100644 skills/cecil-data-analysis-xql/SKILL.md create mode 100644 tutorials/cecil-data-analysis-xql/README.md create mode 100644 tutorials/cecil-data-analysis-xql/references/datasets.md create mode 100644 tutorials/cecil-data-analysis-xql/references/sdk.md create mode 100644 tutorials/cecil-data-analysis-xql/references/xarray_sql.md create mode 100644 tutorials/cecil-data-analysis-xql/scripts/_env.py create mode 100644 tutorials/cecil-data-analysis-xql/scripts/inspect_dataset.py create mode 100644 tutorials/cecil-data-analysis-xql/scripts/list_subscriptions.py create mode 100644 tutorials/cecil-data-analysis-xql/scripts/run_analysis.py diff --git a/skills/cecil-data-analysis-xql/SKILL.md b/skills/cecil-data-analysis-xql/SKILL.md new file mode 100644 index 0000000..f5edd02 --- /dev/null +++ b/skills/cecil-data-analysis-xql/SKILL.md @@ -0,0 +1,123 @@ +--- +name: cecil-data-analysis-xql +description: Use this skill when the user asks an analysis question about Cecil earth-observation data — forest carbon, biomass, deforestation, land cover, soil moisture, biodiversity, protected areas, ecosystem integrity. The skill writes SQL with xarray-sql (xql) against the user's existing subscription and presents query → result table → plain-English interpretation as one compact block. +license: MIT +--- + +# Cecil data analysis with xarray-sql + +Use this skill for questions like *"how has tree cover changed in my AOI?"* or *"what's the dominant land cover here?"* — anything that wants to **query** an existing Cecil subscription with SQL. + +For setup, installation, and the runnable scripts that drive these queries, see [`tutorials/cecil-data-analysis-xql/`](../../tutorials/cecil-data-analysis-xql/). This SKILL.md covers only what an agent needs to know in-context. + +## Related skills + +- [`subscribe-and-load`](../subscribe-and-load/SKILL.md) — sets up the subscription + AOI that the queries below assume. +- [`land-cover-baseline-and-change`](../land-cover-baseline-and-change/SKILL.md) — the hand-written numpy/xarray counterpart for readers who want to skip the SQL layer. + +## When this skill is in scope + +**Yes:** dominant-class, time-series, change-detection, area, or overlap questions on Cecil rasters (biomass, LULC, soil moisture, BII, FLII) or vector datasets (IBAT family — protected areas, KBA, species ranges). + +**No:** weather forecasts, atmospheric chemistry / GHGs, ocean / marine data, generic Python help, anything without a spatial AOI. Cecil doesn't carry that data — say so plainly and stop. Don't force-fit a dataset. + +## Subscription gate (the rule that protects the wallet) + +**Default: read-only on existing subscriptions.** Run `scripts/list_subscriptions.py` first. If a subscription for the chosen dataset already exists, go straight to the SQL. + +If no matching subscription exists: + +1. Summarise what you'd subscribe to: dataset name, provider, coverage, relevant variables. +2. Show the exact calls you'd make: `client.create_aoi(geometry=...)` (if no AOI) and `client.create_subscription(aoi_id=..., dataset_id=...)`. +3. **Stop and ask the user to confirm** before calling either method. Subscriptions are billed. Get explicit confirmation for *this* dataset *now*, even if the user agreed to a different one earlier in the conversation. + +If the user declines, say so and stop. Don't try to find a free substitute that might silently answer a different question. + +## Golden rule: query → table → insight + +Whenever this skill is in scope, present every analysis as one compact block, in this order: + +1. **SQL** — fenced `sql` block. Applies at every stage: `DESCRIBE`, exploratory queries, final analysis. +2. **Result table** — Markdown, **capped at 10 rows**. If longer, end with *"… N more in result.csv"*. Never dump 200 rows inline. +3. **Interpretation** — two or three sentences of plain-English insight. No header, no "here are the results:" preamble. + +For exploratory queries with nothing to interpret (`DESCRIBE`, etc.), just show 1 + 2. + +## Table naming + +The xarray Dataset is registered under a name derived from the dataset (provider prefix stripped, slugified): + +| Dataset name | Table name | +|---|---| +| Impact Observatory — Land Cover 9-Class | `land_cover_9_class` | +| Chloris — Aboveground Biomass Stock and Change 10m | `aboveground_biomass_stock_and_change_10m` | +| Lobelia Earth — Surface Soil Moisture 30m | `surface_soil_moisture_30m` | + +Use the same name in your SQL (`FROM land_cover_9_class`, not `FROM cecil`). `scripts/run_analysis.py` does the derivation automatically and prints the table name. + +## SQL idioms + +**Always start with `DESCRIBE`** to confirm column names before writing real SQL: + +```sql +DESCRIBE land_cover_9_class; +``` + +**Top class per timestamp** — `ROW_NUMBER OVER`: + +```sql +SELECT time, ref."Name", cnt +FROM ( + SELECT time, land_cover_class, COUNT(*) AS cnt, + ROW_NUMBER() OVER (PARTITION BY time ORDER BY COUNT(*) DESC) AS rn + FROM land_cover_9_class + GROUP BY time, land_cover_class +) r +JOIN ref_land_cover_class ref ON r.land_cover_class = ref."Index" +WHERE rn = 1 +ORDER BY time; +``` + +**Change detection** — `LAG OVER` per pixel: + +```sql +SELECT x, y, time, + land_cover_class AS new_class, + LAG(land_cover_class) OVER (PARTITION BY x, y ORDER BY time) AS prev_class +FROM land_cover_9_class; +``` + +Filter with `WHERE prev_class IS DISTINCT FROM new_class AND prev_class IS NOT NULL` to keep only changed pixels. + +**Time-series aggregation** — plain `GROUP BY time`: + +```sql +SELECT time, AVG(aboveground_biomass) AS mean_biomass +FROM aboveground_biomass_stock_and_change_10m +WHERE aboveground_biomass IS NOT NULL +GROUP BY time +ORDER BY time; +``` + +## Quoting rule (datafusion gotcha) + +Reference-table columns from the API are **PascalCase** (`"Index"`, `"Name"`). **Double-quote them in SQL** — datafusion lower-cases unquoted identifiers and you'll see *"Schema error: no field named index"*. Get this right and joins work. + +## Vector vs raster datasets + +Most Cecil datasets are raster (`load_xarray`). The **IBAT family** (IUCN Red List, KBA, WDPA, STAR) is vector — use `load_dataframe`. `run_analysis.py` accepts `--vector` for these: + +```bash +uv run python scripts/run_analysis.py --vector \ + --subscription --sql 'SELECT * FROM wdpa_protected_areas LIMIT 5' +``` + +The full SDK gotchas (auth, `subscription_id` vs `dataset_id`, AOI geometry rules) live in [`tutorials/cecil-data-analysis-xql/references/sdk.md`](../../tutorials/cecil-data-analysis-xql/references/sdk.md). + +## Failure modes — say so plainly + +- **No subscriptions and no clear dataset match** → say what you looked for, list candidates, stop. +- **SQL fails twice with the same error** → stop and show the user. Often the fix is the `DESCRIBE` you skipped. +- **Question isn't really about Cecil data** → decline. Better than silently producing a confident-sounding irrelevant answer. + +For the full worked end-to-end example (dominant land cover 2020 vs 2023 with chart), see the tutorial. diff --git a/tutorials/cecil-data-analysis-xql/README.md b/tutorials/cecil-data-analysis-xql/README.md new file mode 100644 index 0000000..2120453 --- /dev/null +++ b/tutorials/cecil-data-analysis-xql/README.md @@ -0,0 +1,180 @@ +# Cecil data analysis with xarray-sql — tutorial + +End-to-end walkthrough for querying Cecil earth-observation data with SQL via [xarray-sql](https://github.com/xarray-contrib/xarray-sql) (`xql`). This is the human-facing setup and worked example; the agent-facing condensed version lives in [`skills/cecil-data-analysis-xql/SKILL.md`](../../skills/cecil-data-analysis-xql/SKILL.md). + +``` +tutorials/cecil-data-analysis-xql/ +├── README.md (this file) +├── references/ +│ ├── datasets.md catalog pointers — read first when picking a dataset +│ ├── sdk.md condensed Cecil SDK reference +│ └── xarray_sql.md XarrayContext patterns and SQL gotchas +└── scripts/ + ├── _env.py auto-loads .env so you don't have to + ├── list_subscriptions.py what's already paid for + ├── inspect_dataset.py variables, units, reference_table, constraints + └── run_analysis.py load → register → run SQL → save outputs +``` + +## Step 0 — Prerequisites + +1. **Install Python dependencies** with `uv`: + + ```bash + uv add cecil 'xarray-sql>=0.0.1,<0.1' matplotlib pandas tabulate + ``` + + `xarray-sql` is pre-1.0; pin to a sub-`0.1` range so `XarrayContext().from_dataset()` keeps working when the next minor lands. + +2. **Create a `.env` file** in the working directory with your API key: + + ``` + CECIL_API_KEY=your-key-here + ``` + + The scripts auto-load `.env` via `scripts/_env.py` — no manual `source` needed. + +3. **Verify the key works**. Open a shell with `.env` loaded and run: + + ```bash + uv run python -c "import cecil; cecil.Client().list_datasets(); print('authenticated')" + ``` + + If this returns 401, the key is invalid or expired. Fix and re-run before proceeding. + +## Step 1 — Pick a dataset + +1. Call the live catalog from the SDK: + + ```python + import cecil + for d in cecil.Client().list_datasets(): + print(d.id, d.name, d.categories, + d.spatial_resolution.nominal, d.temporal_coverage.nominal) + ``` + + See [`references/datasets.md`](references/datasets.md) for selection guidance — which dataset answers which question, plus the cross-cutting gotchas. + +2. **See what's already paid for.** Strongly prefer existing subscriptions; they're free at the margin and the data is already provisioned: + + ```bash + uv run python scripts/list_subscriptions.py + ``` + +3. If two datasets are genuinely plausible for the question (e.g. Chloris vs Sylvera for biomass), present 2–3 candidates with one-line tradeoffs and let the user pick. + +4. Once chosen, see its variables, units, and any reference tables you'll need to join: + + ```bash + uv run python scripts/inspect_dataset.py + ``` + +## Step 2 — Subscription gate + +**Default: read-only on existing subscriptions.** If `list_subscriptions.py` already shows the dataset, skip to Step 3. + +If no matching subscription exists, **stop and confirm with the user before creating one** — subscriptions are billed. Print the exact calls you'd make (`client.create_aoi(geometry=...)` then `client.create_subscription(aoi_id=..., dataset_id=...)`) and wait for explicit confirmation. + +## Step 3 — Load and register + +The canonical pattern (wrapped by `scripts/run_analysis.py`): + +```python +import cecil +import pandas as pd +import xarray_sql as xql + +client = cecil.Client() +ds = client.load_xarray(subscription_id) # raster +# ds = client.load_dataframe(subscription_id) # vector (IBAT) + +table_name = ... # derived from ds.attrs["dataset_name"] +ctx = xql.XarrayContext().from_dataset(table_name, ds) + +dataset = client.get_dataset(ds.attrs["dataset_id"]) +for var in dataset.variables: + if var.reference_table: + ctx.from_pandas(pd.DataFrame(var.reference_table), f"ref_{var.name}") +``` + +**Critical gotcha:** `load_xarray` and `load_dataframe` take a `subscription_id`, **not** a `dataset_id`. If you get a 404, you almost certainly passed a dataset id. See [`references/sdk.md`](references/sdk.md) for the other SDK gotchas. + +**Reference tables** turn integer class codes into human names. Without them, a result of "class 7 has 14921 pixels" means nothing; with them it becomes "Built area covers 14921 pixels". + +## Step 4 — Write the SQL + +1. **Always `DESCRIBE` first** to confirm column names. Saves a round-trip vs hallucinating them: + + ```sql + DESCRIBE land_cover_9_class; + ``` + +2. **Quote PascalCase columns** (`"Index"`, `"Name"`) from reference tables — datafusion lower-cases unquoted identifiers and you'll get *"Schema error: no field named index"*. + +3. **Narrow exploratory queries** with `WHERE time = ...` or `LIMIT` — the xarray Dataset is dask-backed and `SELECT *` on a multi-year sub-meter raster pulls a lot of data. + +4. **Idioms** (`ROW_NUMBER`, `LAG`, time-series aggregation) are in [`references/xarray_sql.md`](references/xarray_sql.md). + +## Step 5 — Run, present, save + +```bash +uv run python scripts/run_analysis.py \ + --subscription \ + --sql 'SELECT DISTINCT land_cover_class FROM land_cover_9_class' +``` + +For vector datasets (IBAT — protected areas, KBA, species), add `--vector`: + +```bash +uv run python scripts/run_analysis.py --vector \ + --subscription \ + --sql 'SELECT * FROM wdpa_protected_areas LIMIT 5' +``` + +Outputs land in the current directory (or `--output-dir`): + +- `result.csv` — raw rows +- `result.md` — markdown table + the SQL that was run +- `result.png` — chart, when the result has a plottable shape + +Plus a markdown summary on stdout so the answer is visible immediately. + +**Present the output using the golden rule** (the skill says the same): SQL → table (≤10 rows) → 2-3 sentence interpretation, in one compact block. No filler, no preamble. + +## Worked example end to end + +User: *"For our project AOI, what's the dominant land cover class in 2023 and how does it compare to 2020?"* + +1. `list_subscriptions.py` shows a sub for "Impact Observatory — Land Cover 9-Class". Use it. Table name → `land_cover_9_class`. +2. `inspect_dataset.py ` confirms the variable is `land_cover_class` with a 9-row `reference_table`. +3. Write the SQL, run it via `run_analysis.py`, and present: + + ```sql + WITH yearly AS ( + SELECT + date_part('year', time) AS year, + land_cover_class, + COUNT(*) AS px, + ROW_NUMBER() OVER ( + PARTITION BY date_part('year', time) + ORDER BY COUNT(*) DESC + ) AS rn + FROM land_cover_9_class + WHERE date_part('year', time) IN (2020, 2023) + GROUP BY 1, 2 + ) + SELECT y.year, ref."Name" AS dominant_class, y.px + FROM yearly y + JOIN ref_land_cover_class ref ON y.land_cover_class = ref."Index" + WHERE y.rn = 1 + ORDER BY y.year; + ``` + + | year | dominant_class | px | + |------|----------------|------:| + | 2020 | Forest | 52014 | + | 2023 | Forest | 48831 | + + Forest dominated the AOI in both years (52k pixels in 2020, 49k in 2023). The 6% drop is offset by an increase in built area, suggesting modest expansion at the edges. Chart saved to `result.png`. + +That's the loop. Read references when you need them, run scripts instead of re-implementing them, gate paid actions on explicit confirmation, and present query → data → insight as one block. diff --git a/tutorials/cecil-data-analysis-xql/references/datasets.md b/tutorials/cecil-data-analysis-xql/references/datasets.md new file mode 100644 index 0000000..07d0af7 --- /dev/null +++ b/tutorials/cecil-data-analysis-xql/references/datasets.md @@ -0,0 +1,78 @@ +# Picking a Cecil dataset + +For the live catalog of every published dataset, **call the SDK** — don't +trust a static list (this file used to be one and went stale within weeks): + +```python +import cecil +for d in cecil.Client().list_datasets(): + print(d.id, d.name, d.categories, + d.spatial_resolution.nominal, d.temporal_coverage.nominal) +``` + +Every `Dataset` carries `id, name, type, categories, providers, variables, +description, usage_notes, constraints, pricing, spatial_resolution, +temporal_coverage, …`. This file only adds the editorial layer the SDK +*can't* give you — which dataset to pick for which question — plus the +cross-cutting gotchas that affect SQL. + +## Selection guidance + +**Plant biomass / forest carbon** — Chloris (project-scale: 10 m from 2017+, +30 m from 2000+), Kanop (project monitoring; the Screening tier is the +cheap pre-flight pass), Planet Forest Carbon (Diligence 30 m, Monitoring +3.5 m quarterly), Sylvera (long-term atlas), UMD Hansen (industry standard +for tree loss / gain). + +**Land use & land cover** — Impact Observatory 9-/15-/17-Class +(simplified → detailed → very-high-res; pick by class count + pixel size), +JRC Global Forest Cover / Type (2020 snapshots), USDA Cropland Data Layer +(US-only; 10 m for recent, 30 m for long history), USGS Annual NLCD +(40 years of US LULC change), WRI SBTN Natural Lands (SBTN-aligned, 2020), +WRI Tropical Tree Cover (tropics-only). + +**Soil moisture** — Lobelia Earth 30 m (field-scale), Planet Soil Water +Content 20 m (frequent, very-high-res) or 1 km (regional, history back to +2002). + +**Biodiversity** — IBAT family (IUCN Red List, KBA, WDPA, STAR) for +species / KBA / protected-area overlaps, NHM BII 1 km / 10 km for +biodiversity-intactness scoring. + +**Ecosystem integrity** — WCS Forest Landscape Integrity Index +(landscape fragmentation, 2019 snapshot). + +## Cross-cutting gotchas + +- **LULC integer codes.** `land_cover_class` (and similar) are integer + codes that mean nothing without joining the variable's `reference_table`. + See `xarray_sql.md` for the join pattern. +- **IBAT is vector.** Use `client.load_dataframe(...)`, **not** + `load_xarray`. IBAT datasets are pandas DataFrames, not gridded rasters. +- **US-only datasets.** USDA Cropland Data Layer and USGS NLCD don't cover + non-US AOIs. WRI Tropical Tree Cover is tropics-only. +- **BII history stops in 2021.** If the question is about 2022+, BII + won't help. + +## Quick "what dataset answers what question" map + +| User asks about... | Try... | +|---|---| +| Forest carbon stock or change | Chloris, Kanop, Planet Forest Carbon, Sylvera | +| Tree-cover loss / deforestation | UMD Hansen, JRC Forest Cover | +| Land cover class history | Impact Observatory (global) or USGS NLCD (US-only) | +| Crop type (US) | USDA Cropland Data Layer | +| Tropical forest extent | WRI Tropical Tree Cover, JRC Forest Cover | +| Soil moisture / drought | Lobelia Earth, Planet Soil Water Content | +| Threatened species overlap | IBAT IUCN Red List (vector — `load_dataframe`) | +| Protected area overlap | IBAT WDPA (vector) | +| Key Biodiversity Areas | IBAT KBA (vector) | +| Biodiversity intactness | NHM BII 1km / 10km | +| Forest landscape integrity | WCS Forest Landscape Integrity Index | + +## Out of scope for this skill + +Cecil does not cover weather forecasts, atmospheric chemistry / air +quality, GHG concentrations, or ocean / marine datasets. If a user asks +for one of these, tell them plainly that Cecil doesn't carry that data — +don't force-fit a dataset that doesn't really answer the question. diff --git a/tutorials/cecil-data-analysis-xql/references/sdk.md b/tutorials/cecil-data-analysis-xql/references/sdk.md new file mode 100644 index 0000000..3113237 --- /dev/null +++ b/tutorials/cecil-data-analysis-xql/references/sdk.md @@ -0,0 +1,78 @@ +# Cecil Python SDK gotchas + +The Cecil SDK has **no docstrings** — every method's `__doc__` is `None`. +So: + +- **Signatures**: `inspect.signature(client.method_name)` — discoverable live. +- **Method list**: `dir(client)` — covers subscriptions, AOIs, datasets, + settings, webhooks. + +This file only documents the things `inspect.signature` and `list_datasets` +/ `get_dataset` *can't* tell you — the gotchas you'll trip over. + +## Auth + +```python +import os, cecil +os.environ["CECIL_API_KEY"] = "..." # required +client = cecil.Client() +``` + +There is **no constructor argument for the API key** — env-var only. The +Client raises `SDKError("Environment variable CECIL_API_KEY not set")` if +the variable is missing. + +## Loading data + +```python +ds = client.load_xarray(subscription_id) # raster datasets +df = client.load_dataframe(subscription_id) # vector datasets (IBAT) +``` + +**Critical gotcha:** these take a **`subscription_id`**, *not* a +`dataset_id`. If you get a 404 from either method, you almost certainly +passed a dataset id by mistake. + +- Use `load_xarray` for **raster** datasets — the vast majority (biomass, + LULC, soil moisture, BII, FLII, etc.). +- Use `load_dataframe` for **vector** datasets — currently the IBAT + family (IUCN Red List, KBA, WDPA, STAR). + +The returned xarray Dataset has dims `(time, y, x)` and attrs +`provider_name`, `dataset_name`, `dataset_id`, `aoi_id`, `subscription_id`. +Backing arrays are lazy dask arrays. + +## AOIs + +```python +client.create_aoi(geometry={...}, external_ref=None) +``` + +`geometry` must be **GeoJSON `Polygon` or `MultiPolygon` in EPSG:4326**. +Subject to each dataset's `Constraints.aoi_min_hectares` / +`aoi_max_hectares` / `aoi_geometry_types` / `aoi_max_vertices` — read +those from `client.get_dataset(...).constraints` before creating the AOI. + +## Datasets and reference tables + +```python +ds = client.get_dataset(dataset_id) +for var in ds.variables: + if var.reference_table: # list[dict], may be empty + ... +``` + +`reference_table` is the lookup that turns integer class codes (e.g. an +Impact Observatory Land Cover code of `7`) into human names (e.g. +`"Built area"`). When non-empty, register it as a sidecar pandas table +and JOIN to it in SQL — see `xarray_sql.md`. + +`Subscription` objects carry `id, dataset_id, aoi_id, external_ref, +status, timestamps` — they do **not** include the dataset's variables. +Follow up with `client.get_dataset(sub.dataset_id)` if you need the schema. + +## What's deliberately not covered + +Webhooks, user/org management, and `_load_xarray_v2` (an internal +preview) aren't used by this skill. Read `client.py` directly if you +need them. diff --git a/tutorials/cecil-data-analysis-xql/references/xarray_sql.md b/tutorials/cecil-data-analysis-xql/references/xarray_sql.md new file mode 100644 index 0000000..880c557 --- /dev/null +++ b/tutorials/cecil-data-analysis-xql/references/xarray_sql.md @@ -0,0 +1,162 @@ +# xarray-sql patterns for Cecil data + +`xarray_sql.XarrayContext` is a `datafusion.SessionContext` subclass that +treats an `xarray.Dataset` as a queryable SQL table. Source: +`.venv/lib/python3.14/site-packages/xarray_sql/sql.py`. + +## Setup pattern + +```python +import re, cecil +import pandas as pd +import xarray_sql as xql + +client = cecil.Client() +ds = client.load_xarray(subscription_id) + +# Derive table name from the dataset (e.g. "land_cover_9_class") +raw = ds.attrs.get("dataset_name", "cecil") +if "—" in raw: + raw = raw.split("—", 1)[1] +table_name = re.sub(r"[^a-z0-9]+", "_", raw.lower().strip()).strip("_") + +ctx = xql.XarrayContext().from_dataset(table_name, ds) + +# Register every variable's reference_table as a sidecar SQL table +dataset = client.get_dataset(ds.attrs["dataset_id"]) +for var in dataset.variables: + if var.reference_table: + ctx.from_pandas(pd.DataFrame(var.reference_table), f"ref_{var.name}") +``` + +After this you can `ctx.sql("SELECT ... FROM ")` and join to +`ref_`. Use `ctx.tables()` to inspect what's registered. + +`scripts/run_analysis.py` does all of this automatically and prints the +table name in its output. + +## Inspect first + +Always start by checking what columns exist before writing real SQL. Column +names are not always obvious from the dataset description: + +```python +ctx.sql("DESCRIBE ").to_pandas() +``` + +This is much faster than trial-and-error and avoids hallucinating column +names. Each Cecil xarray Dataset's coords (`x`, `y`, `time`) become SQL +columns, plus one column per data variable. + +## Quoting rules + +- Bareword identifiers like `land_cover_9_class`, `time`, `land_cover_class` + work without quotes. +- Reference-table columns from the API often use **PascalCase** (e.g. + `"Index"`, `"Name"`) — these **must** be double-quoted in SQL because + datafusion lower-cases unquoted identifiers. The demo gets this right: + + ```sql + JOIN ref_table ref ON r.land_cover_class = ref."Index" + ``` + + Get this wrong and you'll see `Schema error: No field named index`. + +## Idioms + +### Top class per timestamp (`ROW_NUMBER OVER`) + +The dominant value of a categorical variable, per time slice. +Example assumes the "Impact Observatory — Land Cover 9-Class" dataset +(table `land_cover_9_class`): + +```sql +SELECT r.time, r.land_cover_class, ref."Name", r.cnt +FROM ( + SELECT + time, + land_cover_class, + COUNT(*) AS cnt, + ROW_NUMBER() OVER (PARTITION BY time ORDER BY COUNT(*) DESC) AS rn + FROM land_cover_9_class + GROUP BY time, land_cover_class +) r +JOIN ref_land_cover_class ref ON r.land_cover_class = ref."Index" +WHERE r.rn = 1 +ORDER BY r.time; +``` + +### Change detection (`LAG OVER`) + +Where did a pixel's class change between consecutive time steps: + +```sql +WITH changes AS ( + SELECT + x, y, time, + land_cover_class AS new_class, + LAG(land_cover_class) OVER (PARTITION BY x, y ORDER BY time) AS prev_class, + LAG(time) OVER (PARTITION BY x, y ORDER BY time) AS prev_time + FROM land_cover_9_class +) +SELECT c.x, c.y, c.prev_time, c.time AS change_time, + old."Name" AS prev_class_name, + new."Name" AS new_class_name +FROM changes c +JOIN ref_land_cover_class old ON c.prev_class = old."Index" +JOIN ref_land_cover_class new ON c.new_class = new."Index" +WHERE c.new_class IS DISTINCT FROM c.prev_class + AND c.prev_class IS NOT NULL +ORDER BY c.time, c.x, c.y; +``` + +### Time series aggregation + +Mean of a continuous variable over the AOI per timestamp. +Example assumes a biomass dataset (table `aboveground_biomass_stock_and_change_10m`): + +```sql +SELECT time, AVG(aboveground_biomass) AS mean_biomass +FROM aboveground_biomass_stock_and_change_10m +WHERE aboveground_biomass IS NOT NULL +GROUP BY time +ORDER BY time; +``` + +### Spatial summary at a single time + +```sql +SELECT x, y, aboveground_biomass +FROM aboveground_biomass_stock_and_change_10m +WHERE time = TIMESTAMP '2023-01-01'; +``` + +## Time filtering and the `cftime` UDF + +For datasets that use **non-Gregorian** calendars (`360_day`, `julian`, etc.), +`from_dataset` automatically registers a `cftime()` scalar UDF so you can +write date strings: + +```sql +SELECT * FROM WHERE time >= cftime('2020-01-01'); +``` + +For ordinary Gregorian time axes (the common case for Cecil rasters), use +`TIMESTAMP '...'` literals or compare against ISO strings — the auto-UDF is +not registered and won't be available. + +**Caveat from the source:** only one `cftime()` UDF is registered per context, +based on the calendar of the *first* non-Gregorian coordinate seen. If you +ever register two datasets with two different non-Gregorian calendars in the +same context, the second one's filters may be wrong. Use a fresh +`XarrayContext` per calendar in that case. + +## Performance notes + +- The xarray Dataset is dask-backed; SQL aggregations stream through chunks, + but `SELECT *` on a multi-year, sub-meter dataset will pull a lot of data. + Always narrow with `WHERE time = ...` or `LIMIT` while exploring. +- `ctx.tables()` lists all registered tables — use it to confirm the table + name if you're unsure. +- Datafusion plan inspection: `ctx.sql(query).explain()` shows the physical + plan if a query is unexpectedly slow. diff --git a/tutorials/cecil-data-analysis-xql/scripts/_env.py b/tutorials/cecil-data-analysis-xql/scripts/_env.py new file mode 100644 index 0000000..b3ace87 --- /dev/null +++ b/tutorials/cecil-data-analysis-xql/scripts/_env.py @@ -0,0 +1,27 @@ +"""Auto-load .env file from the working directory into os.environ. + +Looks for .env in the current directory and walks up to the repo root. +No external dependencies. Imported at the top of each skill script so +the user never has to remember `source .env` — it just works. +""" +from __future__ import annotations + +import os +from pathlib import Path + + +def load_dotenv() -> None: + """Find and load the nearest .env file.""" + cwd = Path.cwd() + for directory in (cwd, *cwd.parents): + env_file = directory / ".env" + if env_file.is_file(): + for line in env_file.read_text().splitlines(): + line = line.strip() + if not line or line.startswith("#") or "=" not in line: + continue + key, _, val = line.partition("=") + key = key.strip() + val = val.strip().strip("'\"") + os.environ.setdefault(key, val) + return diff --git a/tutorials/cecil-data-analysis-xql/scripts/inspect_dataset.py b/tutorials/cecil-data-analysis-xql/scripts/inspect_dataset.py new file mode 100644 index 0000000..653877c --- /dev/null +++ b/tutorials/cecil-data-analysis-xql/scripts/inspect_dataset.py @@ -0,0 +1,145 @@ +#!/usr/bin/env python +"""Inspect a Cecil dataset by ID: variables, units, reference tables, constraints. + +Used by the cecil-data-analysis-xql skill when narrowing down which dataset to use +or before writing SQL against an unfamiliar dataset. Reference tables are +truncated to a preview by default. + +Usage: + uv run python scripts/inspect_dataset.py + uv run python scripts/inspect_dataset.py --json + uv run python scripts/inspect_dataset.py --full-reference-table +""" +from __future__ import annotations + +import argparse +import json +import os +import sys + +import cecil + +from _env import load_dotenv +load_dotenv() + + +REF_TABLE_PREVIEW_ROWS = 10 + + +def _dataset_to_dict(ds, include_full_ref: bool) -> dict: + return { + "id": ds.id, + "name": ds.name, + "type": ds.type, + "crs": ds.crs, + "categories": ds.categories, + "providers": [p.name for p in ds.providers], + "spatial_coverage": ds.spatial_coverage.nominal, + "spatial_resolution": { + "nominal": ds.spatial_resolution.nominal, + "x": ds.spatial_resolution.x, + "y": ds.spatial_resolution.y, + "units": ds.spatial_resolution.units, + }, + "temporal_coverage": ds.temporal_coverage.nominal, + "temporal_resolution": ds.temporal_resolution.nominal, + "constraints": { + "aoi_min_hectares": ds.constraints.aoi_min_hectares, + "aoi_max_hectares": ds.constraints.aoi_max_hectares, + "aoi_min_latitude": ds.constraints.aoi_min_latitude, + "aoi_max_latitude": ds.constraints.aoi_max_latitude, + "aoi_max_vertices": ds.constraints.aoi_max_vertices, + "aoi_geometry_types": ds.constraints.aoi_geometry_types, + "organisation_verified_only": ds.constraints.organisation_verified_only, + }, + "variables": [ + { + "name": v.name, + "type": v.type, + "units": v.units, + "no_data": v.no_data, + "description": v.description, + "usage_notes": v.usage_notes, + "reference_table": ( + v.reference_table + if include_full_ref + else v.reference_table[:REF_TABLE_PREVIEW_ROWS] + ), + "reference_table_total_rows": len(v.reference_table), + } + for v in ds.variables + ], + "description": ds.description, + "usage_notes": ds.usage_notes, + } + + +def _print_text(d: dict) -> None: + print(f"{d['name']} ({d['id']})") + print(f" type: {d['type']} crs: {d['crs']}") + print(f" providers: {', '.join(d['providers']) or '(none)'}") + print(f" categories: {', '.join(d['categories']) or '(none)'}") + print(f" spatial: {d['spatial_coverage']} @ {d['spatial_resolution']['nominal']}") + print(f" temporal: {d['temporal_coverage']} ({d['temporal_resolution']})") + + c = d["constraints"] + aoi_bits = [] + if c["aoi_min_hectares"] is not None: + aoi_bits.append(f"min {c['aoi_min_hectares']} ha") + if c["aoi_max_hectares"] is not None: + aoi_bits.append(f"max {c['aoi_max_hectares']} ha") + if c["aoi_geometry_types"]: + aoi_bits.append("geom: " + ",".join(c["aoi_geometry_types"])) + if aoi_bits: + print(f" aoi limits: {'; '.join(aoi_bits)}") + + print() + print(" variables:") + for v in d["variables"]: + units = f" [{v['units']}]" if v["units"] else "" + print(f" - {v['name']}: {v['type']}{units}") + if v["description"]: + print(f" {' '.join(v['description'])[:200]}") + if v["reference_table_total_rows"]: + print( + f" reference_table: {v['reference_table_total_rows']} rows; preview:" + ) + for row in v["reference_table"]: + print(f" {row}") + + if d["description"]: + print() + print(" description:") + for line in d["description"]: + print(f" {line}") + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("dataset_id", help="Cecil dataset UUID") + parser.add_argument("--json", action="store_true", help="emit JSON") + parser.add_argument( + "--full-reference-table", + action="store_true", + help="include all reference_table rows (default: preview only)", + ) + args = parser.parse_args() + + if not os.environ.get("CECIL_API_KEY"): + print("error: CECIL_API_KEY is not set", file=sys.stderr) + return 2 + + client = cecil.Client() + ds = client.get_dataset(args.dataset_id) + payload = _dataset_to_dict(ds, args.full_reference_table) + + if args.json: + json.dump(payload, sys.stdout, indent=2, default=str) + sys.stdout.write("\n") + else: + _print_text(payload) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tutorials/cecil-data-analysis-xql/scripts/list_subscriptions.py b/tutorials/cecil-data-analysis-xql/scripts/list_subscriptions.py new file mode 100644 index 0000000..78f914a --- /dev/null +++ b/tutorials/cecil-data-analysis-xql/scripts/list_subscriptions.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python +"""List the user's Cecil subscriptions with the dataset behind each one. + +Used by the cecil-data-analysis-xql skill to figure out which datasets are +already paid for before suggesting analysis. Output is intentionally compact +and machine-readable so the calling model can scan it quickly. + +Usage: + uv run python scripts/list_subscriptions.py # text output + uv run python scripts/list_subscriptions.py --json # JSON output +""" +from __future__ import annotations + +import argparse +import json +import os +import sys + +import cecil + +from _env import load_dotenv +load_dotenv() + + +def _row(sub, ds) -> dict: + return { + "subscription_id": sub.id, + "dataset_id": sub.dataset_id, + "dataset_name": ds.name, + "type": ds.type, + "categories": ds.categories, + "aoi_id": getattr(sub, "aoi_id", None), + "external_ref": getattr(sub, "external_ref", None), + "variables": [ + {"name": v.name, "units": v.units, "categorical": bool(v.reference_table)} + for v in ds.variables + ], + "spatial_resolution": ds.spatial_resolution.nominal, + "temporal_coverage": ds.temporal_coverage.nominal, + } + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--json", action="store_true", help="emit JSON") + parser.add_argument( + "--include-archived", action="store_true", help="include archived subs" + ) + args = parser.parse_args() + + if not os.environ.get("CECIL_API_KEY"): + print("error: CECIL_API_KEY is not set", file=sys.stderr) + return 2 + + client = cecil.Client() + subs = client.list_subscriptions(archived=args.include_archived) + if not subs: + print("no subscriptions found") + return 0 + + rows = [] + for sub in subs: + try: + ds = client.get_dataset(sub.dataset_id) + except Exception as e: # network / auth issues shouldn't kill the whole list + print( + f"warn: could not fetch dataset {sub.dataset_id}: {e}", + file=sys.stderr, + ) + continue + rows.append(_row(sub, ds)) + + if args.json: + json.dump(rows, sys.stdout, indent=2, default=str) + sys.stdout.write("\n") + return 0 + + for r in rows: + print(f"- subscription: {r['subscription_id']}") + print(f" dataset: {r['dataset_name']} ({r['dataset_id']})") + print(f" type: {r['type']} categories: {', '.join(r['categories'])}") + print(f" resolution: {r['spatial_resolution']} coverage: {r['temporal_coverage']}") + if r["aoi_id"]: + print(f" aoi: {r['aoi_id']}") + var_names = [v["name"] for v in r["variables"]] + cat_names = [v["name"] for v in r["variables"] if v["categorical"]] + print(f" variables: {', '.join(var_names) or '(none)'}") + if cat_names: + print(f" categorical: {', '.join(cat_names)}") + print() + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tutorials/cecil-data-analysis-xql/scripts/run_analysis.py b/tutorials/cecil-data-analysis-xql/scripts/run_analysis.py new file mode 100644 index 0000000..bdf095a --- /dev/null +++ b/tutorials/cecil-data-analysis-xql/scripts/run_analysis.py @@ -0,0 +1,289 @@ +#!/usr/bin/env python +"""Run an xarray-sql query against a Cecil subscription. + +Loads the subscription's xarray Dataset, registers it under a table name +derived from the dataset name, registers every variable's reference_table +as `ref_`, runs the SQL, and writes: + + - result.csv raw rows + - result.md Markdown table + the SQL that was run + - result.png chart, when the result has a plottable shape + +Also prints a short Markdown summary on stdout so the calling skill can show +the answer immediately without re-reading files. + +The main table is named after the dataset (e.g. ``land_cover_9_class`` +for "Impact Observatory — Land Cover 9-Class"). Pass ``--table-name`` +to override. + +Usage: + uv run python scripts/run_analysis.py \\ + --subscription \\ + --sql 'SELECT DISTINCT land_cover_class FROM land_cover_9_class' + + # Vector datasets (IBAT family — protected areas, KBA, species ranges): + uv run python scripts/run_analysis.py --vector \\ + --subscription \\ + --sql 'SELECT * FROM wdpa_protected_areas LIMIT 5' + + uv run python scripts/run_analysis.py \\ + --subscription \\ + --sql-file query.sql \\ + --output-dir ./out +""" +from __future__ import annotations + +import argparse +import os +import re +import sys +from pathlib import Path + +import cecil +import pandas as pd +import xarray_sql as xql + +from _env import load_dotenv +load_dotenv() + + +def _to_table_name(dataset_name: str) -> str: + """Turn a dataset name like 'Impact Observatory — Land Cover 9-Class' + into a valid SQL identifier like 'land_cover_9_class'.""" + # Strip provider prefix before the em-dash / double-hyphen + if "—" in dataset_name: + dataset_name = dataset_name.split("—", 1)[1] + elif " - " in dataset_name: + dataset_name = dataset_name.split(" - ", 1)[1] + name = dataset_name.lower().strip() + name = re.sub(r"[^a-z0-9]+", "_", name) + return name.strip("_") + + +def _read_sql(args) -> str: + if args.sql and args.sql_file: + raise SystemExit("error: pass either --sql or --sql-file, not both") + if args.sql: + return args.sql + if args.sql_file: + return Path(args.sql_file).read_text() + raise SystemExit("error: --sql or --sql-file is required") + + +def _build_context( + client: cecil.Client, + subscription_id: str, + table_name_override: str | None = None, + vector: bool = False, +): + """Load the subscription, register the dataset table and every reference_table. + + raster (default): load_xarray + XarrayContext.from_dataset + vector (--vector): load_dataframe + XarrayContext.from_pandas + """ + if vector: + df = client.load_dataframe(subscription_id) + dataset_id = df.attrs.get("dataset_id") + dataset_name = df.attrs.get("dataset_name", "") + else: + ds = client.load_xarray(subscription_id) + dataset_id = ds.attrs.get("dataset_id") + dataset_name = ds.attrs.get("dataset_name", "") + + # Derive table name: explicit override > dataset name > fallback + if table_name_override: + tbl = table_name_override + elif dataset_name: + tbl = _to_table_name(dataset_name) + else: + tbl = "cecil" + + ctx = xql.XarrayContext() + if vector: + ctx.from_pandas(df, tbl) + data = df + else: + ctx.from_dataset(tbl, ds) + data = ds + + registered_refs: list[str] = [] + if dataset_id: + try: + dataset = client.get_dataset(dataset_id) + except Exception as e: + print( + f"warn: could not fetch dataset metadata ({e}); reference tables not registered", + file=sys.stderr, + ) + else: + for var in dataset.variables: + if var.reference_table: + ref_name = f"ref_{var.name}" + ctx.from_pandas(pd.DataFrame(var.reference_table), ref_name) + registered_refs.append(ref_name) + return data, ctx, tbl, registered_refs + + +def _save_outputs(df: pd.DataFrame, sql: str, refs: list[str], out_dir: Path) -> None: + out_dir.mkdir(parents=True, exist_ok=True) + + csv_path = out_dir / "result.csv" + df.to_csv(csv_path, index=False) + + md_path = out_dir / "result.md" + with md_path.open("w") as f: + f.write("# Cecil analysis result\n\n") + f.write("## SQL\n\n```sql\n") + f.write(sql.strip()) + f.write("\n```\n\n") + if refs: + f.write(f"Reference tables registered: `{'`, `'.join(refs)}`\n\n") + f.write(f"Rows: **{len(df)}**\n\n") + if len(df) > 0: + preview = df.head(50) + f.write(preview.to_markdown(index=False)) + f.write("\n") + if len(df) > 50: + f.write(f"\n_… {len(df) - 50} more rows in result.csv_\n") + else: + f.write("_(empty result)_\n") + + +def _maybe_chart(df: pd.DataFrame, out_dir: Path) -> Path | None: + """Best-effort chart. Returns the path written, or None if nothing plotted.""" + if df.empty: + return None + + # Lazy import — matplotlib pulls in a lot of state + import matplotlib + + matplotlib.use("Agg") + import matplotlib.pyplot as plt + + cols = list(df.columns) + numeric_cols = [c for c in cols if pd.api.types.is_numeric_dtype(df[c])] + chart_path = out_dir / "result.png" + + # 1) Time series: a `time`-ish column + at least one numeric measure + time_col = next( + (c for c in cols if c.lower() in {"time", "timestamp", "date", "year"}), None + ) + if time_col and numeric_cols: + measure_cols = [c for c in numeric_cols if c != time_col][:5] + if measure_cols: + fig, ax = plt.subplots(figsize=(8, 4)) + for c in measure_cols: + ax.plot(df[time_col], df[c], marker="o", label=c) + ax.set_xlabel(time_col) + ax.set_ylabel(", ".join(measure_cols)) + ax.legend() + ax.set_title("Cecil time series") + fig.autofmt_xdate() + fig.tight_layout() + fig.savefig(chart_path, dpi=120) + plt.close(fig) + return chart_path + + # 2) Spatial scatter / heatmap: x + y + numeric measure + if "x" in cols and "y" in cols and numeric_cols: + measure = next((c for c in numeric_cols if c not in ("x", "y")), None) + if measure is not None: + fig, ax = plt.subplots(figsize=(6, 5)) + sc = ax.scatter(df["x"], df["y"], c=df[measure], s=4, cmap="viridis") + ax.set_xlabel("x") + ax.set_ylabel("y") + ax.set_title(f"Cecil spatial: {measure}") + fig.colorbar(sc, ax=ax, label=measure) + fig.tight_layout() + fig.savefig(chart_path, dpi=120) + plt.close(fig) + return chart_path + + # 3) Bar chart: a single label column + a single numeric column, small N + if len(cols) == 2 and len(numeric_cols) == 1 and len(df) <= 30: + label_col = next(c for c in cols if c not in numeric_cols) + measure = numeric_cols[0] + fig, ax = plt.subplots(figsize=(8, 4)) + ax.bar(df[label_col].astype(str), df[measure]) + ax.set_xlabel(label_col) + ax.set_ylabel(measure) + ax.set_title(f"Cecil: {measure} by {label_col}") + # Rotate categorical labels so long names don't collide + for label in ax.get_xticklabels(): + label.set_rotation(30) + label.set_horizontalalignment("right") + fig.tight_layout() + fig.savefig(chart_path, dpi=120) + plt.close(fig) + return chart_path + + return None + + +def _print_summary(df: pd.DataFrame, sql: str, table_name: str, chart: Path | None, out_dir: Path) -> None: + print("## Cecil analysis") + print() + print(f"**Table:** `{table_name}`") + print() + print("```sql") + print(sql.strip()) + print("```") + print() + print(f"**Rows:** {len(df)}") + print() + if not df.empty: + preview = df.head(20) + print(preview.to_markdown(index=False)) + if len(df) > 20: + print() + print(f"_…{len(df) - 20} more rows in {out_dir / 'result.csv'}_") + else: + print("_(empty result)_") + print() + print(f"Saved: `{out_dir / 'result.csv'}`, `{out_dir / 'result.md'}`" + + (f", `{chart}`" if chart else " (no chart generated)")) + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--subscription", required=True, help="Cecil subscription_id") + parser.add_argument("--sql", help="SQL string to run against the dataset table") + parser.add_argument("--sql-file", help="File containing the SQL to run") + parser.add_argument( + "--table-name", + default=None, + help="SQL table name for the dataset (default: derived from dataset name)", + ) + parser.add_argument( + "--output-dir", + default=".", + help="Directory to write result.csv / result.md / result.png (default: cwd)", + ) + parser.add_argument( + "--vector", + action="store_true", + help="Treat this subscription as vector (use load_dataframe instead of load_xarray)", + ) + args = parser.parse_args() + + if not os.environ.get("CECIL_API_KEY"): + print("error: CECIL_API_KEY is not set", file=sys.stderr) + return 2 + + sql = _read_sql(args) + out_dir = Path(args.output_dir) + + client = cecil.Client() + _, ctx, tbl, refs = _build_context(client, args.subscription, args.table_name, args.vector) + + result = ctx.sql(sql) + df = result.to_pandas() + + _save_outputs(df, sql, refs, out_dir) + chart = _maybe_chart(df, out_dir) + _print_summary(df, sql, tbl, chart, out_dir) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main())