From 418a055ac870f91b7981fdc276f04c2f03fe3c96 Mon Sep 17 00:00:00 2001 From: mprammer Date: Wed, 27 May 2026 13:28:07 -0400 Subject: [PATCH 01/33] feat(loader): package skeleton, exceptions, lightweight packaging Co-Authored-By: Claude Signed-off-by: mprammer --- pyproject.toml | 59 +++++++++++++++--------------------- raincloud/__init__.py | 15 +++++++++ raincloud/exceptions.py | 34 +++++++++++++++++++++ tests/test_loader_package.py | 23 ++++++++++++++ 4 files changed, 97 insertions(+), 34 deletions(-) create mode 100644 raincloud/__init__.py create mode 100644 raincloud/exceptions.py create mode 100644 tests/test_loader_package.py diff --git a/pyproject.toml b/pyproject.toml index 42fb770..9ad4aa8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "raincloud" -version = "0.1.5" +version = "0.2.0" description = "Client-reproducible pipeline for building a curated catalog of public datasets as Parquet + Vortex files." readme = "README.md" requires-python = ">=3.11" @@ -9,44 +9,31 @@ license = "Apache-2.0" license-files = ["LICENSE"] dependencies = [ - # Core Arrow / Parquet handling — used by every stage. "pyarrow>=23.0", "numpy>=2.0", - "duckdb>=1.5.0", # VARIANT type requires storage_compatibility_version ≥ v1.5.0 - # Vortex conversion. The package was renamed from `vortex-array` to - # `vortex-data` in the 0.32 → 0.69 move and the API isn't yet stable - # across minor versions, so we cap at the next minor. Loosen once the - # public API (`vxio.write(pa_table, path)` is what we depend on) is - # backed by a stability guarantee. "vortex-data>=0.69.0,<0.70.0", - # Streaming decompression formats surfaced in specific upstreams. - "zstandard>=0.25.0", # lichess .pgn.zst monthly dumps - "py7zr>=1.1.0", # 7z archives - "unlzw3>=0.2.0", # UCI diabetes .tar.Z (Unix compress / LZW) - # Format-specific parsers. - "pandas>=2.0", # xlsx_parse (online-retail-ii) - "openpyxl>=3.1", # pandas xlsx backend - "pyreadstat>=1.3", # sas_xpt_parse (CDC BRFSS .xpt) - "osmium>=4.3", # osm_pbf_split (OSM Germany Geofabrik extract) - "jsonschema>=4.0", # validate_manifest.py — sources.json structural checks + "fsspec>=2024.0", ] [project.optional-dependencies] -# Only needed for fetch.type = "kaggle" (≈9 slugs). Requires ~/.kaggle/kaggle.json -# credentials to be set up once per machine. +s3 = ["s3fs>=2024.0"] +http = ["aiohttp>=3.9"] +duckdb = ["duckdb>=1.5.0"] +pandas = ["pandas>=2.0"] +# Full build pipeline (enables the local-build fallback). Mirrors the +# pre-split heavy dependency set. +build = [ + "duckdb>=1.5.0", "zstandard>=0.25.0", "py7zr>=1.1.0", "unlzw3>=0.2.0", + "pandas>=2.0", "openpyxl>=3.1", "pyreadstat>=1.3", "osmium>=4.3", + "jsonschema>=4.0", +] kaggle = ["kaggle>=2.0"] -# Only needed for fetch.type = "huggingface" (1 slug: dbpedia-embeddings). huggingface = ["huggingface-hub>=0.25"] -# Read-only TUI browser for sources.json — `python -m scripts.pipeline.browse`. tui = ["textual>=0.80"] -# Convenience alias. +dev = ["pytest>=8.0", "ruff>=0.13"] all = [ - "kaggle>=2.0", - "huggingface-hub>=0.25", - "textual>=0.80", + "raincloud[s3,http,duckdb,pandas,build,kaggle,huggingface,tui]", ] -# Test runner + linter — install with `uv sync --extra dev`. -dev = ["pytest>=8.0", "ruff>=0.13"] [tool.ruff] line-length = 120 @@ -64,9 +51,13 @@ select = ["E", "F", "W", "I"] # better as one-liners than split across multiple lines. ignore = ["E501", "E701", "E702"] -[tool.uv] -# Currently consumed in-place from a clone, but the dependency posture -# (loose lower-bound constraints, no committed uv.lock) is set up for -# library publication. Flip `package = false` to `true` and add a build -# backend ([build-system]) when you're ready to ship. -package = false +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["raincloud", "scripts"] + +[tool.hatch.build.targets.wheel.force-include] +"docs/v1/snapshot.json" = "raincloud/_data/snapshot.json" +"sources.json" = "raincloud/_data/sources.json" diff --git a/raincloud/__init__.py b/raincloud/__init__.py new file mode 100644 index 0000000..617607b --- /dev/null +++ b/raincloud/__init__.py @@ -0,0 +1,15 @@ +"""Raincloud loader: datasets-style access to prepared Vortex/Parquet files.""" +from __future__ import annotations + +from .exceptions import ( # noqa: F401 + ArtifactNotFound, + BuildToolingMissing, + ChecksumMismatch, + FormatUnavailable, + MissingDependency, + OfflineMiss, + RaincloudError, + UnknownSlug, +) + +__version__ = "0.2.0" diff --git a/raincloud/exceptions.py b/raincloud/exceptions.py new file mode 100644 index 0000000..0e447c7 --- /dev/null +++ b/raincloud/exceptions.py @@ -0,0 +1,34 @@ +"""Typed error hierarchy for the raincloud loader.""" +from __future__ import annotations + + +class RaincloudError(Exception): + """Base class for all loader errors.""" + + +class UnknownSlug(RaincloudError): + """Requested slug is not in the catalog.""" + + +class FormatUnavailable(RaincloudError): + """Requested format is not available for this slug.""" + + +class ArtifactNotFound(RaincloudError): + """Artifact key was not present at the transport source (clean miss).""" + + +class ChecksumMismatch(RaincloudError): + """Downloaded artifact's sha256 did not match the catalog.""" + + +class BuildToolingMissing(RaincloudError): + """Local build was needed but `raincloud[build]` is not installed.""" + + +class OfflineMiss(RaincloudError): + """Offline mode is on and the artifact is not in the local cache.""" + + +class MissingDependency(RaincloudError): + """An optional convenience dependency (duckdb/pandas) is not installed.""" diff --git a/tests/test_loader_package.py b/tests/test_loader_package.py new file mode 100644 index 0000000..53c7fee --- /dev/null +++ b/tests/test_loader_package.py @@ -0,0 +1,23 @@ +def test_import_and_version(): + import raincloud + assert isinstance(raincloud.__version__, str) + assert raincloud.__version__ # non-empty + # top-level re-exports are present and wired to the hierarchy + from raincloud import RaincloudError, UnknownSlug + assert issubclass(UnknownSlug, RaincloudError) + + +def test_exceptions_hierarchy(): + from raincloud.exceptions import ( + ArtifactNotFound, + BuildToolingMissing, + ChecksumMismatch, + FormatUnavailable, + MissingDependency, + OfflineMiss, + RaincloudError, + UnknownSlug, + ) + for exc in (UnknownSlug, FormatUnavailable, ArtifactNotFound, ChecksumMismatch, + BuildToolingMissing, OfflineMiss, MissingDependency): + assert issubclass(exc, RaincloudError) From 3a7d57ef38ae1bf3f0682a857dd3e262ba110345 Mon Sep 17 00:00:00 2001 From: mprammer Date: Wed, 27 May 2026 13:37:04 -0400 Subject: [PATCH 02/33] feat(loader): catalog module reading snapshot + manifest Co-Authored-By: Claude Signed-off-by: mprammer --- raincloud/_catalog.py | 105 +++++++++++++++++++++++++++++++++++ tests/test_loader_catalog.py | 70 +++++++++++++++++++++++ 2 files changed, 175 insertions(+) create mode 100644 raincloud/_catalog.py create mode 100644 tests/test_loader_catalog.py diff --git a/raincloud/_catalog.py b/raincloud/_catalog.py new file mode 100644 index 0000000..2d0edca --- /dev/null +++ b/raincloud/_catalog.py @@ -0,0 +1,105 @@ +"""Catalog: per-slug metadata + checksums from the shipped snapshot + manifest.""" +from __future__ import annotations + +import json +import os +from dataclasses import dataclass, field +from functools import lru_cache +from importlib import resources +from pathlib import Path + +from .exceptions import UnknownSlug + + +def _repo_root() -> Path: + # raincloud/_catalog.py -> repo root is two parents up in a source checkout. + return Path(__file__).resolve().parent.parent + + +def _data_file(kind: str) -> Path: + """Locate a data file. kind in {"snapshot", "manifest"}. + + Precedence: env override -> wheel-packaged copy -> repo source fallback. + """ + env = {"snapshot": "RAINCLOUD_SNAPSHOT", "manifest": "RAINCLOUD_MANIFEST"}[kind] + override = os.environ.get(env) + if override: + return Path(override) + packaged_name = {"snapshot": "snapshot.json", "manifest": "sources.json"}[kind] + try: + p = resources.files("raincloud").joinpath("_data", packaged_name) + if p.is_file(): + return Path(str(p)) + except FileNotFoundError: + pass + repo_name = {"snapshot": "docs/v1/snapshot.json", "manifest": "sources.json"}[kind] + return _repo_root() / repo_name + + +@dataclass(frozen=True) +class FormatInfo: + sha256: str | None + nbytes: int | None + + +@dataclass(frozen=True) +class Entry: + slug: str + rows: int | None + columns: list[dict] = field(default_factory=list) + formats: dict[str, FormatInfo] = field(default_factory=dict) + info: dict = field(default_factory=dict) + + @property + def column_names(self) -> list[str]: + return [c["name"] for c in self.columns] + + +class Catalog: + def __init__(self, snapshot: dict, manifest: dict): + self._slugs = snapshot.get("slugs", {}) + self._specs = {d["slug"]: d for d in manifest.get("datasets", [])} + + def __contains__(self, slug: str) -> bool: + return slug in self._slugs or slug in self._specs + + def slugs(self) -> list[str]: + return sorted(set(self._slugs) | set(self._specs)) + + def entry(self, slug: str) -> Entry: + if slug not in self: + raise UnknownSlug(slug) + snap = self._slugs.get(slug, {}) + spec = self._specs.get(slug, {}) + formats: dict[str, FormatInfo] = {} + for fmt in ("parquet", "vortex"): + nbytes = snap.get(f"{fmt}_bytes") + if nbytes is None: + continue + formats[fmt] = FormatInfo(sha256=snap.get(f"{fmt}_sha256"), nbytes=nbytes) + lic = spec.get("license", {}) or {} + urls = (spec.get("fetch", {}) or {}).get("urls") or [] + info = { + "short_name": spec.get("short_name"), + "full_name": spec.get("full_name"), + "description": spec.get("description"), + "license": lic, + "source_url": urls[0] if urls else lic.get("source_url"), + } + return Entry( + slug=slug, + # Prefer last_built_rows, fall back to expected_rows. Truthiness + # means rows==0 falls through to expected_rows — benign: no slug + # has last_built_rows==0. + rows=snap.get("last_built_rows") or snap.get("expected_rows"), + columns=snap.get("columns") or [], + formats=formats, + info=info, + ) + + +@lru_cache(maxsize=1) +def load_catalog() -> Catalog: + snapshot = json.loads(_data_file("snapshot").read_text()) + manifest = json.loads(_data_file("manifest").read_text()) + return Catalog(snapshot, manifest) diff --git a/tests/test_loader_catalog.py b/tests/test_loader_catalog.py new file mode 100644 index 0000000..2ccfd7c --- /dev/null +++ b/tests/test_loader_catalog.py @@ -0,0 +1,70 @@ +import json + +import pytest + + +@pytest.fixture +def fake_catalog(tmp_path, monkeypatch): + snapshot = { + "schema_version": 1, + "slugs": { + "tiny": { + "expected_rows": 3, "last_built_rows": 3, + "parquet_bytes": 100, "vortex_bytes": 120, + "parquet_sha256": "aa", "vortex_sha256": "bb", + "columns": [{"name": "x", "type": "int64"}], + }, + "pq_only": { + "expected_rows": 5, "last_built_rows": 5, + "parquet_bytes": 50, "vortex_bytes": None, + "parquet_sha256": "cc", "vortex_sha256": None, + "columns": [{"name": "y", "type": "string"}], + }, + }, + } + manifest = {"schema_version": 1, "datasets": [ + {"slug": "tiny", "short_name": "Tiny", "full_name": "Tiny set", + "description": "a tiny set", + "license": {"spdx": "CC0-1.0", "source_url": "http://x", + "redistribution_permitted": True, "attribution_required": False}, + "fetch": {"urls": ["http://src/tiny.csv"]}}, + {"slug": "pq_only", "short_name": "PQ", "full_name": "PQ only", + "description": "p", "license": {"spdx": "MIT", "source_url": "http://y"}, + "fetch": {"urls": ["http://src/pq.csv"]}}, + ]} + sp = tmp_path / "snapshot.json"; sp.write_text(json.dumps(snapshot)) + mp = tmp_path / "sources.json"; mp.write_text(json.dumps(manifest)) + monkeypatch.setenv("RAINCLOUD_SNAPSHOT", str(sp)) + monkeypatch.setenv("RAINCLOUD_MANIFEST", str(mp)) + from raincloud import _catalog + _catalog.load_catalog.cache_clear() + yield _catalog.load_catalog() + _catalog.load_catalog.cache_clear() + + +def test_entry_formats_and_checksums(fake_catalog): + e = fake_catalog.entry("tiny") + assert e.rows == 3 + assert set(e.formats) == {"parquet", "vortex"} + assert e.formats["vortex"].sha256 == "bb" + assert e.formats["parquet"].nbytes == 100 + + +def test_format_availability_excludes_missing(fake_catalog): + e = fake_catalog.entry("pq_only") + assert set(e.formats) == {"parquet"} # vortex_bytes was None + + +def test_info_fields(fake_catalog): + e = fake_catalog.entry("tiny") + assert e.info["short_name"] == "Tiny" + assert e.info["license"]["spdx"] == "CC0-1.0" + assert e.info["source_url"] == "http://src/tiny.csv" + assert e.column_names == ["x"] + + +def test_unknown_slug(fake_catalog): + from raincloud.exceptions import UnknownSlug + assert "nope" not in fake_catalog + with pytest.raises(UnknownSlug): + fake_catalog.entry("nope") From aee0652428b79c8e313fa9ac89b18ed1613a2385 Mon Sep 17 00:00:00 2001 From: mprammer Date: Wed, 27 May 2026 13:44:57 -0400 Subject: [PATCH 03/33] feat(loader): checksum-gated local cache with atomic adopt Co-Authored-By: Claude Signed-off-by: mprammer --- raincloud/_cache.py | 64 ++++++++++++++++++++++++++++++++++++++ tests/test_loader_cache.py | 61 ++++++++++++++++++++++++++++++++++++ 2 files changed, 125 insertions(+) create mode 100644 raincloud/_cache.py create mode 100644 tests/test_loader_cache.py diff --git a/raincloud/_cache.py b/raincloud/_cache.py new file mode 100644 index 0000000..3d9a4ba --- /dev/null +++ b/raincloud/_cache.py @@ -0,0 +1,64 @@ +"""Local artifact cache: paths, sha256 verification, atomic adoption.""" +from __future__ import annotations + +import hashlib +import os +from pathlib import Path + +from .exceptions import ChecksumMismatch + +# Format -> on-disk file extension. Identity for today's two formats, but kept +# as a map so a future format whose extension differs from its name (e.g. a +# "parquet-hydrated" tier -> "parquet") slots in without touching call sites. +EXT = {"parquet": "parquet", "vortex": "vortex"} + +_TRUTHY = {"1", "true", "yes", "on"} + + +def cache_root() -> Path: + env = os.environ.get("RAINCLOUD_CACHE") + if env: + return Path(env) + xdg = os.environ.get("XDG_CACHE_HOME") + base = Path(xdg) if xdg else Path.home() / ".cache" + return base / "raincloud" + + +def cache_path(slug: str, fmt: str) -> Path: + return cache_root() / "v1" / slug / fmt / f"{slug}.{EXT[fmt]}" + + +def sha256_file(path: Path) -> str: + h = hashlib.sha256() + with open(path, "rb") as f: + for chunk in iter(lambda: f.read(1024 * 1024), b""): # 1 MiB chunks + h.update(chunk) + return h.hexdigest() + + +def is_offline() -> bool: + return os.environ.get("RAINCLOUD_OFFLINE", "").lower() in _TRUTHY + + +def verify(path: Path, expected_sha256: str) -> None: + actual = sha256_file(path) + if actual != expected_sha256: + raise ChecksumMismatch( + f"{path}: expected sha256 {expected_sha256}, got {actual}" + ) + + +def adopt(tmp: Path, dest: Path, expected_sha256: str | None) -> Path: + """Verify (if a checksum is known) then atomically move tmp -> dest. + + On any failure, the tmp file is removed and dest is left untouched. + """ + try: + if expected_sha256 is not None: + verify(tmp, expected_sha256) + dest.parent.mkdir(parents=True, exist_ok=True) + os.replace(tmp, dest) # atomic within a filesystem + return dest + finally: + if tmp.exists(): + tmp.unlink() diff --git a/tests/test_loader_cache.py b/tests/test_loader_cache.py new file mode 100644 index 0000000..47ad0e7 --- /dev/null +++ b/tests/test_loader_cache.py @@ -0,0 +1,61 @@ +import hashlib + +import pytest + + +def _sha(b: bytes) -> str: + return hashlib.sha256(b).hexdigest() + + +def test_cache_path_honors_env(tmp_path, monkeypatch): + monkeypatch.setenv("RAINCLOUD_CACHE", str(tmp_path)) + from raincloud import _cache + p = _cache.cache_path("foo", "vortex") + assert p == tmp_path / "v1" / "foo" / "vortex" / "foo.vortex" + + +def test_sha256_file(tmp_path): + from raincloud import _cache + f = tmp_path / "a.bin"; f.write_bytes(b"hello") + assert _cache.sha256_file(f) == _sha(b"hello") + + +def test_adopt_verifies_and_renames(tmp_path): + from raincloud import _cache + src = tmp_path / "t.part"; src.write_bytes(b"data") + dest = tmp_path / "v1" / "s" / "parquet" / "s.parquet" + out = _cache.adopt(src, dest, _sha(b"data")) + assert out == dest and dest.read_bytes() == b"data" + assert not src.exists() + + +def test_adopt_mismatch_raises_and_cleans(tmp_path): + from raincloud import _cache + from raincloud.exceptions import ChecksumMismatch + src = tmp_path / "t.part"; src.write_bytes(b"data") + dest = tmp_path / "out.parquet" + with pytest.raises(ChecksumMismatch): + _cache.adopt(src, dest, "deadbeef") + assert not src.exists() and not dest.exists() + + +def test_is_offline(monkeypatch): + from raincloud import _cache + monkeypatch.setenv("RAINCLOUD_OFFLINE", "1") + assert _cache.is_offline() is True + monkeypatch.setenv("RAINCLOUD_OFFLINE", "0") + assert _cache.is_offline() is False + monkeypatch.setenv("RAINCLOUD_OFFLINE", "TRUE") + assert _cache.is_offline() is True + monkeypatch.delenv("RAINCLOUD_OFFLINE", raising=False) + assert _cache.is_offline() is False + + +def test_adopt_none_checksum_skips_verification(tmp_path): + from raincloud import _cache + src = tmp_path / "t.part"; src.write_bytes(b"locally-built") + dest = tmp_path / "v1" / "s" / "vortex" / "s.vortex" + # None checksum => trusted local build, no verification, must still adopt. + out = _cache.adopt(src, dest, None) + assert out == dest and dest.read_bytes() == b"locally-built" + assert not src.exists() From 1a6c0bef72fb7f0de3d536fbf16b84220f21e6c1 Mon Sep 17 00:00:00 2001 From: mprammer Date: Wed, 27 May 2026 13:50:12 -0400 Subject: [PATCH 04/33] feat(loader): fsspec transport-only fetch Co-Authored-By: Claude Signed-off-by: mprammer --- raincloud/_transport.py | 31 +++++++++++++++++++++++++++++++ tests/test_loader_transport.py | 18 ++++++++++++++++++ 2 files changed, 49 insertions(+) create mode 100644 raincloud/_transport.py create mode 100644 tests/test_loader_transport.py diff --git a/raincloud/_transport.py b/raincloud/_transport.py new file mode 100644 index 0000000..38ef1fa --- /dev/null +++ b/raincloud/_transport.py @@ -0,0 +1,31 @@ +"""fsspec transport-only: copy a remote URL to a local temp file.""" +from __future__ import annotations + +from pathlib import Path + +import fsspec + +from .exceptions import ArtifactNotFound + + +def fetch(url: str, dest: Path) -> None: + """Stream the object at `url` into `dest`. Raise ArtifactNotFound on a miss. + + `url` is any fsspec-understood URL (file://, s3://, https://, ...). The + matching backend extra (raincloud[s3]/[http]) must be installed for + non-local schemes; fsspec raises ImportError otherwise, which we let + propagate as an actionable message. + """ + dest.parent.mkdir(parents=True, exist_ok=True) + try: + with fsspec.open(url, "rb") as src, open(dest, "wb") as out: + for chunk in iter(lambda: src.read(1024 * 1024), b""): + out.write(chunk) + except FileNotFoundError as e: + if dest.exists(): + dest.unlink() + raise ArtifactNotFound(url) from e + except Exception: + if dest.exists(): + dest.unlink() + raise diff --git a/tests/test_loader_transport.py b/tests/test_loader_transport.py new file mode 100644 index 0000000..3ac047c --- /dev/null +++ b/tests/test_loader_transport.py @@ -0,0 +1,18 @@ +import pytest + + +def test_fetch_copies_bytes(tmp_path): + from raincloud import _transport + src = tmp_path / "remote.bin"; src.write_bytes(b"payload") + dest = tmp_path / "local.part" + _transport.fetch(f"file://{src}", dest) + assert dest.read_bytes() == b"payload" + + +def test_fetch_missing_raises_artifact_not_found(tmp_path): + from raincloud import _transport + from raincloud.exceptions import ArtifactNotFound + dest = tmp_path / "local.part" + with pytest.raises(ArtifactNotFound): + _transport.fetch(f"file://{tmp_path}/does-not-exist.bin", dest) + assert not dest.exists() # cleanup contract: no partial file left on a miss From 04abeed52307110dc04839fd5a65b9e5eeb15a2d Mon Sep 17 00:00:00 2001 From: mprammer Date: Wed, 27 May 2026 13:55:42 -0400 Subject: [PATCH 05/33] feat(loader): cache->mirror->build resolver Co-Authored-By: Claude Signed-off-by: mprammer --- raincloud/_resolve.py | 94 +++++++++++++++++++++++++++++ tests/test_loader_resolve.py | 111 +++++++++++++++++++++++++++++++++++ 2 files changed, 205 insertions(+) create mode 100644 raincloud/_resolve.py create mode 100644 tests/test_loader_resolve.py diff --git a/raincloud/_resolve.py b/raincloud/_resolve.py new file mode 100644 index 0000000..c477742 --- /dev/null +++ b/raincloud/_resolve.py @@ -0,0 +1,94 @@ +"""Resolution order: local cache -> mirror -> local build.""" +from __future__ import annotations + +import importlib.util +import os +import shutil +import subprocess +import sys +from pathlib import Path + +from . import _cache, _transport +from ._catalog import load_catalog +from .exceptions import ( + ArtifactNotFound, + BuildToolingMissing, + FormatUnavailable, + OfflineMiss, +) + + +def artifact_key(slug: str, fmt: str) -> str: + return f"v1/{slug}/{fmt}/{slug}.{_cache.EXT[fmt]}" + + +def _mirror_base(mirror: str | None) -> str | None: + base = mirror if mirror is not None else os.environ.get("RAINCLOUD_MIRROR") + return base.rstrip("/") if base else None + + +def _build_available() -> bool: + return importlib.util.find_spec("scripts.pipeline.build") is not None + + +def resolve( + slug: str, + fmt: str, + *, + mirror: str | None = None, + offline: bool | None = None, + allow_build: bool = True, +) -> Path: + cat = load_catalog() + entry = cat.entry(slug) # raises UnknownSlug + if fmt not in entry.formats: + raise FormatUnavailable( + f"{slug}: format {fmt!r} not available; have {sorted(entry.formats)}" + ) + dest = _cache.cache_path(slug, fmt) + expected = entry.formats[fmt].sha256 + + # 1) cache hit + if dest.exists() and (expected is None or _cache.sha256_file(dest) == expected): + return dest + + is_offline = _cache.is_offline() if offline is None else offline + if is_offline: + raise OfflineMiss(f"{slug}/{fmt} not cached and offline mode is on") + + # 2) mirror + base = _mirror_base(mirror) + if base is not None: + url = f"{base}/{artifact_key(slug, fmt)}" + tmp = dest.parent / f".{dest.name}.part" + dest.parent.mkdir(parents=True, exist_ok=True) + try: + _transport.fetch(url, tmp) + return _cache.adopt(tmp, dest, expected) + except ArtifactNotFound: + pass # fall through to build + + # 3) local build + if allow_build and _build_available(): + subprocess.run( + [sys.executable, "-m", "scripts.pipeline.build", slug], check=True + ) + from scripts.pipeline.spec import output_format_dir # type: ignore + + built = output_format_dir(slug, fmt) / f"{slug}.{_cache.EXT[fmt]}" + if not built.exists(): + raise ArtifactNotFound(f"build produced no {fmt} for {slug}") + tmp = dest.parent / f".{dest.name}.part" + dest.parent.mkdir(parents=True, exist_ok=True) + try: + shutil.copyfile(built, tmp) + return _cache.adopt(tmp, dest, None) # locally built: trusted + except Exception: + if tmp.exists(): + tmp.unlink() + raise + + raise BuildToolingMissing( + f"{slug}/{fmt} not cached and not in mirror; " + f"install `raincloud[build]` or set RAINCLOUD_MIRROR" + ) diff --git a/tests/test_loader_resolve.py b/tests/test_loader_resolve.py new file mode 100644 index 0000000..64c48d7 --- /dev/null +++ b/tests/test_loader_resolve.py @@ -0,0 +1,111 @@ +import hashlib +import json + +import pytest + + +def _sha(b: bytes) -> str: + return hashlib.sha256(b).hexdigest() + + +@pytest.fixture +def env(tmp_path, monkeypatch): + """A file:// mirror with one artifact + a catalog describing it.""" + payload = b"PARQUETBYTES" + mirror = tmp_path / "mirror" + key = mirror / "v1" / "tiny" / "parquet" / "tiny.parquet" + key.parent.mkdir(parents=True) + key.write_bytes(payload) + snapshot = { + "schema_version": 1, + "slugs": { + "tiny": { + "expected_rows": 3, + "last_built_rows": 3, + "parquet_bytes": len(payload), + "vortex_bytes": None, + "parquet_sha256": _sha(payload), + "vortex_sha256": None, + "columns": [{"name": "x", "type": "int64"}], + } + }, + } + manifest = { + "schema_version": 1, + "datasets": [ + {"slug": "tiny", "short_name": "T", "license": {}, "fetch": {"urls": []}} + ], + } + sp = tmp_path / "snapshot.json" + sp.write_text(json.dumps(snapshot)) + mp = tmp_path / "sources.json" + mp.write_text(json.dumps(manifest)) + monkeypatch.setenv("RAINCLOUD_SNAPSHOT", str(sp)) + monkeypatch.setenv("RAINCLOUD_MANIFEST", str(mp)) + monkeypatch.setenv("RAINCLOUD_CACHE", str(tmp_path / "cache")) + monkeypatch.setenv("RAINCLOUD_MIRROR", f"file://{mirror}") + monkeypatch.delenv("RAINCLOUD_OFFLINE", raising=False) + from raincloud import _catalog + + _catalog.load_catalog.cache_clear() + yield {"payload": payload, "mirror": mirror, "tmp": tmp_path} + _catalog.load_catalog.cache_clear() + + +def test_artifact_key(): + from raincloud import _resolve + + assert _resolve.artifact_key("tiny", "parquet") == "v1/tiny/parquet/tiny.parquet" + + +def test_resolve_from_mirror_then_cache(env): + from raincloud import _cache, _resolve + + p = _resolve.resolve("tiny", "parquet") + assert p == _cache.cache_path("tiny", "parquet") + assert p.read_bytes() == env["payload"] + # second call is a pure cache hit (corrupt the mirror to prove no refetch) + (env["mirror"] / "v1" / "tiny" / "parquet" / "tiny.parquet").write_bytes(b"X") + assert _resolve.resolve("tiny", "parquet").read_bytes() == env["payload"] + + +def test_resolve_checksum_mismatch(env): + from raincloud import _resolve + from raincloud.exceptions import ChecksumMismatch + + (env["mirror"] / "v1" / "tiny" / "parquet" / "tiny.parquet").write_bytes(b"corrupt") + with pytest.raises(ChecksumMismatch): + _resolve.resolve("tiny", "parquet") + + +def test_resolve_offline_miss(env, monkeypatch): + from raincloud import _resolve + from raincloud.exceptions import OfflineMiss + + monkeypatch.setenv("RAINCLOUD_OFFLINE", "1") + with pytest.raises(OfflineMiss): + _resolve.resolve("tiny", "parquet") + + +def test_resolve_mirror_miss_no_build(env, monkeypatch): + from raincloud import _resolve + from raincloud.exceptions import BuildToolingMissing + + # point mirror somewhere empty; disable build + monkeypatch.setenv("RAINCLOUD_MIRROR", f"file://{env['tmp']}/empty") + monkeypatch.setattr(_resolve, "_build_available", lambda: False) + with pytest.raises(BuildToolingMissing): + _resolve.resolve("tiny", "parquet", allow_build=True) + + +def test_resolve_propagates_non_notfound_transport_error(env, monkeypatch): + from raincloud import _resolve, _transport + + def boom(url, dest): + raise PermissionError("403 denied") + + monkeypatch.setattr(_transport, "fetch", boom) + # A transport error that is NOT a clean miss must propagate, not silently + # fall through to a (potentially multi-hour) local build. + with pytest.raises(PermissionError): + _resolve.resolve("tiny", "parquet") From f9a0189af936520fa22b5b2882f0d19255f18df3 Mon Sep 17 00:00:00 2001 From: mprammer Date: Wed, 27 May 2026 14:03:44 -0400 Subject: [PATCH 06/33] feat(loader): lazy Dataset handle and load()/load_dataset Co-Authored-By: Claude Signed-off-by: mprammer --- raincloud/__init__.py | 117 ++++++++++++++++++++++++++++++++++++++ tests/test_loader_load.py | 94 ++++++++++++++++++++++++++++++ 2 files changed, 211 insertions(+) create mode 100644 tests/test_loader_load.py diff --git a/raincloud/__init__.py b/raincloud/__init__.py index 617607b..5ba0685 100644 --- a/raincloud/__init__.py +++ b/raincloud/__init__.py @@ -1,6 +1,10 @@ """Raincloud loader: datasets-style access to prepared Vortex/Parquet files.""" from __future__ import annotations +from pathlib import Path + +from . import _resolve +from ._catalog import load_catalog from .exceptions import ( # noqa: F401 ArtifactNotFound, BuildToolingMissing, @@ -13,3 +17,116 @@ ) __version__ = "0.2.0" + +_DEFAULT_FORMAT = "vortex" + + +class Dataset: + """Lazy handle to a prepared artifact. Nothing is fetched until you ask.""" + + def __init__(self, slug: str, fmt: str, *, mirror: str | None, + offline: bool | None): + self.slug = slug + self.format = fmt + self._mirror = mirror + self._offline = offline + self._entry = load_catalog().entry(slug) + + def __repr__(self) -> str: + return f"Dataset(slug={self.slug!r}, format={self.format!r})" + + # --- cheap metadata (no I/O beyond the in-memory catalog) --- + @property + def num_rows(self) -> int | None: + return self._entry.rows + + @property + def column_names(self) -> list[str]: + return self._entry.column_names + + @property + def info(self) -> dict: + return self._entry.info + + # --- resolution --- + def path(self) -> Path: + return self.path_for(self.format) + + def path_for(self, fmt: str) -> Path: + return _resolve.resolve(self.slug, fmt, mirror=self._mirror, + offline=self._offline) + + # --- materialization --- + def to_arrow(self): + import pyarrow.parquet as pq + if self.format == "parquet": + return pq.read_table(self.path()) + import vortex + return vortex.open(str(self.path())).to_arrow().read_all() + + def to_vortex(self): + import vortex + return vortex.open(str(self.path_for("vortex"))) + + @property + def schema(self): + import pyarrow.parquet as pq + if self.format == "parquet": + return pq.read_schema(self.path()) # footer-only, cheap + import vortex + # vortex path must open the file (heavier than the parquet footer read) + return vortex.open(str(self.path())).to_arrow().schema + + def scan(self): + """Return a DuckDB relation over the dataset. + + Always reads the parquet artifact (DuckDB has no native Vortex + reader), resolving the parquet sibling even when this handle's + format is vortex. + """ + try: + import duckdb + except ImportError as e: + raise MissingDependency( + "scan() needs DuckDB — install `raincloud[duckdb]`" + ) from e + pq_path = self.path_for("parquet") + return duckdb.connect().read_parquet(str(pq_path)) + + def to_pandas(self): + try: + import pandas # noqa: F401 + except ImportError as e: + raise MissingDependency( + "to_pandas() needs pandas — install `raincloud[pandas]`" + ) from e + return self.to_arrow().to_pandas() + + +def load(slug: str, *, format: str = _DEFAULT_FORMAT, + offline: bool | None = None, mirror: str | None = None) -> Dataset: + """Return a lazy Dataset handle for `slug`. + + `format` defaults to "vortex" and falls back to "parquet" when the slug + has no vortex artifact. Raises UnknownSlug for an unknown slug and + FormatUnavailable when neither the requested nor a fallback format exists. + """ + entry = load_catalog().entry(slug) # raises UnknownSlug + fmt = format + if fmt not in entry.formats: + if fmt == "vortex" and "parquet" in entry.formats: + fmt = "parquet" + else: + raise FormatUnavailable( + f"{slug}: format {format!r} unavailable; have {sorted(entry.formats)}" + ) + return Dataset(slug, fmt, mirror=mirror, offline=offline) + + +load_dataset = load # datasets-muscle-memory alias + +__all__ = [ + "load", "load_dataset", "Dataset", "__version__", + "RaincloudError", "UnknownSlug", "FormatUnavailable", "ArtifactNotFound", + "ChecksumMismatch", "BuildToolingMissing", "OfflineMiss", "MissingDependency", +] diff --git a/tests/test_loader_load.py b/tests/test_loader_load.py new file mode 100644 index 0000000..03fda40 --- /dev/null +++ b/tests/test_loader_load.py @@ -0,0 +1,94 @@ +import hashlib +import json + +import pyarrow as pa +import pyarrow.parquet as pq +import pytest + + +def _sha(p) -> str: + return hashlib.sha256(p.read_bytes()).hexdigest() + + +@pytest.fixture +def loaded(tmp_path, monkeypatch): + table = pa.table({"x": [1, 2, 3]}) + mirror = tmp_path / "mirror" + pqkey = mirror / "v1" / "tiny" / "parquet" / "tiny.parquet" + pqkey.parent.mkdir(parents=True); pq.write_table(table, pqkey) + snapshot = {"schema_version": 1, "slugs": {"tiny": { + "expected_rows": 3, "last_built_rows": 3, + "parquet_bytes": pqkey.stat().st_size, "vortex_bytes": None, + "parquet_sha256": _sha(pqkey), "vortex_sha256": None, + "columns": [{"name": "x", "type": "int64"}]}}} + manifest = {"schema_version": 1, "datasets": [{"slug": "tiny", + "short_name": "Tiny", "full_name": "Tiny", "description": "d", + "license": {"spdx": "CC0-1.0"}, "fetch": {"urls": ["http://s"]}}]} + (tmp_path / "snapshot.json").write_text(json.dumps(snapshot)) + (tmp_path / "sources.json").write_text(json.dumps(manifest)) + monkeypatch.setenv("RAINCLOUD_SNAPSHOT", str(tmp_path / "snapshot.json")) + monkeypatch.setenv("RAINCLOUD_MANIFEST", str(tmp_path / "sources.json")) + monkeypatch.setenv("RAINCLOUD_CACHE", str(tmp_path / "cache")) + monkeypatch.setenv("RAINCLOUD_MIRROR", f"file://{mirror}") + from raincloud import _catalog + _catalog.load_catalog.cache_clear() + yield + _catalog.load_catalog.cache_clear() + + +def test_load_metadata_is_cheap(loaded): + import raincloud + ds = raincloud.load("tiny", format="parquet") + assert ds.num_rows == 3 + assert ds.column_names == ["x"] + assert ds.info["license"]["spdx"] == "CC0-1.0" + + +def test_load_to_arrow(loaded): + import raincloud + ds = raincloud.load("tiny", format="parquet") + tbl = ds.to_arrow() + assert tbl.num_rows == 3 and tbl.column_names == ["x"] + assert ds.path().exists() + + +def test_default_format_falls_back_to_parquet(loaded): + import raincloud + ds = raincloud.load("tiny") # default vortex, but only parquet exists + assert ds.format == "parquet" + + +def test_unknown_slug_raises(loaded): + import raincloud + from raincloud.exceptions import UnknownSlug + with pytest.raises(UnknownSlug): + raincloud.load("nope") + + +def test_load_dataset_alias(loaded): + import raincloud + assert raincloud.load_dataset is raincloud.load + + +def test_format_unavailable_raises(tmp_path, monkeypatch): + snapshot = {"schema_version": 1, "slugs": {"vx": { + "expected_rows": 1, "last_built_rows": 1, + "parquet_bytes": None, "vortex_bytes": 10, + "parquet_sha256": None, "vortex_sha256": "aa", + "columns": [{"name": "x", "type": "int64"}]}}} + manifest = {"schema_version": 1, "datasets": [{"slug": "vx", + "short_name": "VX", "license": {}, "fetch": {"urls": []}}]} + (tmp_path / "s.json").write_text(json.dumps(snapshot)) + (tmp_path / "m.json").write_text(json.dumps(manifest)) + monkeypatch.setenv("RAINCLOUD_SNAPSHOT", str(tmp_path / "s.json")) + monkeypatch.setenv("RAINCLOUD_MANIFEST", str(tmp_path / "m.json")) + import raincloud + from raincloud import _catalog + from raincloud.exceptions import FormatUnavailable + _catalog.load_catalog.cache_clear() + try: + # only vortex exists; requesting parquet has no fallback + with pytest.raises(FormatUnavailable): + raincloud.load("vx", format="parquet") + finally: + _catalog.load_catalog.cache_clear() From 10ea912895e27a201ccc0c1caae030c59c931bdb Mon Sep 17 00:00:00 2001 From: mprammer Date: Wed, 27 May 2026 14:15:52 -0400 Subject: [PATCH 07/33] feat(docs): carry parquet/vortex sha256 in snapshot for the loader Co-Authored-By: Claude Signed-off-by: mprammer --- docs/v1/snapshot.json | 152 ++++++++++++++++++++++++++++--- scripts/pipeline/docs.py | 16 ++++ tests/test_snapshot_checksums.py | 8 ++ 3 files changed, 163 insertions(+), 13 deletions(-) create mode 100644 tests/test_snapshot_checksums.py diff --git a/docs/v1/snapshot.json b/docs/v1/snapshot.json index e2f636c..d3e7eba 100644 --- a/docs/v1/snapshot.json +++ b/docs/v1/snapshot.json @@ -1,6 +1,6 @@ { "schema_version": 1, - "generated_at": "2026-05-17T01:07:13Z", + "generated_at": "2026-05-27T18:14:16Z", "note": "Auto-generated by scripts/pipeline/docs.py. Read by the TUI as a fallback when a local parquet isn't built. Regenerate after any build / schema change with `python -m scripts.pipeline.docs snapshot`.", "slugs": { "clickbench-hits": { @@ -1201,6 +1201,8 @@ "last_built_row_groups": 1, "parquet_bytes": 73701591, "vortex_bytes": 118973792, + "parquet_sha256": "ab7ab80dd49faba78e8321b1454fc2ff6cdbb5b5e872848e362853105e78c394", + "vortex_sha256": "d149b58bd8d59b157c68520d5de756cdd5755c08eeb9e25d9221ac7fcab7ef56", "columns": [ { "name": "Country Name", @@ -1795,6 +1797,8 @@ "last_built_row_groups": 21, "parquet_bytes": 1558628820, "vortex_bytes": 2112544400, + "parquet_sha256": "fc7c1413ea8eeed4a8993888ea2cd3511513037b294478aa74d07891aaa91cab", + "vortex_sha256": "aeffbfe87959f63ca38b03ef832d121a4f362377d0040822380f873d1221dc84", "columns": [ { "name": "Unique Key", @@ -2437,6 +2441,8 @@ "last_built_row_groups": 701, "parquet_bytes": 9976102966, "vortex_bytes": 15508324928, + "parquet_sha256": "18749a23f34f9f8ff4d9341d695ad6c22154d604ccd31ca3cbc739efaaf53d21", + "vortex_sha256": "37c93f284ea84be60ca53242454cd02eaef3542634560e9003f20fa54da95b11", "columns": [ { "name": "id", @@ -2499,22 +2505,25 @@ }, "osm-germany-relations": { "expected_rows": null, - "last_built_rows": 889712, - "parquet_bytes": 95642737, - "vortex_bytes": 155687156, + "last_built_rows": 890059, + "last_built_row_groups": 9, + "parquet_bytes": 95699411, + "vortex_bytes": 155627324, + "parquet_sha256": "0203b9678f731279507b2bf06047331081617779c4a496cb98ffe0f166d8842d", + "vortex_sha256": "4f7646e656293d31a83278a76ae17e36ae0a3d4ae242b8477945faf319e1057b", "columns": [ { "name": "id", "type": "int64", - "length": 2775802, + "length": 2776912, "null_count": 0, "min": 882, - "max": 20620429 + "max": 20631777 }, { "name": "version", "type": "int32", - "length": 494032, + "length": 488930, "null_count": 0, "min": 1, "max": 3154 @@ -2522,15 +2531,15 @@ { "name": "timestamp", "type": "string", - "length": 4659107, + "length": 4663690, "null_count": 0, "min": "2007-10-17T19:07:38+00:00", - "max": "2026-05-03T20:14:56+00:00" + "max": "2026-05-05T20:18:32+00:00" }, { "name": "members", "type": "list>", - "length": 65420639, + "length": 65466557, "null_count": null, "min": null, "max": null @@ -2538,12 +2547,21 @@ { "name": "tags", "type": "list>", - "length": 22281959, + "length": 22292153, "null_count": null, "min": null, "max": null } - ] + ], + "size_bucket": "s", + "shape_traits": { + "has_nested": true, + "has_timestamp": false, + "has_variant": false, + "string_heavy": false, + "wide_row": false, + "high_cardinality_present": null + } }, "hacker-news": { "expected_rows": null, @@ -3585,6 +3603,8 @@ "last_built_row_groups": 8, "parquet_bytes": 150924485, "vortex_bytes": 134835552, + "parquet_sha256": "2c39fae00b1dee42909f942299c0ddcfd35b5c3aca428a953e3fb6c15641a8d6", + "vortex_sha256": "873bae0250a5c1a86c4cd6bd6503d8580ca6ad21fee7bbc6515407a656a2fa26", "columns": [ { "name": "word", @@ -30061,6 +30081,8 @@ "last_built_row_groups": 1, "parquet_bytes": 2732, "vortex_bytes": 10056, + "parquet_sha256": "72ed0dd70848acef0c2c94ad58385144a50f21484fa012c18b15dfa53ed25538", + "vortex_sha256": "4d3e683f6d911a2d3876e28cda03e22d029b051427ef3cfd65e3e367de4d5300", "columns": [ { "name": "sepal_length", @@ -30239,6 +30261,8 @@ "last_built_row_groups": 1, "parquet_bytes": 82836, "vortex_bytes": 112248, + "parquet_sha256": "74b2cce0f54f063391e4d3812bc11afa86bf4f8bdd1885dce327f49ab8d27ded", + "vortex_sha256": "289df4c7c09e02ac190a4d23187f4bbbbfdcdde2cc562de0b444fd9cd6c35f7f", "columns": [ { "name": "fixed_acidity", @@ -31169,6 +31193,8 @@ "last_built_row_groups": 1, "parquet_bytes": 3085946, "vortex_bytes": 3470324, + "parquet_sha256": "c0d99b2907203d9d4e495967ff2555b9346118013df67a8c44484269237e9dcc", + "vortex_sha256": "1dc992e33ecba1a8b7c0b3b7282a8baa37bc1c6e8069760c9f1b3c9c0488ab4a", "columns": [ { "name": "invoiceno", @@ -32075,6 +32101,8 @@ "last_built_row_groups": 1, "parquet_bytes": 90533, "vortex_bytes": 131396, + "parquet_sha256": "0cc25809dfa9aee37a9e0703f1951568287f6d321994965417c25ccf6a7aaab9", + "vortex_sha256": "ead9e91500dfd44271d9f2119b76ce6004a4ebb7ce2bd281b496798eea94c0a4", "columns": [ { "name": "patient_id", @@ -37469,6 +37497,8 @@ "last_built_row_groups": 1, "parquet_bytes": 195535, "vortex_bytes": 158968, + "parquet_sha256": "151fd25027af4d233c47b90e60923f9ad824f50976a28a835f190ead0ffc015e", + "vortex_sha256": "5ef0007940e02b7ddce829a32094bc3198219c151242e1466e94f8bf40a4f4d5", "columns": [ { "name": "instant", @@ -38228,8 +38258,11 @@ "uci-spambase": { "expected_rows": 4601, "last_built_rows": 4601, + "last_built_row_groups": 1, "parquet_bytes": 169587, "vortex_bytes": 377572, + "parquet_sha256": "0e2e06c5444e8db618fd6fd26048733c592982c0557224497c5fe7f6e0819005", + "vortex_sha256": "f56e3bf6c7b8e87969400bd7d2056a830df1a492d5a74e2c8deb2cd7dfd17290", "columns": [ { "name": "word_freq_make", @@ -38695,7 +38728,16 @@ "min": 0, "max": 1 } - ] + ], + "size_bucket": "xs", + "shape_traits": { + "has_nested": false, + "has_timestamp": false, + "has_variant": false, + "string_heavy": false, + "wide_row": true, + "high_cardinality_present": null + } }, "uci-magic-gamma-telescope": { "expected_rows": 19020, @@ -41231,6 +41273,8 @@ "last_built_row_groups": 1, "parquet_bytes": 10725, "vortex_bytes": 20992, + "parquet_sha256": "59639068c918f2f3f46532b3b2ee47605d1d3a59580cbb3872259f1c5becd886", + "vortex_sha256": "ca0daddc46e7db5934e7655b42fa5be57eed3b22b9a7629ca53d25b9bf1c9ad0", "columns": [ { "name": "col_0", @@ -48369,6 +48413,8 @@ "last_built_row_groups": 2, "parquet_bytes": 62325617, "vortex_bytes": 53679308, + "parquet_sha256": "7db14cf55edb24cc4fd4befaa07b0f5e767e3bc9d0724d435e83dd928583fd7d", + "vortex_sha256": "10a8f8953cc87f36b0156ec06fe03b657c6340f6ecab2a617e7a313e4a45bd20", "columns": [ { "name": "Organization Group Code", @@ -52699,6 +52745,8 @@ "last_built_row_groups": 1, "parquet_bytes": 4637420, "vortex_bytes": 5825840, + "parquet_sha256": "c39dfd34997d7ebeea5ec7389d2f22e28eadb143c43bc06ab81a2410433d3bb1", + "vortex_sha256": "aa8953e69d38d2765f4d2c62af08aaa7336e9c6bb5eefc65bd949f223656b080", "columns": [ { "name": "ID", @@ -102741,6 +102789,8 @@ "last_built_row_groups": 1, "parquet_bytes": 930532, "vortex_bytes": 1577848, + "parquet_sha256": "402ea0920378fa61637c8e4047cbb2461d418ac26e833002ac58c3eef8eb0ac9", + "vortex_sha256": "01e1aa77bac5296c7cf72693024f3323d56756bdcaf7415c0083ebd0a0bc7efa", "columns": [ { "name": "hotel", @@ -103335,6 +103385,8 @@ "last_built_row_groups": 1, "parquet_bytes": 3322173, "vortex_bytes": 3315556, + "parquet_sha256": "a9bf0bddc568e719ab789f341cc6805d79eaddb2ea779bdef593ab9fbc1f4d5d", + "vortex_sha256": "9a57a490f584e08dd8f943fc8e31f0907c6a81d14807770bc9c96800ed19e8d9", "columns": [ { "name": "kepid", @@ -112465,6 +112517,8 @@ "last_built_row_groups": 1, "parquet_bytes": 29295473, "vortex_bytes": 90073420, + "parquet_sha256": "a079040cebf707cc6ae94cb36880ecf5562350e2c4fda5e9b94e6b61613a1708", + "vortex_sha256": "c4fe5dd81cd9a10f171942c8c2cd0c4f0582b6108d525ae23b223fd318c3f849", "columns": [ { "name": "answer", @@ -112723,6 +112777,8 @@ "last_built_row_groups": 1, "parquet_bytes": 1106214411, "vortex_bytes": 1652406076, + "parquet_sha256": "2f917f7469b6f43e28d5680b72ed84f43a7b3e06927e7553b95eac7099767a3c", + "vortex_sha256": "f0007332c751b9a9bf502bed47fb75acd02c1354cc2d65d14b140ef6d5f303b0", "columns": [ { "name": "prompt", @@ -113533,6 +113589,8 @@ "last_built_row_groups": 7, "parquet_bytes": 8130454405, "vortex_bytes": 11784725216, + "parquet_sha256": "4f46f3efd2aae547a4ad19a35be0c0801bf74cbc67ae1cd19965a12681b44664", + "vortex_sha256": "a8e3b94b68a33fe0c0b67335ea47289592cf780dea66b319dd76c8153a0c124b", "columns": [ { "name": "id", @@ -113591,6 +113649,8 @@ "last_built_row_groups": 3, "parquet_bytes": 664283863, "vortex_bytes": 962524448, + "parquet_sha256": "4edd26c9198483aab39e58c14f4f7f86285151c417423bbf6de5dc80c4a6e033", + "vortex_sha256": "6fa4dc3b21e27e8f4c0229607aa0f07e8cde62c0e6dfbbba044b87202734d142", "columns": [ { "name": "text", @@ -113625,6 +113685,8 @@ "last_built_row_groups": 4, "parquet_bytes": 4806167375, "vortex_bytes": 6089862740, + "parquet_sha256": "a0c3ab9b0b050b3e882711fcea7f705e567ffc3f9a9c16aad8dc8845ea76d6c7", + "vortex_sha256": "636cdea4268af7f5b6e1c2c4ef423326151dd1903546b182322afea164584bd9", "columns": [ { "name": "text", @@ -113739,6 +113801,8 @@ "last_built_row_groups": 1, "parquet_bytes": 7294437, "vortex_bytes": 13304260, + "parquet_sha256": "b54e8f4ab980f80ebe6be48356a6dd904dadfc10e4d35dde8a14e3f800dc2792", + "vortex_sha256": "96043c7527071d73a7d88621fa00fbd4d3cd92d6e014a78e8a8d3d0c2254af09", "columns": [ { "name": "text", @@ -113949,6 +114013,8 @@ "last_built_row_groups": 1, "parquet_bytes": 2121169743, "vortex_bytes": 3268285484, + "parquet_sha256": "52428549cbc520a0a12cb475d2582f6999ac271dc3b5a9d0b3d1773c0f2be9e9", + "vortex_sha256": "f9493ed6a760c4cb304b7229c3ffc2b842cdd0dc1712d6de72601eecbd34a1f7", "columns": [ { "name": "text_token_length", @@ -114023,6 +114089,8 @@ "last_built_row_groups": 1, "parquet_bytes": 22934195, "vortex_bytes": 38332972, + "parquet_sha256": "3b9023a42c3aed1d8e5e4f3de8c8bff433dbb667be8a687a6e4522d21928a804", + "vortex_sha256": "b7b3b0156c0c87baf1776951057d4c0d6d541acb60af088d2df4b01cedbea486", "columns": [ { "name": "id", @@ -114137,6 +114205,8 @@ "last_built_row_groups": 1, "parquet_bytes": 1520738958, "vortex_bytes": 1943240616, + "parquet_sha256": "e0fa14e94a3a8b13b189e18fd2309fa2cc3a3f1bada5f5a9cc37496f46571a29", + "vortex_sha256": "4f24e6f06b90b8c4cbbb7ebe891273c253d28a554aa7e4c201b5dd144d19e010", "columns": [ { "name": "id", @@ -114487,6 +114557,8 @@ "last_built_row_groups": 1, "parquet_bytes": 633824764, "vortex_bytes": 645264768, + "parquet_sha256": "e8f14b8ce7e80b6c2f2de9c7c06676d90901b89b06b8afb7af1e40a3aa8831e7", + "vortex_sha256": "9e007f67c6043d028640cdf2e3c6091f16218dd9760163fb93f72fc79adad849", "columns": [ { "name": "images", @@ -114529,6 +114601,8 @@ "last_built_row_groups": 1, "parquet_bytes": 60551, "vortex_bytes": 103864, + "parquet_sha256": "bd65bb1e42c0f3eea4f5e41629271abc4f42b37d68b8cff96a6d032b5329bd11", + "vortex_sha256": "b6adfd55352022ec7c462182e96d8ac9c8a997b46705b75e0ac0a5a7fcaa86c7", "columns": [ { "name": "task_id", @@ -114595,6 +114669,8 @@ "last_built_row_groups": 1, "parquet_bytes": 187862, "vortex_bytes": 331456, + "parquet_sha256": "cc02b069245fbade0263629f7a23b16f199643a756127eb94ef9ae2cc52537ff", + "vortex_sha256": "9b41c6bd7c9c846392c18745d0ead6bd60cbb17c3b551455f14dd7466a4126fd", "columns": [ { "name": "question", @@ -114645,6 +114721,8 @@ "last_built_row_groups": 1, "parquet_bytes": 3633313707, "vortex_bytes": 3675267604, + "parquet_sha256": "a72bafdbec981a21cf5b850f561db97b3902c5a28cf9119e6b8b3411f7c3359f", + "vortex_sha256": "b7ea53a0d1e5566d946f26bf7d2af198d5a5965a7440f4464e1cbdbbccdcc42a", "columns": [ { "name": "id", @@ -114799,6 +114877,8 @@ "last_built_row_groups": 1, "parquet_bytes": 793523, "vortex_bytes": 1103536, + "parquet_sha256": "85acd4636fd9468718e66f795bb33f32d16709dee50ab9105c2f286efd1e04e6", + "vortex_sha256": "e56c0c71829bb4e2b42edab83c1868e632f3f57979c6841f4cfab2ff9a61b4b3", "columns": [ { "name": "id", @@ -114857,6 +114937,8 @@ "last_built_row_groups": 1, "parquet_bytes": 751960, "vortex_bytes": 1055580, + "parquet_sha256": "55ae2ffe9b74516d7668efb73dd5e84e8c046bcf68233b9f320d49ed2d5e6a85", + "vortex_sha256": "6135bc9f6e6f4424b21c479138fb82391800598afdab284909c2ea01f5cb208b", "columns": [ { "name": "pubid", @@ -114923,6 +115005,8 @@ "last_built_row_groups": 1, "parquet_bytes": 56312765, "vortex_bytes": 74216104, + "parquet_sha256": "2f99c4dab111c4c8b7d0a9715ea68df56fcc3ac7c0fe930d65426791d6f8dbb8", + "vortex_sha256": "375462e51d41963e0e7d9536987ff3e3ac6b166579af276fe47485249b2a7590", "columns": [ { "name": "id", @@ -115037,6 +115121,8 @@ "last_built_row_groups": 1, "parquet_bytes": 275730042, "vortex_bytes": 359519012, + "parquet_sha256": "b6756a27e19367d4f37adbba86cf9ee4572730445eaec637674687b2c98cf1d1", + "vortex_sha256": "8ee29f0c19bf44e33f866961444cf5337efeff3860930cb5a930cceafd611f19", "columns": [ { "name": "id", @@ -115119,6 +115205,8 @@ "last_built_row_groups": 1, "parquet_bytes": 569684836, "vortex_bytes": 748493424, + "parquet_sha256": "492009418c2afbd12fd18596f3ac32ede9f4455dcb6912cbacb01e2fce5654d1", + "vortex_sha256": "99b800dbd9448a3df1a7a2efa063725ba742c0c824f0460ff703154a6010b6d6", "columns": [ { "name": "article", @@ -115169,6 +115257,8 @@ "last_built_row_groups": 1, "parquet_bytes": 15873453, "vortex_bytes": 20423644, + "parquet_sha256": "48533689b58e91740da03c67114473d38c61d1c31a8f47d12514b24f5857cec4", + "vortex_sha256": "00955c30842af9d7890fe52364e9b16b05fb3e21fd6d7c8347f08d06ac066fa6", "columns": [ { "name": "image", @@ -115211,6 +115301,8 @@ "last_built_row_groups": 1, "parquet_bytes": 152938, "vortex_bytes": 242304, + "parquet_sha256": "e8f476fc08222703d6f77da35178c1888b06e0a3f814c161293688fb8b16d9b2", + "vortex_sha256": "d8d664d93a91691aa02262069a3109c17f44f5abe103acb695ebda8796747528", "columns": [ { "name": "task_id", @@ -115285,6 +115377,8 @@ "last_built_row_groups": 1, "parquet_bytes": 4307899453, "vortex_bytes": null, + "parquet_sha256": "029e8c97e41a971d5d6bc91dd1dde7c934a7f9abfdbc05842b25b0a84885526d", + "vortex_sha256": null, "columns": [ { "name": "name", @@ -115471,6 +115565,8 @@ "last_built_row_groups": 1, "parquet_bytes": 346051024, "vortex_bytes": 367513008, + "parquet_sha256": "9902c1b94252b424dc361bf5fd7354e8ebff610ea53d22150f92b8cbbbf66386", + "vortex_sha256": "ed724fd452bca8579d2bb70fb7f11697881f2a6d9e9533d67bfa07cd4db9da77", "columns": [ { "name": "file", @@ -115545,6 +115641,8 @@ "last_built_row_groups": 1, "parquet_bytes": 2306373926, "vortex_bytes": 2458934716, + "parquet_sha256": "19fd17c4ab293450bd635e0a7b14237edf19fb5d4fdf99f68d73b16db3c78429", + "vortex_sha256": "dfe64038a4f814d13fd3aa94a4f4f2bceae7d420d147390b8e5652a150de942d", "columns": [ { "name": "id", @@ -115603,6 +115701,8 @@ "last_built_row_groups": 1, "parquet_bytes": 562911303, "vortex_bytes": 571225152, + "parquet_sha256": "eae8a9cb5a57711580c0e374bd14fe901aaa5c903cd5ce29dbdf078cef32b20f", + "vortex_sha256": "76a96582a6ddcf66c84f9a36f14b838276eaf3cc532fcd7e3d1f8d1481f35eb6", "columns": [ { "name": "id", @@ -115677,6 +115777,8 @@ "last_built_row_groups": 1, "parquet_bytes": 165968, "vortex_bytes": 256980, + "parquet_sha256": "941578c6cc5e69739cfb8db43ce4055902b949927aa4b28dc91c34c2b5028e1c", + "vortex_sha256": "1e1c3d4f4a345eb859d1ed978d8be34696e8780f85fdac1fb5cb06e290615281", "columns": [ { "name": "Unnamed: 0", @@ -115831,6 +115933,8 @@ "last_built_row_groups": 1, "parquet_bytes": 265872790, "vortex_bytes": 429778304, + "parquet_sha256": "63bd6e330c8a356a026a08abf8a1b6c74c7683114b0b5ac21f0699fe8f716955", + "vortex_sha256": "4604c8573de84acb62eb61d056de9d57841a5fa7eb3235c526b1f4921a2b83e5", "columns": [ { "name": "video_id", @@ -115961,6 +116065,8 @@ "last_built_row_groups": 1, "parquet_bytes": 1641372431, "vortex_bytes": 2257731372, + "parquet_sha256": "c8250d75e6f5fab02cb576111aae982b46eb85d8691bb4735648881170b0b05c", + "vortex_sha256": "271bba9a5c2c8c06cc60e87f4408dd33063faa220aa69899ac59a7e3e989afc4", "columns": [ { "name": "synth_id", @@ -116099,6 +116205,8 @@ "last_built_row_groups": 1, "parquet_bytes": 13207283, "vortex_bytes": 21735488, + "parquet_sha256": "eef8dd021b7ec65fe5ae5dfaa7359746a25085b32db507d3c6f6b935d1fc533b", + "vortex_sha256": "0ed4dbc889df05c9f0e28256ffbc339188c6a1679c98c56db67cc716a92ebd45", "columns": [ { "name": "prompt", @@ -116181,6 +116289,8 @@ "last_built_row_groups": 1, "parquet_bytes": 3190047, "vortex_bytes": 5669032, + "parquet_sha256": "63ce777948587a5b82a98b80cccc448bfaf4a44d4d285a0b5f38abbbb91c7c54", + "vortex_sha256": "b5c2d7bf860455a435c69bc551f4bf079e3d2289baa3c031baf7df259aeeb20e", "columns": [ { "name": "transcript_id", @@ -116223,6 +116333,8 @@ "last_built_row_groups": 1, "parquet_bytes": 104135565, "vortex_bytes": 170067388, + "parquet_sha256": "39ce8864e3b1d19de6976da4a19133ddd27261071e8522ada64e8e0db2ff1a88", + "vortex_sha256": "b534a6f2db918cb994ad8362808a25ad9c2afa6b5d4304fe449a8df4f29ba8fb", "columns": [ { "name": "inputs", @@ -116297,6 +116409,8 @@ "last_built_row_groups": 1, "parquet_bytes": 103944240, "vortex_bytes": 168069836, + "parquet_sha256": "9b42772dc63b9508f5e63334d9d5253455bf9dc50c89240b50e31cd4fabaa079", + "vortex_sha256": "c513d9f234b94f69b1b9a11a6b4aa331cc732e9db7612fb4054701f0553232f0", "columns": [ { "name": "id", @@ -116403,6 +116517,8 @@ "last_built_row_groups": 1, "parquet_bytes": 58178780, "vortex_bytes": 106617292, + "parquet_sha256": "35516084f5f51199e1459eb1ffba0e8d9dcdbb64b2dd9e2248a85dd276adf941", + "vortex_sha256": "75cd4bcc086c0643b9edd0420c0f3bdd64ef4635d5183bbd6ce35fabb5af8dd3", "columns": [ { "name": "instruction", @@ -116445,6 +116561,8 @@ "last_built_row_groups": 2, "parquet_bytes": 19485316, "vortex_bytes": 21301348, + "parquet_sha256": "63402d6ee1adba5117d54549d776ba566528a247b8fa335aa2598ed1de195659", + "vortex_sha256": "5e4c2c6cb932eaae0bd40b4af2d21459d638f5623b5682cf0ce3cbe363fb7698", "columns": [ { "name": "unix_ts", @@ -116559,6 +116677,8 @@ "last_built_row_groups": 1, "parquet_bytes": 263543, "vortex_bytes": 426464, + "parquet_sha256": "fd894bd09380724d679cb64a8e713891d233d50cca2522216323ae91058d9d32", + "vortex_sha256": "33da5a1918a80a5e896702796f14e48f6d8449ada6978527cb4fe633b2549a18", "columns": [ { "name": "f0", @@ -116625,6 +116745,8 @@ "last_built_row_groups": 1, "parquet_bytes": 5624277, "vortex_bytes": 7621292, + "parquet_sha256": "12da59ce3b1895a968f8d2e708246fa1df210ced3e6986395e91a0ab107d1dde", + "vortex_sha256": "7e19e46cf134805c3e376d2ebffb8cac5a85d2d392a0f3a440a00d2b2daa96be", "columns": [ { "name": "No", @@ -116787,6 +116909,8 @@ "last_built_row_groups": 1, "parquet_bytes": 3747845, "vortex_bytes": 6080116, + "parquet_sha256": "e54895929c1f8e54f31a59d061f476aa2f2cfd28da1928af9b3e34c6501271c1", + "vortex_sha256": "d39b7c358464a299cc43d83650d4361fe532fae635f56274729403ab687709ec", "columns": [ { "name": "geo_id", @@ -116885,6 +117009,8 @@ "last_built_row_groups": 1, "parquet_bytes": 28713325, "vortex_bytes": 41394396, + "parquet_sha256": "4a0b6e364f0271940bb6c4182602890491b41516c87cc0e4e043905b196bee90", + "vortex_sha256": "a0abe7c51aefd60ef7f93db053ddc07eb86f79b2b6e0c8fd421b14113174cc89", "columns": [ { "name": "answer_choices", diff --git a/scripts/pipeline/docs.py b/scripts/pipeline/docs.py index b677e5b..5fef931 100644 --- a/scripts/pipeline/docs.py +++ b/scripts/pipeline/docs.py @@ -57,6 +57,19 @@ ) +def _sha256_for_path(path) -> str | None: + """Stream a file's sha256, or None if it doesn't exist.""" + import hashlib + p = Path(path) + if not p.exists(): + return None + h = hashlib.sha256() + with open(p, "rb") as f: + for chunk in iter(lambda: f.read(1024 * 1024), b""): + h.update(chunk) + return h.hexdigest() + + def _generation_header(kind: str) -> str: ts = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") return (f"