From 8ef7f1d146b052bfd9232a2630adb39963343faa Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Fri, 19 Jun 2026 07:42:59 -0400 Subject: [PATCH 1/3] feat: add durable git snapshot history --- CHANGELOG.md | 2 + README.md | 4 +- docs/commands/publish.md | 2 + docs/commands/update.md | 3 + docs/guides/git-snapshots.md | 3 +- go.mod | 12 +- go.sum | 36 +++--- internal/cli/cli_test.go | 15 ++- internal/cli/share_commands.go | 42 +++++-- internal/share/share.go | 198 ++++++++++++++++++++++++--------- internal/share/share_test.go | 79 +++++++++++-- internal/store/query.go | 3 +- 12 files changed, 299 insertions(+), 100 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2b9ed72..50433f7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,11 +4,13 @@ ### Changes +- Add immutable Git snapshot tags and non-mutating historical restores with `publish --tag` and `update --ref`. - Restore the missing v0.10.0 release history for the first Cloudflare remote archive release. Thanks @joshka. - Expose the release changelog directly in the documentation site navigation. Thanks @joshka. ### Fixes +- Preserve private share-repository permissions and unpublished local branches while moving Git history, ref, and FTS query mechanics onto CrawlKit; refresh Go dependencies. - Refresh Discord member roles daily for published archives, and make `sync --with-members` bypass cached freshness when a refresh is required. Thanks @hannesrudolph. - Keep incremental share imports compatible with crawlkit's safe changed-tail replacement plan instead of falling back to a full archive rebuild. - Accept absolute Windows SQLite paths through the shared crawlkit store opener. diff --git a/README.md b/README.md index 327bd73..2f5c400 100644 --- a/README.md +++ b/README.md @@ -519,6 +519,7 @@ Publisher: ```bash discrawl publish --remote https://github.com/example/discord-archive.git --push +discrawl publish --tag backup-2026-06-19 --push discrawl publish --readme path/to/discord-backup/README.md --push discrawl publish --public-only --include-channels 1458141495701012561 --push discrawl publish --no-media --push @@ -530,6 +531,7 @@ Subscriber: discrawl subscribe https://github.com/example/discord-archive.git discrawl search "launch checklist" discrawl messages --channel general --hours 24 +discrawl update --ref backup-2026-06-19 ``` `subscribe` is the Git-only setup path. It writes a config with `discord.token_source = "none"`, imports the snapshot, and does not require a Discord bot token. `sync` and `tail` remain disabled in this mode because they need live Discord access. @@ -576,7 +578,7 @@ discrawl subscribe --stale-after 15m https://github.com/example/discord-archive. discrawl subscribe --no-auto-update https://github.com/example/discord-archive.git ``` -Once `share.remote` is configured, read commands auto-fetch and import when the local share import is older than `share.stale_after` (default `15m`). Imports are planned from crawlkit shard fingerprints, with a Git-object fallback for older manifests, so routine updates normally read only changed tail shards and preserve local FTS rows instead of rebuilding the whole archive. `discrawl update` forces the same pull/import step manually. `discrawl sync` does not auto-import the share unless `--update=auto` or `--update=force` is provided, so routine live refreshes stay fast. +Once `share.remote` is configured, read commands auto-fetch and import when the local share import is older than `share.stale_after` (default `15m`). Imports are planned from crawlkit shard fingerprints, with a Git-object fallback for older manifests, so routine updates normally read only changed tail shards and preserve local FTS rows instead of rebuilding the whole archive. `discrawl update` forces the same pull/import step manually; `update --ref ` restores that historical snapshot without moving the share checkout. `discrawl sync` does not auto-import the share unless `--update=auto` or `--update=force` is provided, so routine live refreshes stay fast. Hybrid mode is supported too: keep normal Discord credentials configured and set `share.remote`. `discrawl sync --update=auto` and `discrawl messages --sync` import the Git snapshot first, usually as a changed-shard delta, then use live Discord for latest-message deltas. Use `sync --all-channels` or `sync --full` when you intentionally want a broader live repair/backfill pass. diff --git a/docs/commands/publish.md b/docs/commands/publish.md index 9598acd..292b822 100644 --- a/docs/commands/publish.md +++ b/docs/commands/publish.md @@ -8,6 +8,7 @@ Publishes the local SQLite archive as sharded, compressed NDJSON snapshots in a discrawl publish --remote https://github.com/example/discord-archive.git --push discrawl publish --readme path/to/discord-backup/README.md --push discrawl publish --message "sync: discord archive" --push +discrawl publish --tag backup-2026-06-19 --push discrawl publish --with-embeddings --push discrawl publish --no-media --push discrawl publish --public-only --include-channels 1458141495701012561 --push @@ -19,6 +20,7 @@ discrawl publish --public-only --include-channels 1458141495701012561 --push - `--remote ` - target Git remote (defaults to `[share].remote`) - `--branch ` - snapshot branch (defaults to `[share].branch`) - `--message ` - commit message (default: `sync: discord archive`) +- `--tag ` - create an immutable snapshot tag; requires a commit - `--no-commit` - write/export files without creating a Git commit - `--push` - push the snapshot commit after writing it - `--readme ` - update the activity block in this README file too diff --git a/docs/commands/update.md b/docs/commands/update.md index 9a41e0b..948f64d 100644 --- a/docs/commands/update.md +++ b/docs/commands/update.md @@ -13,6 +13,7 @@ discrawl update \ --remote https://github.com/example/discord-archive.git discrawl update --with-embeddings discrawl update --no-media +discrawl update --ref backup-2026-06-19 ``` ## Flags @@ -20,6 +21,7 @@ discrawl update --no-media - `--repo ` - local snapshot repo path (defaults to `[share].repo_path`) - `--remote ` - target Git remote (defaults to `[share].remote`) - `--branch ` - snapshot branch (defaults to `[share].branch`) +- `--ref ` - import a historical snapshot without changing the share checkout - `--with-embeddings` - also import vectors that match your local `[search.embeddings]` identity - `--no-media` - skip restoring cached attachment media files into `cache_dir/media` @@ -28,6 +30,7 @@ discrawl update --no-media - you have `share.remote` configured and want a fresh shard-delta import before running a command that does not auto-update (`sync` does not auto-import unless `--update=auto` is passed) - you set `--no-auto-update` when subscribing and want to refresh on demand - a CI job already imported the latest snapshot but read commands still consider it stale +- you need to restore a named tag or commit while leaving the checked-out share branch untouched ## How `sync` interacts diff --git a/docs/guides/git-snapshots.md b/docs/guides/git-snapshots.md index 1967c0b..691224a 100644 --- a/docs/guides/git-snapshots.md +++ b/docs/guides/git-snapshots.md @@ -12,6 +12,7 @@ published snapshots and are preserved locally on import. ```bash discrawl publish --remote https://github.com/example/discord-archive.git --push discrawl publish --readme path/to/discord-backup/README.md --push +discrawl publish --tag backup-2026-06-19 --push ``` The publisher uses your existing bot-synced archive. It exports non-DM tables only. @@ -62,7 +63,7 @@ discrawl subscribe --stale-after 15m https://github.com/example/discord-archive. discrawl subscribe --no-auto-update https://github.com/example/discord-archive.git ``` -`discrawl update` forces the same pull/import step manually. Snapshot imports are delta-planned from crawlkit shard fingerprints. Older manifests without those fields fall back to Git blob identity, so the common publish shape only imports the changed message tail shard plus small cursor tables. Unsafe table-shape changes still fall back to a full import. +`discrawl update` forces the same pull/import step manually. `discrawl update --ref ` reads the historical Git objects directly and leaves the share checkout unchanged. Snapshot imports are delta-planned from crawlkit shard fingerprints. Older manifests without those fields fall back to Git blob identity, so the common publish shape only imports the changed message tail shard plus small cursor tables. Unsafe table-shape changes still fall back to a full import. `discrawl sync` does **not** auto-import the share unless `--update=auto` or `--update=force` is provided, so routine live refreshes stay fast. diff --git a/go.mod b/go.mod index 1f07835..ca3c153 100644 --- a/go.mod +++ b/go.mod @@ -16,14 +16,14 @@ require ( github.com/charmbracelet/bubbles v1.0.0 // indirect github.com/clipperhouse/displaywidth v0.11.0 // indirect github.com/clipperhouse/uax29/v2 v2.7.0 // indirect - github.com/pelletier/go-toml/v2 v2.3.1 // indirect + github.com/pelletier/go-toml/v2 v2.4.0 // indirect modernc.org/sqlite v1.52.0 // indirect ) require ( github.com/aymanbagabas/go-osc52/v2 v2.0.1 // indirect github.com/charmbracelet/bubbletea v1.3.10 // indirect - github.com/charmbracelet/colorprofile v0.4.1 // indirect + github.com/charmbracelet/colorprofile v0.4.3 // indirect github.com/charmbracelet/lipgloss v1.1.0 // indirect github.com/charmbracelet/x/ansi v0.11.7 // indirect github.com/charmbracelet/x/cellbuf v0.0.15 // indirect @@ -39,20 +39,20 @@ require ( github.com/lucasb-eyer/go-colorful v1.4.0 // indirect github.com/mattn/go-isatty v0.0.22 // indirect github.com/mattn/go-localereader v0.0.1 // indirect - github.com/mattn/go-runewidth v0.0.23 // indirect + github.com/mattn/go-runewidth v0.0.24 // indirect github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6 // indirect github.com/muesli/cancelreader v0.2.2 // indirect github.com/muesli/termenv v0.16.0 // indirect github.com/ncruces/go-strftime v1.0.0 // indirect - github.com/openclaw/crawlkit v0.12.2 + github.com/openclaw/crawlkit v0.12.3-0.20260619113933-ca6cd668be8b github.com/pmezard/go-difflib v1.0.0 // indirect github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect github.com/rivo/uniseg v0.4.7 // indirect github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e // indirect - golang.org/x/crypto v0.50.0 // indirect + golang.org/x/crypto v0.53.0 // indirect gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c // indirect gopkg.in/yaml.v3 v3.0.1 // indirect - modernc.org/libc v1.72.3 // indirect + modernc.org/libc v1.73.4 // indirect modernc.org/mathutil v1.7.1 // indirect modernc.org/memory v1.11.0 // indirect ) diff --git a/go.sum b/go.sum index 7f80c80..1d9ed7f 100644 --- a/go.sum +++ b/go.sum @@ -12,8 +12,8 @@ github.com/charmbracelet/bubbles v1.0.0 h1:12J8/ak/uCZEMQ6KU7pcfwceyjLlWsDLAxB5f github.com/charmbracelet/bubbles v1.0.0/go.mod h1:9d/Zd5GdnauMI5ivUIVisuEm3ave1XwXtD1ckyV6r3E= github.com/charmbracelet/bubbletea v1.3.10 h1:otUDHWMMzQSB0Pkc87rm691KZ3SWa4KUlvF9nRvCICw= github.com/charmbracelet/bubbletea v1.3.10/go.mod h1:ORQfo0fk8U+po9VaNvnV95UPWA1BitP1E0N6xJPlHr4= -github.com/charmbracelet/colorprofile v0.4.1 h1:a1lO03qTrSIRaK8c3JRxJDZOvhvIeSco3ej+ngLk1kk= -github.com/charmbracelet/colorprofile v0.4.1/go.mod h1:U1d9Dljmdf9DLegaJ0nGZNJvoXAhayhmidOdcBwAvKk= +github.com/charmbracelet/colorprofile v0.4.3 h1:QPa1IWkYI+AOB+fE+mg/5/4HRMZcaXex9t5KX76i20Q= +github.com/charmbracelet/colorprofile v0.4.3/go.mod h1:/zT4BhpD5aGFpqQQqw7a+VtHCzu+zrQtt1zhMt9mR4Q= github.com/charmbracelet/lipgloss v1.1.0 h1:vYXsiLHVkK7fp74RkV7b2kq9+zDLoEU4MZoFqR/noCY= github.com/charmbracelet/lipgloss v1.1.0/go.mod h1:/6Q8FR2o+kj8rz4Dq0zQc3vYf7X+B0binUUBwA0aL30= github.com/charmbracelet/x/ansi v0.11.7 h1:kzv1kJvjg2S3r9KHo8hDdHFQLEqn4RBCb39dAYC84jI= @@ -61,8 +61,8 @@ github.com/mattn/go-isatty v0.0.22 h1:j8l17JJ9i6VGPUFUYoTUKPSgKe/83EYU2zBC7YNKMw github.com/mattn/go-isatty v0.0.22/go.mod h1:ZXfXG4SQHsB/w3ZeOYbR0PrPwLy+n6xiMrJlRFqopa4= github.com/mattn/go-localereader v0.0.1 h1:ygSAOl7ZXTx4RdPYinUpg6W99U8jWvWi9Ye2JC/oIi4= github.com/mattn/go-localereader v0.0.1/go.mod h1:8fBrzywKY7BI3czFoHkuzRoWE9C+EiG4R1k4Cjx5p88= -github.com/mattn/go-runewidth v0.0.23 h1:7ykA0T0jkPpzSvMS5i9uoNn2Xy3R383f9HDx3RybWcw= -github.com/mattn/go-runewidth v0.0.23/go.mod h1:XBkDxAl56ILZc9knddidhrOlY5R/pDhgLpndooCuJAs= +github.com/mattn/go-runewidth v0.0.24 h1:cpokDiIn0MGnhdHwuWnJBITySJ20QyNGnY2kR/ay2DU= +github.com/mattn/go-runewidth v0.0.24/go.mod h1:XBkDxAl56ILZc9knddidhrOlY5R/pDhgLpndooCuJAs= github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6 h1:ZK8zHtRHOkbHy6Mmr5D264iyp3TiX5OmNcI5cIARiQI= github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6/go.mod h1:CJlz5H+gyd6CUWT45Oy4q24RdLyn7Md9Vj2/ldJBSIo= github.com/muesli/cancelreader v0.2.2 h1:3I4Kt4BQjOR54NavqnDogx/MIoWBFa0StPA8ELUXHmA= @@ -71,10 +71,10 @@ github.com/muesli/termenv v0.16.0 h1:S5AlUN9dENB57rsbnkPyfdGuWIlkmzJjbFf0Tf5FWUc github.com/muesli/termenv v0.16.0/go.mod h1:ZRfOIKPFDYQoDFF4Olj7/QJbW60Ol/kL1pU3VfY/Cnk= github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w= github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls= -github.com/openclaw/crawlkit v0.12.2 h1:KivYMOHfemLG9LrfKKI8A/FTDJpdFJyeOreCGbKCsXA= -github.com/openclaw/crawlkit v0.12.2/go.mod h1:+Z9vrCgH8BJ/+3MMoMfnDyhXC9ON7bEDduGvp5TmmuM= -github.com/pelletier/go-toml/v2 v2.3.1 h1:MYEvvGnQjeNkRF1qUuGolNtNExTDwct51yp7olPtrEc= -github.com/pelletier/go-toml/v2 v2.3.1/go.mod h1:2gIqNv+qfxSVS7cM2xJQKtLSTLUE9V8t9Stt+h56mCY= +github.com/openclaw/crawlkit v0.12.3-0.20260619113933-ca6cd668be8b h1:f6NCwJYOsApd5mOfNiI2nGmd/Ohpd0lLAqDKz3KmqLA= +github.com/openclaw/crawlkit v0.12.3-0.20260619113933-ca6cd668be8b/go.mod h1:zOJv5WPWO1AuuXO7zW8NRTxb/ZTkIQXYPrx3StmnMUI= +github.com/pelletier/go-toml/v2 v2.4.0 h1:Mwu0mAkUKbittDs3/ADDWXqMmq3EOK2VHiuCkV00Row= +github.com/pelletier/go-toml/v2 v2.4.0/go.mod h1:2gIqNv+qfxSVS7cM2xJQKtLSTLUE9V8t9Stt+h56mCY= github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= @@ -93,8 +93,8 @@ github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e/go.mod h1:RbqR21r5mrJu github.com/zalando/go-keyring v0.2.8 h1:6sD/Ucpl7jNq10rM2pgqTs0sZ9V3qMrqfIIy5YPccHs= github.com/zalando/go-keyring v0.2.8/go.mod h1:tsMo+VpRq5NGyKfxoBVjCuMrG47yj8cmakZDO5QGii0= golang.org/x/crypto v0.0.0-20210421170649-83a5a9bb288b/go.mod h1:T9bdIzuCu7OtxOm1hfPfRQxPLYneinmdGuTeoZ9dtd4= -golang.org/x/crypto v0.50.0 h1:zO47/JPrL6vsNkINmLoo/PH1gcxpls50DNogFvB5ZGI= -golang.org/x/crypto v0.50.0/go.mod h1:3muZ7vA7PBCE6xgPX7nkzzjiUq87kRItoJQM1Yo8S+Q= +golang.org/x/crypto v0.53.0 h1:QZ4Muo8THX6CizN2vPPd5fBGHyogrdK9fG4wLPFUsto= +golang.org/x/crypto v0.53.0/go.mod h1:DNLU434OwVakk9PzuwV8w62mAJpRJL3vsgcfp4Qnsio= golang.org/x/exp v0.0.0-20231006140011-7918f672742d h1:jtJma62tbqLibJ5sFQz8bKtEM8rJBtfilJ2qTU199MI= golang.org/x/exp v0.0.0-20231006140011-7918f672742d/go.mod h1:ldy0pHrwJyGW56pPQzzkH36rKxoZW1tw7ZJpeKx+hdo= golang.org/x/mod v0.36.0 h1:JJjpVx6myfUsUdAzZuOSTTmRE0PfZeNWzzvKrP7amb4= @@ -118,20 +118,20 @@ gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntN gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -modernc.org/cc/v4 v4.28.2 h1:3tQ0lf2ADtoby2EtSP+J7IE2SHwEJdP8ioR59wx7XpY= -modernc.org/cc/v4 v4.28.2/go.mod h1:OnovgIhbbMXMu1aISnJ0wvVD1KnW+cAUJkIrAWh+kVI= -modernc.org/ccgo/v4 v4.34.0 h1:yRLPFZieg532OT4rp4JFNIVcquwalMX26G95WQDqwCQ= -modernc.org/ccgo/v4 v4.34.0/go.mod h1:AS5WYMyBakQ+fhsHhtP8mWB82KTGPkNNJDGfGQCe0/A= +modernc.org/cc/v4 v4.28.4 h1:Hd/4Es+MBj+/7hSdZaisNyu6bv3V0Dp2MdllyfqaH+c= +modernc.org/cc/v4 v4.28.4/go.mod h1:OnovgIhbbMXMu1aISnJ0wvVD1KnW+cAUJkIrAWh+kVI= +modernc.org/ccgo/v4 v4.34.4 h1:OVnSOWQjVKOYkFxoHYB+qQmSHK5gqMqARM+K9DpR/Ws= +modernc.org/ccgo/v4 v4.34.4/go.mod h1:qdKqE8FNIYyysougB1RX9MxCzp5oJOcQXSobANJ4TuE= modernc.org/fileutil v1.4.0 h1:j6ZzNTftVS054gi281TyLjHPp6CPHr2KCxEXjEbD6SM= modernc.org/fileutil v1.4.0/go.mod h1:EqdKFDxiByqxLk8ozOxObDSfcVOv/54xDs/DUHdvCUU= modernc.org/gc/v2 v2.6.5 h1:nyqdV8q46KvTpZlsw66kWqwXRHdjIlJOhG6kxiV/9xI= modernc.org/gc/v2 v2.6.5/go.mod h1:YgIahr1ypgfe7chRuJi2gD7DBQiKSLMPgBQe9oIiito= -modernc.org/gc/v3 v3.1.2 h1:ZtDCnhonXSZexk/AYsegNRV1lJGgaNZJuKjJSWKyEqo= -modernc.org/gc/v3 v3.1.2/go.mod h1:HFK/6AGESC7Ex+EZJhJ2Gni6cTaYpSMmU/cT9RmlfYY= +modernc.org/gc/v3 v3.1.3 h1:6QAplYyVO+KdPW3pGnqmJDUxtkec8ooEWvks/hhU3lc= +modernc.org/gc/v3 v3.1.3/go.mod h1:HFK/6AGESC7Ex+EZJhJ2Gni6cTaYpSMmU/cT9RmlfYY= modernc.org/goabi0 v0.2.0 h1:HvEowk7LxcPd0eq6mVOAEMai46V+i7Jrj13t4AzuNks= modernc.org/goabi0 v0.2.0/go.mod h1:CEFRnnJhKvWT1c1JTI3Avm+tgOWbkOu5oPA8eH8LnMI= -modernc.org/libc v1.72.3 h1:ZnDF4tXn4NBXFutMMQC4vtbTFSXhhKzR73fv0beZEAU= -modernc.org/libc v1.72.3/go.mod h1:dn0dZNnnn1clLyvRxLxYExxiKRZIRENOfqQ8XEeg4Qs= +modernc.org/libc v1.73.4 h1:+ra4Ui8ngyt8HDcO1FTDPWlkAh6yOdaO2yAoh8MddQA= +modernc.org/libc v1.73.4/go.mod h1:DXZ3eO8qMCNn2SnmTNCiC71nJ9Rcq3PsnpU6Vc4rWK8= modernc.org/mathutil v1.7.1 h1:GCZVGXdaN8gTqB1Mf/usp1Y/hSqgI2vAGGP4jZMCxOU= modernc.org/mathutil v1.7.1/go.mod h1:4p5IwJITfppl0G4sUEDtCr4DthTaT47/N3aT6MhfgJg= modernc.org/memory v1.11.0 h1:o4QC8aMQzmcwCK3t3Ux/ZHmwFPzE6hf2Y5LbkRs+hbI= diff --git a/internal/cli/cli_test.go b/internal/cli/cli_test.go index 37284f4..1e423ea 100644 --- a/internal/cli/cli_test.go +++ b/internal/cli/cli_test.go @@ -1650,13 +1650,13 @@ func TestShareCommandsPublishSubscribeAndUpdate(t *testing.T) { }, &bytes.Buffer{}, &bytes.Buffer{}) require.Equal(t, 2, ExitCode(err)) require.ErrorContains(t, err, "publish --readme is not supported with share filters") + err = Run(ctx, []string{"--config", cfgPath, "publish", "--tag", "invalid", "--no-commit"}, &bytes.Buffer{}, &bytes.Buffer{}) + require.Equal(t, 2, ExitCode(err)) + require.ErrorContains(t, err, "publish --tag requires a commit") - runGit(t, cfg.Share.RepoPath, "config", "user.name", "discrawl test") - runGit(t, cfg.Share.RepoPath, "config", "user.email", "discrawl@example.com") - committed, err := share.Commit(ctx, share.Options{RepoPath: cfg.Share.RepoPath, Remote: remoteRepo, Branch: "main"}, "test: snapshot") - require.NoError(t, err) - require.True(t, committed) - require.NoError(t, share.Push(ctx, share.Options{RepoPath: cfg.Share.RepoPath, Remote: remoteRepo, Branch: "main"})) + out.Reset() + require.NoError(t, Run(ctx, []string{"--config", cfgPath, "publish", "--tag", "test-snapshot", "--push"}, &out, &bytes.Buffer{})) + require.Contains(t, out.String(), "test-snapshot") readerCfgPath := filepath.Join(dir, "reader.toml") require.NoError(t, Run(ctx, []string{ @@ -1677,6 +1677,9 @@ func TestShareCommandsPublishSubscribeAndUpdate(t *testing.T) { require.NoError(t, Run(ctx, []string{"--config", readerCfgPath, "update"}, &out, &bytes.Buffer{})) require.Contains(t, out.String(), "generated_at") out.Reset() + require.NoError(t, Run(ctx, []string{"--config", readerCfgPath, "update", "--ref", "test-snapshot"}, &out, &bytes.Buffer{})) + require.Contains(t, out.String(), "test-snapshot") + out.Reset() require.NoError(t, Run(ctx, []string{"--config", readerCfgPath, "search", "automatic"}, &out, &bytes.Buffer{})) require.Contains(t, out.String(), "automatic updates work") } diff --git a/internal/cli/share_commands.go b/internal/cli/share_commands.go index 250f4f2..f1ab936 100644 --- a/internal/cli/share_commands.go +++ b/internal/cli/share_commands.go @@ -21,6 +21,7 @@ func (r *runtime) runPublish(args []string) error { remote := fs.String("remote", r.cfg.Share.Remote, "") branch := fs.String("branch", r.cfg.Share.Branch, "") message := fs.String("message", "", "") + tag := fs.String("tag", "", "") readmePath := fs.String("readme", "", "") noCommit := fs.Bool("no-commit", false, "") push := fs.Bool("push", false, "") @@ -35,10 +36,17 @@ func (r *runtime) runPublish(args []string) error { if fs.NArg() != 0 { return usageErr(errors.New("publish takes no positional arguments")) } + if *noCommit && strings.TrimSpace(*tag) != "" { + return usageErr(errors.New("publish --tag requires a commit")) + } opts, err := shareOptionsFromFlags(*repoPath, *remote, *branch) if err != nil { return err } + opts.Tag = strings.TrimSpace(*tag) + if err := share.ValidateTag(r.ctx, opts); err != nil { + return err + } opts.Filter = share.FilterOptions{ PublicOnly: *publicOnly, IncludeChannelIDs: csvList(*includeChannels), @@ -86,6 +94,10 @@ func (r *runtime) runPublish(args []string) error { return err } } + createdTag, err := share.CreateImmutableTag(r.ctx, opts) + if err != nil { + return err + } if *push { if err := share.Push(r.ctx, opts); err != nil { return err @@ -103,6 +115,7 @@ func (r *runtime) runPublish(args []string) error { "embeddings": manifest.Embeddings, "readme": *readmePath, "committed": committed, + "tag": createdTag, "pushed": *push, }) } @@ -214,6 +227,7 @@ func (r *runtime) runUpdate(args []string) error { repoPath := fs.String("repo", r.cfg.Share.RepoPath, "") remote := fs.String("remote", r.cfg.Share.Remote, "") branch := fs.String("branch", r.cfg.Share.Branch, "") + ref := fs.String("ref", "", "") withEmbeddings := fs.Bool("with-embeddings", false, "") noMedia := fs.Bool("no-media", !r.cfg.ShareMediaEnabled(), "") if err := fs.Parse(args); err != nil { @@ -233,14 +247,25 @@ func (r *runtime) runUpdate(args []string) error { if err := applyMediaShareOptions(&opts, r.cfg, !*noMedia); err != nil { return err } - r.setSyncLockPhase("share pull") - if err := share.Pull(r.ctx, opts); err != nil { - return err - } - r.setSyncLockPhase("share import") - manifest, imported, err := share.ImportIfChanged(r.ctx, r.store, opts) - if err != nil { - return err + var manifest share.Manifest + var imported bool + if strings.TrimSpace(*ref) == "" { + r.setSyncLockPhase("share pull") + if err := share.Pull(r.ctx, opts); err != nil { + return err + } + r.setSyncLockPhase("share import") + manifest, imported, err = share.ImportIfChanged(r.ctx, r.store, opts) + if err != nil { + return err + } + } else { + r.setSyncLockPhase("share historical import") + manifest, err = share.ImportAt(r.ctx, r.store, opts, *ref) + if err != nil { + return err + } + imported = true } return r.print(map[string]any{ "repo_path": opts.RepoPath, @@ -250,6 +275,7 @@ func (r *runtime) runUpdate(args []string) error { "media": manifest.Media, "embeddings": manifest.Embeddings, "imported": imported, + "ref": strings.TrimSpace(*ref), }) } diff --git a/internal/share/share.go b/internal/share/share.go index 895f5f5..0b2c343 100644 --- a/internal/share/share.go +++ b/internal/share/share.go @@ -13,7 +13,7 @@ import ( "hash/fnv" "io" "os" - "os/exec" + "path" "path/filepath" "slices" "sort" @@ -68,6 +68,7 @@ type Options struct { CacheDir string Remote string Branch string + Tag string Filter FilterOptions IncludeMedia bool IncludeEmbeddings bool @@ -134,7 +135,15 @@ func Pull(ctx context.Context, opts Options) error { if strings.TrimSpace(opts.Remote) == "" && strings.TrimSpace(opts.RepoPath) == "" { return nil } - return mirror.Pull(ctx, mirrorOptions(opts)) + if strings.TrimSpace(opts.Remote) == "" { + return EnsureRepo(ctx, opts) + } + if err := mirror.EnsureRemote(ctx, mirrorOptions(opts)); err != nil { + return err + } + pullOpts := mirrorOptions(opts) + pullOpts.Remote = "" + return mirror.PullCurrent(ctx, pullOpts) } func Commit(ctx context.Context, opts Options, message string) (bool, error) { @@ -142,7 +151,13 @@ func Commit(ctx context.Context, opts Options, message string) (bool, error) { } func Push(ctx context.Context, opts Options) error { - if err := mirror.Push(ctx, mirrorOptions(opts)); err != nil { + var err error + if strings.TrimSpace(opts.Tag) == "" { + err = mirror.Push(ctx, mirrorOptions(opts)) + } else { + err = mirror.PushSnapshot(ctx, mirrorOptions(opts), opts.Tag) + } + if err != nil { branch := opts.Branch if strings.TrimSpace(branch) == "" { branch = "main" @@ -152,11 +167,37 @@ func Push(ctx context.Context, opts Options) error { return nil } +func ValidateTag(ctx context.Context, opts Options) error { + if strings.TrimSpace(opts.Tag) == "" { + return nil + } + if strings.TrimSpace(opts.Remote) != "" { + if err := mirror.EnsureRemote(ctx, mirrorOptions(opts)); err != nil { + return err + } + } else if err := mirror.EnsureRepo(ctx, mirrorOptions(opts)); err != nil { + return err + } + if err := mirror.SyncForWrite(ctx, mirrorOptions(opts)); err != nil { + return err + } + return mirror.ValidateTag(ctx, mirrorOptions(opts), opts.Tag) +} + +func CreateImmutableTag(ctx context.Context, opts Options) (string, error) { + return mirror.CreateImmutableTag(ctx, mirrorOptions(opts), opts.Tag) +} + func Export(ctx context.Context, s *store.Store, opts Options) (Manifest, error) { if err := validateMediaRoots(opts); err != nil { return Manifest{}, err } - if err := EnsureRepo(ctx, opts); err != nil { + if strings.TrimSpace(opts.Remote) != "" { + if err := mirror.EnsureRemote(ctx, mirrorOptions(opts)); err != nil { + return Manifest{}, err + } + } + if err := mirror.SyncForWrite(ctx, mirrorOptions(opts)); err != nil { return Manifest{}, err } filter, err := newSnapshotFilter(ctx, s.DB(), opts.Filter) @@ -617,17 +658,18 @@ func PreviousImportedManifest(ctx context.Context, s *store.Store, opts Options) } func manifestFromGitHistory(ctx context.Context, repoPath string, generatedAt time.Time) (Manifest, error) { - out, err := output(ctx, repoPath, "git", "log", "--format=%H", "--max-count=500", "--", ManifestName) + opts := mirror.Options{RepoPath: repoPath} + commits, err := mirror.CommitsChanging(ctx, opts, ManifestName, 500) if err != nil { return Manifest{}, err } - for hash := range strings.FieldsSeq(out) { - body, err := output(ctx, repoPath, "git", "show", hash+":"+ManifestName) + for _, hash := range commits { + body, _, err := mirror.ReadFileAt(ctx, opts, hash, ManifestName) if err != nil { continue } var manifest Manifest - if err := json.Unmarshal([]byte(body), &manifest); err != nil { + if err := json.Unmarshal(body, &manifest); err != nil { continue } if manifest.GeneratedAt.Equal(generatedAt) { @@ -668,8 +710,8 @@ func enrichManifestFromGit(ctx context.Context, repoPath, rev string, manifest M table.FileManifests = append(table.FileManifests, snapshot.FileManifest{ Path: path, Rows: rows, - Size: info.size, - SHA256: "git:" + info.object, + Size: info.Size, + SHA256: "git:" + info.Object, }) } } @@ -685,27 +727,17 @@ func manifestHasFileManifests(manifest Manifest) bool { return true } -type gitTreeFile struct { - object string - size int64 -} - -func gitTreeFiles(ctx context.Context, repoPath, rev string) (map[string]gitTreeFile, error) { +func gitTreeFiles(ctx context.Context, repoPath, rev string) (map[string]mirror.TreeFile, error) { if strings.TrimSpace(rev) == "" { rev = "HEAD" } - out, err := output(ctx, repoPath, "git", "ls-tree", "-r", "-l", rev, "--", "tables") + entries, err := mirror.ListTreeFiles(ctx, mirror.Options{RepoPath: repoPath}, rev, "tables") if err != nil { return nil, err } - files := map[string]gitTreeFile{} - for line := range strings.SplitSeq(out, "\n") { - fields := strings.Fields(line) - if len(fields) < 5 { - continue - } - size, _ := strconv.ParseInt(fields[3], 10, 64) - files[fields[4]] = gitTreeFile{object: fields[2], size: size} + files := make(map[string]mirror.TreeFile, len(entries)) + for _, entry := range entries { + files[entry.Path] = entry } return files, nil } @@ -762,6 +794,10 @@ func ReadManifest(repoPath string) (Manifest, error) { } return Manifest{}, fmt.Errorf("read share manifest: %w", err) } + return parseManifest(data) +} + +func parseManifest(data []byte) (Manifest, error) { var manifest Manifest if err := json.Unmarshal(data, &manifest); err != nil { return Manifest{}, fmt.Errorf("parse share manifest: %w", err) @@ -773,7 +809,92 @@ func ReadManifest(repoPath string) (Manifest, error) { } func mirrorOptions(opts Options) mirror.Options { - return mirror.Options{RepoPath: opts.RepoPath, Remote: opts.Remote, Branch: opts.Branch} + return mirror.Options{RepoPath: opts.RepoPath, Remote: opts.Remote, Branch: opts.Branch, DirMode: 0o750} +} + +// ImportAt restores a snapshot from a Git ref without changing the share checkout. +func ImportAt(ctx context.Context, s *store.Store, opts Options, ref string) (Manifest, error) { + ref = strings.TrimSpace(ref) + if ref == "" { + return Import(ctx, s, opts) + } + if err := mirror.Fetch(ctx, mirrorOptions(opts)); err != nil { + return Manifest{}, err + } + manifestBody, commit, err := mirror.ReadFileAt(ctx, mirrorOptions(opts), ref, ManifestName) + if err != nil { + return Manifest{}, err + } + manifest, err := parseManifest(manifestBody) + if err != nil { + return Manifest{}, err + } + tempDir, err := os.MkdirTemp("", "discrawl-share-ref-*") + if err != nil { + return Manifest{}, fmt.Errorf("create historical share directory: %w", err) + } + defer func() { _ = os.RemoveAll(tempDir) }() + if err := os.WriteFile(filepath.Join(tempDir, ManifestName), manifestBody, 0o600); err != nil { + return Manifest{}, fmt.Errorf("write historical manifest: %w", err) + } + for _, table := range manifest.Tables { + for _, file := range tableSnapshotFiles(table) { + if err := materializeRefFile(ctx, mirrorOptions(opts), commit, file, tempDir); err != nil { + return Manifest{}, err + } + } + } + for _, embeddings := range manifest.Embeddings { + if !opts.IncludeEmbeddings { + break + } + for _, file := range embeddings.Files { + if err := materializeRefFile(ctx, mirrorOptions(opts), commit, file, tempDir); err != nil { + return Manifest{}, err + } + } + } + if opts.IncludeMedia && manifest.Media != nil { + for _, file := range manifest.Media.Files { + if err := materializeRefFile(ctx, mirrorOptions(opts), commit, file.Path, tempDir); err != nil { + return Manifest{}, err + } + } + } + historicalOpts := opts + historicalOpts.RepoPath = tempDir + historicalOpts.Remote = "" + historicalOpts.Tag = "" + return Import(ctx, s, historicalOpts) +} + +func tableSnapshotFiles(table TableManifest) []string { + if len(table.Files) > 0 { + return table.Files + } + if strings.TrimSpace(table.File) != "" { + return []string{table.File} + } + return nil +} + +func materializeRefFile(ctx context.Context, opts mirror.Options, ref, filePath, targetRoot string) error { + clean := path.Clean(filepath.ToSlash(strings.TrimSpace(filePath))) + if clean == "." || clean == ".." || path.IsAbs(clean) || strings.HasPrefix(clean, "../") || strings.ContainsRune(clean, '\x00') { + return fmt.Errorf("invalid historical share path %q", filePath) + } + body, _, err := mirror.ReadFileAt(ctx, opts, ref, clean) + if err != nil { + return err + } + target := filepath.Join(targetRoot, filepath.FromSlash(clean)) + if err := os.MkdirAll(filepath.Dir(target), 0o750); err != nil { + return fmt.Errorf("create historical share directory: %w", err) + } + if err := os.WriteFile(target, body, 0o600); err != nil { + return fmt.Errorf("write historical share file %s: %w", clean, err) + } + return nil } func NeedsImport(ctx context.Context, s *store.Store, staleAfter time.Duration) bool { @@ -2384,28 +2505,3 @@ func safePathSegment(s string) string { } return clean } - -func run(ctx context.Context, dir, name string, args ...string) error { - out, err := output(ctx, dir, name, args...) - if err != nil { - return fmt.Errorf("%s %s: %w\n%s", name, strings.Join(args, " "), err, strings.TrimSpace(out)) - } - return nil -} - -func output(ctx context.Context, dir, name string, args ...string) (string, error) { - // #nosec G204 -- discrawl invokes the Git executable with argv, never through a shell. - cmd := exec.CommandContext(ctx, name, args...) - if dir != "" { - cmd.Dir = dir - } - body, err := cmd.CombinedOutput() - return string(body), err -} - -func isNonFastForwardPush(out string) bool { - lower := strings.ToLower(out) - return strings.Contains(lower, "non-fast-forward") || - strings.Contains(lower, "fetch first") || - strings.Contains(lower, "failed to push some refs") -} diff --git a/internal/share/share_test.go b/internal/share/share_test.go index 6f6b068..9743818 100644 --- a/internal/share/share_test.go +++ b/internal/share/share_test.go @@ -1629,6 +1629,59 @@ func TestPullAndPushWithBareRemote(t *testing.T) { require.FileExists(t, filepath.Join(subscriber, ManifestName)) } +func TestImportAtRestoresTaggedSnapshotWithoutMovingCheckout(t *testing.T) { + ctx := context.Background() + dir := t.TempDir() + src := seedStore(t, filepath.Join(dir, "src.db")) + defer func() { _ = src.Close() }() + opts := Options{RepoPath: filepath.Join(dir, "share"), Branch: "main", Tag: "snapshot-old"} + _, err := Export(ctx, src, opts) + require.NoError(t, err) + committed, err := Commit(ctx, opts, "old snapshot") + require.NoError(t, err) + require.True(t, committed) + tag, err := CreateImmutableTag(ctx, opts) + require.NoError(t, err) + require.Equal(t, "snapshot-old", tag) + + now := time.Now().UTC().Format(time.RFC3339Nano) + require.NoError(t, src.UpsertMessages(ctx, []store.MessageMutation{{ + Record: store.MessageRecord{ + ID: "m1", + GuildID: "g1", + ChannelID: "c1", + ChannelName: "general", + AuthorID: "u1", + AuthorName: "Peter", + CreatedAt: now, + Content: "new snapshot", + NormalizedContent: "new snapshot", + RawJSON: `{}`, + }, + EventType: "upsert", + PayloadJSON: `{"id":"m1"}`, + }})) + opts.Tag = "" + _, err = Export(ctx, src, opts) + require.NoError(t, err) + committed, err = Commit(ctx, opts, "new snapshot") + require.NoError(t, err) + require.True(t, committed) + headBefore := strings.TrimSpace(testGitOutput(t, ctx, opts.RepoPath, "rev-parse", "HEAD")) + + dst, err := store.Open(ctx, filepath.Join(dir, "dst.db")) + require.NoError(t, err) + defer func() { _ = dst.Close() }() + manifest, err := ImportAt(ctx, dst, opts, "snapshot-old") + require.NoError(t, err) + require.False(t, manifest.GeneratedAt.IsZero()) + results, err := dst.SearchMessages(ctx, store.SearchOptions{Query: "launch", Limit: 10}) + require.NoError(t, err) + require.Len(t, results, 1) + require.Equal(t, "launch checklist ready", results[0].Content) + require.Equal(t, headBefore, strings.TrimSpace(testGitOutput(t, ctx, opts.RepoPath, "rev-parse", "HEAD"))) +} + func TestPushRebasesRemoteReadmeUpdates(t *testing.T) { ctx := context.Background() src := seedStore(t, filepath.Join(t.TempDir(), "src.db")) @@ -1651,11 +1704,11 @@ func TestPushRebasesRemoteReadmeUpdates(t *testing.T) { require.NoError(t, Push(ctx, opts)) reporter := filepath.Join(dir, "reporter") - require.NoError(t, run(ctx, dir, "git", "clone", "--branch", "main", remote, reporter)) + testGitRun(t, ctx, dir, "clone", "--branch", "main", remote, reporter) configureGitUser(t, reporter) require.NoError(t, os.WriteFile(filepath.Join(reporter, "README.md"), []byte("report: first\n\nfield notes: fresh\n"), 0o600)) - require.NoError(t, run(ctx, reporter, "git", "commit", "-am", "docs: update field notes")) - require.NoError(t, run(ctx, reporter, "git", "push", "-u", "origin", "main")) + testGitRun(t, ctx, reporter, "commit", "-am", "docs: update field notes") + testGitRun(t, ctx, reporter, "push", "-u", "origin", "main") require.NoError(t, os.WriteFile(filepath.Join(publisher, "README.md"), []byte("report: second\n\nfield notes: old\n"), 0o600)) committed, err = Commit(ctx, opts, "test: update report") @@ -1732,7 +1785,6 @@ func TestRepoCommandEdges(t *testing.T) { err := Push(ctx, Options{RepoPath: repo, Branch: "main"}) require.ErrorContains(t, err, "git push -u origin main") - require.ErrorContains(t, run(ctx, repo, "git", "definitely-not-a-command"), "git definitely-not-a-command") } func TestShareSmallHelpersAndValidation(t *testing.T) { @@ -1753,9 +1805,6 @@ func TestShareSmallHelpersAndValidation(t *testing.T) { require.Equal(t, "plain", stringValue("plain")) require.Equal(t, "42", stringValue(json.Number("42"))) require.Empty(t, stringValue(42)) - require.True(t, isNonFastForwardPush("failed to push some refs; fetch first")) - require.True(t, isNonFastForwardPush("non-fast-forward")) - require.False(t, isNonFastForwardPush("everything up-to-date")) query, args := snapshotExportQuery("messages") require.Equal(t, "select * from messages where guild_id != ?", query) @@ -1804,7 +1853,7 @@ func TestShareSmallHelpersAndValidation(t *testing.T) { Options{Progress: func(progress ImportProgress) { seen = append(seen, progress) }}.reportProgress(ImportProgress{Phase: "phase"}) require.Equal(t, []ImportProgress{{Phase: "phase"}}, seen) Options{}.reportProgress(ImportProgress{Phase: "ignored"}) - require.Equal(t, mirror.Options{RepoPath: "repo", Remote: "origin", Branch: "main"}, mirrorOptions(Options{RepoPath: "repo", Remote: "origin", Branch: "main"})) + require.Equal(t, mirror.Options{RepoPath: "repo", Remote: "origin", Branch: "main", DirMode: 0o750}, mirrorOptions(Options{RepoPath: "repo", Remote: "origin", Branch: "main"})) var buf bytes.Buffer cw := &countingWriter{w: &buf} @@ -2255,6 +2304,20 @@ func seedDirectMessageData(t *testing.T, ctx context.Context, s *store.Store) { require.NoError(t, s.SetSyncState(ctx, "wiretap:last_import", now.Format(time.RFC3339))) } +func testGitRun(t *testing.T, ctx context.Context, dir string, args ...string) { + t.Helper() + _ = testGitOutput(t, ctx, dir, args...) +} + +func testGitOutput(t *testing.T, ctx context.Context, dir string, args ...string) string { + t.Helper() + cmd := exec.CommandContext(ctx, "git", args...) + cmd.Dir = dir + body, err := cmd.CombinedOutput() + require.NoError(t, err, "%s", body) + return string(body) +} + func configureGitUser(t *testing.T, repo string) { t.Helper() // #nosec G204 -- fixed git argv in test setup. diff --git a/internal/store/query.go b/internal/store/query.go index 8752b12..1756825 100644 --- a/internal/store/query.go +++ b/internal/store/query.go @@ -10,6 +10,7 @@ import ( "strings" "time" + crawlstore "github.com/openclaw/crawlkit/store" "github.com/openclaw/crawlkit/vector" "github.com/openclaw/discrawl/internal/store/storedb" ) @@ -1038,7 +1039,7 @@ func normalizeFTSQuery(raw string) string { } fields := strings.Fields(raw) for i, field := range fields { - fields[i] = `"` + strings.ReplaceAll(field, `"`, " ") + `"` + fields[i] = crawlstore.FTS5Phrase(strings.ReplaceAll(field, `"`, " ")) } return strings.Join(fields, " ") } From 16414d0d8d197bc67148c0187edf4d6b794f7674 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Fri, 19 Jun 2026 07:54:32 -0400 Subject: [PATCH 2/3] build(deps): update CrawlKit --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index ca3c153..aae3abb 100644 --- a/go.mod +++ b/go.mod @@ -44,7 +44,7 @@ require ( github.com/muesli/cancelreader v0.2.2 // indirect github.com/muesli/termenv v0.16.0 // indirect github.com/ncruces/go-strftime v1.0.0 // indirect - github.com/openclaw/crawlkit v0.12.3-0.20260619113933-ca6cd668be8b + github.com/openclaw/crawlkit v0.12.3-0.20260619115105-eb1aa35e0e78 github.com/pmezard/go-difflib v1.0.0 // indirect github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect github.com/rivo/uniseg v0.4.7 // indirect diff --git a/go.sum b/go.sum index 1d9ed7f..0b4402a 100644 --- a/go.sum +++ b/go.sum @@ -71,8 +71,8 @@ github.com/muesli/termenv v0.16.0 h1:S5AlUN9dENB57rsbnkPyfdGuWIlkmzJjbFf0Tf5FWUc github.com/muesli/termenv v0.16.0/go.mod h1:ZRfOIKPFDYQoDFF4Olj7/QJbW60Ol/kL1pU3VfY/Cnk= github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w= github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls= -github.com/openclaw/crawlkit v0.12.3-0.20260619113933-ca6cd668be8b h1:f6NCwJYOsApd5mOfNiI2nGmd/Ohpd0lLAqDKz3KmqLA= -github.com/openclaw/crawlkit v0.12.3-0.20260619113933-ca6cd668be8b/go.mod h1:zOJv5WPWO1AuuXO7zW8NRTxb/ZTkIQXYPrx3StmnMUI= +github.com/openclaw/crawlkit v0.12.3-0.20260619115105-eb1aa35e0e78 h1:7ebiHhILVHAJT+b4h2CVu/xeUKsLN3nNSfq994uo/3I= +github.com/openclaw/crawlkit v0.12.3-0.20260619115105-eb1aa35e0e78/go.mod h1:zOJv5WPWO1AuuXO7zW8NRTxb/ZTkIQXYPrx3StmnMUI= github.com/pelletier/go-toml/v2 v2.4.0 h1:Mwu0mAkUKbittDs3/ADDWXqMmq3EOK2VHiuCkV00Row= github.com/pelletier/go-toml/v2 v2.4.0/go.mod h1:2gIqNv+qfxSVS7cM2xJQKtLSTLUE9V8t9Stt+h56mCY= github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA= From 26a9eb17cdeafa2fab6d3c5aaf922ed747b10a0a Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Fri, 19 Jun 2026 08:07:43 -0400 Subject: [PATCH 3/3] test: cover historical snapshot paths --- internal/share/share.go | 5 +- internal/share/share_test.go | 137 ++++++++++++++++++++++++++++++++++- 2 files changed, 138 insertions(+), 4 deletions(-) diff --git a/internal/share/share.go b/internal/share/share.go index 0b2c343..71ae39d 100644 --- a/internal/share/share.go +++ b/internal/share/share.go @@ -178,10 +178,13 @@ func ValidateTag(ctx context.Context, opts Options) error { } else if err := mirror.EnsureRepo(ctx, mirrorOptions(opts)); err != nil { return err } + if err := mirror.ValidateTag(ctx, mirrorOptions(opts), opts.Tag); err != nil { + return err + } if err := mirror.SyncForWrite(ctx, mirrorOptions(opts)); err != nil { return err } - return mirror.ValidateTag(ctx, mirrorOptions(opts), opts.Tag) + return nil } func CreateImmutableTag(ctx context.Context, opts Options) (string, error) { diff --git a/internal/share/share_test.go b/internal/share/share_test.go index 9743818..9508c6a 100644 --- a/internal/share/share_test.go +++ b/internal/share/share_test.go @@ -76,6 +76,8 @@ func TestExportImportRoundTrip(t *testing.T) { require.NoError(t, err) require.False(t, changed) require.Equal(t, manifest.GeneratedAt, imported.GeneratedAt) + _, err = ImportAt(ctx, dst, Options{RepoPath: repo, Branch: "main"}, "") + require.NoError(t, err) } func TestExportImportRestoresMediaFiles(t *testing.T) { @@ -1621,6 +1623,10 @@ func TestPullAndPushWithBareRemote(t *testing.T) { committed, err := Commit(ctx, opts, "test: snapshot") require.NoError(t, err) require.True(t, committed) + opts.Tag = "snapshot/test" + tag, err := CreateImmutableTag(ctx, opts) + require.NoError(t, err) + require.Equal(t, "snapshot/test", tag) require.NoError(t, Push(ctx, opts)) subscriber := filepath.Join(dir, "subscriber") @@ -1629,13 +1635,126 @@ func TestPullAndPushWithBareRemote(t *testing.T) { require.FileExists(t, filepath.Join(subscriber, ManifestName)) } +func TestValidateTagBeforeRemoteSync(t *testing.T) { + ctx := context.Background() + repo := filepath.Join(t.TempDir(), "share") + require.NoError(t, EnsureRepo(ctx, Options{RepoPath: repo, Branch: "main"})) + err := ValidateTag(ctx, Options{ + RepoPath: repo, + Remote: "https://example.invalid/archive.git", + Branch: "main", + Tag: "bad tag", + }) + require.ErrorContains(t, err, "invalid snapshot tag") + require.NoError(t, ValidateTag(ctx, Options{})) + require.Error(t, ValidateTag(ctx, Options{Remote: "remote", Tag: "snapshot/valid"})) + require.Error(t, ValidateTag(ctx, Options{Tag: "snapshot/valid"})) + localRepo := filepath.Join(t.TempDir(), "local-share") + require.NoError(t, ValidateTag(ctx, Options{RepoPath: localRepo, Branch: "main", Tag: "snapshot/valid"})) + require.Equal(t, []string{"tables/messages/000001.jsonl.gz"}, tableSnapshotFiles(TableManifest{Files: []string{"tables/messages/000001.jsonl.gz"}})) + require.Equal(t, []string{"tables/messages.jsonl.gz"}, tableSnapshotFiles(TableManifest{File: "tables/messages.jsonl.gz"})) + require.Nil(t, tableSnapshotFiles(TableManifest{})) + err = materializeRefFile(ctx, mirror.Options{}, "HEAD", "../escape", t.TempDir()) + require.ErrorContains(t, err, "invalid historical share path") + require.NoError(t, os.MkdirAll(filepath.Join(localRepo, "tables"), 0o755)) + require.NoError(t, os.WriteFile(filepath.Join(localRepo, "tables", "sample.txt"), []byte("sample\n"), 0o600)) + committed, err := mirror.Commit(ctx, mirror.Options{RepoPath: localRepo}, "sample") + require.NoError(t, err) + require.True(t, committed) + materialized := t.TempDir() + require.NoError(t, materializeRefFile(ctx, mirror.Options{RepoPath: localRepo}, "HEAD", "tables/sample.txt", materialized)) + require.Equal(t, []byte("sample\n"), mustReadFile(t, filepath.Join(materialized, "tables", "sample.txt"))) + require.NoError(t, os.WriteFile(filepath.Join(localRepo, ManifestName), []byte(`{`), 0o600)) + committed, err = mirror.Commit(ctx, mirror.Options{RepoPath: localRepo}, "malformed manifest") + require.NoError(t, err) + require.True(t, committed) + _, err = ImportAt(ctx, nil, Options{RepoPath: localRepo}, "HEAD") + require.ErrorContains(t, err, "parse share manifest") +} + +func TestHistoricalRefErrorPaths(t *testing.T) { + ctx := context.Background() + repo := filepath.Join(t.TempDir(), "share") + require.NoError(t, EnsureRepo(ctx, Options{RepoPath: repo, Branch: "main"})) + require.NoError(t, os.WriteFile(filepath.Join(repo, "sample.txt"), []byte("sample\n"), 0o600)) + committed, err := mirror.Commit(ctx, mirror.Options{RepoPath: repo}, "sample") + require.NoError(t, err) + require.True(t, committed) + _, err = ImportAt(ctx, nil, Options{RepoPath: repo}, "HEAD") + require.Error(t, err) + _, err = ImportAt(ctx, nil, Options{}, "HEAD") + require.Error(t, err) + + writeManifest := func(manifest Manifest, message string) { + t.Helper() + body, marshalErr := json.Marshal(manifest) + require.NoError(t, marshalErr) + require.NoError(t, os.WriteFile(filepath.Join(repo, ManifestName), body, 0o600)) + changed, commitErr := mirror.Commit(ctx, mirror.Options{RepoPath: repo}, message) + require.NoError(t, commitErr) + require.True(t, changed) + } + writeManifest(Manifest{Version: 1, GeneratedAt: time.Now().UTC(), Tables: []TableManifest{{Name: "messages", Files: []string{"tables/missing.jsonl.gz"}}}}, "missing table") + _, err = ImportAt(ctx, nil, Options{RepoPath: repo}, "HEAD") + require.Error(t, err) + + writeManifest(Manifest{Version: 1, GeneratedAt: time.Now().UTC(), Embeddings: []EmbeddingManifest{{Files: []string{"embeddings/missing.jsonl.gz"}}}}, "skipped embeddings") + dst, err := store.Open(ctx, filepath.Join(t.TempDir(), "dst.db")) + require.NoError(t, err) + defer func() { _ = dst.Close() }() + _, _ = ImportAt(ctx, dst, Options{RepoPath: repo}, "HEAD") + _, err = ImportAt(ctx, dst, Options{RepoPath: repo, IncludeEmbeddings: true}, "HEAD") + require.Error(t, err) + + writeManifest(Manifest{Version: 1, GeneratedAt: time.Now().UTC(), Media: &MediaManifest{Files: []snapshot.FileManifest{{Path: "media/missing.gz"}}}}, "missing media") + _, err = ImportAt(ctx, dst, Options{RepoPath: repo, IncludeMedia: true}, "HEAD") + require.Error(t, err) + + blockedParent := t.TempDir() + require.NoError(t, os.WriteFile(filepath.Join(blockedParent, "tables"), []byte("blocked"), 0o600)) + err = materializeRefFile(ctx, mirror.Options{RepoPath: repo}, "HEAD~3", "sample.txt", filepath.Join(blockedParent, "tables", "child")) + require.Error(t, err) + writeTarget := t.TempDir() + require.NoError(t, os.MkdirAll(filepath.Join(writeTarget, "sample.txt"), 0o755)) + err = materializeRefFile(ctx, mirror.Options{RepoPath: repo}, "HEAD~3", "sample.txt", writeTarget) + require.Error(t, err) +} + func TestImportAtRestoresTaggedSnapshotWithoutMovingCheckout(t *testing.T) { ctx := context.Background() dir := t.TempDir() src := seedStore(t, filepath.Join(dir, "src.db")) defer func() { _ = src.Close() }() - opts := Options{RepoPath: filepath.Join(dir, "share"), Branch: "main", Tag: "snapshot-old"} - _, err := Export(ctx, src, opts) + mediaBody := []byte("historical media") + mediaSum := sha256.Sum256(mediaBody) + mediaHash := hex.EncodeToString(mediaSum[:]) + mediaPath := filepath.ToSlash(filepath.Join("attachments", mediaHash[:2], mediaHash+"-history.txt")) + require.NoError(t, addCachedAttachment(ctx, src, mediaPath, mediaHash, int64(len(mediaBody)))) + srcCache := filepath.Join(dir, "src-cache") + srcMedia, err := media.LocalPath(srcCache, mediaPath) + require.NoError(t, err) + require.NoError(t, os.MkdirAll(filepath.Dir(srcMedia), 0o755)) + require.NoError(t, os.WriteFile(srcMedia, mediaBody, 0o600)) + embeddingBlob, err := store.EncodeEmbeddingVector([]float32{1, 0.5}) + require.NoError(t, err) + _, err = src.DB().ExecContext(ctx, ` + insert into message_embeddings( + message_id, provider, model, input_version, dimensions, embedding_blob, embedded_at + ) values ('m1', 'openai', 'text-embedding-3-small', ?, 2, ?, ?) + `, store.EmbeddingInputVersion, embeddingBlob, time.Now().UTC().Format(time.RFC3339Nano)) + require.NoError(t, err) + opts := Options{ + RepoPath: filepath.Join(dir, "share"), + CacheDir: srcCache, + Branch: "main", + Tag: "snapshot-old", + IncludeMedia: true, + IncludeEmbeddings: true, + EmbeddingProvider: "openai", + EmbeddingModel: "text-embedding-3-small", + EmbeddingInputVersion: store.EmbeddingInputVersion, + } + _, err = Export(ctx, src, opts) require.NoError(t, err) committed, err := Commit(ctx, opts, "old snapshot") require.NoError(t, err) @@ -1672,13 +1791,21 @@ func TestImportAtRestoresTaggedSnapshotWithoutMovingCheckout(t *testing.T) { dst, err := store.Open(ctx, filepath.Join(dir, "dst.db")) require.NoError(t, err) defer func() { _ = dst.Close() }() - manifest, err := ImportAt(ctx, dst, opts, "snapshot-old") + restoreOpts := opts + restoreOpts.CacheDir = filepath.Join(dir, "dst-cache") + manifest, err := ImportAt(ctx, dst, restoreOpts, "snapshot-old") require.NoError(t, err) require.False(t, manifest.GeneratedAt.IsZero()) results, err := dst.SearchMessages(ctx, store.SearchOptions{Query: "launch", Limit: 10}) require.NoError(t, err) require.Len(t, results, 1) require.Equal(t, "launch checklist ready", results[0].Content) + restoredMedia, err := media.LocalPath(restoreOpts.CacheDir, mediaPath) + require.NoError(t, err) + require.Equal(t, mediaBody, mustReadFile(t, restoredMedia)) + var embeddingCount int + require.NoError(t, dst.DB().QueryRowContext(ctx, `select count(*) from message_embeddings where message_id = 'm1'`).Scan(&embeddingCount)) + require.Equal(t, 1, embeddingCount) require.Equal(t, headBefore, strings.TrimSpace(testGitOutput(t, ctx, opts.RepoPath, "rev-parse", "HEAD"))) } @@ -1778,6 +1905,8 @@ func TestRepoCommandEdges(t *testing.T) { ctx := context.Background() require.ErrorContains(t, EnsureRepo(ctx, Options{}), "repo path is empty") require.NoError(t, Pull(ctx, Options{})) + require.Error(t, Pull(ctx, Options{Remote: "remote"})) + require.NoError(t, Pull(ctx, Options{RepoPath: filepath.Join(t.TempDir(), "local-share")})) repo := filepath.Join(t.TempDir(), "repo") require.NoError(t, os.MkdirAll(filepath.Join(repo, ".git"), 0o755)) @@ -1785,6 +1914,8 @@ func TestRepoCommandEdges(t *testing.T) { err := Push(ctx, Options{RepoPath: repo, Branch: "main"}) require.ErrorContains(t, err, "git push -u origin main") + err = Push(ctx, Options{RepoPath: repo}) + require.ErrorContains(t, err, "git push -u origin main") } func TestShareSmallHelpersAndValidation(t *testing.T) {