From fcb86fdf461f93bd4b9ff49ec5e1fdb6a39c78c5 Mon Sep 17 00:00:00 2001 From: 3vilM33pl3 Date: Fri, 2 Jan 2026 18:23:37 +0000 Subject: [PATCH 1/3] feat: Add PGVector support to setup and doctor commands --- src/commands/doctor.rs | 79 +++++++++++++++++++++++++++++++++++++++++- src/commands/setup.rs | 60 ++++++++++++++++++++++++++++++-- 2 files changed, 136 insertions(+), 3 deletions(-) diff --git a/src/commands/doctor.rs b/src/commands/doctor.rs index e970bd9..4da5d29 100644 --- a/src/commands/doctor.rs +++ b/src/commands/doctor.rs @@ -210,6 +210,9 @@ impl Doctor { // Display schema validation results self.display_schema_results(&schema_result)?; + // Display extension validation results + self.display_extension_results(&schema_result)?; + Ok(()) } @@ -264,6 +267,11 @@ impl Doctor { result["status"] = json!("failed"); } + // Update overall status if extension validation failed + if schema_result["extensions"]["vector"]["status"].as_str() == Some("failed") { + result["status"] = json!("failed"); + } + Ok(result) } @@ -302,7 +310,46 @@ impl Doctor { })?; // Validate database schema - self.validate_database_schema(&client).await + let mut full_result = self.validate_database_schema(&client).await?; + + // Validate extensions + let extension_result = self.validate_extensions(&client).await?; + full_result["extensions"] = extension_result; + + Ok(full_result) + } + + fn display_extension_results( + &self, + schema_result: &serde_json::Value, + ) -> Result<(), BucketError> { + let empty_map = serde_json::Map::new(); + let extensions = schema_result["extensions"] + .as_object() + .unwrap_or(&empty_map); + + if extensions.is_empty() { + return Ok(()); + } + + println!(); + println!("Database Extensions"); + println!("-------------------"); + + let vector_installed = extensions["vector"]["installed"].as_bool().unwrap_or(false); + let vector_version = extensions["vector"]["version"].as_str().unwrap_or("N/A"); + + if vector_installed { + println!( + "✅ vector extension installed (version: {})", + vector_version + ); + } else { + println!("❌ vector extension not installed"); + println!(" Run 'buckets setup' to install it"); + } + + Ok(()) } fn display_schema_results(&self, schema_result: &serde_json::Value) -> Result<(), BucketError> { @@ -675,4 +722,34 @@ impl Doctor { _ => vec![], } } + + async fn validate_extensions( + &self, + client: &tokio_postgres::Client, + ) -> Result { + let query = " + SELECT extname, extversion + FROM pg_extension + WHERE extname = 'vector' + "; + + let rows = client.query(query, &[]).await.map_err(|e| { + BucketError::from(format!("Failed to query extensions: {}", e).as_str()) + })?; + + let (installed, version) = if let Some(row) = rows.first() { + let version: String = row.get(1); + (true, version) + } else { + (false, "N/A".to_string()) + }; + + Ok(json!({ + "vector": { + "status": if installed { "passed" } else { "failed" }, + "installed": installed, + "version": version + } + })) + } } diff --git a/src/commands/setup.rs b/src/commands/setup.rs index 682e42b..c52a990 100644 --- a/src/commands/setup.rs +++ b/src/commands/setup.rs @@ -43,6 +43,7 @@ impl BucketCommand for Setup { let items = vec![ "Configure PostgreSQL", "Configure NTP Server", + "Install PostgreSQL Extensions (pgvector)", "Test Connection", "Save & Exit", "Cancel", @@ -68,6 +69,17 @@ impl BucketCommand for Setup { config = self.configure_ntp_server(config)?; } Some(2) => { + if let Err(e) = self.install_extensions(&config) { + eprintln!("\n❌ Extension installation failed: {}", e); + } + // Pause to let user read output + if let Ok(_) = Input::::new() + .with_prompt("Press Enter to continue") + .allow_empty(true) + .interact_text() + {} + } + Some(3) => { if let Err(e) = self.test_database_connection(&config) { eprintln!("\n❌ Connection failed: {}", e); } else { @@ -81,7 +93,7 @@ impl BucketCommand for Setup { .interact_text() {} } - Some(3) => { + Some(4) => { config.save()?; println!("\n✅ Global configuration saved successfully!"); println!( @@ -90,7 +102,7 @@ impl BucketCommand for Setup { ); break; } - Some(4) => { + Some(5) => { println!("\n❌ Configuration cancelled. Changes were not saved."); break; } @@ -132,6 +144,50 @@ impl Setup { Ok(config) } + fn install_extensions(&self, config: &GlobalConfig) -> Result<(), BucketError> { + println!("\nInstalling PostgreSQL Extensions..."); + + if let Some(ref conn_str) = config.postgresql_connection { + RuntimeManager::block_on(async { self.install_pgvector(conn_str).await })?; + println!("✅ Extension installation completed!"); + } else { + println!("⚠️ No PostgreSQL connection string configured."); + } + + Ok(()) + } + + async fn install_pgvector(&self, connection_string: &str) -> Result<(), BucketError> { + let db_config = DatabaseConfig::from_url(connection_string)?; + + let mut cfg = Config::new(); + cfg.host = Some(db_config.host); + cfg.port = Some(db_config.port); + cfg.user = Some(db_config.username); + cfg.password = db_config.password; + cfg.dbname = Some(db_config.database); + cfg.connect_timeout = Some(Duration::from_secs(10)); + + let pool = cfg.create_pool(Some(Runtime::Tokio1), NoTls).map_err(|e| { + BucketError::from(format!("Failed to create connection pool: {}", e).as_str()) + })?; + + let client = pool.get().await.map_err(|e| { + BucketError::from(format!("Failed to connect to PostgreSQL database: {}", e).as_str()) + })?; + + println!(" Enabling 'vector' extension..."); + client + .execute("CREATE EXTENSION IF NOT EXISTS vector", &[]) + .await + .map_err(|e| { + BucketError::from(format!("Failed to enable vector extension: {}", e).as_str()) + })?; + println!(" ✅ 'vector' extension enabled"); + + Ok(()) + } + fn test_database_connection(&self, config: &GlobalConfig) -> Result<(), BucketError> { println!("\nTesting Database Connection..."); From 7243f88a11939dc3e7db418c39fe1f2d75360780 Mon Sep 17 00:00:00 2001 From: 3vilM33pl3 Date: Fri, 2 Jan 2026 18:24:00 +0000 Subject: [PATCH 2/3] chore: Bump version to 0.4.1 --- Cargo.lock | 2 +- Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index dcdc595..469e0c1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -439,7 +439,7 @@ dependencies = [ [[package]] name = "buckets" -version = "0.4.0" +version = "0.4.1" dependencies = [ "arrow", "assert_cmd", diff --git a/Cargo.toml b/Cargo.toml index a322b7d..d86fdee 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "buckets" -version = "0.4.0" +version = "0.4.1" edition = "2021" [dependencies] From 8efb23407c4fa54e886624c228fa03304393aedf Mon Sep 17 00:00:00 2001 From: 3vilM33pl3 Date: Fri, 6 Feb 2026 17:48:04 +0000 Subject: [PATCH 3/3] feat: Add semantic embedding and duplicate detection to expectations Integrate pgvector for semantic search on expectations: - V3 migration: pgvector extension, embedding vector(384) column, HNSW index - EmbeddingGenerator using all-MiniLM-L6-v2 via Candle (lazy-loaded, cached) - Duplicate detection at 85% cosine similarity threshold in expect command - Doctor command validates pgvector extension and expectations/pebbles schema - EmbeddingError variant for proper error semantics - Fix all clippy warnings and run cargo fmt across codebase Co-Authored-By: Claude Opus 4.6 --- .gitignore | 1 + CLAUDE.md | 57 +- Cargo.lock | 1125 ++++++++++++++++++++++- Cargo.toml | 6 + README.md | 2 + docs/expectation_system.md | 9 + src/bootstrap.rs | 6 +- src/commands/check.rs | 2 +- src/commands/commit.rs | 14 +- src/commands/completions.rs | 8 +- src/commands/config.rs | 18 +- src/commands/create.rs | 5 +- src/commands/doctor.rs | 21 +- src/commands/expect.rs | 38 +- src/commands/list.rs | 22 +- src/commands/mod.rs | 2 +- src/commands/revert.rs | 6 +- src/commands/rollback.rs | 28 +- src/commands/setup.rs | 6 +- src/commands/status.rs | 15 +- src/data/bucket.rs | 23 +- src/data/commit.rs | 45 +- src/data/expectation.rs | 71 +- src/errors.rs | 14 + src/postgres_db.rs | 2 +- src/sql/migrations/V3__add_pgvector.sql | 10 + src/test_support.rs | 6 +- src/utils/checks.rs | 4 +- src/utils/compression.rs | 17 +- src/utils/embeddings.rs | 177 ++++ src/utils/mod.rs | 1 + src/utils/utils.rs | 17 +- tests/common.rs | 6 +- tests/test_cli_completions.rs | 1 - tests/test_cli_expect.rs | 37 +- tests/test_cli_finalize.rs | 4 +- tests/tests.rs | 5 +- 37 files changed, 1630 insertions(+), 201 deletions(-) create mode 100644 src/sql/migrations/V3__add_pgvector.sql create mode 100644 src/utils/embeddings.rs diff --git a/.gitignore b/.gitignore index 61a4f8f..ad3aa28 100644 --- a/.gitignore +++ b/.gitignore @@ -28,3 +28,4 @@ test_repo_no_db_init/ # AI Context .claude/ .linear_mcp.sh +.brv/ diff --git a/CLAUDE.md b/CLAUDE.md index 5c50109..3d3b5ff 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -36,6 +36,12 @@ cargo test -- --ignored - Tests use `#[serial]` from the `serial_test` crate for tests that need sequential execution - Some tests use `#[ignore]` and must be explicitly run with `-- --ignored` +### Environment Variables for Tests +Skip Docker-dependent tests when Docker is unavailable: +- `BUCKETS_SKIP_DOCKER_TESTS=1` - Skip tests requiring Docker +- `BUCKETS_SKIP_DB_TESTS=1` - Skip database tests +- `NO_NETWORK=1` - Skip tests requiring network + ## Architecture Overview ### Command Structure and Pattern @@ -91,13 +97,20 @@ Buckets has a two-tier configuration system: - Can override global settings ### Database & Storage -- **PostgreSQL** for data persistence (previously DuckDB) -- Schema management in `src/postgres_db/` -- File storage: Content-addressable in `.buckets/storage/` +- **PostgreSQL** with **pgvector** extension for data persistence and semantic search +- Schema in `src/postgres_db.rs`, migrations in `src/sql/migrations/` +- File storage: Content-addressable in `.b/storage/` within each bucket - File hashing: BLAKE3 for content integrity - Compression: zstd for efficient storage - UUID-based object identification +### Semantic Search (pgvector) +- Expectations use vector embeddings for duplicate detection +- Embedding model: `all-MiniLM-L6-v2` (384 dimensions) via Candle +- Model is lazily loaded and cached globally in `src/utils/embeddings.rs` +- First run downloads ~90MB model from HuggingFace Hub +- HNSW index for fast cosine similarity search (>85% threshold warns for duplicates) + ### Thread-Local State Defined in [main.rs](src/main.rs): - `CURRENT_DIR`: Current working directory (used throughout the codebase) @@ -123,9 +136,14 @@ All errors use the centralized `BucketError` enum with: - Uses `tempfile` crate for isolated test environments - Uses `serial_test` crate with `#[serial]` for tests requiring sequential execution - Uses `assert_cmd` for CLI testing +- Uses `testcontainers` with `pgvector/pgvector:pg16` Docker image for database tests Test naming convention: `test_cli_` +### Test Fixtures +- `TestDatabase` - Spins up a PostgreSQL container with pgvector, sets `DATABASE_URL` env var, auto-cleans on drop +- `RepoFixture` - Creates a complete test repository with initialized database and bucket + ## Project Structure ``` buckets/ @@ -134,23 +152,15 @@ buckets/ │ ├── main.rs # Entry point, command dispatch │ ├── errors.rs # Error types │ ├── world.rs # Global state +│ ├── postgres_db.rs # Database connection and operations │ ├── commands/ # Command implementations │ │ ├── mod.rs # BucketCommand trait -│ │ ├── init.rs -│ │ ├── create.rs -│ │ └── ... -│ ├── data/ # Core data structures -│ │ ├── bucket.rs # Bucket type -│ │ └── commit.rs # Commit type -│ ├── utils/ # Utility functions -│ │ ├── checks.rs # Validation functions -│ │ ├── security.rs # Path security -│ │ ├── compression.rs # File compression -│ │ └── ... -│ └── postgres_db/ # Database layer -├── tests/ # Integration tests -│ ├── common.rs # Test utilities -│ └── test_cli_*.rs # Per-command tests +│ │ ├── init.rs, create.rs, commit.rs, expect.rs, ... +│ ├── data/ # Core data structures (Bucket, Commit) +│ ├── utils/ # Utility functions (compression, security, embeddings) +│ └── sql/migrations/ # PostgreSQL schema migrations (V1, V2, V3) +├── tests/ # Integration tests (test_cli_*.rs) +│ └── common.rs # Test fixtures (TestDatabase, RepoFixture) └── debian/ # Debian packaging ``` @@ -162,10 +172,11 @@ A buckets repository has this structure: repo_name/ ├── .buckets/ │ ├── config # Repository configuration (TOML) -│ ├── buckets.db # Database file -│ └── storage/ # Compressed file storage +│ └── database_type # "PostgreSQL" marker file └── bucket_name/ # Individual buckets - └── .bucket_meta # Bucket metadata + └── .b/ + ├── info # Bucket metadata (TOML: id, name, relative_bucket_path) + └── storage/ # Compressed file storage (content-addressable) ``` ### Static Arguments @@ -175,3 +186,7 @@ The `ARGS` static in [main.rs](src/main.rs:23) is initialized lazily using `once - Build scripts: `build-deb.sh` (clean build) and `build-deb-fast.sh` (incremental) - Package files in `debian/` directory - Makefile at `Makefile.deb` for package building + +### Diagnostics +- `buckets doctor` - System diagnostics command that tests database connectivity, NTP server, and pgvector availability +- Useful for troubleshooting setup issues diff --git a/Cargo.lock b/Cargo.lock index 469e0c1..119d1f9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -105,6 +105,15 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "arbitrary" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3d036a3c4ab069c7b410a2ce876bd74808d2d0888a82667669f8e783a898bf1" +dependencies = [ + "derive_arbitrary", +] + [[package]] name = "arrayref" version = "0.3.9" @@ -191,7 +200,7 @@ dependencies = [ "arrow-schema", "arrow-select", "atoi", - "base64", + "base64 0.22.1", "chrono", "half", "lexical-core", @@ -382,12 +391,39 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "base64" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" + [[package]] name = "base64" version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" +[[package]] +name = "bit-set" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0700ddab506f33b20a03b13996eccd309a48e5ff77d0d95926aa0210fb4e95f1" +dependencies = [ + "bit-vec", +] + +[[package]] +name = "bit-vec" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb" + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + [[package]] name = "bitflags" version = "2.9.1" @@ -445,6 +481,9 @@ dependencies = [ "assert_cmd", "async-trait", "blake3", + "candle-core", + "candle-nn", + "candle-transformers", "chrono", "clap", "clap_complete", @@ -452,9 +491,11 @@ dependencies = [ "dialoguer", "dirs", "env_logger", + "hf-hub", "log 0.4.27", "ntp", "once_cell", + "pgvector", "postgres", "predicates", "refinery", @@ -465,6 +506,7 @@ dependencies = [ "tempfile", "testcontainers", "thiserror 2.0.12", + "tokenizers", "tokio", "tokio-postgres", "toml", @@ -479,6 +521,26 @@ version = "3.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1628fb46dfa0b37568d12e5edd512553eccf6a22a78e8bde00bb4aed84d5bdbf" +[[package]] +name = "bytemuck" +version = "1.24.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fbdf580320f38b612e485521afda1ee26d10cc9884efaaa750d383e13e3c5f4" +dependencies = [ + "bytemuck_derive", +] + +[[package]] +name = "bytemuck_derive" +version = "1.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9abbd1bc6865053c427f7198e6af43bfdedc55ab791faed4fbd361d789575ff" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.103", +] + [[package]] name = "byteorder" version = "1.5.0" @@ -491,6 +553,61 @@ version = "1.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a" +[[package]] +name = "candle-core" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e1a39b963e261c58017edf2007e5b63425ad21538aaaf51fe23d1da41703701" +dependencies = [ + "byteorder", + "gemm", + "half", + "memmap2", + "num-traits", + "num_cpus", + "rand 0.8.5", + "rand_distr", + "rayon", + "safetensors", + "thiserror 1.0.69", + "yoke 0.7.5", + "zip", +] + +[[package]] +name = "candle-nn" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "898f8d21b8bdf559a1c8635e2db8386b2134015cd3003c18c1a30a22a67daec6" +dependencies = [ + "candle-core", + "half", + "num-traits", + "rayon", + "safetensors", + "serde", + "thiserror 1.0.69", +] + +[[package]] +name = "candle-transformers" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06b8a130a8ac1d1e20696d89f7a52948902e037ad0eec085fceb77021007cfee" +dependencies = [ + "byteorder", + "candle-core", + "candle-nn", + "fancy-regex", + "num-traits", + "rand 0.8.5", + "rayon", + "serde", + "serde_json", + "serde_plain", + "tracing", +] + [[package]] name = "cc" version = "1.2.27" @@ -520,7 +637,7 @@ dependencies = [ "num-traits", "serde", "wasm-bindgen", - "windows-link", + "windows-link 0.1.1", ] [[package]] @@ -626,6 +743,16 @@ dependencies = [ "custom_derive", ] +[[package]] +name = "core-foundation" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "core-foundation-sys" version = "0.8.7" @@ -641,6 +768,40 @@ dependencies = [ "libc", ] +[[package]] +name = "crc32fast" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + [[package]] name = "crunchy" version = "0.2.3" @@ -690,8 +851,18 @@ version = "0.13.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a01d95850c592940db9b8194bc39f4bc0e89dee5c4265e4b1807c34a9aba453c" dependencies = [ - "darling_core", - "darling_macro", + "darling_core 0.13.4", + "darling_macro 0.13.4", +] + +[[package]] +name = "darling" +version = "0.20.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc7f46116c46ff9ab3eb1597a45688b6715c6e628b5c133e288e709a29bcb4ee" +dependencies = [ + "darling_core 0.20.11", + "darling_macro 0.20.11", ] [[package]] @@ -708,17 +879,42 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "darling_core" +version = "0.20.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d00b9596d185e565c2207a0b01f8bd1a135483d02d9b7b0a54b11da8d53412e" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim 0.11.1", + "syn 2.0.103", +] + [[package]] name = "darling_macro" version = "0.13.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c972679f83bdf9c42bd905396b6c3588a843a17f0f16dfcfa3e2c5d57441835" dependencies = [ - "darling_core", + "darling_core 0.13.4", "quote", "syn 1.0.109", ] +[[package]] +name = "darling_macro" +version = "0.20.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead" +dependencies = [ + "darling_core 0.20.11", + "quote", + "syn 2.0.103", +] + [[package]] name = "deadpool" version = "0.10.0" @@ -761,6 +957,48 @@ dependencies = [ "powerfmt", ] +[[package]] +name = "derive_arbitrary" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e567bd82dcff979e4b03460c307b3cdc9e96fde3d73bed1496d2bc75d9dd62a" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.103", +] + +[[package]] +name = "derive_builder" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "507dfb09ea8b7fa618fcf76e953f4f5e192547945816d5358edffe39f6f94947" +dependencies = [ + "derive_builder_macro", +] + +[[package]] +name = "derive_builder_core" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d5bcf7b024d6835cfb3d473887cd966994907effbe9227e8c8219824d06c4e8" +dependencies = [ + "darling 0.20.11", + "proc-macro2", + "quote", + "syn 2.0.103", +] + +[[package]] +name = "derive_builder_macro" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c" +dependencies = [ + "derive_builder_core", + "syn 2.0.103", +] + [[package]] name = "dialoguer" version = "0.11.0" @@ -829,12 +1067,40 @@ version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10" +[[package]] +name = "dyn-stack" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56e53799688f5632f364f8fb387488dd05db9fe45db7011be066fc20e7027f8b" +dependencies = [ + "bytemuck", + "reborrow", +] + +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" + [[package]] name = "encode_unicode" version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0" +[[package]] +name = "enum-as-inner" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1e6a265c649f3f5979b601d26f1d05ada116434c87741c9493cb56218f76cbc" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn 2.0.103", +] + [[package]] name = "env_filter" version = "0.1.3" @@ -883,12 +1149,29 @@ dependencies = [ "version_check", ] +[[package]] +name = "esaxx-rs" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d817e038c30374a4bcb22f94d0a8a0e216958d4c3dcde369b1439fec4bdda6e6" + [[package]] name = "fallible-iterator" version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4443176a9f2c162692bd3d352d745ef9413eec5782a80d8fd6f8a1ac692a07f7" +[[package]] +name = "fancy-regex" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "531e46835a22af56d1e3b66f04844bed63158bc094a628bec1d321d9b4c44bf2" +dependencies = [ + "bit-set", + "regex-automata", + "regex-syntax", +] + [[package]] name = "fastrand" version = "2.3.0" @@ -901,10 +1184,20 @@ version = "25.2.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1045398c1bfd89168b5fd3f1fc11f6e70b34f6f66300c87d44d3de849463abf1" dependencies = [ - "bitflags", + "bitflags 2.9.1", "rustc_version", ] +[[package]] +name = "flate2" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfe33edd8e85a12a67454e37f8c75e730830d83e313556ab9ebf9ee7fbeb3bfb" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + [[package]] name = "float-cmp" version = "0.10.0" @@ -920,6 +1213,21 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" +[[package]] +name = "foreign-types" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" +dependencies = [ + "foreign-types-shared", +] + +[[package]] +name = "foreign-types-shared" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" + [[package]] name = "form_urlencoded" version = "1.2.1" @@ -1018,6 +1326,124 @@ dependencies = [ "slab", ] +[[package]] +name = "gemm" +version = "0.17.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ab24cc62135b40090e31a76a9b2766a501979f3070fa27f689c27ec04377d32" +dependencies = [ + "dyn-stack", + "gemm-c32", + "gemm-c64", + "gemm-common", + "gemm-f16", + "gemm-f32", + "gemm-f64", + "num-complex", + "num-traits", + "paste", + "raw-cpuid", + "seq-macro", +] + +[[package]] +name = "gemm-c32" +version = "0.17.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9c030d0b983d1e34a546b86e08f600c11696fde16199f971cd46c12e67512c0" +dependencies = [ + "dyn-stack", + "gemm-common", + "num-complex", + "num-traits", + "paste", + "raw-cpuid", + "seq-macro", +] + +[[package]] +name = "gemm-c64" +version = "0.17.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbb5f2e79fefb9693d18e1066a557b4546cd334b226beadc68b11a8f9431852a" +dependencies = [ + "dyn-stack", + "gemm-common", + "num-complex", + "num-traits", + "paste", + "raw-cpuid", + "seq-macro", +] + +[[package]] +name = "gemm-common" +version = "0.17.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2e7ea062c987abcd8db95db917b4ffb4ecdfd0668471d8dc54734fdff2354e8" +dependencies = [ + "bytemuck", + "dyn-stack", + "half", + "num-complex", + "num-traits", + "once_cell", + "paste", + "pulp", + "raw-cpuid", + "rayon", + "seq-macro", + "sysctl", +] + +[[package]] +name = "gemm-f16" +version = "0.17.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ca4c06b9b11952071d317604acb332e924e817bd891bec8dfb494168c7cedd4" +dependencies = [ + "dyn-stack", + "gemm-common", + "gemm-f32", + "half", + "num-complex", + "num-traits", + "paste", + "raw-cpuid", + "rayon", + "seq-macro", +] + +[[package]] +name = "gemm-f32" +version = "0.17.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e9a69f51aaefbd9cf12d18faf273d3e982d9d711f60775645ed5c8047b4ae113" +dependencies = [ + "dyn-stack", + "gemm-common", + "num-complex", + "num-traits", + "paste", + "raw-cpuid", + "seq-macro", +] + +[[package]] +name = "gemm-f64" +version = "0.17.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa397a48544fadf0b81ec8741e5c0fba0043008113f71f2034def1935645d2b0" +dependencies = [ + "dyn-stack", + "gemm-common", + "num-complex", + "num-traits", + "paste", + "raw-cpuid", + "seq-macro", +] + [[package]] name = "generic-array" version = "0.14.7" @@ -1063,9 +1489,12 @@ version = "2.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888" dependencies = [ + "bytemuck", "cfg-if", "crunchy", "num-traits", + "rand 0.8.5", + "rand_distr", ] [[package]] @@ -1092,6 +1521,23 @@ version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" +[[package]] +name = "hf-hub" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b780635574b3d92f036890d8373433d6f9fc7abb320ee42a5c25897fc8ed732" +dependencies = [ + "dirs", + "indicatif", + "log 0.4.27", + "native-tls", + "rand 0.8.5", + "serde", + "serde_json", + "thiserror 1.0.69", + "ureq", +] + [[package]] name = "hmac" version = "0.12.1" @@ -1132,7 +1578,7 @@ checksum = "200072f5d0e3614556f94a9930d5dc3e0662a652823904c3a75dc3b0af7fee47" dependencies = [ "displaydoc", "potential_utf", - "yoke", + "yoke 0.8.0", "zerofrom", "zerovec", ] @@ -1204,7 +1650,7 @@ dependencies = [ "stable_deref_trait", "tinystr", "writeable", - "yoke", + "yoke 0.8.0", "zerofrom", "zerotrie", "zerovec", @@ -1247,12 +1693,43 @@ dependencies = [ "hashbrown", ] +[[package]] +name = "indicatif" +version = "0.17.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "183b3088984b400f4cfac3620d5e076c84da5364016b4f49473de574b2586235" +dependencies = [ + "console", + "number_prefix", + "portable-atomic", + "unicode-width", + "web-time", +] + [[package]] name = "is_terminal_polyfill" version = "1.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" +[[package]] +name = "itertools" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57" +dependencies = [ + "either", +] + +[[package]] +name = "itertools" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.15" @@ -1391,7 +1868,7 @@ version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c0ff37bd590ca25063e35af745c343cb7a0271906fb7b37e4813e8f79f00268d" dependencies = [ - "bitflags", + "bitflags 2.9.1", "libc", ] @@ -1432,6 +1909,22 @@ version = "0.4.27" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" +[[package]] +name = "macro_rules_attribute" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "65049d7923698040cd0b1ddcced9b0eb14dd22c5f86ae59c3740eab64a676520" +dependencies = [ + "macro_rules_attribute-proc_macro", + "paste", +] + +[[package]] +name = "macro_rules_attribute-proc_macro" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "670fdfda89751bc4a84ac13eaa63e205cf0fd22b4c9a5fbfa085b63c1f1d3a30" + [[package]] name = "md-5" version = "0.10.6" @@ -1448,6 +1941,22 @@ version = "2.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0" +[[package]] +name = "memmap2" +version = "0.9.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "744133e4a0e0a658e1374cf3bf8e415c4052a15a111acd372764c55b4177d490" +dependencies = [ + "libc", + "stable_deref_trait", +] + +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + [[package]] name = "miniz_oxide" version = "0.8.9" @@ -1455,6 +1964,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" dependencies = [ "adler2", + "simd-adler32", ] [[package]] @@ -1468,6 +1978,54 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "monostate" +version = "0.1.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aafe1be9d0c75642e3e50fedc7ecadf1ef1cbce6eb66462153fc44245343fbee" +dependencies = [ + "monostate-impl", + "serde", +] + +[[package]] +name = "monostate-impl" +version = "0.1.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c402a4092d5e204f32c9e155431046831fa712637043c58cb73bc6bc6c9663b5" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.103", +] + +[[package]] +name = "native-tls" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87de3442987e9dbec73158d5c715e7ad9072fda936bb03d19d7fa10e00520f0e" +dependencies = [ + "libc", + "log 0.4.27", + "openssl", + "openssl-probe", + "openssl-sys", + "schannel", + "security-framework", + "security-framework-sys", + "tempfile", +] + +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + [[package]] name = "normalize-line-endings" version = "0.3.0" @@ -1518,6 +2076,7 @@ version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" dependencies = [ + "bytemuck", "num-traits", ] @@ -1578,6 +2137,34 @@ dependencies = [ "libc", ] +[[package]] +name = "num_enum" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1207a7e20ad57b847bbddc6776b968420d38292bbfe2089accff5e19e82454c" +dependencies = [ + "num_enum_derive", + "rustversion", +] + +[[package]] +name = "num_enum_derive" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff32365de1b6743cb203b710788263c44a03de03802daf96092f2da4fe6ba4d7" +dependencies = [ + "proc-macro-crate", + "proc-macro2", + "quote", + "syn 2.0.103", +] + +[[package]] +name = "number_prefix" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3" + [[package]] name = "object" version = "0.36.7" @@ -1599,6 +2186,72 @@ version = "1.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad" +[[package]] +name = "onig" +version = "6.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "336b9c63443aceef14bea841b899035ae3abe89b7c486aaf4c5bd8aafedac3f0" +dependencies = [ + "bitflags 2.9.1", + "libc", + "once_cell", + "onig_sys", +] + +[[package]] +name = "onig_sys" +version = "69.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7f86c6eef3d6df15f23bcfb6af487cbd2fed4e5581d58d5bf1f5f8b7f6727dc" +dependencies = [ + "cc", + "pkg-config", +] + +[[package]] +name = "openssl" +version = "0.10.75" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08838db121398ad17ab8531ce9de97b244589089e290a384c900cb9ff7434328" +dependencies = [ + "bitflags 2.9.1", + "cfg-if", + "foreign-types", + "libc", + "once_cell", + "openssl-macros", + "openssl-sys", +] + +[[package]] +name = "openssl-macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.103", +] + +[[package]] +name = "openssl-probe" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e" + +[[package]] +name = "openssl-sys" +version = "0.9.111" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "82cab2d520aa75e3c58898289429321eb788c3106963d0dc886ec7a5f4adc321" +dependencies = [ + "cc", + "libc", + "pkg-config", + "vcpkg", +] + [[package]] name = "option-ext" version = "0.2.0" @@ -1628,12 +2281,28 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "paste" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" + [[package]] name = "percent-encoding" version = "2.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" +[[package]] +name = "pgvector" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc58e2d255979a31caa7cabfa7aac654af0354220719ab7a68520ae7a91e8c0b" +dependencies = [ + "bytes", + "postgres-types", +] + [[package]] name = "phf" version = "0.11.3" @@ -1705,7 +2374,7 @@ version = "0.6.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "76ff0abab4a9b844b93ef7b81f1efc0a366062aaef2cd702c76256b5dc075c54" dependencies = [ - "base64", + "base64 0.22.1", "byteorder", "bytes", "fallible-iterator", @@ -1778,10 +2447,19 @@ checksum = "727e462b119fe9c93fd0eb1429a5f7647394014cf3c04ab2c0350eeb09095ffa" name = "predicates-tree" version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72dd2d6d381dfb73a193c7fca536518d7caee39fc8503f74e7dc0be0531b425c" +checksum = "72dd2d6d381dfb73a193c7fca536518d7caee39fc8503f74e7dc0be0531b425c" +dependencies = [ + "predicates-core", + "termtree", +] + +[[package]] +name = "proc-macro-crate" +version = "3.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "219cb19e96be00ab2e37d6e299658a0cfa83e52429179969b0f0121b4ac46983" dependencies = [ - "predicates-core", - "termtree", + "toml_edit 0.23.4", ] [[package]] @@ -1793,6 +2471,18 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "pulp" +version = "0.18.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a0a01a0dc67cf4558d279f0c25b0962bd08fc6dec0137699eae304103e882fe6" +dependencies = [ + "bytemuck", + "libm", + "num-complex", + "reborrow", +] + [[package]] name = "quote" version = "1.0.40" @@ -1867,13 +2557,69 @@ dependencies = [ "getrandom 0.3.3", ] +[[package]] +name = "rand_distr" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32cb0b9bc82b0a0876c2dd994a7e7a2683d3e7390ca40e6886785ef0c7e3ee31" +dependencies = [ + "num-traits", + "rand 0.8.5", +] + +[[package]] +name = "raw-cpuid" +version = "10.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c297679cb867470fa8c9f67dbba74a78d78e3e98d7cf2b08d6d71540f797332" +dependencies = [ + "bitflags 1.3.2", +] + +[[package]] +name = "rayon" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "368f01d005bf8fd9b1206fb6fa653e6c4a81ceb1466406b81792d87c5677a58f" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-cond" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "059f538b55efd2309c9794130bc149c6a553db90e9d99c2030785c82f0bd7df9" +dependencies = [ + "either", + "itertools 0.11.0", + "rayon", +] + +[[package]] +name = "rayon-core" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + +[[package]] +name = "reborrow" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03251193000f4bd3b042892be858ee50e8b3719f2b08e5833ac4353724632430" + [[package]] name = "redox_syscall" version = "0.5.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "82b568323e98e49e2a0899dcee453dd679fae22d69adf9b11dd508d1549b7e2f" dependencies = [ - "bitflags", + "bitflags 2.9.1", ] [[package]] @@ -1961,6 +2707,20 @@ version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" +[[package]] +name = "ring" +version = "0.17.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7" +dependencies = [ + "cc", + "cfg-if", + "getrandom 0.2.16", + "libc", + "untrusted", + "windows-sys 0.52.0", +] + [[package]] name = "rustc-demangle" version = "0.1.25" @@ -1982,13 +2742,48 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c71e83d6afe7ff64890ec6b71d6a69bb8a610ab78ce364b3352876bb4c801266" dependencies = [ - "bitflags", + "bitflags 2.9.1", "errno", "libc", "linux-raw-sys", "windows-sys 0.59.0", ] +[[package]] +name = "rustls" +version = "0.23.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "533f54bc6a7d4f647e46ad909549eda97bf5afc1585190ef692b4286b198bd8f" +dependencies = [ + "log 0.4.27", + "once_cell", + "ring", + "rustls-pki-types", + "rustls-webpki", + "subtle", + "zeroize", +] + +[[package]] +name = "rustls-pki-types" +version = "1.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21e6f2ab2928ca4291b86736a8bd920a277a399bba1589409d72154ff87c1282" +dependencies = [ + "zeroize", +] + +[[package]] +name = "rustls-webpki" +version = "0.103.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2ffdfa2f5286e2247234e03f680868ac2815974dc39e00ea15adc445d0aafe52" +dependencies = [ + "ring", + "rustls-pki-types", + "untrusted", +] + [[package]] name = "rustversion" version = "1.0.19" @@ -2001,6 +2796,16 @@ version = "1.0.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" +[[package]] +name = "safetensors" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44560c11236a6130a46ce36c836a62936dc81ebf8c36a37947423571be0e55b6" +dependencies = [ + "serde", + "serde_json", +] + [[package]] name = "same-file" version = "1.0.6" @@ -2019,6 +2824,15 @@ dependencies = [ "sdd", ] +[[package]] +name = "schannel" +version = "0.1.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "891d81b926048e76efe18581bf793546b4c0eaf8448d72be8de2bbee5fd166e1" +dependencies = [ + "windows-sys 0.61.2", +] + [[package]] name = "scopeguard" version = "1.2.0" @@ -2031,12 +2845,41 @@ version = "3.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "584e070911c7017da6cb2eb0788d09f43d789029b5877d3e5ecc8acf86ceee21" +[[package]] +name = "security-framework" +version = "2.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02" +dependencies = [ + "bitflags 2.9.1", + "core-foundation", + "core-foundation-sys", + "libc", + "security-framework-sys", +] + +[[package]] +name = "security-framework-sys" +version = "2.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc1f0cbffaac4852523ce30d8bd3c5cdc873501d96ff467ca09b6767bb8cd5c0" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "semver" version = "1.0.26" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "56e6fa9c48d24d85fb3de5ad847117517440f6beceb7798af16b4a87d616b8d0" +[[package]] +name = "seq-macro" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bc711410fbe7399f390ca1c3b60ad0f53f80e95c5eb935e52268a0e2cd49acc" + [[package]] name = "serde" version = "1.0.219" @@ -2069,6 +2912,15 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_plain" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ce1fc6db65a611022b23a0dec6975d63fb80a302cb3388835ff02c097258d50" +dependencies = [ + "serde", +] + [[package]] name = "serde_spanned" version = "0.6.9" @@ -2094,7 +2946,7 @@ version = "1.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e182d6ec6f05393cc0e5ed1bf81ad6db3a8feedf8ee515ecdd369809bcce8082" dependencies = [ - "darling", + "darling 0.13.4", "proc-macro2", "quote", "syn 1.0.109", @@ -2148,6 +3000,12 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" +[[package]] +name = "simd-adler32" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e320a6c5ad31d271ad523dcf3ad13e2767ad8b1cb8f047f75a8aeaf8da139da2" + [[package]] name = "simdutf8" version = "0.1.5" @@ -2185,6 +3043,18 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "spm_precompiled" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5851699c4033c63636f7ea4cf7b7c1f1bf06d0cc03cfb42e711de5a5c46cf326" +dependencies = [ + "base64 0.13.1", + "nom", + "serde", + "unicode-segmentation", +] + [[package]] name = "stable_deref_trait" version = "1.2.0" @@ -2259,6 +3129,20 @@ dependencies = [ "syn 2.0.103", ] +[[package]] +name = "sysctl" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec7dddc5f0fee506baf8b9fdb989e242f17e4b11c61dfbb0635b705217199eea" +dependencies = [ + "bitflags 2.9.1", + "byteorder", + "enum-as-inner", + "libc", + "thiserror 1.0.69", + "walkdir", +] + [[package]] name = "tempfile" version = "3.20.0" @@ -2411,6 +3295,37 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" +[[package]] +name = "tokenizers" +version = "0.19.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e500fad1dd3af3d626327e6a3fe5050e664a6eaa4708b8ca92f1794aaf73e6fd" +dependencies = [ + "aho-corasick", + "derive_builder", + "esaxx-rs", + "getrandom 0.2.16", + "itertools 0.12.1", + "lazy_static", + "log 0.4.27", + "macro_rules_attribute", + "monostate", + "onig", + "paste", + "rand 0.8.5", + "rayon", + "rayon-cond", + "regex", + "regex-syntax", + "serde", + "serde_json", + "spm_precompiled", + "thiserror 1.0.69", + "unicode-normalization-alignments", + "unicode-segmentation", + "unicode_categories", +] + [[package]] name = "tokio" version = "1.45.1" @@ -2485,8 +3400,8 @@ checksum = "dc1beb996b9d83529a9e75c17a1686767d148d70663143c7854d8b4a09ced362" dependencies = [ "serde", "serde_spanned", - "toml_datetime", - "toml_edit", + "toml_datetime 0.6.11", + "toml_edit 0.22.27", ] [[package]] @@ -2498,6 +3413,15 @@ dependencies = [ "serde", ] +[[package]] +name = "toml_datetime" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bade1c3e902f58d73d3f294cd7f20391c1cb2fbcb643b73566bc773971df91e3" +dependencies = [ + "serde", +] + [[package]] name = "toml_edit" version = "0.22.27" @@ -2507,11 +3431,32 @@ dependencies = [ "indexmap", "serde", "serde_spanned", - "toml_datetime", + "toml_datetime 0.6.11", "toml_write", "winnow", ] +[[package]] +name = "toml_edit" +version = "0.23.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7211ff1b8f0d3adae1663b7da9ffe396eabe1ca25f0b0bee42b0da29a9ddce93" +dependencies = [ + "indexmap", + "toml_datetime 0.7.0", + "toml_parser", + "winnow", +] + +[[package]] +name = "toml_parser" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b551886f449aa90d4fe2bdaa9f4a2577ad2dde302c61ecf262d80b116db95c10" +dependencies = [ + "winnow", +] + [[package]] name = "toml_write" version = "0.1.2" @@ -2576,18 +3521,64 @@ dependencies = [ "tinyvec", ] +[[package]] +name = "unicode-normalization-alignments" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43f613e4fa046e69818dd287fdc4bc78175ff20331479dab6e1b0f98d57062de" +dependencies = [ + "smallvec", +] + [[package]] name = "unicode-properties" version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e70f2a8b45122e719eb623c01822704c4e0907e7e426a05927e1a1cfff5b75d0" +[[package]] +name = "unicode-segmentation" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" + [[package]] name = "unicode-width" version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254" +[[package]] +name = "unicode_categories" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e" + +[[package]] +name = "untrusted" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" + +[[package]] +name = "ureq" +version = "2.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02d1a66277ed75f640d608235660df48c8e3c19f3b4edb6a263315626cc3c01d" +dependencies = [ + "base64 0.22.1", + "flate2", + "log 0.4.27", + "native-tls", + "once_cell", + "rustls", + "rustls-pki-types", + "serde", + "serde_json", + "url", + "webpki-roots 0.26.11", +] + [[package]] name = "url" version = "2.5.4" @@ -2623,6 +3614,12 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + [[package]] name = "version_check" version = "0.9.5" @@ -2743,6 +3740,34 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "web-time" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "webpki-roots" +version = "0.26.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "521bc38abb08001b01866da9f51eb7c5d647a19260e00054a8c7fd5f9e57f7a9" +dependencies = [ + "webpki-roots 1.0.5", +] + +[[package]] +name = "webpki-roots" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "12bed680863276c63889429bfd6cab3b99943659923822de1c8a39c49e4d722c" +dependencies = [ + "rustls-pki-types", +] + [[package]] name = "whoami" version = "1.6.0" @@ -2800,6 +3825,12 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "76840935b766e1b0a05c0066835fb9ec80071d4c09a16f6bd5f7e655e3c14c38" +[[package]] +name = "windows-link" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" + [[package]] name = "windows-sys" version = "0.48.0" @@ -2827,6 +3858,15 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "windows-sys" +version = "0.61.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" +dependencies = [ + "windows-link 0.2.1", +] + [[package]] name = "windows-targets" version = "0.48.5" @@ -2963,7 +4003,7 @@ version = "0.39.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1" dependencies = [ - "bitflags", + "bitflags 2.9.1", ] [[package]] @@ -2972,6 +4012,18 @@ version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ea2f10b9bb0928dfb1b42b65e1f9e36f7f54dbdf08457afefb38afcdec4fa2bb" +[[package]] +name = "yoke" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "120e6aef9aa629e3d4f52dc8cc43a015c7724194c97dfaf45180d2daf2b77f40" +dependencies = [ + "serde", + "stable_deref_trait", + "yoke-derive 0.7.5", + "zerofrom", +] + [[package]] name = "yoke" version = "0.8.0" @@ -2980,10 +4032,22 @@ checksum = "5f41bb01b8226ef4bfd589436a297c53d118f65921786300e427be8d487695cc" dependencies = [ "serde", "stable_deref_trait", - "yoke-derive", + "yoke-derive 0.8.0", "zerofrom", ] +[[package]] +name = "yoke-derive" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.103", + "synstructure", +] + [[package]] name = "yoke-derive" version = "0.8.0" @@ -3070,7 +4134,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "36f0bbd478583f79edad978b407914f61b2972f5af6fa089686016be8f9af595" dependencies = [ "displaydoc", - "yoke", + "yoke 0.8.0", "zerofrom", ] @@ -3080,7 +4144,7 @@ version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4a05eb080e015ba39cc9e23bbe5e7fb04d5fb040350f99f34e338d5fdd294428" dependencies = [ - "yoke", + "yoke 0.8.0", "zerofrom", "zerovec-derive", ] @@ -3096,6 +4160,21 @@ dependencies = [ "syn 2.0.103", ] +[[package]] +name = "zip" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9cc23c04387f4da0374be4533ad1208cbb091d5c11d070dfef13676ad6497164" +dependencies = [ + "arbitrary", + "crc32fast", + "crossbeam-utils", + "displaydoc", + "indexmap", + "num_enum", + "thiserror 1.0.69", +] + [[package]] name = "zstd" version = "0.13.3" diff --git a/Cargo.toml b/Cargo.toml index d86fdee..766acb5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,6 +12,12 @@ env_logger = "0.11.8" postgres = "0.19.9" tokio = { version = "1.0", features = ["rt", "rt-multi-thread", "macros"] } tokio-postgres = { version = "0.7.10", features = ["with-uuid-1", "with-chrono-0_4"] } +pgvector = { version = "0.4", features = ["postgres"] } +candle-core = "0.7.1" +candle-nn = "0.7.1" +candle-transformers = "0.7.1" +tokenizers = { version = "0.19.1", default-features = false, features = ["onig"] } +hf-hub = "0.3.2" deadpool-postgres = "0.11.0" refinery = { version = "0.8.14", features = ["tokio-postgres"] } tempfile = "3.20.0" diff --git a/README.md b/README.md index bed88f9..e0d39b2 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,7 @@ Buckets is a version control and workflow management tool designed specifically - **Expectation Management**: Define and check requirements between workflow stages - **File Integrity**: BLAKE3 hashing ensures file integrity - **Compression**: Built-in zstd compression for efficient storage +- **Semantic Search**: AI-powered duplicate detection for expectations using `pgvector` - **Database Backend**: PostgreSQL for reliable data persistence ### How It Works @@ -235,6 +236,7 @@ buckets schema - Rust 1.70 or later - Cargo +- PostgreSQL with `pgvector` extension installed ### Building from Source diff --git a/docs/expectation_system.md b/docs/expectation_system.md index 0bd7bb9..affbd66 100644 --- a/docs/expectation_system.md +++ b/docs/expectation_system.md @@ -53,6 +53,15 @@ ``` + buckets finalize [bucket name] + ``` + +4. **Semantic Deduplication** + - When you create an expectation using `buckets expect`, the system automatically generates a vector embedding of your description using a specialized AI model (`all-MiniLM-L6-v2`). + - It then compares this embedding against all existing expectations to find semantically similar ones. + - If a high similarity (>85%) is found, you will be warned to prevent duplicate work. + - **Note**: The first time you run this, it will download the model (~90MB), which may take a few moments. Subsequent runs will be fast. + --- ## Example Workflow (Expanded) diff --git a/src/bootstrap.rs b/src/bootstrap.rs index ae0b2d7..aa38872 100644 --- a/src/bootstrap.rs +++ b/src/bootstrap.rs @@ -50,7 +50,7 @@ fn perform_bootstrap() -> Result<(), BucketError> { // 2. Try repository configuration (db_config.toml) if let Ok(config) = - crate::database::get_database_config(&buckets_dir.parent().unwrap_or(&buckets_dir)) + crate::database::get_database_config(buckets_dir.parent().unwrap_or(&buckets_dir)) { return initialize_runtime_and_db( config, @@ -79,9 +79,7 @@ fn perform_bootstrap() -> Result<(), BucketError> { Err(BucketError::IoError(Error::new( ErrorKind::NotFound, - format!( - "No PostgreSQL connection found. Please set DATABASE_URL, run 'buckets init' with database options, or configure globally with 'buckets setup'.", - ), + "No PostgreSQL connection found. Please set DATABASE_URL, run 'buckets init' with database options, or configure globally with 'buckets setup'.".to_string(), ))) } diff --git a/src/commands/check.rs b/src/commands/check.rs index a2ffb7a..dcbcd0f 100644 --- a/src/commands/check.rs +++ b/src/commands/check.rs @@ -115,7 +115,7 @@ impl BucketCommand for Check { ); } } - println!(""); + println!(); Ok(()) }) diff --git a/src/commands/commit.rs b/src/commands/commit.rs index a447894..c448106 100644 --- a/src/commands/commit.rs +++ b/src/commands/commit.rs @@ -12,7 +12,7 @@ use blake3::Hash; use log::{debug, error}; use std::io; use std::io::{Error, ErrorKind}; -use std::path::PathBuf; +use std::path::{Path, PathBuf}; use std::str::FromStr; use tokio_postgres::types::ToSql; use uuid::Uuid; @@ -193,9 +193,9 @@ impl Commit { pub async fn process_files_async( &self, bucket_id: Uuid, - bucket_path: &PathBuf, + bucket_path: &Path, files: &[CommittedFile], - message: &String, + message: &str, ) -> Result<(), BucketError> { let db = get_database().await?; @@ -213,7 +213,7 @@ impl Commit { .await?; // Compress and store the file (no database operation) - file.compress_and_store(&bucket_path).map_err(|e| { + file.compress_and_store(bucket_path).map_err(|e| { error!("Error compressing and storing file: {}", e); e })?; @@ -225,7 +225,7 @@ impl Commit { &self, db: &crate::postgres_db::DatabaseManager, bucket_id: Uuid, - bucket_path: &PathBuf, + bucket_path: &Path, ) -> Result<(), BucketError> { let check_params: Vec<&(dyn ToSql + Sync)> = vec![&bucket_id]; let rows = db @@ -286,11 +286,11 @@ impl Commit { &self, db: &crate::postgres_db::DatabaseManager, bucket_id: Uuid, - message: &String, + message: &str, ) -> Result { debug!("CommitCommand: inserting commit into PostgreSQL database"); - let params: Vec<&(dyn ToSql + Sync)> = vec![&bucket_id, message]; + let params: Vec<&(dyn ToSql + Sync)> = vec![&bucket_id, &message]; let rows = db.query( "INSERT INTO commits (id, bucket_id, message) VALUES (uuid_generate_v4(), $1::uuid, $2) RETURNING id", diff --git a/src/commands/completions.rs b/src/commands/completions.rs index ad9c623..03e9ce0 100644 --- a/src/commands/completions.rs +++ b/src/commands/completions.rs @@ -3,10 +3,9 @@ use crate::commands::BucketCommand; use crate::errors::BucketError; use clap::CommandFactory; use clap_complete::{generate, Shell}; +use log::info; use std::io; use std::path::PathBuf; -use log::info; -use dirs; pub struct Completions { args: CompletionsCommand, @@ -33,13 +32,13 @@ impl BucketCommand for Completions { if self.args.install { let path = self.get_install_path(shell)?; - if let Some(parent) = path.parent() { + if let Some(parent) = path.parent() { std::fs::create_dir_all(parent)?; } let mut file = std::fs::File::create(&path)?; generate(shell, &mut cmd, bin_name, &mut file); info!("Completion script installed to {:?}", path); - eprintln!("Completion script installed to {:?}", path); + eprintln!("Completion script installed to {:?}", path); } else if let Some(path) = &self.args.output { let mut file = std::fs::File::create(path)?; generate(shell, &mut cmd, bin_name, &mut file); @@ -79,4 +78,3 @@ impl Completions { } } } - diff --git a/src/commands/config.rs b/src/commands/config.rs index 6d40958..fde3731 100644 --- a/src/commands/config.rs +++ b/src/commands/config.rs @@ -23,9 +23,13 @@ impl BucketCommand for Config { fn execute(&self) -> Result<(), BucketError> { match &self.args.command { - ConfigSubcommand::Get(command) => self.get_value(&command.key, command.global, command.local), + ConfigSubcommand::Get(command) => { + self.get_value(&command.key, command.global, command.local) + } ConfigSubcommand::Set(command) => self.set_value(command), - ConfigSubcommand::Unset(command) => self.unset_value(&command.key, command.global, command.local), + ConfigSubcommand::Unset(command) => { + self.unset_value(&command.key, command.global, command.local) + } ConfigSubcommand::List(command) => self.list_values(command), } } @@ -159,9 +163,8 @@ fn load_config_value_from_path(path: &PathBuf) -> Result { } let content = fs::read_to_string(path)?; - toml::from_str(&content).map_err(|e| { - BucketError::InvalidData(format!("Failed to parse config: {}", e)) - }) + toml::from_str(&content) + .map_err(|e| BucketError::InvalidData(format!("Failed to parse config: {}", e))) } fn save_config_value(path: PathBuf, value: &Value) -> Result<(), BucketError> { @@ -169,9 +172,8 @@ fn save_config_value(path: PathBuf, value: &Value) -> Result<(), BucketError> { fs::create_dir_all(parent)?; } - let content = toml::to_string_pretty(value).map_err(|e| { - BucketError::InvalidData(format!("Failed to serialize config: {}", e)) - })?; + let content = toml::to_string_pretty(value) + .map_err(|e| BucketError::InvalidData(format!("Failed to serialize config: {}", e)))?; fs::write(path, content)?; Ok(()) } diff --git a/src/commands/create.rs b/src/commands/create.rs index 63b0942..5429fa1 100644 --- a/src/commands/create.rs +++ b/src/commands/create.rs @@ -149,10 +149,7 @@ impl Create { "Directory already exists", ))); } else { - return Err(BucketError::IoError(std::io::Error::new( - std::io::ErrorKind::Other, - "Unknown error", - ))); + return Err(BucketError::IoError(std::io::Error::other("Unknown error"))); } } diff --git a/src/commands/doctor.rs b/src/commands/doctor.rs index 4da5d29..4366fbf 100644 --- a/src/commands/doctor.rs +++ b/src/commands/doctor.rs @@ -535,7 +535,7 @@ impl Doctor { "overall_status": "passed" }); - let required_tables = vec!["buckets", "commits", "files"]; + let required_tables = vec!["buckets", "commits", "files", "expectations", "pebbles"]; let mut all_valid = true; for table_name in required_tables { @@ -719,6 +719,25 @@ impl Doctor { json!({"name": "file_path", "type": "text"}), json!({"name": "hash", "type": "text"}), ], + "expectations" => vec![ + json!({"name": "id", "type": "uuid"}), + json!({"name": "bucket_id", "type": "uuid"}), + json!({"name": "target_bucket_id", "type": "uuid"}), + json!({"name": "description", "type": "text"}), + json!({"name": "status", "type": "text"}), + json!({"name": "created_at", "type": "timestamp"}), + json!({"name": "embedding", "type": "user-defined"}), + ], + "pebbles" => vec![ + json!({"name": "id", "type": "uuid"}), + json!({"name": "description", "type": "text"}), + json!({"name": "origin_bucket_id", "type": "uuid"}), + json!({"name": "current_bucket_id", "type": "uuid"}), + json!({"name": "original_pebble_id", "type": "uuid"}), + json!({"name": "created_via_expectation_id", "type": "uuid"}), + json!({"name": "status", "type": "text"}), + json!({"name": "created_at", "type": "timestamp"}), + ], _ => vec![], } } diff --git a/src/commands/expect.rs b/src/commands/expect.rs index ecfdbb4..e553d26 100644 --- a/src/commands/expect.rs +++ b/src/commands/expect.rs @@ -5,9 +5,11 @@ use crate::data::expectation::Expectation; use crate::data::pebble::Pebble; use crate::errors::BucketError; use crate::postgres_db::get_database; +use crate::utils::embeddings::EmbeddingGenerator; use crate::utils::runtime::RuntimeManager; use crate::CURRENT_DIR; -use log::info; +use dialoguer::console::style; +use log::{info, warn}; use uuid::Uuid; pub struct Expect { @@ -66,12 +68,46 @@ impl BucketCommand for Expect { None }; + // 2.5. Generate Embedding & Check for Duplicates + let embedding = match EmbeddingGenerator::generate(description) { + Ok(vec) => Some(vec), + Err(e) => { + warn!("Failed to generate embedding: {}", e); + None + } + }; + + if let Some(vec) = &embedding { + match Expectation::find_similar(&db, vec, 3, 0.85).await { + Ok(founds) => { + if !founds.is_empty() { + println!( + "{}", + style("Warning: Similar expectations found:") + .yellow() + .bold() + ); + for (exp, score) in founds { + println!( + " - \"{}\" (Similarity: {:.1}%)", + style(exp.description).cyan(), + score * 100.0 + ); + } + println!("{}", style("creating anyway...").dim()); + } + } + Err(e) => warn!("Failed to check for duplicates: {}", e), + } + } + // 3. Create Expectation Record let expectation = Expectation::create( &db, consumer_bucket_id, producer_bucket_id, description.clone(), + embedding, ) .await?; diff --git a/src/commands/list.rs b/src/commands/list.rs index 1b0756d..e729fd3 100644 --- a/src/commands/list.rs +++ b/src/commands/list.rs @@ -67,19 +67,17 @@ impl BucketCommand for List { Ok(json) => println!("{}", json), Err(e) => eprintln!("Error serializing to JSON: {}", e), } + } else if buckets.is_empty() { + println!("No buckets found"); } else { - if buckets.is_empty() { - println!("No buckets found"); - } else { - println!("Buckets:"); - for bucket in &buckets { - println!( - " {} - {} ({})", - bucket.name, - bucket.id, - bucket.relative_bucket_path.display() - ); - } + println!("Buckets:"); + for bucket in &buckets { + println!( + " {} - {} ({})", + bucket.name, + bucket.id, + bucket.relative_bucket_path.display() + ); } } diff --git a/src/commands/mod.rs b/src/commands/mod.rs index 02e3557..eb8bbe7 100644 --- a/src/commands/mod.rs +++ b/src/commands/mod.rs @@ -123,8 +123,8 @@ impl CommandDispatcher { } pub(crate) mod check; -pub(crate) mod completions; pub(crate) mod commit; +pub(crate) mod completions; pub(crate) mod config; pub(crate) mod create; pub(crate) mod doctor; diff --git a/src/commands/revert.rs b/src/commands/revert.rs index 6455fa2..9557282 100644 --- a/src/commands/revert.rs +++ b/src/commands/revert.rs @@ -11,7 +11,7 @@ use log::{debug, error}; use std::collections::HashSet; use std::fs::File; use std::io::{BufReader, BufWriter}; -use std::path::PathBuf; +use std::path::{Path, PathBuf}; use tokio_postgres::types::ToSql; use uuid::Uuid; @@ -60,7 +60,7 @@ impl Revert { let db = get_database().await?; let relative_path = PathBuf::from(&file_path) - .strip_prefix(&bucket_path) + .strip_prefix(bucket_path) .unwrap_or(&PathBuf::from(&file_path)) .to_string_lossy() .to_string(); @@ -150,7 +150,7 @@ impl Revert { Ok(()) } - fn revert_all(&self, bucket: Bucket, bucket_path: &PathBuf) -> Result<(), BucketError> { + fn revert_all(&self, bucket: Bucket, bucket_path: &Path) -> Result<(), BucketError> { let (files_to_restore, commit_id_str) = RuntimeManager::block_on(async { let db = get_database().await?; let bucket_id = bucket.id; diff --git a/src/commands/rollback.rs b/src/commands/rollback.rs index 03fd005..cb93acc 100644 --- a/src/commands/rollback.rs +++ b/src/commands/rollback.rs @@ -1,6 +1,6 @@ use std::io::Error; use std::io::ErrorKind; -use std::path::PathBuf; +use std::path::{Path, PathBuf}; use crate::args::RollbackCommand; use crate::commands::commit::Commit; @@ -40,12 +40,12 @@ impl BucketCommand for Rollback { match &self.args.path { None => rollback_all(¤t_dir), - Some(path) => rollback_single_file(¤t_dir, &path), + Some(path) => rollback_single_file(¤t_dir, path), } } } -fn rollback_single_file(bucket_path: &PathBuf, file: &PathBuf) -> Result<(), BucketError> { +fn rollback_single_file(bucket_path: &Path, file: &PathBuf) -> Result<(), BucketError> { if !file.exists() { return Err(BucketError::from(Error::new( ErrorKind::NotFound, @@ -58,10 +58,7 @@ fn rollback_single_file(bucket_path: &PathBuf, file: &PathBuf) -> Result<(), Buc let previous_commit = RuntimeManager::block_on(Commit::load_last_commit_async(bucket.id)) .map_err(|err| { error!("Failed to load previous commit: {}", err); - BucketError::from(Error::new( - ErrorKind::Other, - "Failed to load previous commit.", - )) + BucketError::from(Error::other("Failed to load previous commit.")) })?; match previous_commit { @@ -94,9 +91,9 @@ fn rollback_single_file(bucket_path: &PathBuf, file: &PathBuf) -> Result<(), Buc } } -fn rollback_all(bucket_path: &PathBuf) -> Result<(), BucketError> { +fn rollback_all(bucket_path: &Path) -> Result<(), BucketError> { // Read the bucket's metadata - let bucket = Bucket::from_meta_data(&bucket_path)?; + let bucket = Bucket::from_meta_data(bucket_path)?; let bucket_files = bucket.list_files_with_metadata_in_bucket()?; if bucket_files.files.is_empty() { println!("No files in bucket"); @@ -106,10 +103,7 @@ fn rollback_all(bucket_path: &PathBuf) -> Result<(), BucketError> { let previous_commit = RuntimeManager::block_on(Commit::load_last_commit_async(bucket.id)) .map_err(|err| { error!("Failed to load previous commit: {}", err); - BucketError::from(Error::new( - ErrorKind::Other, - "Failed to load previous commit.", - )) + BucketError::from(Error::other("Failed to load previous commit.")) })?; match previous_commit { @@ -120,9 +114,9 @@ fn rollback_all(bucket_path: &PathBuf) -> Result<(), BucketError> { ))); } Some(previous_commit) => { - let changes = bucket_files.compare(&previous_commit).ok_or_else(|| { - BucketError::from(Error::new(ErrorKind::Other, "Failed to compare files.")) - })?; + let changes = bucket_files + .compare(&previous_commit) + .ok_or_else(|| BucketError::from(Error::other("Failed to compare files.")))?; if changes .iter() @@ -138,7 +132,7 @@ fn rollback_all(bucket_path: &PathBuf) -> Result<(), BucketError> { .iter() .filter(|change| change.status == CommitStatus::Modified) .for_each(|change| { - if let Err(e) = change.restore(&bucket_path) { + if let Err(e) = change.restore(bucket_path) { error!("Failed to restore file: {}", e); } }); diff --git a/src/commands/setup.rs b/src/commands/setup.rs index c52a990..e331c37 100644 --- a/src/commands/setup.rs +++ b/src/commands/setup.rs @@ -73,10 +73,11 @@ impl BucketCommand for Setup { eprintln!("\n❌ Extension installation failed: {}", e); } // Pause to let user read output - if let Ok(_) = Input::::new() + if Input::::new() .with_prompt("Press Enter to continue") .allow_empty(true) .interact_text() + .is_ok() {} } Some(3) => { @@ -87,10 +88,11 @@ impl BucketCommand for Setup { // test_database_connection prints success message } // Pause to let user read output - if let Ok(_) = Input::::new() + if Input::::new() .with_prompt("Press Enter to continue") .allow_empty(true) .interact_text() + .is_ok() {} } Some(4) => { diff --git a/src/commands/status.rs b/src/commands/status.rs index d84b6a9..c0b4acd 100644 --- a/src/commands/status.rs +++ b/src/commands/status.rs @@ -12,7 +12,7 @@ use crate::CURRENT_DIR; use log::{debug, error, info}; use serde::{Deserialize, Serialize}; use std::env; -use std::io::{self, Error, ErrorKind}; +use std::io::{self, Error}; #[derive(Serialize, Deserialize, Debug)] pub struct BucketStatusOutput { @@ -106,10 +106,7 @@ impl Status { let latest_commit = RuntimeManager::block_on(Commit::load_last_commit_async(bucket.id)) .map_err(|err| { error!("Failed to load previous commit: {}", err); - BucketError::from(Error::new( - ErrorKind::Other, - "Failed to load previous commit.", - )) + BucketError::from(Error::other("Failed to load previous commit.")) })?; let file_statuses = match latest_commit { @@ -159,10 +156,10 @@ impl Status { fn repository_status(&self) -> Result<(), BucketError> { let current_dir = env::current_dir().map_err(|e| { - BucketError::from(io::Error::new( - io::ErrorKind::Other, - format!("Failed to get current directory: {}", e), - )) + BucketError::from(io::Error::other(format!( + "Failed to get current directory: {}", + e + ))) })?; let repo_config = RepositoryConfig::from_file(current_dir)?; diff --git a/src/data/bucket.rs b/src/data/bucket.rs index 08a3507..0085e5a 100644 --- a/src/data/bucket.rs +++ b/src/data/bucket.rs @@ -21,8 +21,8 @@ pub struct Bucket { } pub trait BucketTrait { - fn default(uuid: Uuid, name: &String, path: &PathBuf) -> Self; - fn from_meta_data(current_path: &PathBuf) -> Result; + fn default(uuid: Uuid, name: &str, path: &Path) -> Self; + fn from_meta_data(current_path: &Path) -> Result; fn write_bucket_info(&self) -> Result<(), io::Error>; #[cfg(test)] fn write_bucket_info_with_repo_path(&self, repo_path: &Path) -> Result<(), io::Error>; @@ -35,7 +35,7 @@ pub trait BucketTrait { } impl BucketTrait for Bucket { - fn default(uuid: Uuid, name: &String, path: &PathBuf) -> Bucket { + fn default(uuid: Uuid, name: &str, path: &Path) -> Bucket { // Ensure the path is always relative by stripping any leading slash let relative_path = if path.is_absolute() { // If given an absolute path, try to make it relative to the repo root @@ -59,10 +59,10 @@ impl BucketTrait for Bucket { } } - fn from_meta_data(current_path: &PathBuf) -> Result { - debug!("Current path {}", current_path.as_path().display()); + fn from_meta_data(current_path: &Path) -> Result { + debug!("Current path {}", current_path.display()); // find the top level of the bucket directory - let bucket_path: PathBuf = match Bucket::find_bucket(current_path.as_path()) { + let bucket_path: PathBuf = match Bucket::find_bucket(current_path) { Some(mut path) => { path.pop(); path @@ -86,7 +86,7 @@ impl BucketTrait for Bucket { // Use full_path for filesystem operations let full_path = self .full_path() - .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?; + .map_err(|e| io::Error::other(e.to_string()))?; let mut file = File::create(full_path.join(".b").join("info"))?; let serialized = to_string(self) .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e.to_string()))?; @@ -162,10 +162,7 @@ impl BucketTrait for Bucket { } fn find_bucket(dir_path: &Path) -> Option { - match find_directory_in_parents(dir_path, ".b") { - Some(path) => Some(path), - None => None, - } + find_directory_in_parents(dir_path, ".b") } fn get_full_bucket_path(&self) -> Result { @@ -175,7 +172,7 @@ impl BucketTrait for Bucket { fn full_path(&self) -> Result { let current_dir = env::current_dir().map_err(BucketError::from)?; - let repo_path = find_bucket_repo(¤t_dir.as_path()).ok_or(BucketError::NotInRepo)?; + let repo_path = find_bucket_repo(current_dir.as_path()).ok_or(BucketError::NotInRepo)?; let repo_root = repo_path.parent().ok_or(BucketError::NotInRepo)?; // Always treat relative_bucket_path as relative to repo root @@ -228,7 +225,7 @@ impl BucketTrait for Bucket { } } -pub fn read_bucket_info(path: &PathBuf) -> Result { +pub fn read_bucket_info(path: &Path) -> Result { let info_path = path.join(".b").join("info"); let mut file = File::open(&info_path).map_err(|e| { io::Error::new( diff --git a/src/data/commit.rs b/src/data/commit.rs index a97802b..d17b29d 100644 --- a/src/data/commit.rs +++ b/src/data/commit.rs @@ -4,15 +4,16 @@ use std::cmp::PartialEq; use std::collections::HashMap; use std::fmt::{Display, Formatter}; use std::io; -use std::path::PathBuf; +use std::path::Path; use uuid::Uuid; use crate::utils::compression::{compress_file, decompress_file, DEFAULT_COMPRESSION_LEVEL}; -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Default)] pub enum CommitStatus { Unknown, New, + #[default] Committed, Modified, Deleted, @@ -30,12 +31,6 @@ impl Display for CommitStatus { } } -impl Default for CommitStatus { - fn default() -> Self { - CommitStatus::Committed - } -} - #[derive(Serialize, Deserialize, Debug)] pub struct CommittedFile { pub id: Uuid, @@ -75,13 +70,13 @@ where impl PartialEq for CommitStatus { fn eq(&self, other: &Self) -> bool { - match (self, other) { - (CommitStatus::New, CommitStatus::New) => true, - (CommitStatus::Committed, CommitStatus::Committed) => true, - (CommitStatus::Modified, CommitStatus::Modified) => true, - (CommitStatus::Deleted, CommitStatus::Deleted) => true, - _ => false, - } + matches!( + (self, other), + (CommitStatus::New, CommitStatus::New) + | (CommitStatus::Committed, CommitStatus::Committed) + | (CommitStatus::Modified, CommitStatus::Modified) + | (CommitStatus::Deleted, CommitStatus::Deleted) + ) } } @@ -104,8 +99,8 @@ impl Commit { differences.push(CommittedFile { id: file.id, name: file.name.clone(), - hash: file.hash.clone(), - previous_hash: previous.hash.clone(), + hash: file.hash, + previous_hash: previous.hash, status: CommitStatus::Modified, }); } @@ -114,8 +109,8 @@ impl Commit { differences.push(CommittedFile { id: file.id, name: file.name.clone(), - hash: file.hash.clone(), - previous_hash: zero_hash.clone(), + hash: file.hash, + previous_hash: zero_hash, status: CommitStatus::New, }); } @@ -127,8 +122,8 @@ impl Commit { differences.push(CommittedFile { id: other_file.id, name: other_file.name.clone(), - hash: other_file.hash.clone(), - previous_hash: zero_hash.clone(), + hash: other_file.hash, + previous_hash: zero_hash, status: CommitStatus::Deleted, }); } @@ -154,21 +149,21 @@ impl CommittedFile { } } - pub fn compress_and_store(&self, bucket_path: &PathBuf) -> io::Result<()> { + pub fn compress_and_store(&self, bucket_path: &Path) -> io::Result<()> { let input_path = bucket_path.join(&self.name); let output_path = bucket_path .join(".b") .join("storage") - .join(&self.hash.to_string()); + .join(self.hash.to_string()); compress_file(&input_path, &output_path, DEFAULT_COMPRESSION_LEVEL) } - pub fn restore(&self, bucket_path: &PathBuf) -> io::Result<()> { + pub fn restore(&self, bucket_path: &Path) -> io::Result<()> { let input_path = bucket_path .join(".b") .join("storage") - .join(&self.previous_hash.to_string()); + .join(self.previous_hash.to_string()); let output_path = bucket_path.join(&self.name); // Create parent directories if they don't exist diff --git a/src/data/expectation.rs b/src/data/expectation.rs index f6cb646..12cf7cb 100644 --- a/src/data/expectation.rs +++ b/src/data/expectation.rs @@ -4,6 +4,28 @@ use chrono::NaiveDateTime; use serde::{Deserialize, Serialize}; use uuid::Uuid; +use pgvector::Vector; +use serde::Deserializer; +use serde::Serializer; + +fn serialize_vector(vector: &Option, serializer: S) -> Result +where + S: Serializer, +{ + match vector { + Some(v) => serializer.serialize_some(&v.to_vec()), + None => serializer.serialize_none(), + } +} + +fn deserialize_vector<'de, D>(deserializer: D) -> Result, D::Error> +where + D: Deserializer<'de>, +{ + let v: Option> = Option::deserialize(deserializer)?; + Ok(v.map(Vector::from)) +} + #[derive(Debug, Serialize, Deserialize)] pub struct Expectation { pub id: Uuid, @@ -12,6 +34,12 @@ pub struct Expectation { pub description: String, pub status: String, pub created_at: NaiveDateTime, + #[serde( + serialize_with = "serialize_vector", + deserialize_with = "deserialize_vector", + skip_serializing_if = "Option::is_none" + )] + pub embedding: Option, } impl Expectation { @@ -20,13 +48,15 @@ impl Expectation { bucket_id: Uuid, target_bucket_id: Option, description: String, + embedding: Option>, ) -> Result { let id = Uuid::new_v4(); let status = "pending"; + let vector = embedding.map(Vector::from); db.execute( - "INSERT INTO expectations (id, bucket_id, target_bucket_id, description, status) VALUES ($1, $2, $3, $4, $5)", - &[&id, &bucket_id, &target_bucket_id, &description, &status], + "INSERT INTO expectations (id, bucket_id, target_bucket_id, description, status, embedding) VALUES ($1, $2, $3, $4, $5, $6)", + &[&id, &bucket_id, &target_bucket_id, &description, &status, &vector], ) .await?; @@ -46,9 +76,45 @@ impl Expectation { description, status: status.to_string(), created_at: row.get(0), + embedding: vector, }) } + pub async fn find_similar( + db: &DatabaseManager, + embedding: &[f32], + limit: i64, + similarity_threshold: f64, + ) -> Result, BucketError> { + let vector = Vector::from(embedding.to_vec()); + let rows = db + .query( + "SELECT id, bucket_id, target_bucket_id, description, status, created_at, embedding, 1 - (embedding <=> $1) as similarity + FROM expectations + WHERE 1 - (embedding <=> $1) > $2 + ORDER BY embedding <=> $1 LIMIT $3", + &[&vector, &similarity_threshold, &limit], + ) + .await?; + + let mut results = Vec::new(); + for row in rows { + let similarity: f64 = row.get(7); + let expectation = Expectation { + id: row.get(0), + bucket_id: row.get(1), + target_bucket_id: row.get(2), + description: row.get(3), + status: row.get(4), + created_at: row.get(5), + embedding: row.get(6), + }; + results.push((expectation, similarity)); + } + Ok(results) + } + + #[allow(dead_code)] pub async fn get_by_id(db: &DatabaseManager, id: Uuid) -> Result, BucketError> { let rows = db .query( @@ -65,6 +131,7 @@ impl Expectation { description: row.get(3), status: row.get(4), created_at: row.get(5), + embedding: None, // Optimization: skip retrieving embedding unless needed })) } else { Ok(None) diff --git a/src/errors.rs b/src/errors.rs index 6a6bf36..ba4f78c 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -35,6 +35,8 @@ pub enum BucketError { SecurityError(String), #[error("Path validation error: {0}")] PathValidationError(String), + #[error("Embedding Error: {0}")] + EmbeddingError(String), #[error("Dialog Error: {0}")] DialogError(#[from] dialoguer::Error), } @@ -45,6 +47,12 @@ impl From<&str> for BucketError { } } +impl From for BucketError { + fn from(error: String) -> Self { + BucketError::IoError(io::Error::other(error)) + } +} + #[cfg(test)] mod tests { use super::*; @@ -144,6 +152,12 @@ mod tests { assert!(display_str.contains("access denied")); } + #[test] + fn test_embedding_error_display() { + let error = BucketError::EmbeddingError("model not found".to_string()); + assert_eq!(format!("{}", error), "Embedding Error: model not found"); + } + #[test] fn test_error_debug_format() { let error = BucketError::BucketAlreadyExists; diff --git a/src/postgres_db.rs b/src/postgres_db.rs index 1725020..348213e 100644 --- a/src/postgres_db.rs +++ b/src/postgres_db.rs @@ -244,7 +244,7 @@ where F: FnOnce(&DatabaseManager) -> T, { let db = get_database().await?; - Ok(f(&*db)) + Ok(f(&db)) } pub struct DatabaseHandle<'a> { diff --git a/src/sql/migrations/V3__add_pgvector.sql b/src/sql/migrations/V3__add_pgvector.sql new file mode 100644 index 0000000..43baa5b --- /dev/null +++ b/src/sql/migrations/V3__add_pgvector.sql @@ -0,0 +1,10 @@ +-- Enable the pgvector extension to work with embeddings +CREATE EXTENSION IF NOT EXISTS vector; + +-- Add embedding column to expectations table +-- Using 384 dimensions to match the all-MiniLM-L6-v2 model +ALTER TABLE expectations ADD COLUMN embedding vector(384); + +-- Create an HNSW index for faster similarity search +-- m=16 and ef_construction=64 are reasonable defaults +CREATE INDEX ON expectations USING hnsw (embedding vector_cosine_ops) WITH (m = 16, ef_construction = 64); diff --git a/src/test_support.rs b/src/test_support.rs index 3e35609..2ba798d 100644 --- a/src/test_support.rs +++ b/src/test_support.rs @@ -1,5 +1,3 @@ -#![cfg(test)] - pub mod docker { use once_cell::sync::Lazy; use std::path::{Path, PathBuf}; @@ -12,8 +10,8 @@ pub mod docker { use uuid::Uuid; static DOCKER: Lazy = Lazy::new(|| Cli::default()); - const POSTGRES_IMAGE: &str = "postgres"; - const POSTGRES_TAG: &str = "16-alpine"; + const POSTGRES_IMAGE: &str = "pgvector/pgvector"; + const POSTGRES_TAG: &str = "pg16"; #[derive(Debug)] pub struct TestDatabase { diff --git a/src/utils/checks.rs b/src/utils/checks.rs index 09de359..7fd46c7 100644 --- a/src/utils/checks.rs +++ b/src/utils/checks.rs @@ -38,7 +38,7 @@ pub fn find_directory_in_parents(start_path: &Path, target_dir_name: &str) -> Op /// - The `.buckets` directory must contain either: /// - a `database_type` file (indicating a file-based database), or /// - a `postgres` directory (indicating a PostgreSQL-based database). -/// At least one of these must be present for the repository to be considered valid. +/// At least one of these must be present for the repository to be considered valid. pub fn is_valid_bucket_repo(dir_path: &Path) -> bool { debug!("{:?}", dir_path); // Find the .buckets directory @@ -80,7 +80,7 @@ pub fn is_valid_bucket(path: &Path) -> bool { } } -fn has_valid_bucket_info(bucket_path: &PathBuf) -> bool { +fn has_valid_bucket_info(bucket_path: &Path) -> bool { let info_path = bucket_path.join(".b").join("info"); if info_path.exists() && info_path.is_file() { return true; diff --git a/src/utils/compression.rs b/src/utils/compression.rs index b62ecd0..a13e151 100644 --- a/src/utils/compression.rs +++ b/src/utils/compression.rs @@ -30,7 +30,7 @@ pub fn compress_file( // Validate compression level let level = if compression_level == 0 { DEFAULT_COMPRESSION_LEVEL - } else if compression_level < 1 || compression_level > MAX_COMPRESSION_LEVEL { + } else if !(1..=MAX_COMPRESSION_LEVEL).contains(&compression_level) { return Err(io::Error::new( io::ErrorKind::InvalidInput, format!( @@ -79,15 +79,12 @@ pub fn compress_file( })?; copy_encode(&input_file, &mut output_file, level).map_err(|e| { - io::Error::new( - io::ErrorKind::Other, - format!( - "Failed to compress file from '{}' to '{}': {}", - input_path.display(), - output_path.display(), - e - ), - ) + io::Error::other(format!( + "Failed to compress file from '{}' to '{}': {}", + input_path.display(), + output_path.display(), + e + )) })?; Ok(()) diff --git a/src/utils/embeddings.rs b/src/utils/embeddings.rs new file mode 100644 index 0000000..145cf91 --- /dev/null +++ b/src/utils/embeddings.rs @@ -0,0 +1,177 @@ +use crate::errors::BucketError; +use candle_core::{DType, Device, Tensor}; +use candle_nn::VarBuilder; +use candle_transformers::models::bert::{BertModel, Config}; +use hf_hub::{api::sync::Api, Repo, RepoType}; +use once_cell::sync::Lazy; +use std::sync::Mutex; +use tokenizers::{PaddingParams, Tokenizer}; + +// Global lazy static for the model components +struct ModelComponents { + model: BertModel, + tokenizer: Tokenizer, +} + +static MODEL: Lazy>> = Lazy::new(|| Mutex::new(None)); + +pub struct EmbeddingGenerator; + +impl EmbeddingGenerator { + /// Generates a vector embedding using all-MiniLM-L6-v2 via Candle + pub fn generate(text: &str) -> Result, BucketError> { + let mut guard = MODEL.lock().map_err(|_| { + BucketError::EmbeddingError("Failed to acquire lock on embedding model".to_string()) + })?; + + if guard.is_none() { + // Load model components + let model_id = "sentence-transformers/all-MiniLM-L6-v2".to_string(); + let _revision = "refs/pr/21".to_string(); // Helper to pin version if needed, or use main + + let api = Api::new() + .map_err(|e| BucketError::EmbeddingError(format!("Failed to create API: {}", e)))?; + let repo = api.repo(Repo::new(model_id, RepoType::Model)); + + let config_filename = repo + .get("config.json") + .map_err(|e| BucketError::EmbeddingError(format!("Failed to get config: {}", e)))?; + let tokenizer_filename = repo.get("tokenizer.json").map_err(|e| { + BucketError::EmbeddingError(format!("Failed to get tokenizer: {}", e)) + })?; + let weights_filename = repo.get("model.safetensors").map_err(|e| { + BucketError::EmbeddingError(format!("Failed to get weights: {}", e)) + })?; + + let config_str = std::fs::read_to_string(config_filename).map_err(|e| { + BucketError::EmbeddingError(format!("Failed to read config file: {}", e)) + })?; + let config: Config = serde_json::from_str(&config_str).map_err(|e| { + BucketError::EmbeddingError(format!("Failed to parse config: {}", e)) + })?; + let mut tokenizer = Tokenizer::from_file(tokenizer_filename).map_err(|e| { + BucketError::EmbeddingError(format!("Failed to load tokenizer: {}", e)) + })?; + + let device = Device::Cpu; + let vb = unsafe { + VarBuilder::from_mmaped_safetensors(&[weights_filename], DType::F32, &device) + } + .map_err(|e| BucketError::EmbeddingError(format!("Failed to load weights: {}", e)))?; + + let model = BertModel::load(vb, &config) + .map_err(|e| BucketError::EmbeddingError(format!("Failed to load model: {}", e)))?; + + if let Some(pp) = tokenizer.get_padding_mut() { + pp.strategy = tokenizers::PaddingStrategy::BatchLongest + } else { + let pp = PaddingParams { + strategy: tokenizers::PaddingStrategy::BatchLongest, + ..Default::default() + }; + tokenizer.with_padding(Some(pp)); + } + + *guard = Some(ModelComponents { model, tokenizer }); + } + + let components = guard.as_ref().ok_or_else(|| { + BucketError::EmbeddingError("Model components not initialized".to_string()) + })?; + let device = &Device::Cpu; + + let tokenizer = &components.tokenizer; + let model = &components.model; + + // Tokenize + let tokens = tokenizer + .encode(text, true) + .map_err(|e| BucketError::EmbeddingError(format!("Tokenization failed: {}", e)))?; + + let token_ids = Tensor::new(tokens.get_ids(), device) + .map_err(|e| BucketError::EmbeddingError(format!("Tensor error: {}", e)))? + .unsqueeze(0) + .map_err(|e| BucketError::EmbeddingError(format!("Tensor error: {}", e)))?; + + // Inference + let token_type_ids = token_ids + .zeros_like() + .map_err(|e| BucketError::EmbeddingError(format!("Tensor error: {}", e)))?; + + let embeddings = model + .forward(&token_ids, &token_type_ids, None) + .map_err(|e| BucketError::EmbeddingError(format!("Model forward failed: {}", e)))?; + + // Mean pooling (simple version) + let (_n_sentence, n_tokens, _hidden_size) = embeddings + .dims3() + .map_err(|e| BucketError::EmbeddingError(format!("Dims error: {}", e)))?; + + let embeddings = (embeddings + .sum(1) + .map_err(|e| BucketError::EmbeddingError(format!("Sum error: {}", e)))? + / (n_tokens as f64)) + .map_err(|e| BucketError::EmbeddingError(format!("Div error: {}", e)))?; + + let embeddings_vec = embeddings + .squeeze(0) + .map_err(|e| BucketError::EmbeddingError(format!("Squeeze error: {}", e)))? + .to_vec1::() + .map_err(|e| BucketError::EmbeddingError(format!("ToVec error: {}", e)))?; + + Ok(embeddings_vec) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + /// Verify embedding generator produces 384-dimension vectors (all-MiniLM-L6-v2) + /// Note: Requires network on first run to download model (~90MB) + #[test] + #[ignore] // Run with: cargo test -- --ignored + fn test_embedding_dimension() { + let embedding = EmbeddingGenerator::generate("Test text for embedding") + .expect("Failed to generate embedding"); + assert_eq!( + embedding.len(), + 384, + "Expected 384 dimensions from all-MiniLM-L6-v2" + ); + } + + /// Verify similar texts produce similar embeddings + #[test] + #[ignore] // Run with: cargo test -- --ignored + fn test_embedding_similarity() { + let emb1 = + EmbeddingGenerator::generate("The API should respond quickly").expect("embedding 1"); + let emb2 = + EmbeddingGenerator::generate("API response time must be fast").expect("embedding 2"); + let emb3 = EmbeddingGenerator::generate("The weather is nice today").expect("embedding 3"); + + // Cosine similarity helper + fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 { + let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum(); + let norm_a: f32 = a.iter().map(|x| x * x).sum::().sqrt(); + let norm_b: f32 = b.iter().map(|x| x * x).sum::().sqrt(); + dot / (norm_a * norm_b) + } + + let sim_related = cosine_similarity(&emb1, &emb2); + let sim_unrelated = cosine_similarity(&emb1, &emb3); + + assert!( + sim_related > sim_unrelated, + "Related texts should be more similar: related={:.3} unrelated={:.3}", + sim_related, + sim_unrelated + ); + assert!( + sim_related > 0.7, + "Related texts should have high similarity: {:.3}", + sim_related + ); + } +} diff --git a/src/utils/mod.rs b/src/utils/mod.rs index 78f1858..4d0a966 100644 --- a/src/utils/mod.rs +++ b/src/utils/mod.rs @@ -1,6 +1,7 @@ pub(crate) mod checks; pub mod compression; pub mod config; +pub mod embeddings; pub mod runtime; pub mod security; #[allow(clippy::module_inception)] diff --git a/src/utils/utils.rs b/src/utils/utils.rs index 830f2e5..ae0967e 100644 --- a/src/utils/utils.rs +++ b/src/utils/utils.rs @@ -9,7 +9,7 @@ use std::{fs, io}; use walkdir::{DirEntry, WalkDir}; #[allow(dead_code)] -pub fn delete_and_create_tmp_dir(bucket_path: &PathBuf) -> Result { +pub fn delete_and_create_tmp_dir(bucket_path: &Path) -> Result { let tmp_bucket_path = bucket_path.join(".b").join("tmp"); fs::remove_dir_all(&tmp_bucket_path).unwrap_or_default(); fs::create_dir_all(&tmp_bucket_path)?; @@ -20,8 +20,8 @@ pub(crate) fn find_files_excluding_top_level_b(dir: &Path) -> Vec { WalkDir::new(dir) .into_iter() .filter_map(Result::ok) - .filter(|entry| is_not_in_dir(entry, &dir, ".b")) - .filter_map(|entry| make_relative_path(entry.path(), &dir)) + .filter(|entry| is_not_in_dir(entry, dir, ".b")) + .filter_map(|entry| make_relative_path(entry.path(), dir)) .collect() } @@ -99,21 +99,14 @@ pub fn find_directory_in_parents(start_path: &Path, target_dir_name: &str) -> Op } pub fn find_bucket_path(dir_path: &Path) -> Option { - match find_directory_in_parents(dir_path, ".b") { - Some(path) => Some(path), - None => None, - } - .map(|mut path| { + find_directory_in_parents(dir_path, ".b").map(|mut path| { path.pop(); path }) } pub fn find_bucket_repo(dir_path: &Path) -> Option { - match find_directory_in_parents(dir_path, ".buckets") { - Some(path) => Some(path), - None => None, - } + find_directory_in_parents(dir_path, ".buckets") } #[cfg(test)] diff --git a/tests/common.rs b/tests/common.rs index 77420b7..51d78a2 100644 --- a/tests/common.rs +++ b/tests/common.rs @@ -12,10 +12,10 @@ pub mod tests { use tokio_postgres::NoTls; use uuid::Uuid; - static DOCKER: Lazy = Lazy::new(|| Cli::default()); + static DOCKER: Lazy = Lazy::new(Cli::default); - const POSTGRES_IMAGE: &str = "postgres"; - const POSTGRES_TAG: &str = "16-alpine"; + const POSTGRES_IMAGE: &str = "pgvector/pgvector"; + const POSTGRES_TAG: &str = "pg16"; #[allow(dead_code)] pub fn get_test_dir() -> PathBuf { diff --git a/tests/test_cli_completions.rs b/tests/test_cli_completions.rs index dee34ac..232c143 100644 --- a/tests/test_cli_completions.rs +++ b/tests/test_cli_completions.rs @@ -14,4 +14,3 @@ mod tests { .stdout(contains("#compdef buckets")); } } - diff --git a/tests/test_cli_expect.rs b/tests/test_cli_expect.rs index c4cb113..a7accbe 100644 --- a/tests/test_cli_expect.rs +++ b/tests/test_cli_expect.rs @@ -3,14 +3,13 @@ mod common; #[cfg(test)] mod tests { use crate::common::tests::RepoFixture; + use predicates::prelude::*; use serial_test::serial; /// Test the `expect` command. /// /// # Commands - /// `$ typst expect` - /// - /// # Expected output + /// `$ buckets expect` /// #[test] #[serial] @@ -18,14 +17,42 @@ mod tests { let Some(fixture) = repo_fixture_or_skip() else { return; }; - let temp_dir = fixture.repo_dir.clone(); let mut cmd = assert_cmd::Command::cargo_bin("buckets").expect("failed to run command"); - cmd.current_dir(temp_dir.as_path()) + cmd.current_dir(fixture.bucket_dir.as_path()) .arg("expect") + .arg("Test Expectation") .assert() .success(); } + /// Test semantic duplicate detection warns when similar expectations exist + /// Note: Requires model download on first run (~90MB) + #[test] + #[serial] + #[ignore] // Run with: cargo test -- --ignored + fn test_cli_expect_duplicate_warning() { + let Some(fixture) = repo_fixture_or_skip() else { + return; + }; + + // Create first expectation + let mut cmd1 = assert_cmd::Command::cargo_bin("buckets").expect("failed to run command"); + cmd1.current_dir(fixture.bucket_dir.as_path()) + .arg("expect") + .arg("API response time should be under 200ms") + .assert() + .success(); + + // Create semantically similar expectation - should warn + let mut cmd2 = assert_cmd::Command::cargo_bin("buckets").expect("failed to run command"); + cmd2.current_dir(fixture.bucket_dir.as_path()) + .arg("expect") + .arg("The API must respond quickly within 200 milliseconds") + .assert() + .success() + .stdout(predicate::str::contains("Similar expectations found")); + } + fn repo_fixture_or_skip() -> Option { match RepoFixture::new() { Ok(fixture) => Some(fixture), diff --git a/tests/test_cli_finalize.rs b/tests/test_cli_finalize.rs index 30040d6..e18c00d 100644 --- a/tests/test_cli_finalize.rs +++ b/tests/test_cli_finalize.rs @@ -18,9 +18,9 @@ mod tests { let Some(fixture) = repo_fixture_or_skip() else { return; }; - let temp_dir = fixture.repo_dir.clone(); + let _temp_dir = fixture.repo_dir.clone(); let mut cmd = assert_cmd::Command::cargo_bin("buckets").expect("failed to run command"); - cmd.current_dir(temp_dir.as_path()) + cmd.current_dir(fixture.bucket_dir.as_path()) .arg("finalize") .assert() .success(); diff --git a/tests/tests.rs b/tests/tests.rs index 2ce0883..bd73d13 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -25,9 +25,10 @@ mod tests { let Some(fixture) = repo_fixture_or_skip() else { return; }; - let temp_dir = fixture.repo_dir.clone(); + // Must run from inside a bucket directory when no bucket name is provided + let bucket_dir = fixture.bucket_dir.clone(); let mut cmd = assert_cmd::Command::cargo_bin("buckets").expect("failed to run command"); - cmd.current_dir(temp_dir.as_path()) + cmd.current_dir(bucket_dir.as_path()) .arg("check") .assert() .success();