From 0089d86a52fb7a679c9480ddd212ace6a5805eef Mon Sep 17 00:00:00 2001 From: Ivan Pleshkov Date: Mon, 2 Mar 2026 15:37:18 +0100 Subject: [PATCH] structured-vectors --- src/args.rs | 14 +++++ src/common.rs | 19 ++++-- src/main.rs | 30 ++++++++-- src/query.rs | 9 ++- src/search.rs | 7 ++- src/structured_vectors.rs | 121 ++++++++++++++++++++++++++++++++++++++ src/upload.rs | 8 ++- src/upsert.rs | 8 ++- 8 files changed, 202 insertions(+), 14 deletions(-) create mode 100644 src/structured_vectors.rs diff --git a/src/args.rs b/src/args.rs index db50ff0..bbf191e 100644 --- a/src/args.rs +++ b/src/args.rs @@ -396,6 +396,20 @@ pub struct Args { /// Set a custom full-scan threshold. #[clap(long)] pub full_scan_threshold: Option, + + /// Enable structured vector generation (Poincaré disk + low-rank projection). + /// Generates vectors with inherent cluster structure instead of uniform random. + #[clap(long, default_value_t = false)] + pub structured_vectors: bool, + + /// Intrinsic dimension for structured vector generation. + /// Lower values create stronger cluster structure. [default: clamp(dim/8, 4, 32)] + #[clap(long, value_parser = parse_number)] + pub intrinsic_dim: Option, + + /// Gaussian noise sigma for structured vector generation. [default: 0.1] + #[clap(long, default_value_t = 0.1)] + pub noise_sigma: f32, } #[derive(Copy, Clone, Debug)] diff --git a/src/common.rs b/src/common.rs index 2c46120..264ca35 100644 --- a/src/common.rs +++ b/src/common.rs @@ -289,7 +289,11 @@ pub fn random_filter( have_any.then_some(filter) } -pub fn random_vector(rng: &mut impl Rng, args: &Args) -> Vector { +pub fn random_vector( + rng: &mut impl Rng, + args: &Args, + generator: Option<&crate::structured_vectors::StructuredVectorGenerator>, +) -> Vector { let is_uint = args .datatype .as_ref() @@ -297,11 +301,11 @@ pub fn random_vector(rng: &mut impl Rng, args: &Args) -> Vector { .unwrap_or(false); if let Some(count) = args.multivector_size { let multivector: Vec<_> = (0..count) - .map(|_| random_dense_vector(rng, args.dim, is_uint)) + .map(|_| random_dense_vector(rng, args.dim, is_uint, generator)) .collect(); Vector::new_multi(multivector) } else { - random_dense_vector(rng, args.dim, is_uint).into() + random_dense_vector(rng, args.dim, is_uint, generator).into() } } @@ -324,9 +328,16 @@ pub fn random_sparse_vector(rng: &mut impl Rng, max_size: usize, sparsity: f64) pairs } -pub fn random_dense_vector(rng: &mut impl Rng, dim: usize, is_uint: bool) -> Vec { +pub fn random_dense_vector( + rng: &mut impl Rng, + dim: usize, + is_uint: bool, + generator: Option<&crate::structured_vectors::StructuredVectorGenerator>, +) -> Vec { if is_uint { (0..dim).map(|_| rng.random_range(0..255) as f32).collect() + } else if let Some(structured) = generator { + structured.generate(rng) } else { (0..dim).map(|_| rng.random_range(-1.0..1.0)).collect() } diff --git a/src/main.rs b/src/main.rs index ddd1ff3..4028658 100644 --- a/src/main.rs +++ b/src/main.rs @@ -6,6 +6,7 @@ use clap::{CommandFactory, FromArgMatches}; use tokio::runtime; use args::Args; +use structured_vectors::StructuredVectorGenerator; mod args; mod client; @@ -18,10 +19,15 @@ mod save_jsonl; mod scroll; mod search; mod stats; +mod structured_vectors; mod upload; mod upsert; -async fn run_benchmark(args: Args, stopped: Arc) -> Result<()> { +async fn run_benchmark( + args: Args, + generator: Option>, + stopped: Arc, +) -> Result<()> { if args.search_quality && args.search_exact { println!("Ignoring `exact` flag because `search_quality` is also enabled!"); } @@ -31,7 +37,7 @@ async fn run_benchmark(args: Args, stopped: Arc) -> Result<()> { } if !args.skip_upload && !args.skip_setup { - upload::upload_data(&args, stopped.clone()).await?; + upload::upload_data(&args, generator.clone(), stopped.clone()).await?; } if !args.skip_wait_index && !args.skip_setup { @@ -41,7 +47,7 @@ async fn run_benchmark(args: Args, stopped: Arc) -> Result<()> { } if args.search || args.search_quality { - query::search(&args, stopped.clone()).await?; + query::search(&args, generator.clone(), stopped.clone()).await?; } if args.scroll { @@ -82,6 +88,22 @@ fn parse_args() -> Args { fn main() { let args = parse_args(); + let generator = if args.structured_vectors { + let intrinsic_dim = args + .intrinsic_dim + .unwrap_or_else(|| (args.dim / 8).clamp(4, 32)); + let normalize = args.distance == "Cosine"; + Some(Arc::new(StructuredVectorGenerator::new( + intrinsic_dim, + args.dim, + args.noise_sigma, + normalize, + 42, + ))) + } else { + None + }; + let stopped = Arc::new(AtomicBool::new(false)); let r = stopped.clone(); @@ -97,6 +119,6 @@ fn main() { runtime .unwrap() - .block_on(run_benchmark(args, stopped)) + .block_on(run_benchmark(args, generator, stopped)) .unwrap(); } diff --git a/src/query.rs b/src/query.rs index 15fa962..3d5dd39 100644 --- a/src/query.rs +++ b/src/query.rs @@ -12,12 +12,17 @@ use crate::common::UUID_PAYLOAD_KEY; use crate::scroll::ScrollProcessor; use crate::search::SearchProcessor; use crate::stats::process; +use crate::structured_vectors::StructuredVectorGenerator; -pub async fn search(args: &Args, stopped: Arc) -> Result<()> { +pub async fn search( + args: &Args, + generator: Option>, + stopped: Arc, +) -> Result<()> { let clients = create_clients(args)?; let uuids = get_uuids(args, &clients[0]).await?; - let searcher = SearchProcessor::new(args.clone(), stopped.clone(), clients, uuids); + let searcher = SearchProcessor::new(args.clone(), generator, stopped.clone(), clients, uuids); process(args, stopped, searcher).await } diff --git a/src/search.rs b/src/search.rs index 34680bf..9bb30eb 100644 --- a/src/search.rs +++ b/src/search.rs @@ -4,6 +4,7 @@ use crate::common::{ retry_with_clients, }; use crate::processor::Processor; +use crate::structured_vectors::StructuredVectorGenerator; use indicatif::ProgressBar; use qdrant_client::Qdrant; use qdrant_client::qdrant::point_id::PointIdOptions; @@ -30,6 +31,7 @@ struct SearchStats { pub struct SearchProcessor { args: Args, + generator: Option>, stopped: Arc, clients: Vec, pub start_timestamp_millis: f64, @@ -41,12 +43,14 @@ pub struct SearchProcessor { impl SearchProcessor { pub fn new( args: Args, + generator: Option>, stopped: Arc, clients: Vec, uuids: Vec, ) -> Self { SearchProcessor { args, + generator, stopped, clients, start_timestamp_millis: std::time::SystemTime::now() @@ -98,10 +102,11 @@ impl SearchProcessor { None }; + let generator = self.generator.as_deref(); (0..self.args.search_batch_size) .map(|_| { ( - random_dense_vector(rng, self.args.dim, false), + random_dense_vector(rng, self.args.dim, false, generator), None, name.clone(), ) diff --git a/src/structured_vectors.rs b/src/structured_vectors.rs new file mode 100644 index 0000000..faa12da --- /dev/null +++ b/src/structured_vectors.rs @@ -0,0 +1,121 @@ +use rand::SeedableRng; +use rand::{Rng, RngExt}; +use rand_distr::{Distribution, Normal}; +use std::fmt; + +/// Generator for structured dense vectors using Poincaré disk sampling +/// combined with low-rank random projection and Gaussian noise. +/// +/// This produces vectors with inherent cluster structure that indexes +/// well with HNSW, unlike uniform random vectors which suffer from +/// the curse of dimensionality. +pub struct StructuredVectorGenerator { + /// Projection matrix of shape [intrinsic_dim × target_dim], stored row-major. + projection: Vec, + /// Intrinsic dimension (Poincaré disk dimension). + intrinsic_dim: usize, + /// Target dimension (output vector dimension). + target_dim: usize, + /// Gaussian noise standard deviation. + noise_sigma: f32, + /// Whether to L2-normalize the output vectors. + normalize: bool, +} + +impl fmt::Debug for StructuredVectorGenerator { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("StructuredVectorGenerator") + .field("intrinsic_dim", &self.intrinsic_dim) + .field("target_dim", &self.target_dim) + .field("noise_sigma", &self.noise_sigma) + .field("normalize", &self.normalize) + .finish() + } +} + +impl StructuredVectorGenerator { + pub fn new( + intrinsic_dim: usize, + target_dim: usize, + noise_sigma: f32, + normalize: bool, + seed: u64, + ) -> Self { + let mut rng = rand::rngs::StdRng::seed_from_u64(seed); + let normal = Normal::new(0.0f32, 1.0f32).unwrap(); + let projection: Vec = (0..intrinsic_dim * target_dim) + .map(|_| normal.sample(&mut rng)) + .collect(); + + Self { + projection, + intrinsic_dim, + target_dim, + noise_sigma, + normalize, + } + } + + /// Generate a structured dense vector. + pub fn generate(&self, rng: &mut impl Rng) -> Vec { + let poincare_point = self.sample_poincare_disk(rng); + + // Project from intrinsic_dim to target_dim via matrix multiply + let mut result = vec![0.0f32; self.target_dim]; + for (i, &coord) in poincare_point.iter().enumerate() { + let row_offset = i * self.target_dim; + for j in 0..self.target_dim { + result[j] += coord * self.projection[row_offset + j]; + } + } + + // Add Gaussian noise + if self.noise_sigma > 0.0 { + let noise_dist = Normal::new(0.0f32, self.noise_sigma).unwrap(); + for val in &mut result { + *val += noise_dist.sample(rng); + } + } + + // Optionally L2-normalize (for cosine distance) + if self.normalize { + let norm: f32 = result.iter().map(|x| x * x).sum::().sqrt(); + if norm > 0.0 { + for val in &mut result { + *val /= norm; + } + } + } + + result + } + + /// Sample a point from the Poincaré disk (unit ball) of dimension `intrinsic_dim`. + /// + /// Uses the Gaussian direction + radial scaling method: + /// 1. Sample direction from Normal(0,1), normalize to unit sphere + /// 2. Sample radius r = U^(1/k) for uniform distribution in the ball + fn sample_poincare_disk(&self, rng: &mut impl Rng) -> Vec { + let normal = Normal::new(0.0f32, 1.0f32).unwrap(); + let mut point: Vec = (0..self.intrinsic_dim) + .map(|_| normal.sample(rng)) + .collect(); + + // Normalize to unit sphere + let norm: f32 = point.iter().map(|x| x * x).sum::().sqrt(); + if norm > 0.0 { + for val in &mut point { + *val /= norm; + } + } + + // Radial scaling: U^(1/k) gives uniform distribution in the k-dimensional ball + let u: f32 = rng.random_range(0.0..1.0); + let radius = u.powf(1.0 / self.intrinsic_dim as f32); + for val in &mut point { + *val *= radius; + } + + point + } +} diff --git a/src/upload.rs b/src/upload.rs index 8a78086..a870647 100644 --- a/src/upload.rs +++ b/src/upload.rs @@ -12,9 +12,14 @@ use crate::args::Args; use crate::client::get_config; use crate::common::throttler; use crate::fbin_reader::FBinReader; +use crate::structured_vectors::StructuredVectorGenerator; use crate::upsert::UpsertProcessor; -pub async fn upload_data(args: &Args, stopped: Arc) -> Result<()> { +pub async fn upload_data( + args: &Args, + generator: Option>, + stopped: Arc, +) -> Result<()> { let mut clients = Vec::new(); for config in get_config(args) { clients.push(qdrant_client::Qdrant::new(config)?); @@ -47,6 +52,7 @@ pub async fn upload_data(args: &Args, stopped: Arc) -> Result<()> { }; let upserter = UpsertProcessor::new( args.clone(), + generator, stopped.clone(), clients, sent_bar_arc.clone(), diff --git a/src/upsert.rs b/src/upsert.rs index 934961b..249d2e5 100644 --- a/src/upsert.rs +++ b/src/upsert.rs @@ -23,6 +23,7 @@ use crate::common::{ }; use crate::fbin_reader::FBinReader; use crate::save_jsonl::save_timings_as_jsonl; +use crate::structured_vectors::StructuredVectorGenerator; fn log_points(points: &[PointStruct]) -> impl FnOnce(QdrantError) -> QdrantError + use<'_> { move |e| { @@ -46,6 +47,7 @@ fn log_points(points: &[PointStruct]) -> impl FnOnce(QdrantError) -> QdrantError pub struct UpsertProcessor { args: Args, + generator: Option>, stopped: Arc, clients: Vec, progress_bar: Arc, @@ -58,6 +60,7 @@ pub struct UpsertProcessor { impl UpsertProcessor { pub fn new( args: Args, + generator: Option>, stopped: Arc, clients: Vec, progress_bar: Arc, @@ -65,6 +68,7 @@ impl UpsertProcessor { ) -> Self { UpsertProcessor { args, + generator, stopped, clients, progress_bar, @@ -116,13 +120,13 @@ impl UpsertProcessor { let vectors_map: HashMap<_, _> = (0..self.args.vectors_per_point) .map(|i| { let vector_name = format!("{i}"); - let vector = random_vector(&mut rng, &self.args); + let vector = random_vector(&mut rng, &self.args, self.generator.as_deref()); (vector_name, vector) }) .collect(); vectors_map.into() } else { - random_vector(&mut rng, &self.args).into() + random_vector(&mut rng, &self.args, self.generator.as_deref()).into() }; let vectors: Vectors = if let Some(sparsity) = self.args.sparse_vectors {