Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions src/args.rs
Original file line number Diff line number Diff line change
Expand Up @@ -396,6 +396,20 @@ pub struct Args {
/// Set a custom full-scan threshold.
#[clap(long)]
pub full_scan_threshold: Option<usize>,

/// Enable structured vector generation (Poincaré disk + low-rank projection).
/// Generates vectors with inherent cluster structure instead of uniform random.
#[clap(long, default_value_t = false)]
pub structured_vectors: bool,

/// Intrinsic dimension for structured vector generation.
/// Lower values create stronger cluster structure. [default: clamp(dim/8, 4, 32)]
#[clap(long, value_parser = parse_number)]
pub intrinsic_dim: Option<usize>,

/// Gaussian noise sigma for structured vector generation. [default: 0.1]
#[clap(long, default_value_t = 0.1)]
pub noise_sigma: f32,
}

#[derive(Copy, Clone, Debug)]
Expand Down
19 changes: 15 additions & 4 deletions src/common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -289,19 +289,23 @@ pub fn random_filter(
have_any.then_some(filter)
}

pub fn random_vector(rng: &mut impl Rng, args: &Args) -> Vector {
pub fn random_vector(
rng: &mut impl Rng,
args: &Args,
generator: Option<&crate::structured_vectors::StructuredVectorGenerator>,
) -> Vector {
let is_uint = args
.datatype
.as_ref()
.map(|x| x == Datatype::Uint8.as_str_name())
.unwrap_or(false);
if let Some(count) = args.multivector_size {
let multivector: Vec<_> = (0..count)
.map(|_| random_dense_vector(rng, args.dim, is_uint))
.map(|_| random_dense_vector(rng, args.dim, is_uint, generator))
.collect();
Vector::new_multi(multivector)
} else {
random_dense_vector(rng, args.dim, is_uint).into()
random_dense_vector(rng, args.dim, is_uint, generator).into()
}
}

Expand All @@ -324,9 +328,16 @@ pub fn random_sparse_vector(rng: &mut impl Rng, max_size: usize, sparsity: f64)
pairs
}

pub fn random_dense_vector(rng: &mut impl Rng, dim: usize, is_uint: bool) -> Vec<f32> {
pub fn random_dense_vector(
rng: &mut impl Rng,
dim: usize,
is_uint: bool,
generator: Option<&crate::structured_vectors::StructuredVectorGenerator>,
) -> Vec<f32> {
if is_uint {
(0..dim).map(|_| rng.random_range(0..255) as f32).collect()
} else if let Some(structured) = generator {
structured.generate(rng)
} else {
(0..dim).map(|_| rng.random_range(-1.0..1.0)).collect()
}
Expand Down
30 changes: 26 additions & 4 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ use clap::{CommandFactory, FromArgMatches};
use tokio::runtime;

use args::Args;
use structured_vectors::StructuredVectorGenerator;

mod args;
mod client;
Expand All @@ -18,10 +19,15 @@ mod save_jsonl;
mod scroll;
mod search;
mod stats;
mod structured_vectors;
mod upload;
mod upsert;

async fn run_benchmark(args: Args, stopped: Arc<AtomicBool>) -> Result<()> {
async fn run_benchmark(
args: Args,
generator: Option<Arc<StructuredVectorGenerator>>,
stopped: Arc<AtomicBool>,
) -> Result<()> {
if args.search_quality && args.search_exact {
println!("Ignoring `exact` flag because `search_quality` is also enabled!");
}
Expand All @@ -31,7 +37,7 @@ async fn run_benchmark(args: Args, stopped: Arc<AtomicBool>) -> Result<()> {
}

if !args.skip_upload && !args.skip_setup {
upload::upload_data(&args, stopped.clone()).await?;
upload::upload_data(&args, generator.clone(), stopped.clone()).await?;
}

if !args.skip_wait_index && !args.skip_setup {
Expand All @@ -41,7 +47,7 @@ async fn run_benchmark(args: Args, stopped: Arc<AtomicBool>) -> Result<()> {
}

if args.search || args.search_quality {
query::search(&args, stopped.clone()).await?;
query::search(&args, generator.clone(), stopped.clone()).await?;
}

if args.scroll {
Expand Down Expand Up @@ -82,6 +88,22 @@ fn parse_args() -> Args {
fn main() {
let args = parse_args();

let generator = if args.structured_vectors {
let intrinsic_dim = args
.intrinsic_dim
.unwrap_or_else(|| (args.dim / 8).clamp(4, 32));
let normalize = args.distance == "Cosine";
Some(Arc::new(StructuredVectorGenerator::new(
intrinsic_dim,
args.dim,
args.noise_sigma,
normalize,
42,
)))
} else {
None
};

let stopped = Arc::new(AtomicBool::new(false));
let r = stopped.clone();

Expand All @@ -97,6 +119,6 @@ fn main() {

runtime
.unwrap()
.block_on(run_benchmark(args, stopped))
.block_on(run_benchmark(args, generator, stopped))
.unwrap();
}
9 changes: 7 additions & 2 deletions src/query.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,17 @@ use crate::common::UUID_PAYLOAD_KEY;
use crate::scroll::ScrollProcessor;
use crate::search::SearchProcessor;
use crate::stats::process;
use crate::structured_vectors::StructuredVectorGenerator;

pub async fn search(args: &Args, stopped: Arc<AtomicBool>) -> Result<()> {
pub async fn search(
args: &Args,
generator: Option<Arc<StructuredVectorGenerator>>,
stopped: Arc<AtomicBool>,
) -> Result<()> {
let clients = create_clients(args)?;
let uuids = get_uuids(args, &clients[0]).await?;

let searcher = SearchProcessor::new(args.clone(), stopped.clone(), clients, uuids);
let searcher = SearchProcessor::new(args.clone(), generator, stopped.clone(), clients, uuids);
process(args, stopped, searcher).await
}

Expand Down
7 changes: 6 additions & 1 deletion src/search.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ use crate::common::{
retry_with_clients,
};
use crate::processor::Processor;
use crate::structured_vectors::StructuredVectorGenerator;
use indicatif::ProgressBar;
use qdrant_client::Qdrant;
use qdrant_client::qdrant::point_id::PointIdOptions;
Expand All @@ -30,6 +31,7 @@ struct SearchStats {

pub struct SearchProcessor {
args: Args,
generator: Option<Arc<StructuredVectorGenerator>>,
stopped: Arc<AtomicBool>,
clients: Vec<Qdrant>,
pub start_timestamp_millis: f64,
Expand All @@ -41,12 +43,14 @@ pub struct SearchProcessor {
impl SearchProcessor {
pub fn new(
args: Args,
generator: Option<Arc<StructuredVectorGenerator>>,
stopped: Arc<AtomicBool>,
clients: Vec<Qdrant>,
uuids: Vec<String>,
) -> Self {
SearchProcessor {
args,
generator,
stopped,
clients,
start_timestamp_millis: std::time::SystemTime::now()
Expand Down Expand Up @@ -98,10 +102,11 @@ impl SearchProcessor {
None
};

let generator = self.generator.as_deref();
(0..self.args.search_batch_size)
.map(|_| {
(
random_dense_vector(rng, self.args.dim, false),
random_dense_vector(rng, self.args.dim, false, generator),
None,
name.clone(),
)
Expand Down
121 changes: 121 additions & 0 deletions src/structured_vectors.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
use rand::SeedableRng;
use rand::{Rng, RngExt};
use rand_distr::{Distribution, Normal};
use std::fmt;

/// Generator for structured dense vectors using Poincaré disk sampling
/// combined with low-rank random projection and Gaussian noise.
///
/// This produces vectors with inherent cluster structure that indexes
/// well with HNSW, unlike uniform random vectors which suffer from
/// the curse of dimensionality.
pub struct StructuredVectorGenerator {
/// Projection matrix of shape [intrinsic_dim × target_dim], stored row-major.
projection: Vec<f32>,
/// Intrinsic dimension (Poincaré disk dimension).
intrinsic_dim: usize,
/// Target dimension (output vector dimension).
target_dim: usize,
/// Gaussian noise standard deviation.
noise_sigma: f32,
/// Whether to L2-normalize the output vectors.
normalize: bool,
}

impl fmt::Debug for StructuredVectorGenerator {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("StructuredVectorGenerator")
.field("intrinsic_dim", &self.intrinsic_dim)
.field("target_dim", &self.target_dim)
.field("noise_sigma", &self.noise_sigma)
.field("normalize", &self.normalize)
.finish()
}
}

impl StructuredVectorGenerator {
pub fn new(
intrinsic_dim: usize,
target_dim: usize,
noise_sigma: f32,
normalize: bool,
seed: u64,
) -> Self {
let mut rng = rand::rngs::StdRng::seed_from_u64(seed);
let normal = Normal::new(0.0f32, 1.0f32).unwrap();
let projection: Vec<f32> = (0..intrinsic_dim * target_dim)
.map(|_| normal.sample(&mut rng))
.collect();

Self {
projection,
intrinsic_dim,
target_dim,
noise_sigma,
normalize,
}
}

/// Generate a structured dense vector.
pub fn generate(&self, rng: &mut impl Rng) -> Vec<f32> {
let poincare_point = self.sample_poincare_disk(rng);

// Project from intrinsic_dim to target_dim via matrix multiply
let mut result = vec![0.0f32; self.target_dim];
for (i, &coord) in poincare_point.iter().enumerate() {
let row_offset = i * self.target_dim;
for j in 0..self.target_dim {
result[j] += coord * self.projection[row_offset + j];
}
}

// Add Gaussian noise
if self.noise_sigma > 0.0 {
let noise_dist = Normal::new(0.0f32, self.noise_sigma).unwrap();
for val in &mut result {
*val += noise_dist.sample(rng);
}
}

// Optionally L2-normalize (for cosine distance)
if self.normalize {
let norm: f32 = result.iter().map(|x| x * x).sum::<f32>().sqrt();
if norm > 0.0 {
for val in &mut result {
*val /= norm;
}
}
}

result
}

/// Sample a point from the Poincaré disk (unit ball) of dimension `intrinsic_dim`.
///
/// Uses the Gaussian direction + radial scaling method:
/// 1. Sample direction from Normal(0,1), normalize to unit sphere
/// 2. Sample radius r = U^(1/k) for uniform distribution in the ball
fn sample_poincare_disk(&self, rng: &mut impl Rng) -> Vec<f32> {
let normal = Normal::new(0.0f32, 1.0f32).unwrap();
let mut point: Vec<f32> = (0..self.intrinsic_dim)
.map(|_| normal.sample(rng))
.collect();

// Normalize to unit sphere
let norm: f32 = point.iter().map(|x| x * x).sum::<f32>().sqrt();
if norm > 0.0 {
for val in &mut point {
*val /= norm;
}
}

// Radial scaling: U^(1/k) gives uniform distribution in the k-dimensional ball
let u: f32 = rng.random_range(0.0..1.0);
let radius = u.powf(1.0 / self.intrinsic_dim as f32);
for val in &mut point {
*val *= radius;
}

point
}
}
8 changes: 7 additions & 1 deletion src/upload.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,14 @@ use crate::args::Args;
use crate::client::get_config;
use crate::common::throttler;
use crate::fbin_reader::FBinReader;
use crate::structured_vectors::StructuredVectorGenerator;
use crate::upsert::UpsertProcessor;

pub async fn upload_data(args: &Args, stopped: Arc<AtomicBool>) -> Result<()> {
pub async fn upload_data(
args: &Args,
generator: Option<Arc<StructuredVectorGenerator>>,
stopped: Arc<AtomicBool>,
) -> Result<()> {
let mut clients = Vec::new();
for config in get_config(args) {
clients.push(qdrant_client::Qdrant::new(config)?);
Expand Down Expand Up @@ -47,6 +52,7 @@ pub async fn upload_data(args: &Args, stopped: Arc<AtomicBool>) -> Result<()> {
};
let upserter = UpsertProcessor::new(
args.clone(),
generator,
stopped.clone(),
clients,
sent_bar_arc.clone(),
Expand Down
8 changes: 6 additions & 2 deletions src/upsert.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ use crate::common::{
};
use crate::fbin_reader::FBinReader;
use crate::save_jsonl::save_timings_as_jsonl;
use crate::structured_vectors::StructuredVectorGenerator;

fn log_points(points: &[PointStruct]) -> impl FnOnce(QdrantError) -> QdrantError + use<'_> {
move |e| {
Expand All @@ -46,6 +47,7 @@ fn log_points(points: &[PointStruct]) -> impl FnOnce(QdrantError) -> QdrantError

pub struct UpsertProcessor {
args: Args,
generator: Option<Arc<StructuredVectorGenerator>>,
stopped: Arc<AtomicBool>,
clients: Vec<Qdrant>,
progress_bar: Arc<ProgressBar>,
Expand All @@ -58,13 +60,15 @@ pub struct UpsertProcessor {
impl UpsertProcessor {
pub fn new(
args: Args,
generator: Option<Arc<StructuredVectorGenerator>>,
stopped: Arc<AtomicBool>,
clients: Vec<Qdrant>,
progress_bar: Arc<ProgressBar>,
reader: Option<FBinReader>,
) -> Self {
UpsertProcessor {
args,
generator,
stopped,
clients,
progress_bar,
Expand Down Expand Up @@ -116,13 +120,13 @@ impl UpsertProcessor {
let vectors_map: HashMap<_, _> = (0..self.args.vectors_per_point)
.map(|i| {
let vector_name = format!("{i}");
let vector = random_vector(&mut rng, &self.args);
let vector = random_vector(&mut rng, &self.args, self.generator.as_deref());
(vector_name, vector)
})
.collect();
vectors_map.into()
} else {
random_vector(&mut rng, &self.args).into()
random_vector(&mut rng, &self.args, self.generator.as_deref()).into()
};

let vectors: Vectors = if let Some(sparsity) = self.args.sparse_vectors {
Expand Down