From 4c839013b639a6e0bf5a5a6a703d7f5e0c007b4a Mon Sep 17 00:00:00 2001 From: Vlad Lasky Date: Thu, 18 Jun 2026 20:59:26 +1000 Subject: [PATCH] perf(filtering): split METADATA into thin + fat tables (v2 schema) Re-sequencing after deletes now touches only a ~50 byte/row thin table instead of rewriting full ~27KB rows. Benchmarked at 8ms for 20K rows vs 210ms (v1) / 940ms (v0). - METADATA retains small filterable columns (_subset_, file, name, etc.) - METADATA_CONTENT holds large TEXT columns (code, signature, imports) - Linked by stable monotonic _content_id_ (never re-sequenced) - v0/v1 code paths preserved for backward compat reads - FTS rebuild/update_rows adapted to JOIN for v2 layout - INDEX_FORMAT_VERSION bumped to 2 (triggers full rebuild on upgrade) - Removed v0->v1 in-place migration; format mismatches now full-rebuild --- colgrep/src/index/mod.rs | 41 +- colgrep/src/index/state.rs | 2 +- next-plaid/src/filtering.rs | 1147 ++++++++++++++++++++++++++++++--- next-plaid/src/text_search.rs | 203 ++++-- 4 files changed, 1216 insertions(+), 177 deletions(-) diff --git a/colgrep/src/index/mod.rs b/colgrep/src/index/mod.rs index 85ce8c3..5ff4281 100644 --- a/colgrep/src/index/mod.rs +++ b/colgrep/src/index/mod.rs @@ -1634,7 +1634,7 @@ impl IndexBuilder { // instead of rebuilding from scratch. No-op for non-worktree projects. self.maybe_seed_from_worktree(force); - let mut state = IndexState::load(&self.index_dir)?; + let state = IndexState::load(&self.index_dir)?; let index_dir = get_vector_index_path(&self.index_dir); let index_path = index_dir.to_str().unwrap(); let index_exists = index_dir.join("metadata.json").exists(); @@ -1660,41 +1660,10 @@ impl IndexBuilder { return self.full_rebuild(languages); } - // Format version 0 → 1 did not change the on-disk index layout, only - // added version tracking and the FileInfo.size field (which defaults to - // 0 via serde). Migrate in place rather than discarding the entire index. - if format_mismatch && state.index_format_version == 0 { - state.index_format_version = INDEX_FORMAT_VERSION; - // Stat every still-present entry to fill in missing sizes and upgrade - // legacy second-precision mtimes, restoring the stat-only fast path - // without re-embedding. Entries whose file is gone are deliberately - // LEFT in the state: the incremental update below will see them - // missing from disk, mark them deleted, and remove them from every - // store (vector index + metadata + FTS5) via delete_files_from_index. - // Dropping them here would instead orphan their index rows — the - // deleted content would stay searchable until a later periodic cleanup. - for (path, info) in state.files.iter_mut() { - let full = self.project_root.join(path); - let Ok((mtime, size)) = file_stat(&full) else { - continue; // gone on disk → handled as a deletion by the update - }; - if info.size == 0 { - info.size = size; - } - // Legacy states store mtime in seconds; current code uses - // nanoseconds. Upgrade precision when the file hasn't been - // modified (same second), otherwise leave the value stale so the - // hash pass below catches the real change. - if info.mtime < 10_000_000_000_000 { - if info.mtime == mtime / 1_000_000_000 { - info.mtime = mtime; - } - } else if info.mtime == 0 { - info.mtime = mtime; - } - } - state.save(&self.index_dir)?; - } else if format_mismatch { + // Format version 0/1 to 2 changes the metadata DB schema (thin/fat table + // split), requiring a full rebuild. Drop any in-progress resumable-build + // marker so we don't try to resume an index we're discarding. + if format_mismatch { let _ = std::fs::remove_file(self.index_dir.join(BUILDING_MARKER)); return self.full_rebuild(languages); } diff --git a/colgrep/src/index/state.rs b/colgrep/src/index/state.rs index 47d237e..efeffdc 100644 --- a/colgrep/src/index/state.rs +++ b/colgrep/src/index/state.rs @@ -12,7 +12,7 @@ use super::paths::get_state_path; /// metadata schema). Bump ONLY for incompatible changes: a mismatch discards /// the index and re-embeds the entire project on the next run. Routine CLI /// releases must NOT bump this. -pub const INDEX_FORMAT_VERSION: u32 = 1; +pub const INDEX_FORMAT_VERSION: u32 = 2; #[derive(Debug, Clone, Serialize, Deserialize, Default)] pub struct IndexState { diff --git a/next-plaid/src/filtering.rs b/next-plaid/src/filtering.rs index 82fa7e4..a27d7c5 100644 --- a/next-plaid/src/filtering.rs +++ b/next-plaid/src/filtering.rs @@ -62,6 +62,34 @@ const SUBSET_INDEX_NAME: &str = "idx_metadata_subset"; /// binaries (e.g. a deployed next-plaid-api) read a v1 index without modification. const METADATA_SCHEMA_V1: i64 = 1; +/// `PRAGMA user_version` value for the thin/fat split layout. METADATA holds only +/// small filterable columns + `_content_id_` FK; METADATA_CONTENT holds the large +/// TEXT columns (code, signature, etc). Re-sequencing on delete touches only the +/// thin table, making it position-independent regardless of row count. +pub(crate) const METADATA_SCHEMA_V2: i64 = 2; + +/// Content table name for the v2 split layout. +pub(crate) const CONTENT_TABLE: &str = "METADATA_CONTENT"; + +/// Column linking METADATA to METADATA_CONTENT in the v2 layout. +pub(crate) const CONTENT_ID_COLUMN: &str = "_content_id_"; + +/// Columns that belong in the thin METADATA table (v2). +/// All other columns go into METADATA_CONTENT. +const THIN_COLUMNS: &[&str] = &[ + "file", + "name", + "qualified_name", + "line", + "end_line", + "language", + "unit_type", + "complexity", + "has_loops", + "has_branches", + "has_error_handling", +]; + /// Validate that a column name is a safe SQL identifier. /// /// Column names must start with a letter or underscore, followed by @@ -508,10 +536,26 @@ impl<'a> ConditionValidator<'a> { /// Get column names from the database schema. fn get_schema_columns(conn: &Connection) -> Result> { let mut columns = HashSet::new(); + let split = is_split_schema(conn); let mut stmt = conn.prepare("PRAGMA table_info(METADATA)")?; let rows = stmt.query_map([], |row| row.get::<_, String>(1))?; for row in rows { - columns.insert(row?); + let col = row?; + if split && col == CONTENT_ID_COLUMN { + continue; + } + columns.insert(col); + } + // For v2, also include columns from the content table. + if split { + let mut stmt = conn.prepare(&format!("PRAGMA table_info({})", CONTENT_TABLE))?; + let rows = stmt.query_map([], |row| row.get::<_, String>(1))?; + for row in rows { + let col = row?; + if col != CONTENT_ID_COLUMN { + columns.insert(col); + } + } } Ok(columns) } @@ -700,11 +744,21 @@ fn validate_fixed_columns(columns: &[(&str, &str)]) -> Result<()> { } /// Read the metadata schema version from `PRAGMA user_version` (0 = legacy). -fn metadata_schema_version(conn: &Connection) -> i64 { +pub(crate) fn metadata_schema_version(conn: &Connection) -> i64 { conn.query_row("PRAGMA user_version", [], |row| row.get(0)) .unwrap_or(0) } +/// Check if the DB uses the v2 split layout. +pub(crate) fn is_split_schema(conn: &Connection) -> bool { + metadata_schema_version(conn) >= METADATA_SCHEMA_V2 +} + +/// Determine if a column belongs to the thin (METADATA) or fat (METADATA_CONTENT) table. +fn is_thin_column(col: &str) -> bool { + col == SUBSET_COLUMN || col == CONTENT_ID_COLUMN || THIN_COLUMNS.contains(&col) +} + /// Create the (non-unique) index over `_subset_` used by the v1 layout. /// /// Non-unique on purpose: the dense-0..N-1 uniqueness of `_subset_` is guaranteed @@ -823,18 +877,36 @@ fn ensure_fast_subset_schema(conn: &Connection) -> Result<()> { } fn create_fixed_metadata_table(conn: &Connection, columns: &[(&str, &str)]) -> Result<()> { - // The caller has already committed to a stable schema, so we can skip the - // generic "discover columns as we go" logic and create the table directly. - // `_subset_` is a regular column (v1 layout) so delete re-sequencing never - // relocates fat rows; a separate index keeps lookups fast. - let mut col_defs = vec![format!("\"{}\" INTEGER NOT NULL", SUBSET_COLUMN)]; + // v2 split layout: thin METADATA table for fast re-sequencing, fat + // METADATA_CONTENT table for large columns that never move. + let mut thin_col_defs = vec![ + format!("\"{}\" INTEGER NOT NULL", SUBSET_COLUMN), + format!("\"{}\" INTEGER NOT NULL", CONTENT_ID_COLUMN), + ]; + let mut fat_col_defs = vec![format!("\"{}\" INTEGER PRIMARY KEY", CONTENT_ID_COLUMN)]; + for (name, sql_type) in columns { - col_defs.push(format!("\"{}\" {}", name, sql_type)); + if is_thin_column(name) { + thin_col_defs.push(format!("\"{}\" {}", name, sql_type)); + } else { + fat_col_defs.push(format!("\"{}\" {}", name, sql_type)); + } } - let create_sql = format!("CREATE TABLE METADATA ({})", col_defs.join(", ")); - conn.execute(&create_sql, [])?; + + conn.execute( + &format!("CREATE TABLE METADATA ({})", thin_col_defs.join(", ")), + [], + )?; + conn.execute( + &format!( + "CREATE TABLE {} ({})", + CONTENT_TABLE, + fat_col_defs.join(", ") + ), + [], + )?; create_subset_index(conn)?; - conn.execute_batch(&format!("PRAGMA user_version={}", METADATA_SCHEMA_V1))?; + conn.execute_batch(&format!("PRAGMA user_version={}", METADATA_SCHEMA_V2))?; Ok(()) } @@ -844,6 +916,9 @@ fn insert_fixed_metadata_rows( metadata: &[Value], doc_ids: &[i64], ) -> Result { + if is_split_schema(conn) { + return insert_fixed_metadata_rows_v2(conn, columns, metadata, doc_ids); + } let txn = conn.transaction()?; let mut column_names = vec![format!("\"{}\"", SUBSET_COLUMN)]; column_names.extend(columns.iter().map(|(name, _)| format!("\"{}\"", name))); @@ -861,8 +936,6 @@ fn insert_fixed_metadata_rows( })?; let mut values: Vec> = vec![Box::new(doc_ids[i])]; for (column_name, _) in columns { - // Reuse the generic JSON-to-SQL coercion so the fast path stores - // values exactly like the slower schema-discovery path. let value = obj.get(*column_name).unwrap_or(&Value::Null); values.push(json_to_sql(value)); } @@ -874,6 +947,86 @@ fn insert_fixed_metadata_rows( Ok(metadata.len()) } +fn insert_fixed_metadata_rows_v2( + conn: &mut Connection, + columns: &[(&str, &str)], + metadata: &[Value], + doc_ids: &[i64], +) -> Result { + let thin_cols: Vec<&(&str, &str)> = columns.iter().filter(|(n, _)| is_thin_column(n)).collect(); + let fat_cols: Vec<&(&str, &str)> = columns.iter().filter(|(n, _)| !is_thin_column(n)).collect(); + + let next_content_id: i64 = conn + .query_row( + &format!( + "SELECT COALESCE(MAX(\"{}\"), -1) + 1 FROM {}", + CONTENT_ID_COLUMN, CONTENT_TABLE + ), + [], + |row| row.get(0), + ) + .unwrap_or(0); + + let txn = conn.transaction()?; + + // Prepare fat table INSERT + let mut fat_col_names = vec![format!("\"{}\"", CONTENT_ID_COLUMN)]; + fat_col_names.extend(fat_cols.iter().map(|(name, _)| format!("\"{}\"", name))); + let fat_placeholders: Vec<&str> = std::iter::repeat_n("?", fat_cols.len() + 1).collect(); + let fat_insert_sql = format!( + "INSERT INTO {} ({}) VALUES ({})", + CONTENT_TABLE, + fat_col_names.join(", "), + fat_placeholders.join(", ") + ); + + // Prepare thin table INSERT + let mut thin_col_names = vec![ + format!("\"{}\"", SUBSET_COLUMN), + format!("\"{}\"", CONTENT_ID_COLUMN), + ]; + thin_col_names.extend(thin_cols.iter().map(|(name, _)| format!("\"{}\"", name))); + let thin_placeholders: Vec<&str> = std::iter::repeat_n("?", thin_cols.len() + 2).collect(); + let thin_insert_sql = format!( + "INSERT INTO METADATA ({}) VALUES ({})", + thin_col_names.join(", "), + thin_placeholders.join(", ") + ); + + { + let mut fat_stmt = txn.prepare_cached(&fat_insert_sql)?; + let mut thin_stmt = txn.prepare_cached(&thin_insert_sql)?; + + for (i, item) in metadata.iter().enumerate() { + let obj = item.as_object().ok_or_else(|| { + Error::Filtering("Expected metadata rows to be JSON objects".into()) + })?; + let content_id = next_content_id + i as i64; + + // Insert into fat table + let mut fat_values: Vec> = vec![Box::new(content_id)]; + for (column_name, _) in &fat_cols { + let value = obj.get(*column_name).unwrap_or(&Value::Null); + fat_values.push(json_to_sql(value)); + } + let fat_params: Vec<&dyn ToSql> = fat_values.iter().map(|v| v.as_ref()).collect(); + fat_stmt.execute(params_from_iter(fat_params))?; + + // Insert into thin table + let mut thin_values: Vec> = + vec![Box::new(doc_ids[i]), Box::new(content_id)]; + for (column_name, _) in &thin_cols { + let value = obj.get(*column_name).unwrap_or(&Value::Null); + thin_values.push(json_to_sql(value)); + } + let thin_params: Vec<&dyn ToSql> = thin_values.iter().map(|v| v.as_ref()).collect(); + thin_stmt.execute(params_from_iter(thin_params))?; + } + } + txn.commit()?; + Ok(metadata.len()) +} + fn try_fixed_schema_from_first_row( metadata: &[Value], ) -> Result>> { @@ -1027,7 +1180,6 @@ pub fn create(index_path: &str, metadata: &[Value], doc_ids: &[i64]) -> Result Result Result = columns.iter().filter(|c| is_thin_column(c)).collect(); + let fat_columns: Vec<&String> = columns.iter().filter(|c| !is_thin_column(c)).collect(); + + let mut thin_col_defs = vec![ + format!("\"{}\" INTEGER NOT NULL", SUBSET_COLUMN), + format!("\"{}\" INTEGER NOT NULL", CONTENT_ID_COLUMN), + ]; + for col in &thin_columns { + let sql_type = column_types.get(col.as_str()).copied().unwrap_or("TEXT"); + thin_col_defs.push(format!("\"{}\" {}", col, sql_type)); + } + + let mut fat_col_defs = vec![format!("\"{}\" INTEGER PRIMARY KEY", CONTENT_ID_COLUMN)]; + for col in &fat_columns { + let sql_type = column_types.get(col.as_str()).copied().unwrap_or("TEXT"); + fat_col_defs.push(format!("\"{}\" {}", col, sql_type)); } let txn = conn.transaction()?; - let create_sql = format!("CREATE TABLE METADATA ({})", col_defs.join(", ")); - txn.execute(&create_sql, [])?; + txn.execute( + &format!("CREATE TABLE METADATA ({})", thin_col_defs.join(", ")), + [], + )?; + txn.execute( + &format!( + "CREATE TABLE {} ({})", + CONTENT_TABLE, + fat_col_defs.join(", ") + ), + [], + )?; txn.execute( &format!( "CREATE INDEX IF NOT EXISTS \"{}\" ON METADATA (\"{}\")", @@ -1067,43 +1239,70 @@ pub fn create(index_path: &str, metadata: &[Value], doc_ids: &[i64]) -> Result = std::iter::repeat_n("?", columns.len() + 1).collect(); - let insert_sql = if columns.is_empty() { - format!("INSERT INTO METADATA (\"{}\") VALUES (?)", SUBSET_COLUMN,) - } else { - let col_names: Vec = columns.iter().map(|c| format!("\"{}\"", c)).collect(); - format!( - "INSERT INTO METADATA (\"{}\", {}) VALUES ({})", - SUBSET_COLUMN, - col_names.join(", "), - placeholders.join(", ") - ) - }; + // Prepare INSERT statements for both tables + let thin_col_names: Vec = std::iter::once(format!("\"{}\"", SUBSET_COLUMN)) + .chain(std::iter::once(format!("\"{}\"", CONTENT_ID_COLUMN))) + .chain(thin_columns.iter().map(|c| format!("\"{}\"", c))) + .collect(); + let thin_placeholders: Vec<&str> = std::iter::repeat_n("?", thin_columns.len() + 2).collect(); + let thin_insert_sql = format!( + "INSERT INTO METADATA ({}) VALUES ({})", + thin_col_names.join(", "), + thin_placeholders.join(", ") + ); + + let fat_col_names: Vec = std::iter::once(format!("\"{}\"", CONTENT_ID_COLUMN)) + .chain(fat_columns.iter().map(|c| format!("\"{}\"", c))) + .collect(); + let fat_placeholders: Vec<&str> = std::iter::repeat_n("?", fat_columns.len() + 1).collect(); + let fat_insert_sql = format!( + "INSERT INTO {} ({}) VALUES ({})", + CONTENT_TABLE, + fat_col_names.join(", "), + fat_placeholders.join(", ") + ); { - let mut stmt = txn.prepare(&insert_sql)?; + let mut thin_stmt = txn.prepare(&thin_insert_sql)?; + let mut fat_stmt = txn.prepare(&fat_insert_sql)?; for (i, item) in metadata.iter().enumerate() { - let mut values: Vec> = vec![Box::new(doc_ids[i])]; + let content_id = doc_ids[i]; + + // Insert into fat table + let mut fat_values: Vec> = vec![Box::new(content_id)]; if let Value::Object(obj) = item { - for col in &columns { - let value = obj.get(col).unwrap_or(&Value::Null); - values.push(json_to_sql(value)); + for col in &fat_columns { + let value = obj.get(col.as_str()).unwrap_or(&Value::Null); + fat_values.push(json_to_sql(value)); } } else { - // If not an object, insert nulls - for _ in &columns { - values.push(Box::new(None::)); + for _ in &fat_columns { + fat_values.push(Box::new(None::)); } } - let params: Vec<&dyn ToSql> = values.iter().map(|v| v.as_ref()).collect(); - stmt.execute(params_from_iter(params))?; + let fat_params: Vec<&dyn ToSql> = fat_values.iter().map(|v| v.as_ref()).collect(); + fat_stmt.execute(params_from_iter(fat_params))?; + + // Insert into thin table + let mut thin_values: Vec> = + vec![Box::new(doc_ids[i]), Box::new(content_id)]; + if let Value::Object(obj) = item { + for col in &thin_columns { + let value = obj.get(col.as_str()).unwrap_or(&Value::Null); + thin_values.push(json_to_sql(value)); + } + } else { + for _ in &thin_columns { + thin_values.push(Box::new(None::)); + } + } + let thin_params: Vec<&dyn ToSql> = thin_values.iter().map(|v| v.as_ref()).collect(); + thin_stmt.execute(params_from_iter(thin_params))?; } } txn.commit()?; - // Stamp the fast-delete layout so future deletes skip the migration check. - conn.execute_batch(&format!("PRAGMA user_version={}", METADATA_SCHEMA_V1))?; + conn.execute_batch(&format!("PRAGMA user_version={}", METADATA_SCHEMA_V2))?; Ok(metadata.len()) } @@ -1153,6 +1352,10 @@ pub fn update(index_path: &str, metadata: &[Value], doc_ids: &[i64]) -> Result = Vec::new(); { @@ -1259,6 +1462,170 @@ pub fn update(index_path: &str, metadata: &[Value], doc_ids: &[i64]) -> Result Result { + // Gather existing columns from both tables (excluding internal columns). + let mut thin_existing: Vec = Vec::new(); + { + let mut stmt = conn.prepare("PRAGMA table_info(METADATA)")?; + let rows = stmt.query_map([], |row| row.get::<_, String>(1))?; + for row in rows { + let col = row?; + if col != SUBSET_COLUMN && col != CONTENT_ID_COLUMN { + thin_existing.push(col); + } + } + } + let mut fat_existing: Vec = Vec::new(); + { + let mut stmt = conn.prepare(&format!("PRAGMA table_info({})", CONTENT_TABLE))?; + let rows = stmt.query_map([], |row| row.get::<_, String>(1))?; + for row in rows { + let col = row?; + if col != CONTENT_ID_COLUMN { + fat_existing.push(col); + } + } + } + + let mut all_existing: HashSet<&str> = HashSet::new(); + for c in &thin_existing { + all_existing.insert(c.as_str()); + } + for c in &fat_existing { + all_existing.insert(c.as_str()); + } + + // Discover new columns and ALTER TABLE as needed. + let mut column_types: HashMap = HashMap::new(); + let mut new_thin: Vec = Vec::new(); + let mut new_fat: Vec = Vec::new(); + + for item in metadata { + if let Value::Object(obj) = item { + for (key, value) in obj { + if !all_existing.contains(key.as_str()) + && !new_thin.contains(key) + && !new_fat.contains(key) + { + if !is_valid_column_name(key) { + return Err(Error::Filtering(format!( + "Invalid column name '{}'. Column names must start with a letter or \ + underscore, followed by letters, digits, or underscores.", + key + ))); + } + if is_thin_column(key) { + new_thin.push(key.clone()); + } else { + new_fat.push(key.clone()); + } + } + if !value.is_null() && !column_types.contains_key(key) { + column_types.insert(key.clone(), infer_sql_type(value)); + } + } + } + } + + let txn = conn.transaction()?; + for col in &new_thin { + let sql_type = column_types.get(col).copied().unwrap_or("TEXT"); + txn.execute( + &format!("ALTER TABLE METADATA ADD COLUMN \"{}\" {}", col, sql_type), + [], + )?; + } + for col in &new_fat { + let sql_type = column_types.get(col).copied().unwrap_or("TEXT"); + txn.execute( + &format!( + "ALTER TABLE {} ADD COLUMN \"{}\" {}", + CONTENT_TABLE, col, sql_type + ), + [], + )?; + } + + let all_thin: Vec = thin_existing.into_iter().chain(new_thin).collect(); + let all_fat: Vec = fat_existing.into_iter().chain(new_fat).collect(); + + // Next content ID + let next_content_id: i64 = txn + .query_row( + &format!( + "SELECT COALESCE(MAX(\"{}\"), -1) + 1 FROM {}", + CONTENT_ID_COLUMN, CONTENT_TABLE + ), + [], + |row| row.get(0), + ) + .unwrap_or(0); + + // Prepare fat INSERT + let mut fat_col_names = vec![format!("\"{}\"", CONTENT_ID_COLUMN)]; + fat_col_names.extend(all_fat.iter().map(|c| format!("\"{}\"", c))); + let fat_placeholders: Vec<&str> = std::iter::repeat_n("?", all_fat.len() + 1).collect(); + let fat_insert_sql = format!( + "INSERT INTO {} ({}) VALUES ({})", + CONTENT_TABLE, + fat_col_names.join(", "), + fat_placeholders.join(", ") + ); + + // Prepare thin INSERT + let mut thin_col_names = vec![ + format!("\"{}\"", SUBSET_COLUMN), + format!("\"{}\"", CONTENT_ID_COLUMN), + ]; + thin_col_names.extend(all_thin.iter().map(|c| format!("\"{}\"", c))); + let thin_placeholders: Vec<&str> = std::iter::repeat_n("?", all_thin.len() + 2).collect(); + let thin_insert_sql = format!( + "INSERT INTO METADATA ({}) VALUES ({})", + thin_col_names.join(", "), + thin_placeholders.join(", ") + ); + + { + let mut fat_stmt = txn.prepare(&fat_insert_sql)?; + let mut thin_stmt = txn.prepare(&thin_insert_sql)?; + + for (i, item) in metadata.iter().enumerate() { + let content_id = next_content_id + i as i64; + + let mut fat_values: Vec> = vec![Box::new(content_id)]; + if let Value::Object(obj) = item { + for col in &all_fat { + let value = obj.get(col).unwrap_or(&Value::Null); + fat_values.push(json_to_sql(value)); + } + } else { + for _ in &all_fat { + fat_values.push(Box::new(None::)); + } + } + let fat_params: Vec<&dyn ToSql> = fat_values.iter().map(|v| v.as_ref()).collect(); + fat_stmt.execute(params_from_iter(fat_params))?; + + let mut thin_values: Vec> = + vec![Box::new(doc_ids[i]), Box::new(content_id)]; + if let Value::Object(obj) = item { + for col in &all_thin { + let value = obj.get(col).unwrap_or(&Value::Null); + thin_values.push(json_to_sql(value)); + } + } else { + for _ in &all_thin { + thin_values.push(Box::new(None::)); + } + } + let thin_params: Vec<&dyn ToSql> = thin_values.iter().map(|v| v.as_ref()).collect(); + thin_stmt.execute(params_from_iter(thin_params))?; + } + } + txn.commit()?; + Ok(metadata.len()) +} + /// Delete rows by subset IDs and re-index the _subset_ column to be sequential. /// /// After deletion, remaining documents are re-indexed to maintain sequential @@ -1288,6 +1655,10 @@ pub fn delete(index_path: &str, subset: &[i64]) -> Result { let conn = open_db_write(&db_path)?; + if is_split_schema(&conn) { + return delete_v2(&conn, subset); + } + // Migrate a legacy (v0) index to the fast-delete layout on the first delete, so // the re-sequencing below updates a plain indexed column instead of relocating // each fat row. No-op once the index is v1. @@ -1324,12 +1695,113 @@ pub fn delete(index_path: &str, subset: &[i64]) -> Result { crate::text_search::drop_temp_table(&conn, name); } - // Re-sequence _subset_ IDs to be contiguous 0-based. - // Instead of copying the entire table (expensive for large tables with TEXT/BLOB - // columns), use range-based UPDATEs. Because `_subset_` is now a regular column - // (not the rowid), each UPDATE rewrites only the small integer value and its - // index entry — the fat row stays put. Process gaps in ascending order so - // decremented values never collide. + // Re-sequence _subset_ IDs to be contiguous 0-based. + // Instead of copying the entire table (expensive for large tables with TEXT/BLOB + // columns), use range-based UPDATEs. Because `_subset_` is now a regular column + // (not the rowid), each UPDATE rewrites only the small integer value and its + // index entry - the fat row stays put. Process gaps in ascending order so + // decremented values never collide. + let mut sorted_ids: Vec = subset.to_vec(); + sorted_ids.sort_unstable(); + sorted_ids.dedup(); + sorted_ids.retain(|&id| id >= 0 && id < original_count); + + let max_id: i64 = conn + .query_row( + &format!( + "SELECT COALESCE(MAX(\"{}\"), -1) FROM METADATA", + SUBSET_COLUMN + ), + [], + |row| row.get(0), + ) + .unwrap_or(-1); + + if max_id >= 0 && !sorted_ids.is_empty() { + // For each gap left by deleted rows, shift all subsequent rows down. + // Merge into contiguous ranges: if IDs 5,6,7 were deleted, rows >= 8 + // shift down by 3. We process from lowest gap upward. + // + // Build (range_start, range_end, shift) tuples. Consecutive deleted IDs + // form a single gap; the rows between two gaps all need the same shift + // (equal to the number of deleted IDs to their left). + let mut updates: Vec<(i64, i64, i64)> = Vec::new(); + let mut i = 0; + while i < sorted_ids.len() { + // Advance past consecutive deleted IDs + let mut j = i + 1; + while j < sorted_ids.len() && sorted_ids[j] == sorted_ids[j - 1] + 1 { + j += 1; + } + // Rows from sorted_ids[j-1]+1 up to (but not including) the next + // deleted ID need to shift down by j (total deletions so far). + let range_start = sorted_ids[j - 1] + 1; + let range_end = if j < sorted_ids.len() { + sorted_ids[j] + } else { + max_id + sorted_ids.len() as i64 + 1 + }; + if range_start < range_end { + updates.push((range_start, range_end, j as i64)); + } + i = j; + } + + for (from, to_excl, shift) in &updates { + conn.execute( + &format!( + "UPDATE METADATA SET \"{}\" = \"{}\" - ?1 WHERE \"{}\" >= ?2 AND \"{}\" < ?3", + SUBSET_COLUMN, SUBSET_COLUMN, SUBSET_COLUMN, SUBSET_COLUMN + ), + rusqlite::params![shift, from, to_excl], + )?; + } + } + + // Commit transaction + conn.execute("COMMIT", [])?; + + Ok(deleted) +} + +fn delete_v2(conn: &Connection, subset: &[i64]) -> Result { + conn.execute("BEGIN", [])?; + + let original_count: i64 = conn + .query_row( + &format!( + "SELECT COALESCE(MAX(\"{}\"), -1) FROM METADATA", + SUBSET_COLUMN + ), + [], + |row| row.get::<_, i64>(0), + ) + .unwrap_or(-1) + + 1; + + // Delete from thin table + let (in_clause, in_params, temp_table) = crate::text_search::build_in_clause(conn, subset)?; + let delete_sql = format!( + "DELETE FROM METADATA WHERE \"{}\" {}", + SUBSET_COLUMN, in_clause + ); + let param_refs: Vec<&dyn ToSql> = in_params.iter().map(|v| v.as_ref()).collect(); + let deleted = conn.execute(&delete_sql, params_from_iter(param_refs))?; + if let Some(ref name) = temp_table { + crate::text_search::drop_temp_table(conn, name); + } + + // Delete orphaned content rows + conn.execute( + &format!( + "DELETE FROM {} WHERE \"{}\" NOT IN (SELECT \"{}\" FROM METADATA)", + CONTENT_TABLE, CONTENT_ID_COLUMN, CONTENT_ID_COLUMN + ), + [], + )?; + + // Re-sequence _subset_ on the thin table (same algorithm as v1, but now + // the table is small so this is always fast). let mut sorted_ids: Vec = subset.to_vec(); sorted_ids.sort_unstable(); sorted_ids.dedup(); @@ -1347,23 +1819,13 @@ pub fn delete(index_path: &str, subset: &[i64]) -> Result { .unwrap_or(-1); if max_id >= 0 && !sorted_ids.is_empty() { - // For each gap left by deleted rows, shift all subsequent rows down. - // Merge into contiguous ranges: if IDs 5,6,7 were deleted, rows >= 8 - // shift down by 3. We process from lowest gap upward. - // - // Build (range_start, range_end, shift) tuples. Consecutive deleted IDs - // form a single gap; the rows between two gaps all need the same shift - // (equal to the number of deleted IDs to their left). let mut updates: Vec<(i64, i64, i64)> = Vec::new(); let mut i = 0; while i < sorted_ids.len() { - // Advance past consecutive deleted IDs let mut j = i + 1; while j < sorted_ids.len() && sorted_ids[j] == sorted_ids[j - 1] + 1 { j += 1; } - // Rows from sorted_ids[j-1]+1 up to (but not including) the next - // deleted ID need to shift down by j (total deletions so far). let range_start = sorted_ids[j - 1] + 1; let range_end = if j < sorted_ids.len() { sorted_ids[j] @@ -1387,9 +1849,7 @@ pub fn delete(index_path: &str, subset: &[i64]) -> Result { } } - // Commit transaction conn.execute("COMMIT", [])?; - Ok(deleted) } @@ -1435,10 +1895,17 @@ pub fn where_condition( let valid_columns = get_schema_columns(conn)?; validate_condition(condition, &valid_columns)?; - let query = format!( - "SELECT \"{}\" FROM METADATA WHERE {}", - SUBSET_COLUMN, condition - ); + let query = if is_split_schema(conn) && condition_references_fat_column(condition, conn) { + format!( + "SELECT M.\"{}\" FROM METADATA M JOIN {} C ON M.\"{}\" = C.\"{}\" WHERE {}", + SUBSET_COLUMN, CONTENT_TABLE, CONTENT_ID_COLUMN, CONTENT_ID_COLUMN, condition + ) + } else { + format!( + "SELECT \"{}\" FROM METADATA WHERE {}", + SUBSET_COLUMN, condition + ) + }; let params: Vec> = parameters.iter().map(json_to_sql).collect(); let param_refs: Vec<&dyn ToSql> = params.iter().map(|v| v.as_ref()).collect(); @@ -1455,6 +1922,24 @@ pub fn where_condition( }) } +/// Check if a SQL condition references any fat (METADATA_CONTENT) column. +fn condition_references_fat_column(condition: &str, conn: &Connection) -> bool { + let mut fat_columns: Vec = Vec::new(); + if let Ok(mut stmt) = conn.prepare(&format!("PRAGMA table_info({})", CONTENT_TABLE)) { + if let Ok(rows) = stmt.query_map([], |row| row.get::<_, String>(1)) { + for row in rows.flatten() { + if row != CONTENT_ID_COLUMN { + fat_columns.push(row); + } + } + } + } + let cond_upper = condition.to_uppercase(); + fat_columns + .iter() + .any(|col| cond_upper.contains(&col.to_uppercase())) +} + /// Query document IDs with REGEXP support enabled. /// /// This function is similar to `where_condition` but registers a REGEXP @@ -1546,10 +2031,17 @@ pub fn where_condition_regexp( }, )?; - let query = format!( - "SELECT \"{}\" FROM METADATA WHERE {}", - SUBSET_COLUMN, condition - ); + let query = if is_split_schema(conn) && condition_references_fat_column(condition, conn) { + format!( + "SELECT M.\"{}\" FROM METADATA M JOIN {} C ON M.\"{}\" = C.\"{}\" WHERE {}", + SUBSET_COLUMN, CONTENT_TABLE, CONTENT_ID_COLUMN, CONTENT_ID_COLUMN, condition + ) + } else { + format!( + "SELECT \"{}\" FROM METADATA WHERE {}", + SUBSET_COLUMN, condition + ) + }; let params: Vec> = parameters.iter().map(json_to_sql).collect(); let param_refs: Vec<&dyn ToSql> = params.iter().map(|v| v.as_ref()).collect(); @@ -1604,10 +2096,17 @@ pub fn get_distinct_strings(index_path: &str, column: &str) -> Result>(0))?; @@ -1665,6 +2164,10 @@ pub fn get( validate_condition(cond, &valid_columns)?; } + if is_split_schema(conn) { + return get_v2(conn, condition, parameters, subset); + } + // Get column names let mut columns: Vec = Vec::new(); { @@ -1746,6 +2249,128 @@ pub fn get( }) } +fn get_v2( + conn: &Connection, + condition: Option<&str>, + parameters: &[Value], + subset: Option<&[i64]>, +) -> Result> { + // Build the column list for the JOIN query, excluding _content_id_. + let mut thin_cols: Vec = Vec::new(); + { + let mut stmt = conn.prepare("PRAGMA table_info(METADATA)")?; + let rows = stmt.query_map([], |row| row.get::<_, String>(1))?; + for row in rows { + let col = row?; + if col != CONTENT_ID_COLUMN { + thin_cols.push(col); + } + } + } + let mut fat_cols: Vec = Vec::new(); + { + let mut stmt = conn.prepare(&format!("PRAGMA table_info({})", CONTENT_TABLE))?; + let rows = stmt.query_map([], |row| row.get::<_, String>(1))?; + for row in rows { + let col = row?; + if col != CONTENT_ID_COLUMN { + fat_cols.push(col); + } + } + } + + // Build SELECT columns: M.thin_col, ..., C.fat_col, ... + let mut select_parts: Vec = Vec::new(); + for col in &thin_cols { + select_parts.push(format!("M.\"{}\"", col)); + } + for col in &fat_cols { + select_parts.push(format!("C.\"{}\"", col)); + } + let select_clause = select_parts.join(", "); + + let from_clause = format!( + "METADATA M JOIN {} C ON M.\"{}\" = C.\"{}\"", + CONTENT_TABLE, CONTENT_ID_COLUMN, CONTENT_ID_COLUMN + ); + + // Merged column names for result construction + let all_cols: Vec<&String> = thin_cols.iter().chain(fat_cols.iter()).collect(); + + if let Some(ids) = subset { + if ids.is_empty() { + return Ok(Vec::new()); + } + + let mut results: Vec = Vec::new(); + for chunk in ids.chunks(SQLITE_PARAM_LIMIT) { + let placeholders: Vec<&str> = std::iter::repeat_n("?", chunk.len()).collect(); + let query = format!( + "SELECT {} FROM {} WHERE M.\"{}\" IN ({})", + select_clause, + from_clause, + SUBSET_COLUMN, + placeholders.join(", ") + ); + let params: Vec> = chunk + .iter() + .map(|&id| Box::new(id) as Box) + .collect(); + let param_refs: Vec<&dyn ToSql> = params.iter().map(|v| v.as_ref()).collect(); + let mut stmt = conn.prepare(&query)?; + let mut rows = stmt.query(params_from_iter(param_refs))?; + + while let Some(row) = rows.next()? { + let mut obj = serde_json::Map::new(); + for (i, col) in all_cols.iter().enumerate() { + let value = row_to_json_value(row, i)?; + obj.insert((*col).clone(), value); + } + results.push(Value::Object(obj)); + } + } + + let mut results_map: HashMap = HashMap::new(); + for result in results { + if let Some(id) = result.get(SUBSET_COLUMN).and_then(|v| v.as_i64()) { + results_map.insert(id, result); + } + } + return Ok(ids.iter().filter_map(|id| results_map.remove(id)).collect()); + } + + let (query, params): (String, Vec>) = if let Some(cond) = condition { + let query = format!( + "SELECT {} FROM {} WHERE {} ORDER BY M.\"{}\"", + select_clause, from_clause, cond, SUBSET_COLUMN + ); + let params = parameters.iter().map(json_to_sql).collect(); + (query, params) + } else { + let query = format!( + "SELECT {} FROM {} ORDER BY M.\"{}\"", + select_clause, from_clause, SUBSET_COLUMN + ); + (query, Vec::new()) + }; + + let param_refs: Vec<&dyn ToSql> = params.iter().map(|v| v.as_ref()).collect(); + let mut stmt = conn.prepare(&query)?; + let mut rows = stmt.query(params_from_iter(param_refs))?; + + let mut results: Vec = Vec::new(); + while let Some(row) = rows.next()? { + let mut obj = serde_json::Map::new(); + for (i, col) in all_cols.iter().enumerate() { + let value = row_to_json_value(row, i)?; + obj.insert((*col).clone(), value); + } + results.push(Value::Object(obj)); + } + + Ok(results) +} + /// Helper to convert a rusqlite row column to JSON value. fn row_to_json_value(row: &rusqlite::Row, idx: usize) -> SqliteResult { // Try to get the value in order of most likely types @@ -1884,14 +2509,20 @@ pub fn update_where( } } - // Keep the FTS mirror in sync with metadata updates by recording affected rows - // before executing the UPDATE. This stays on the generic path so any caller that - // updates searchable fields gets consistent search results. + // Find affected rows before the update (for FTS sync). + let affected_query = + if is_split_schema(&conn) && condition_references_fat_column(condition, &conn) { + format!( + "SELECT M.\"{}\" FROM METADATA M JOIN {} C ON M.\"{}\" = C.\"{}\" WHERE {}", + SUBSET_COLUMN, CONTENT_TABLE, CONTENT_ID_COLUMN, CONTENT_ID_COLUMN, condition + ) + } else { + format!( + "SELECT \"{}\" FROM METADATA WHERE {}", + SUBSET_COLUMN, condition + ) + }; let affected_ids: Vec = { - let affected_query = format!( - "SELECT \"{}\" FROM METADATA WHERE {}", - SUBSET_COLUMN, condition - ); let cond_params: Vec> = parameters.iter().map(json_to_sql).collect(); let cond_param_refs: Vec<&dyn ToSql> = cond_params.iter().map(|v| v.as_ref()).collect(); @@ -1902,6 +2533,17 @@ pub fn update_where( rows.filter_map(|row| row.ok()).collect() }; + if is_split_schema(&conn) { + return update_where_v2( + &conn, + index_path, + updates_obj, + &affected_ids, + condition, + parameters, + ); + } + // Build SET clause let set_parts: Vec = updates_obj .keys() @@ -1927,6 +2569,112 @@ pub fn update_where( Ok(updated) } +fn update_where_v2( + conn: &Connection, + index_path: &str, + updates_obj: &serde_json::Map, + affected_ids: &[i64], + _condition: &str, + _parameters: &[Value], +) -> Result { + if affected_ids.is_empty() { + return Ok(0); + } + + // Split updates into thin and fat columns. + let thin_updates: Vec<(&String, &Value)> = updates_obj + .iter() + .filter(|(k, _)| is_thin_column(k)) + .collect(); + let fat_updates: Vec<(&String, &Value)> = updates_obj + .iter() + .filter(|(k, _)| !is_thin_column(k)) + .collect(); + + conn.execute("BEGIN", [])?; + + let mut total_updated = 0usize; + + // Update thin table + if !thin_updates.is_empty() { + let set_parts: Vec = thin_updates + .iter() + .map(|(col, _)| format!("\"{}\" = ?", col)) + .collect(); + let set_clause = set_parts.join(", "); + + for chunk in affected_ids.chunks(SQLITE_PARAM_LIMIT) { + let placeholders: Vec<&str> = std::iter::repeat_n("?", chunk.len()).collect(); + let query = format!( + "UPDATE METADATA SET {} WHERE \"{}\" IN ({})", + set_clause, + SUBSET_COLUMN, + placeholders.join(", ") + ); + let mut params: Vec> = + thin_updates.iter().map(|(_, v)| json_to_sql(v)).collect(); + params.extend(chunk.iter().map(|&id| Box::new(id) as Box)); + let param_refs: Vec<&dyn ToSql> = params.iter().map(|v| v.as_ref()).collect(); + total_updated += conn.execute(&query, params_from_iter(param_refs))?; + } + } + + // Update fat table + if !fat_updates.is_empty() { + let set_parts: Vec = fat_updates + .iter() + .map(|(col, _)| format!("\"{}\" = ?", col)) + .collect(); + let set_clause = set_parts.join(", "); + + // Get content IDs for affected rows + for chunk in affected_ids.chunks(SQLITE_PARAM_LIMIT) { + let placeholders: Vec<&str> = std::iter::repeat_n("?", chunk.len()).collect(); + let content_ids_query = format!( + "SELECT \"{}\" FROM METADATA WHERE \"{}\" IN ({})", + CONTENT_ID_COLUMN, + SUBSET_COLUMN, + placeholders.join(", ") + ); + let id_params: Vec> = chunk + .iter() + .map(|&id| Box::new(id) as Box) + .collect(); + let id_param_refs: Vec<&dyn ToSql> = id_params.iter().map(|v| v.as_ref()).collect(); + let mut stmt = conn.prepare(&content_ids_query)?; + let content_ids: Vec = stmt + .query_map(params_from_iter(id_param_refs), |row| row.get::<_, i64>(0))? + .filter_map(|r| r.ok()) + .collect(); + + if !content_ids.is_empty() { + let c_placeholders: Vec<&str> = + std::iter::repeat_n("?", content_ids.len()).collect(); + let query = format!( + "UPDATE {} SET {} WHERE \"{}\" IN ({})", + CONTENT_TABLE, + set_clause, + CONTENT_ID_COLUMN, + c_placeholders.join(", ") + ); + let mut params: Vec> = + fat_updates.iter().map(|(_, v)| json_to_sql(v)).collect(); + params.extend(content_ids.iter().map(|&id| Box::new(id) as Box)); + let param_refs: Vec<&dyn ToSql> = params.iter().map(|v| v.as_ref()).collect(); + total_updated += conn.execute(&query, params_from_iter(param_refs))?; + } + } + } + + conn.execute("COMMIT", [])?; + + if total_updated > 0 && !affected_ids.is_empty() { + crate::text_search::update_rows(index_path, affected_ids)?; + } + + Ok(affected_ids.len()) +} + /// Get the number of documents in the metadata database. pub fn count(index_path: &str) -> Result { let db_path = get_db_path(index_path); @@ -2766,7 +3514,7 @@ mod tests { } #[test] - fn test_create_uses_v1_layout() { + fn test_create_uses_v2_layout() { let dir = setup_test_dir(); let path = dir.path().to_str().unwrap(); let meta: Vec = (0..5) @@ -2774,17 +3522,28 @@ mod tests { .collect(); create(path, &meta, &(0..5).collect::>()).unwrap(); - assert_eq!(user_version_of(path), 1); - assert!(!subset_is_pk(path), "v1: _subset_ must not be the PK/rowid"); + assert_eq!(user_version_of(path), 2); + assert!(!subset_is_pk(path), "v2: _subset_ must not be the PK/rowid"); - // Forward-compat guarantee: SELECT * exposes exactly _subset_ + user - // columns and leaks no internal column, so older binaries read it unchanged. + // Forward-compat: get() exposes _subset_ + user columns, hides _content_id_. let cols = select_star_columns(path); assert!(cols.iter().any(|c| c == SUBSET_COLUMN)); assert!(cols.iter().any(|c| c == "file") && cols.iter().any(|c| c == "code")); - assert!(!cols - .iter() - .any(|c| c == "rowid" || c.starts_with("_METADATA"))); + assert!(!cols.iter().any(|c| c == CONTENT_ID_COLUMN)); + + // Verify both tables exist + let c = meta_db(path); + let has_content: i64 = c + .query_row( + &format!( + "SELECT COUNT(*) FROM sqlite_master WHERE type='table' AND name='{}'", + CONTENT_TABLE + ), + [], + |row| row.get(0), + ) + .unwrap(); + assert_eq!(has_content, 1); } #[test] @@ -2868,4 +3627,206 @@ mod tests { assert_eq!(rows[3]["file"].as_str().unwrap(), "f4.rs"); assert_eq!(rows[3]["code"].as_str().unwrap(), "c4"); } + + // ---- v2 split layout tests ---- + + #[test] + fn test_v2_delete_resequences_thin_only() { + let dir = setup_test_dir(); + let path = dir.path().to_str().unwrap(); + let meta: Vec = (0..10) + .map(|i| { + json!({ + "file": format!("f{i}.rs"), + "code": format!("fn func_{i}() {{}}"), + }) + }) + .collect(); + create(path, &meta, &(0..10).collect::>()).unwrap(); + assert_eq!(user_version_of(path), 2); + + // Verify content IDs before delete + let c = meta_db(path); + let content_count_before: i64 = c + .query_row( + &format!("SELECT COUNT(*) FROM {}", CONTENT_TABLE), + [], + |row| row.get(0), + ) + .unwrap(); + assert_eq!(content_count_before, 10); + + // Delete rows 2, 5, 7 + assert_eq!(delete(path, &[2, 5, 7]).unwrap(), 3); + + // Thin table re-sequenced + let rows = get(path, None, &[], None).unwrap(); + assert_eq!(rows.len(), 7); + for (i, row) in rows.iter().enumerate() { + assert_eq!(row[SUBSET_COLUMN].as_i64().unwrap(), i as i64); + } + // Survivors in correct order + assert_eq!(rows[0]["file"].as_str().unwrap(), "f0.rs"); + assert_eq!(rows[2]["file"].as_str().unwrap(), "f3.rs"); + + // Content table had orphans removed + let c = meta_db(path); + let content_count_after: i64 = c + .query_row( + &format!("SELECT COUNT(*) FROM {}", CONTENT_TABLE), + [], + |row| row.get(0), + ) + .unwrap(); + assert_eq!(content_count_after, 7); + + // Content IDs are stable (not re-sequenced) - the remaining content_ids + // should be a subset of 0..10, not necessarily 0..7. + let max_content_id: i64 = c + .query_row( + &format!( + "SELECT MAX(\"{}\") FROM {}", + CONTENT_ID_COLUMN, CONTENT_TABLE + ), + [], + |row| row.get(0), + ) + .unwrap(); + assert!(max_content_id >= 7, "content IDs are stable, not compacted"); + } + + #[test] + fn test_v2_get_returns_all_columns() { + let dir = setup_test_dir(); + let path = dir.path().to_str().unwrap(); + let meta = vec![ + json!({ + "file": "src/main.rs", + "name": "main", + "line": 1, + "code": "fn main() { println!(\"hello\"); }", + "signature": "fn main()", + }), + json!({ + "file": "src/lib.rs", + "name": "helper", + "line": 10, + "code": "fn helper() -> i32 { 42 }", + "signature": "fn helper() -> i32", + }), + ]; + create(path, &meta, &[0, 1]).unwrap(); + + let rows = get(path, None, &[], None).unwrap(); + assert_eq!(rows.len(), 2); + + // Thin columns present + assert_eq!(rows[0]["file"], "src/main.rs"); + assert_eq!(rows[0]["name"], "main"); + assert_eq!(rows[0]["line"], 1); + + // Fat columns present + assert_eq!(rows[0]["code"], "fn main() { println!(\"hello\"); }"); + assert_eq!(rows[0]["signature"], "fn main()"); + + // _content_id_ hidden + assert!(rows[0].get(CONTENT_ID_COLUMN).is_none()); + + // _subset_ present + assert_eq!(rows[0][SUBSET_COLUMN], 0); + assert_eq!(rows[1][SUBSET_COLUMN], 1); + } + + #[test] + fn test_v2_where_condition_on_thin_column() { + let dir = setup_test_dir(); + let path = dir.path().to_str().unwrap(); + let meta = vec![ + json!({"file": "src/a.rs", "name": "foo", "code": "fn foo() {}"}), + json!({"file": "src/b.rs", "name": "bar", "code": "fn bar() {}"}), + json!({"file": "src/a.rs", "name": "baz", "code": "fn baz() {}"}), + ]; + create(path, &meta, &[0, 1, 2]).unwrap(); + + let ids = where_condition(path, "file = ?", &[json!("src/a.rs")]).unwrap(); + assert_eq!(ids, vec![0, 2]); + } + + #[test] + fn test_v2_where_condition_regexp_on_fat_column() { + let dir = setup_test_dir(); + let path = dir.path().to_str().unwrap(); + let meta = vec![ + json!({"file": "a.rs", "code": "fn alpha() { let x = async_fn().await; }"}), + json!({"file": "b.rs", "code": "fn beta() { println!(\"hi\"); }"}), + json!({"file": "c.rs", "code": "fn gamma() { let y = tokio::spawn(async {}); }"}), + ]; + create(path, &meta, &[0, 1, 2]).unwrap(); + + let ids = where_condition_regexp(path, "code REGEXP ?", &[json!("async")]).unwrap(); + let mut ids_sorted = ids.clone(); + ids_sorted.sort(); + assert_eq!(ids_sorted, vec![0, 2]); + } + + #[test] + fn test_v1_index_still_works() { + let dir = setup_test_dir(); + let path = dir.path().to_str().unwrap(); + + // Hand-build a v1 index (no METADATA_CONTENT table). + { + let c = meta_db(path); + c.execute_batch(&format!("PRAGMA user_version={};", METADATA_SCHEMA_V1)) + .unwrap(); + c.execute( + &format!( + "CREATE TABLE METADATA (\"{}\" INTEGER NOT NULL, file TEXT, code TEXT)", + SUBSET_COLUMN + ), + [], + ) + .unwrap(); + c.execute( + &format!( + "CREATE INDEX \"{}\" ON METADATA (\"{}\")", + SUBSET_INDEX_NAME, SUBSET_COLUMN + ), + [], + ) + .unwrap(); + for i in 0..5i64 { + c.execute( + "INSERT INTO METADATA VALUES (?, ?, ?)", + rusqlite::params![i, format!("f{i}.rs"), format!("c{i}")], + ) + .unwrap(); + } + } + + assert_eq!(user_version_of(path), 1); + assert_eq!(count(path).unwrap(), 5); + + // get works + let rows = get(path, None, &[], None).unwrap(); + assert_eq!(rows.len(), 5); + assert_eq!(rows[0]["file"], "f0.rs"); + assert_eq!(rows[0]["code"], "c0"); + + // where_condition works + let ids = where_condition(path, "file = ?", &[json!("f2.rs")]).unwrap(); + assert_eq!(ids, vec![2]); + + // delete works (uses v1 path) + assert_eq!(delete(path, &[1]).unwrap(), 1); + assert_eq!(count(path).unwrap(), 4); + let rows = get(path, None, &[], None).unwrap(); + assert_eq!(rows[1]["file"], "f2.rs"); + assert_eq!(rows[1][SUBSET_COLUMN], 1); + + // update works (v1 path) + let new_meta = vec![json!({"file": "f5.rs", "code": "c5"})]; + update(path, &new_meta, &[4]).unwrap(); + assert_eq!(count(path).unwrap(), 5); + } } diff --git a/next-plaid/src/text_search.rs b/next-plaid/src/text_search.rs index e1c3756..442e731 100644 --- a/next-plaid/src/text_search.rs +++ b/next-plaid/src/text_search.rs @@ -32,7 +32,9 @@ use serde::{Deserialize, Serialize}; use serde_json::Value; use crate::error::{Error, Result}; -use crate::filtering::{get_db_path, SUBSET_COLUMN}; +use crate::filtering::{ + get_db_path, is_split_schema, CONTENT_ID_COLUMN, CONTENT_TABLE, SUBSET_COLUMN, +}; use crate::search::QueryResult; /// FTS5 virtual table name. @@ -599,18 +601,8 @@ pub fn update_rows(index_path: &str, doc_ids: &[i64]) -> Result<()> { return Ok(()); } - // Get METADATA column names (to rebuild text from current row) - let mut columns: Vec = Vec::new(); - { - let mut stmt = conn.prepare("PRAGMA table_info(METADATA)")?; - let rows = stmt.query_map([], |row| row.get::<_, String>(1))?; - for row in rows { - let col = row?; - if col != SUBSET_COLUMN { - columns.push(col); - } - } - } + // Get all user-visible column names and build the per-row SELECT. + let (columns, meta_select_sql) = fts_update_row_select(&conn)?; // Prepare statements let read_old_sql = format!( @@ -630,21 +622,6 @@ pub fn update_rows(index_path: &str, doc_ids: &[i64]) -> Result<()> { FTS_TABLE, FTS_CONTENT_COLUMN ); - let col_refs: Vec = columns.iter().map(|c| format!("\"{}\"", c)).collect(); - let meta_select_sql = if columns.is_empty() { - format!( - "SELECT \"{}\" FROM METADATA WHERE \"{}\" = ?", - SUBSET_COLUMN, SUBSET_COLUMN - ) - } else { - format!( - "SELECT \"{}\", {} FROM METADATA WHERE \"{}\" = ?", - SUBSET_COLUMN, - col_refs.join(", "), - SUBSET_COLUMN - ) - }; - conn.execute_batch("BEGIN") .map_err(|e| Error::Filtering(format!("Failed to begin transaction: {}", e)))?; @@ -704,6 +681,155 @@ pub fn update_rows(index_path: &str, doc_ids: &[i64]) -> Result<()> { Ok(()) } +/// Build the column list and SELECT statement for populating FTS from metadata. +/// For v2 (split layout), JOINs METADATA and METADATA_CONTENT. +/// Returns (user_visible_columns, select_sql) where select_sql has _subset_ as col 0. +fn fts_rebuild_select(conn: &Connection) -> Result<(Vec, String)> { + if is_split_schema(conn) { + let mut thin_cols: Vec = Vec::new(); + { + let mut stmt = conn.prepare("PRAGMA table_info(METADATA)")?; + let rows = stmt.query_map([], |row| row.get::<_, String>(1))?; + for row in rows { + let col = row?; + if col != SUBSET_COLUMN && col != CONTENT_ID_COLUMN { + thin_cols.push(col); + } + } + } + let mut fat_cols: Vec = Vec::new(); + { + let mut stmt = conn.prepare(&format!("PRAGMA table_info({})", CONTENT_TABLE))?; + let rows = stmt.query_map([], |row| row.get::<_, String>(1))?; + for row in rows { + let col = row?; + if col != CONTENT_ID_COLUMN { + fat_cols.push(col); + } + } + } + + let mut select_parts = vec![format!("M.\"{}\"", SUBSET_COLUMN)]; + let mut columns = Vec::new(); + for col in &thin_cols { + select_parts.push(format!("M.\"{}\"", col)); + columns.push(col.clone()); + } + for col in &fat_cols { + select_parts.push(format!("C.\"{}\"", col)); + columns.push(col.clone()); + } + + let select_sql = format!( + "SELECT {} FROM METADATA M JOIN {} C ON M.\"{}\" = C.\"{}\" ORDER BY M.\"{}\"", + select_parts.join(", "), + CONTENT_TABLE, + CONTENT_ID_COLUMN, + CONTENT_ID_COLUMN, + SUBSET_COLUMN + ); + Ok((columns, select_sql)) + } else { + let mut columns: Vec = Vec::new(); + { + let mut stmt = conn.prepare("PRAGMA table_info(METADATA)")?; + let rows = stmt.query_map([], |row| row.get::<_, String>(1))?; + for row in rows { + let col = row?; + if col != SUBSET_COLUMN { + columns.push(col); + } + } + } + let col_refs: Vec = columns.iter().map(|c| format!("\"{}\"", c)).collect(); + let select_sql = format!( + "SELECT \"{}\", {} FROM METADATA ORDER BY \"{}\"", + SUBSET_COLUMN, + col_refs.join(", "), + SUBSET_COLUMN + ); + Ok((columns, select_sql)) + } +} + +/// Build the column list and per-row SELECT for update_rows. +/// For v2, JOINs METADATA and METADATA_CONTENT with a WHERE on _subset_. +/// Returns (user_visible_columns, select_sql_with_trailing_where_placeholder). +fn fts_update_row_select(conn: &Connection) -> Result<(Vec, String)> { + if is_split_schema(conn) { + let mut thin_cols: Vec = Vec::new(); + { + let mut stmt = conn.prepare("PRAGMA table_info(METADATA)")?; + let rows = stmt.query_map([], |row| row.get::<_, String>(1))?; + for row in rows { + let col = row?; + if col != SUBSET_COLUMN && col != CONTENT_ID_COLUMN { + thin_cols.push(col); + } + } + } + let mut fat_cols: Vec = Vec::new(); + { + let mut stmt = conn.prepare(&format!("PRAGMA table_info({})", CONTENT_TABLE))?; + let rows = stmt.query_map([], |row| row.get::<_, String>(1))?; + for row in rows { + let col = row?; + if col != CONTENT_ID_COLUMN { + fat_cols.push(col); + } + } + } + + let mut select_parts = vec![format!("M.\"{}\"", SUBSET_COLUMN)]; + let mut columns = Vec::new(); + for col in &thin_cols { + select_parts.push(format!("M.\"{}\"", col)); + columns.push(col.clone()); + } + for col in &fat_cols { + select_parts.push(format!("C.\"{}\"", col)); + columns.push(col.clone()); + } + + let select_sql = format!( + "SELECT {} FROM METADATA M JOIN {} C ON M.\"{}\" = C.\"{}\" WHERE M.\"{}\" = ?", + select_parts.join(", "), + CONTENT_TABLE, + CONTENT_ID_COLUMN, + CONTENT_ID_COLUMN, + SUBSET_COLUMN + ); + Ok((columns, select_sql)) + } else { + let mut columns: Vec = Vec::new(); + { + let mut stmt = conn.prepare("PRAGMA table_info(METADATA)")?; + let rows = stmt.query_map([], |row| row.get::<_, String>(1))?; + for row in rows { + let col = row?; + if col != SUBSET_COLUMN { + columns.push(col); + } + } + } + let meta_select_sql = if columns.is_empty() { + format!( + "SELECT \"{}\" FROM METADATA WHERE \"{}\" = ?", + SUBSET_COLUMN, SUBSET_COLUMN + ) + } else { + let col_refs: Vec = columns.iter().map(|c| format!("\"{}\"", c)).collect(); + format!( + "SELECT \"{}\", {} FROM METADATA WHERE \"{}\" = ?", + SUBSET_COLUMN, + col_refs.join(", "), + SUBSET_COLUMN + ) + }; + Ok((columns, meta_select_sql)) + } +} + /// Rebuild the FTS5 index and content table after `_subset_` IDs have been /// re-indexed (e.g. after a delete in the METADATA table). /// @@ -749,18 +875,8 @@ pub fn rebuild(index_path: &str) -> Result<()> { // Recreate both tables (preserving the stored tokenizer) ensure_tables(&conn, &tokenizer)?; - // Get all column names except _subset_ - let mut columns: Vec = Vec::new(); - { - let mut stmt = conn.prepare("PRAGMA table_info(METADATA)")?; - let rows = stmt.query_map([], |row| row.get::<_, String>(1))?; - for row in rows { - let col = row?; - if col != SUBSET_COLUMN { - columns.push(col); - } - } - } + // Get all user-visible column names and build the SELECT for populating FTS. + let (columns, select_sql) = fts_rebuild_select(&conn)?; // Populate content table from METADATA if columns.is_empty() { @@ -771,13 +887,6 @@ pub fn rebuild(index_path: &str) -> Result<()> { conn.execute(&sql, []) .map_err(|e| Error::Filtering(format!("Failed to populate content table: {}", e)))?; } else { - let col_refs: Vec = columns.iter().map(|c| format!("\"{}\"", c)).collect(); - let select_sql = format!( - "SELECT \"{}\", {} FROM METADATA ORDER BY \"{}\"", - SUBSET_COLUMN, - col_refs.join(", "), - SUBSET_COLUMN - ); let mut select_stmt = conn.prepare(&select_sql)?; let mut rows = select_stmt.query([])?;