diff --git a/Cargo.lock b/Cargo.lock index f4df53efe5..9ad731a212 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2420,8 +2420,10 @@ dependencies = [ "gix-path", "gix-pathspec", "gix-worktree", + "hashbrown 0.16.1", "portable-atomic", "thiserror 2.0.18", + "windows-sys 0.61.2", ] [[package]] diff --git a/gix-index/src/entry/mode.rs b/gix-index/src/entry/mode.rs index 3b29548485..54874748f0 100644 --- a/gix-index/src/entry/mode.rs +++ b/gix-index/src/entry/mode.rs @@ -46,22 +46,44 @@ impl Mode { stat: &crate::fs::Metadata, has_symlinks: bool, executable_bit: bool, + ) -> Option { + self.change_to_match_fs_with_values( + stat.is_file(), + stat.is_dir(), + stat.is_symlink(), + stat.is_executable(), + has_symlinks, + executable_bit, + ) + } + + /// Like [`change_to_match_fs`](Self::change_to_match_fs) but accepts pre-extracted + /// file-type and permission bits, for callers that already have them (e.g. cached + /// metadata from a batched directory enumeration). + pub fn change_to_match_fs_with_values( + self, + is_file: bool, + is_dir: bool, + is_symlink: bool, + is_executable: bool, + has_symlinks: bool, + executable_bit: bool, ) -> Option { match self { - Mode::FILE if !stat.is_file() => (), - Mode::SYMLINK if stat.is_symlink() => return None, - Mode::SYMLINK if has_symlinks && !stat.is_symlink() => (), - Mode::SYMLINK if !has_symlinks && !stat.is_file() => (), - Mode::COMMIT | Mode::DIR if !stat.is_dir() => (), - Mode::FILE if executable_bit && stat.is_executable() => return Some(Change::ExecutableBit), - Mode::FILE_EXECUTABLE if executable_bit && !stat.is_executable() => return Some(Change::ExecutableBit), + Mode::FILE if !is_file => (), + Mode::SYMLINK if is_symlink => return None, + Mode::SYMLINK if has_symlinks && !is_symlink => (), + Mode::SYMLINK if !has_symlinks && !is_file => (), + Mode::COMMIT | Mode::DIR if !is_dir => (), + Mode::FILE if executable_bit && is_executable => return Some(Change::ExecutableBit), + Mode::FILE_EXECUTABLE if executable_bit && !is_executable => return Some(Change::ExecutableBit), _ => return None, } - let new_mode = if stat.is_dir() { + let new_mode = if is_dir { Mode::COMMIT - } else if executable_bit && stat.is_executable() { + } else if executable_bit && is_executable { Mode::FILE_EXECUTABLE - } else if has_symlinks && stat.is_symlink() { + } else if has_symlinks && is_symlink { Mode::SYMLINK } else { Mode::FILE diff --git a/gix-status/Cargo.toml b/gix-status/Cargo.toml index da89cb9284..f4c1566b42 100644 --- a/gix-status/Cargo.toml +++ b/gix-status/Cargo.toml @@ -38,12 +38,21 @@ gix-diff = { version = "^0.62.0", path = "../gix-diff", default-features = false thiserror = "2.0.18" filetime = "0.2.27" bstr = { version = "1.12.0", default-features = false } +hashbrown = "0.16.0" document-features = { version = "0.2.0", optional = true } [target.'cfg(not(target_has_atomic = "64"))'.dependencies] portable-atomic = "1" +[target.'cfg(windows)'.dependencies] +windows-sys = { version = "0.61.1", features = [ + "Win32_Foundation", + "Win32_Storage_FileSystem", + # For SECURITY_ATTRIBUTES in CreateFileW (used by metadata_cache::prepare on Windows). + "Win32_Security", +] } + [dev-dependencies] gix-hash = { path = "../gix-hash", features = ["sha1"] } diff --git a/gix-status/src/index_as_worktree/function.rs b/gix-status/src/index_as_worktree/function.rs index 8c2e48b11e..4a6b4a9133 100644 --- a/gix-status/src/index_as_worktree/function.rs +++ b/gix-status/src/index_as_worktree/function.rs @@ -21,6 +21,67 @@ use crate::{ }, is_dir_to_mode, AtomicU64, SymlinkCheck, }; +#[cfg(windows)] +use crate::metadata_cache::{CachedMetadata, MetadataCache}; + +/// Windows-only union of live `lstat` metadata and pre-cached metadata, so +/// `compute_status` sees one shape. Other platforms use `gix_index::fs::Metadata` +/// directly. +#[cfg(windows)] +enum FileMetadata<'a> { + Live(gix_index::fs::Metadata), + Cached(&'a CachedMetadata), +} + +#[cfg(windows)] +impl FileMetadata<'_> { + fn is_dir(&self) -> bool { + match self { + Self::Live(m) => m.is_dir(), + Self::Cached(c) => c.is_dir, + } + } + + fn is_symlink(&self) -> bool { + match self { + Self::Live(m) => m.is_symlink(), + Self::Cached(c) => c.is_symlink, + } + } + + fn len(&self) -> u64 { + match self { + Self::Live(m) => m.len(), + Self::Cached(c) => c.size, + } + } + + fn to_stat(&self) -> Result { + match self { + Self::Live(m) => gix_index::entry::Stat::from_fs(m), + Self::Cached(c) => Ok(c.to_stat()), + } + } + + fn mode_change( + &self, + entry_mode: gix_index::entry::Mode, + has_symlinks: bool, + executable_bit: bool, + ) -> Option { + match self { + Self::Live(m) => entry_mode.change_to_match_fs(m, has_symlinks, executable_bit), + Self::Cached(c) => entry_mode.change_to_match_fs_with_values( + !c.is_dir && !c.is_symlink, // is_file: regular file (not dir, not symlink) + c.is_dir, + c.is_symlink, + c.is_executable, + has_symlinks, + executable_bit, + ), + } + } +} /// Calculates the changes that need to be applied to an `index` to match the state of the `worktree` and makes them /// observable in `collector`, along with information produced by `compare` which gets to see blobs that may have changes, and @@ -63,6 +124,8 @@ pub fn index_as_worktree<'index, T, U, Find, E>( stack, filter, should_interrupt, + #[cfg(windows)] + metadata_cache, }: Context<'_>, options: Options, ) -> Result @@ -122,6 +185,8 @@ where path_backing, filter, options, + #[cfg(windows)] + metadata_cache, skipped_by_pathspec, skipped_by_entry_flags, @@ -228,6 +293,10 @@ struct State<'a, 'b> { filter: gix_filter::Pipeline, path_backing: &'b gix_index::PathStorageRef, options: &'a Options, + /// Optional pre-populated metadata cache for faster status checks on Windows. + /// Cache lookups happen before falling back to per-file syscalls. + #[cfg(windows)] + metadata_cache: Option<&'a MetadataCache>, skipped_by_pathspec: &'a AtomicUsize, skipped_by_entry_flags: &'a AtomicUsize, @@ -374,53 +443,80 @@ impl<'index> State<'_, 'index> { } Err(err) => return Err(Error::Io(err.into())), }; - self.symlink_metadata_calls.fetch_add(1, Ordering::Relaxed); - let metadata = match gix_index::fs::Metadata::from_path_no_follow(worktree_path) { - Ok(metadata) if metadata.is_dir() => { - // index entries are normally only for files/symlinks - // if a file turned into a directory it was removed - // the only exception here are submodules which are - // part of the index despite being directories - if entry.mode.is_submodule() { - let status = submodule - .status(entry, rela_path) - .map_err(|err| Error::SubmoduleStatus { - rela_path: rela_path.into(), - source: Box::new(err), - })?; - return Ok(status.map(|status| Change::SubmoduleModification(status).into())); - } else { - return Ok(Some(Change::Removed.into())); + + // Acquire metadata. On Windows we consult the metadata cache first and + // only fall back to a syscall on miss; on other platforms per-file + // `lstat` is already fast, so we just do the syscall directly. + #[cfg(windows)] + let metadata = if let Some(cached) = self.metadata_cache.and_then(|c| c.get(rela_path)) { + FileMetadata::Cached(cached) + } else { + self.symlink_metadata_calls.fetch_add(1, Ordering::Relaxed); + match gix_index::fs::Metadata::from_path_no_follow(worktree_path) { + Ok(m) => FileMetadata::Live(m), + Err(err) if gix_fs::io_err::is_not_found(err.kind(), err.raw_os_error()) => { + return Ok(Some(Change::Removed.into())) } + Err(err) => return Err(Error::Io(err.into())), } - Ok(metadata) => metadata, - Err(err) if gix_fs::io_err::is_not_found(err.kind(), err.raw_os_error()) => { - return Ok(Some(Change::Removed.into())) - } - Err(err) => { - return Err(Error::Io(err.into())); + }; + #[cfg(not(windows))] + let metadata = { + self.symlink_metadata_calls.fetch_add(1, Ordering::Relaxed); + match gix_index::fs::Metadata::from_path_no_follow(worktree_path) { + Ok(m) => m, + Err(err) if gix_fs::io_err::is_not_found(err.kind(), err.raw_os_error()) => { + return Ok(Some(Change::Removed.into())) + } + Err(err) => return Err(Error::Io(err.into())), } }; + + // Handle directory: index entries are normally only for files/symlinks. + // If a file turned into a directory it was removed. + // The only exception here are submodules which are part of the index despite being directories. + if metadata.is_dir() { + if entry.mode.is_submodule() { + let status = submodule + .status(entry, rela_path) + .map_err(|err| Error::SubmoduleStatus { + rela_path: rela_path.into(), + source: Box::new(err), + })?; + return Ok(status.map(|status| Change::SubmoduleModification(status).into())); + } else { + return Ok(Some(Change::Removed.into())); + } + } + if entry.flags.contains(gix_index::entry::Flags::INTENT_TO_ADD) { return Ok(Some(EntryStatus::IntentToAdd)); } + + #[cfg(windows)] + let new_stat = metadata.to_stat()?; + #[cfg(not(windows))] let new_stat = gix_index::entry::Stat::from_fs(&metadata)?; - let executable_bit_changed = - match entry - .mode - .change_to_match_fs(&metadata, self.options.fs.symlink, self.options.fs.executable_bit) - { - Some(gix_index::entry::mode::Change::Type { new_mode }) => { - return Ok(Some( - Change::Type { - worktree_mode: new_mode, - } - .into(), - )) - } - Some(gix_index::entry::mode::Change::ExecutableBit) => true, - None => false, - }; + + #[cfg(windows)] + let mode_change = + metadata.mode_change(entry.mode, self.options.fs.symlink, self.options.fs.executable_bit); + #[cfg(not(windows))] + let mode_change = entry + .mode + .change_to_match_fs(&metadata, self.options.fs.symlink, self.options.fs.executable_bit); + let executable_bit_changed = match mode_change { + Some(gix_index::entry::mode::Change::Type { new_mode }) => { + return Ok(Some( + Change::Type { + worktree_mode: new_mode, + } + .into(), + )) + } + Some(gix_index::entry::mode::Change::ExecutableBit) => true, + None => false, + }; // We implement racy-git. See racy-git.txt in the git documentation for detailed documentation. // diff --git a/gix-status/src/index_as_worktree/types.rs b/gix-status/src/index_as_worktree/types.rs index 26d2698141..4485f69a30 100644 --- a/gix-status/src/index_as_worktree/types.rs +++ b/gix-status/src/index_as_worktree/types.rs @@ -3,6 +3,9 @@ use std::sync::atomic::AtomicBool; use bstr::{BStr, BString}; use gix_index::entry; +#[cfg(windows)] +use crate::metadata_cache::MetadataCache; + /// The error returned by [index_as_worktree()`](crate::index_as_worktree()). #[derive(Debug, thiserror::Error)] #[allow(missing_docs)] @@ -55,6 +58,10 @@ pub struct Context<'a> { pub filter: gix_filter::Pipeline, /// A flag to query to learn if cancellation is requested. pub should_interrupt: &'a AtomicBool, + /// Windows-only pre-populated metadata cache. See [`crate::metadata_cache`]. + /// Transparent: `None`/empty/partial are all correct. + #[cfg(windows)] + pub metadata_cache: Option<&'a MetadataCache>, } /// Provide additional information collected during the runtime of [`index_as_worktree()`](crate::index_as_worktree()). diff --git a/gix-status/src/index_as_worktree_with_renames/mod.rs b/gix-status/src/index_as_worktree_with_renames/mod.rs index b362bc858b..a5d45ea7f1 100644 --- a/gix-status/src/index_as_worktree_with_renames/mod.rs +++ b/gix-status/src/index_as_worktree_with_renames/mod.rs @@ -153,6 +153,8 @@ pub(super) mod function { stack, filter, should_interrupt: ctx.should_interrupt, + #[cfg(windows)] + metadata_cache: ctx.metadata_cache, }, options.tracked_file_modifications, ) diff --git a/gix-status/src/index_as_worktree_with_renames/types.rs b/gix-status/src/index_as_worktree_with_renames/types.rs index d0e528c1e4..8a42948aea 100644 --- a/gix-status/src/index_as_worktree_with_renames/types.rs +++ b/gix-status/src/index_as_worktree_with_renames/types.rs @@ -342,6 +342,11 @@ pub struct Context<'a> { pub should_interrupt: &'a AtomicBool, /// The context for the directory walk. pub dirwalk: DirwalkContext<'a>, + /// An optional pre-populated metadata cache for faster status checks on Windows. + /// + /// See [`crate::index_as_worktree::Context::metadata_cache`] for details. + #[cfg(windows)] + pub metadata_cache: Option<&'a crate::metadata_cache::MetadataCache>, } /// All information that is required to perform a [dirwalk](gix_dir::walk()). diff --git a/gix-status/src/lib.rs b/gix-status/src/lib.rs index fd79017b89..ccc7df641f 100644 --- a/gix-status/src/lib.rs +++ b/gix-status/src/lib.rs @@ -37,6 +37,19 @@ use portable_atomic::AtomicU64; pub mod index_as_worktree; pub use index_as_worktree::function::index_as_worktree; +/// The metadata cache is a **Windows-only** optimization. Its job is to skip +/// per-file `lstat` calls by pre-populating stat results via one batched +/// directory enumeration. That trade only pays off where per-file stat is +/// expensive (Windows), not on Linux/macOS where `lstat` is sub-microsecond. +/// A Linux-friendly cache would almost certainly be keyed by *directory* (à la +/// git's `UNTRACKED_CACHE`) rather than by file path, so forcing this type to +/// exist there would encourage the wrong abstraction. Keep the two separate; +/// lift this gate if a cross-platform use case actually appears. +#[cfg(windows)] +pub mod metadata_cache; +#[cfg(windows)] +pub use metadata_cache::{CachedMetadata, MetadataCache}; + #[cfg(feature = "worktree-rewrites")] pub mod index_as_worktree_with_renames; #[cfg(feature = "worktree-rewrites")] diff --git a/gix-status/src/metadata_cache.rs b/gix-status/src/metadata_cache.rs new file mode 100644 index 0000000000..41dd0cfedc --- /dev/null +++ b/gix-status/src/metadata_cache.rs @@ -0,0 +1,546 @@ +//! Windows-only metadata cache — see the gate on `pub mod metadata_cache` in +//! [`crate`] for why this is Windows-only. +//! +//! [`prepare`] batches a parallel `GetFileInformationByHandleEx` walk of the +//! worktree (~30 ms / 90 k files) into a [`MetadataCache`] keyed by +//! worktree-relative path. `index_as_worktree` looks up each index entry +//! there instead of calling `lstat` (~1 s for the same tree). +//! +//! The cache is **transparent**: empty/partial/extra entries change speed +//! only, never correctness — misses fall through to a live syscall. + +use std::path::Path; + +use bstr::BString; + +/// Cached file metadata. +/// +/// Carries enough information to determine file type, detect mode changes, +/// build a [`gix_index::entry::Stat`] for comparison, and short-circuit content +/// reads via file size. +/// +/// All fields are platform-agnostic. When populating from a source that doesn't +/// provide some fields (e.g. Windows directory enumeration doesn't expose +/// `ino`/`uid`/`gid`), leave those as `0`/`false`. The status pipeline's stat +/// comparison on Windows treats zeros as "equal-by-default" for those fields. +#[derive(Debug, Clone, Default)] +pub struct CachedMetadata { + /// Whether this is a directory. + pub is_dir: bool, + /// Whether this is a symlink (or reparse point on Windows). + pub is_symlink: bool, + /// Whether the file has the executable bit set. + /// + /// Always `false` when populated from Windows batch enumeration — git on + /// Windows defaults to `core.filemode=false`, so the bit isn't tracked there. + pub is_executable: bool, + /// File size in bytes. + pub size: u64, + /// Modification time — seconds since Unix epoch. + pub mtime_secs: u32, + /// Modification time — nanoseconds component. + pub mtime_nsecs: u32, + /// Status/creation time — seconds since Unix epoch. + /// + /// On Windows this must be populated from the real `CreationTime`, not `mtime`: + /// the stat comparison in the status pipeline compares `ctime.secs` by default + /// (`trust_ctime=true`), and faking `ctime=mtime` causes spurious mismatches + /// for any file whose creation-time and modification-time differ. + pub ctime_secs: u32, + /// Status/creation time — nanoseconds component. + pub ctime_nsecs: u32, + /// Device ID. Set to 0 if not available (always 0 on Windows). + pub dev: u64, + /// Inode number. Set to 0 if not available (always 0 on Windows). + pub ino: u64, + /// User ID. Set to 0 if not available (always 0 on Windows). + pub uid: u32, + /// Group ID. Set to 0 if not available (always 0 on Windows). + pub gid: u32, +} + +impl CachedMetadata { + /// Convert to gitoxide's [`Stat`](gix_index::entry::Stat) struct for index comparison. + /// + /// Truncates `dev`, `ino`, and `size` from 64 to 32 bits — matching what + /// [`gix_index::entry::stat::Stat::from_fs`] does on Unix, so both code + /// paths compare the same quantities. + pub fn to_stat(&self) -> gix_index::entry::Stat { + gix_index::entry::Stat { + mtime: gix_index::entry::stat::Time { + secs: self.mtime_secs, + nsecs: self.mtime_nsecs, + }, + ctime: gix_index::entry::stat::Time { + secs: self.ctime_secs, + nsecs: self.ctime_nsecs, + }, + dev: self.dev as u32, + ino: self.ino as u32, + uid: self.uid, + gid: self.gid, + size: self.size as u32, + } + } +} + +/// Metadata cache: maps worktree-relative paths (forward-slashed, in the exact +/// case as enumerated from disk) to cached metadata. +/// +/// Lookups are case-sensitive: callers must query with the same case the walker +/// emitted. On a case-insensitive worktree where the index path's case differs +/// from disk, the lookup misses and `index_as_worktree` falls back to a live +/// `lstat` — a few extra syscalls in a rare scenario. Folding cases together +/// would silently merge distinct files on case-sensitive volumes (Windows +/// per-directory case-sensitivity, NTFS POSIX mode), which would let the cache +/// return one file's stat for a query about another and silently misreport +/// tracked-file status. That's strictly worse than a few cache misses. +pub type MetadataCache = hashbrown::HashMap; + +/// Prepare a metadata cache by walking the worktree in parallel using +/// `GetFileInformationByHandleEx` with `FileIdBothDirectoryInfo`, skipping +/// subtrees flagged by the per-thread predicate produced by `make_excludes`. +/// +/// The returned cache can be attached to the status pipeline via +/// [`Context::metadata_cache`](crate::index_as_worktree::Context::metadata_cache) +/// — cache hits skip per-file syscalls. +/// +/// `thread_limit` caps parallelism. `None` uses all available cores; `Some(1)` +/// is single-threaded. +/// +/// `make_excludes` is called once on each worker thread and returns a predicate +/// that owns thread-local state (e.g. a `gix_worktree::Stack`). Each time the +/// walker is about to descend into a subdirectory, it calls the predicate with +/// the worktree-relative path; returning `true` skips that subtree. Callers +/// that don't need gitignore pruning can pass `|| |_: &bstr::BStr| false`, but +/// for typical projects with fat ignored dirs (`node_modules`, `target`) the +/// wasted enumeration makes the cache net-slower than plain per-file stats. +pub fn prepare( + worktree: &Path, + thread_limit: Option, + make_excludes: F, +) -> std::io::Result +where + F: Fn() -> E + Sync, + E: FnMut(&bstr::BStr) -> bool, +{ + windows::walk_worktree_parallel(worktree, thread_limit, make_excludes) +} + +/// Windows-specific implementation using `GetFileInformationByHandleEx` / +/// `FileIdBothDirectoryInfo`. Work-stealing across threads via `thread::scope`. +#[allow(unsafe_code)] +mod windows { + use super::*; + use std::collections::VecDeque; + use std::ffi::{c_void, OsString}; + use std::os::windows::ffi::{OsStrExt, OsStringExt}; + use std::sync::{Condvar, Mutex}; + use std::thread; + + use windows_sys::Win32::Foundation::{CloseHandle, INVALID_HANDLE_VALUE}; + use windows_sys::Win32::Storage::FileSystem::{ + CreateFileW, FileIdBothDirectoryInfo, GetFileInformationByHandleEx, FILE_ATTRIBUTE_DIRECTORY, + FILE_ATTRIBUTE_REPARSE_POINT, FILE_FLAG_BACKUP_SEMANTICS, FILE_ID_BOTH_DIR_INFO, + FILE_LIST_DIRECTORY, FILE_SHARE_DELETE, FILE_SHARE_READ, FILE_SHARE_WRITE, OPEN_EXISTING, + SYNCHRONIZE, + }; + + /// 64 KiB, u64-aligned — `FILE_ID_BOTH_DIR_INFO` contains LARGE_INTEGER fields that + /// require 8-byte alignment, and `Vec` guarantees it. Hoisted to the worker so + /// 6k+ directory walks reuse one allocation instead of allocating per call. + const BUFFER_U64S: usize = 8 * 1024; + + /// Work item for the parallel walker: (null-terminated UTF-16 absolute path, relative prefix). + /// + /// The path is stored pre-encoded so `CreateFileW` on the child can reuse the parent's + /// allocation without re-traversing `PathBuf`/`OsStr` each time. + type WorkItem = (Vec, String); + + /// Convert FILE_ID_BOTH_DIR_INFO to CachedMetadata. + fn cached_from_info(info: &FILE_ID_BOTH_DIR_INFO) -> CachedMetadata { + let size = info.EndOfFile as u64; + + // FILETIME values are LARGE_INTEGER holding 100ns intervals since 1601-01-01 UTC. + // `ctime` must come from `CreationTime` (not mtime): `gix_index::entry::stat::from_fs` + // on Windows populates ctime from `Metadata::created()`, which is CreationTime. If we + // faked ctime=mtime here, stat comparison would spuriously fail for any file where + // creation-time and modification-time differ, forcing an unnecessary content hash. + let (mtime_secs, mtime_nsecs) = filetime_to_unix(info.LastWriteTime as u64); + let (ctime_secs, ctime_nsecs) = filetime_to_unix(info.CreationTime as u64); + + let is_dir = (info.FileAttributes & FILE_ATTRIBUTE_DIRECTORY) != 0; + let is_symlink = (info.FileAttributes & FILE_ATTRIBUTE_REPARSE_POINT) != 0; + + // The executable bit, dev, ino, uid, and gid aren't exposed by Windows + // directory enumeration. Git on Windows defaults to core.filemode=false + // (so is_executable is ignored anyway); the remaining fields are only + // compared against matching zeros from `Stat::from_fs`'s Windows branch. + CachedMetadata { + is_dir, + is_symlink, + is_executable: false, + size, + mtime_secs, + mtime_nsecs, + ctime_secs, + ctime_nsecs, + dev: 0, + ino: 0, + uid: 0, + gid: 0, + } + } + + /// Convert a Windows FILETIME (100ns intervals since 1601-01-01 UTC) to Unix (secs, nsecs). + fn filetime_to_unix(ft: u64) -> (u32, u32) { + const EPOCH_DIFF: u64 = 116_444_736_000_000_000; + let unix_100ns = ft.saturating_sub(EPOCH_DIFF); + let secs = (unix_100ns / 10_000_000) as u32; + let nsecs = ((unix_100ns % 10_000_000) * 100) as u32; + (secs, nsecs) + } + + /// Build a null-terminated UTF-16 absolute path for `parent\name`. + fn join_utf16(parent: &[u16], name: &[u16]) -> Vec { + // Parent is null-terminated; drop the trailing NUL before joining. + let parent = parent.strip_suffix(&[0u16]).unwrap_or(parent); + let mut out = Vec::with_capacity(parent.len() + 1 + name.len() + 1); + out.extend_from_slice(parent); + if out.last().copied() != Some(b'\\' as u16) { + out.push(b'\\' as u16); + } + out.extend_from_slice(name); + out.push(0); + out + } + + /// Convert a filesystem path into a null-terminated UTF-16 buffer suitable for `CreateFileW`. + fn utf16_null_terminated(path: &Path) -> Vec { + let mut v: Vec = path.as_os_str().encode_wide().collect(); + v.push(0); + v + } + + /// Check if a UTF-16 name equals exactly ASCII ".git" (case-sensitive, matching the + /// prior behaviour). This is intentional: on Windows a mis-cased `.Git` is the same + /// file to the filesystem but conventionally never appears, and the cache is + /// look-through — a missed skip just means one extra cached entry that will be + /// ignored by the status pipeline. + fn name_is_dotgit(name: &[u16]) -> bool { + name.len() == 4 + && name[0] == b'.' as u16 + && name[1] == b'g' as u16 + && name[2] == b'i' as u16 + && name[3] == b't' as u16 + } + + /// Result type for directory walking to simplify the return type. + type WalkResult = (Vec<(BString, CachedMetadata)>, Vec); + + /// Walk a single directory using `GetFileInformationByHandleEx` with + /// `FileIdBothDirectoryInfo`. + /// + /// Returns (cacheable entries, subdirectories to recurse into). `buffer` is a + /// reusable 64 KiB u64-aligned scratch buffer; reusing it across calls avoids + /// a heap allocation per directory (6k+ per worktree on the Linux kernel). + fn walk_directory( + dir_path: &[u16], + rel_prefix: &str, + buffer: &mut [u64], + ) -> std::io::Result { + let mut files = Vec::new(); + let mut subdirs = Vec::new(); + + let handle = unsafe { + CreateFileW( + dir_path.as_ptr(), + FILE_LIST_DIRECTORY | SYNCHRONIZE, + FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, + std::ptr::null(), + OPEN_EXISTING, + FILE_FLAG_BACKUP_SEMANTICS, + std::ptr::null_mut(), + ) + }; + + if handle == INVALID_HANDLE_VALUE { + // Directory doesn't exist or can't be read - not an error for a look-through cache. + return Ok((files, subdirs)); + } + + let buffer_bytes = (buffer.len() * 8) as u32; + + loop { + let success = unsafe { + GetFileInformationByHandleEx( + handle, + FileIdBothDirectoryInfo, + buffer.as_mut_ptr().cast::(), + buffer_bytes, + ) + }; + if success == 0 { + // End of enumeration (ERROR_NO_MORE_FILES) or access denied / similar. + // Either way, stop: the cache is best-effort and correctness falls back + // to per-file syscalls in `index_as_worktree`. + break; + } + + let mut offset = 0usize; + loop { + let info_ptr = unsafe { + buffer.as_ptr().cast::().add(offset).cast::() + }; + let info = unsafe { &*info_ptr }; + + let name_len = (info.FileNameLength / 2) as usize; + let name_slice = + unsafe { std::slice::from_raw_parts(info.FileName.as_ptr(), name_len) }; + + let is_dot = name_len == 1 && name_slice[0] == b'.' as u16; + let is_dotdot = name_len == 2 + && name_slice[0] == b'.' as u16 + && name_slice[1] == b'.' as u16; + + if !is_dot && !is_dotdot && !name_is_dotgit(name_slice) { + let is_dir = (info.FileAttributes & FILE_ATTRIBUTE_DIRECTORY) != 0; + let is_reparse = (info.FileAttributes & FILE_ATTRIBUTE_REPARSE_POINT) != 0; + + let name_str = OsString::from_wide(name_slice).to_string_lossy().into_owned(); + let rel_path = if rel_prefix.is_empty() { + name_str + } else { + format!("{rel_prefix}/{name_str}") + }; + + let meta = cached_from_info(info); + if is_dir && !is_reparse { + let child = join_utf16(dir_path, name_slice); + subdirs.push((child, rel_path.clone())); + } + files.push((rel_path.into_bytes().into(), meta)); + } + + if info.NextEntryOffset == 0 { + break; + } + offset += info.NextEntryOffset as usize; + } + } + + unsafe { CloseHandle(handle) }; + Ok((files, subdirs)) + } + + /// A directory the walk hasn't descended into yet, plus a count of workers + /// currently processing work so the last one out can tell the others to exit. + struct WorkQueue { + dirs: VecDeque, + active_workers: usize, + } + + /// Walk the worktree using work-stealing parallelism. + pub fn walk_worktree_parallel( + worktree: &Path, + thread_limit: Option, + make_excludes: F, + ) -> std::io::Result + where + F: Fn() -> E + Sync, + E: FnMut(&bstr::BStr) -> bool, + { + let num_threads = thread_limit + .unwrap_or_else(|| { + std::thread::available_parallelism() + .map(std::num::NonZero::get) + .unwrap_or(4) + }) + .max(1); + + if num_threads == 1 { + return walk_worktree_single_threaded(worktree, make_excludes()); + } + + let queue_mutex = Mutex::new(WorkQueue { + dirs: VecDeque::from([(utf16_null_terminated(worktree), String::new())]), + active_workers: 0, + }); + let cvar = Condvar::new(); + let shared_cache = Mutex::new(MetadataCache::default()); + + thread::scope(|s| { + for _ in 0..num_threads { + let make_excludes = &make_excludes; + s.spawn(|| worker(&queue_mutex, &cvar, &shared_cache, make_excludes())); + } + }); + + Ok(shared_cache.into_inner().unwrap()) + } + + /// One worker of the parallel walker. Grabs batches of directories from the + /// shared queue, walks them into a thread-local cache, and pushes any discovered + /// subdirectories back onto the queue. Exits when the queue is drained and no + /// worker is still producing. + /// + /// `is_excluded` is a thread-local predicate that returns true for directories + /// whose contents should be skipped (gitignored). The excluded directory's own + /// metadata entry is still cached; only recursion is avoided. + fn worker bool>( + queue_mutex: &Mutex, + cvar: &Condvar, + shared_cache: &Mutex, + mut is_excluded: E, + ) { + let mut local_cache = MetadataCache::default(); + let mut local_stack: Vec = Vec::new(); + let mut buffer = vec![0u64; BUFFER_U64S]; + + loop { + // Claim work, or exit if the walk is done. + { + let mut queue = queue_mutex.lock().unwrap(); + loop { + // Steal up to half of the queue (capped) to reduce re-locking while + // still leaving work for other threads to pick up. + let take = queue.dirs.len().div_ceil(2).min(32); + if take > 0 { + local_stack.extend(queue.dirs.drain(..take)); + queue.active_workers += 1; + break; + } + if queue.active_workers == 0 { + // Queue is empty and no one is producing more work: we're done. + cvar.notify_all(); + shared_cache.lock().unwrap().extend(local_cache); + return; + } + queue = cvar.wait(queue).unwrap(); + } + } + + // Process the claimed directories outside the lock. + let mut new_dirs: Vec = Vec::new(); + while let Some((dir, rel_prefix)) = local_stack.pop() { + if let Ok((files, subdirs)) = walk_directory(&dir, &rel_prefix, &mut buffer) { + local_cache.extend(files); + for (child_path, child_rel) in subdirs { + if !is_excluded(child_rel.as_bytes().into()) { + new_dirs.push((child_path, child_rel)); + } + } + } + } + + // Return discovered subdirectories; wake anyone waiting. + let mut queue = queue_mutex.lock().unwrap(); + queue.dirs.extend(new_dirs); + queue.active_workers -= 1; + cvar.notify_all(); + } + } + + /// Simple single-threaded walk for thread_limit=1. + fn walk_worktree_single_threaded bool>( + worktree: &Path, + mut is_excluded: E, + ) -> std::io::Result { + let mut cache = MetadataCache::default(); + let mut dir_stack: Vec = vec![(utf16_null_terminated(worktree), String::new())]; + let mut buffer = vec![0u64; BUFFER_U64S]; + + while let Some((dir, rel_prefix)) = dir_stack.pop() { + if let Ok((files, subdirs)) = walk_directory(&dir, &rel_prefix, &mut buffer) { + cache.extend(files); + for (child_path, child_rel) in subdirs { + if !is_excluded(child_rel.as_bytes().into()) { + dir_stack.push((child_path, child_rel)); + } + } + } + } + + Ok(cache) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_cached_metadata_to_stat() { + let cached = CachedMetadata { + is_dir: false, + is_symlink: false, + is_executable: true, + size: 1234, + mtime_secs: 1700000000, + mtime_nsecs: 500_000_000, + ctime_secs: 1699999999, + ctime_nsecs: 100_000_000, + dev: 123, + ino: 456, + uid: 1000, + gid: 1000, + }; + let stat = cached.to_stat(); + assert_eq!(stat.size, 1234); + assert_eq!(stat.mtime.secs, 1700000000); + assert_eq!(stat.mtime.nsecs, 500_000_000); + assert_eq!(stat.ctime.secs, 1699999999); + assert_eq!(stat.ctime.nsecs, 100_000_000); + assert_eq!(stat.dev, 123); + assert_eq!(stat.ino, 456); + assert_eq!(stat.uid, 1000); + assert_eq!(stat.gid, 1000); + } + + #[test] + fn test_lookup_is_case_sensitive() { + // The cache is keyed by the exact path bytes the walker emits. + // Mixed-case lookups miss rather than silently aliasing onto the wrong + // file — a case-insensitive worktree falls back to a live `lstat` on miss. + let mut cache = MetadataCache::default(); + let meta = CachedMetadata { + size: 42, + ..Default::default() + }; + cache.insert(BString::from(b"src/foo.rs".as_slice()), meta.clone()); + + assert!(cache.get(&b"src/foo.rs"[..]).is_some()); + assert!(cache.get(&b"SRC/Foo.rs"[..]).is_none()); + + cache.insert(BString::from("ünïcode.txt".as_bytes()), meta); + assert!(cache.get("ünïcode.txt".as_bytes()).is_some()); + } + + #[test] + fn test_prepare_returns_cache() { + // Use a unique temp directory to avoid walking other files. + use std::time::{SystemTime, UNIX_EPOCH}; + let timestamp = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_nanos(); + let temp_dir = std::env::temp_dir().join(format!("gix_status_test_{timestamp}")); + std::fs::create_dir_all(&temp_dir).unwrap(); + + let test_file = temp_dir.join("test.txt"); + std::fs::write(&test_file, b"hello").unwrap(); + + let subdir = temp_dir.join("subdir"); + std::fs::create_dir(&subdir).unwrap(); + let nested_file = subdir.join("nested.txt"); + std::fs::write(&nested_file, b"world").unwrap(); + + let cache = prepare(&temp_dir, Some(1), || |_: &bstr::BStr| false).unwrap(); + + assert!(!cache.is_empty()); + assert!(cache.contains_key(&b"test.txt"[..])); + assert!(cache.contains_key(&b"subdir/nested.txt"[..])); + + let _ = std::fs::remove_dir_all(&temp_dir); + } +} diff --git a/gix-status/tests/status/index_as_worktree.rs b/gix-status/tests/status/index_as_worktree.rs index 4e50ba2646..e49574c112 100644 --- a/gix-status/tests/status/index_as_worktree.rs +++ b/gix-status/tests/status/index_as_worktree.rs @@ -187,6 +187,8 @@ fn fixture_filtered_detailed( }, ), should_interrupt: &AtomicBool::default(), + #[cfg(windows)] + metadata_cache: None, }; let options = Options { fs: fs_capabilities.map_or_else(|| gix_fs::Capabilities::probe(&git_dir), |new| new(&git_dir)), @@ -1054,6 +1056,8 @@ fn racy_git() { stack, filter: Default::default(), should_interrupt: &AtomicBool::default(), + #[cfg(windows)] + metadata_cache: None, }; let out = index_as_worktree( &index, diff --git a/gix-status/tests/status/index_as_worktree_with_renames.rs b/gix-status/tests/status/index_as_worktree_with_renames.rs index cc8e546831..65966ecc1f 100644 --- a/gix-status/tests/status/index_as_worktree_with_renames.rs +++ b/gix-status/tests/status/index_as_worktree_with_renames.rs @@ -330,6 +330,8 @@ fn fixture_filtered_detailed( current_dir: &cwd, ignore_case_index_lookup: None, }, + #[cfg(windows)] + metadata_cache: None, }; let options = Options { object_hash: gix_hash::Kind::Sha1, diff --git a/gix/src/status/index_worktree.rs b/gix/src/status/index_worktree.rs index 1aade3a478..a7933ced74 100644 --- a/gix/src/status/index_worktree.rs +++ b/gix/src/status/index_worktree.rs @@ -27,6 +27,12 @@ pub enum Error { StatOptions(#[from] config::stat_options::Error), #[error(transparent)] ResourceCache(#[from] crate::diff::resource_cache::Error), + #[cfg(windows)] + #[error("Failed to prepare metadata cache")] + PrepareMetadataCache(#[from] std::io::Error), + #[cfg(windows)] + #[error(transparent)] + OpenIndex(#[from] crate::worktree::open_index::Error), } /// Options for use with [Repository::index_worktree_status()]. @@ -81,6 +87,8 @@ impl Repository { /// - A flag to stop the whole operation. /// * `options` /// - Additional configuration for all parts of the operation. + /// * `metadata_cache` *(Windows only)* + /// - Optional pre-populated metadata cache; see [`gix_status::metadata_cache`]. /// /// ### Note /// @@ -100,6 +108,7 @@ impl Repository { progress: &mut dyn gix_features::progress::Progress, should_interrupt: &AtomicBool, options: Options, + #[cfg(windows)] metadata_cache: Option<&gix_status::MetadataCache>, ) -> Result where T: Send + Clone, @@ -148,6 +157,8 @@ impl Repository { current_dir: cwd, ignore_case_index_lookup: accelerate_lookup.as_ref(), }, + #[cfg(windows)] + metadata_cache, }, gix_status::index_as_worktree_with_renames::Options { sorting: options.sorting, diff --git a/gix/src/status/iter/mod.rs b/gix/src/status/iter/mod.rs index 38f4aa51e9..363f73b860 100644 --- a/gix/src/status/iter/mod.rs +++ b/gix/src/status/iter/mod.rs @@ -65,6 +65,16 @@ where .unwrap_or_default(); let should_interrupt = self.should_interrupt.clone().unwrap_or_default(); let submodule = BuiltinSubmoduleStatus::new(self.repo.clone().into_sync(), self.submodules)?; + #[cfg(windows)] + let metadata_cache = match self.metadata_cache { + crate::status::MetadataCacheConfig::Provided(cache) => Some(cache), + crate::status::MetadataCacheConfig::Disabled => None, + // Best-effort: if the prep walk fails (missing workdir, syscall error), + // silently fall through to stat-based status rather than abort. + crate::status::MetadataCacheConfig::Auto => { + crate::status::build_metadata_cache(self.repo, None).ok() + } + }; #[cfg(feature = "parallel")] { let (tx, rx) = std::sync::mpsc::channel(); @@ -134,6 +144,8 @@ where &mut progress, &should_interrupt, options, + #[cfg(windows)] + metadata_cache.as_ref(), )?; Ok(Outcome { index_worktree: out, @@ -197,6 +209,8 @@ where &mut progress, &should_interrupt, options, + #[cfg(windows)] + metadata_cache.as_ref(), )?; let mut iter = Iter { items: Vec::new().into_iter(), diff --git a/gix/src/status/mod.rs b/gix/src/status/mod.rs index b2770478f5..8b0f2ca41d 100644 --- a/gix/src/status/mod.rs +++ b/gix/src/status/mod.rs @@ -15,6 +15,23 @@ where index_worktree_options: index_worktree::Options, tree_index_renames: tree_index::TrackRenames, should_interrupt: Option, + #[cfg(windows)] + metadata_cache: MetadataCacheConfig, +} + +/// Windows-only: controls the metadata cache. `Auto` (default) trades a +/// one-shot gitignore-aware worktree walk (~30 ms / 90 k files) for avoiding +/// per-file `lstat` during status (~1 s for the same tree). +#[cfg(windows)] +#[derive(Default)] +pub enum MetadataCacheConfig { + /// Prepare the cache lazily inside the iterator using all cores. + #[default] + Auto, + /// Skip the cache. + Disabled, + /// Use this pre-built cache. + Provided(gix_status::MetadataCache), } /// How to obtain a submodule's status. @@ -114,6 +131,8 @@ impl Repository { rewrites: None, thread_limit: None, }, + #[cfg(windows)] + metadata_cache: MetadataCacheConfig::default(), }; let untracked = self @@ -232,6 +251,42 @@ pub mod into_iter { } } +/// Build a gitignore-aware Windows metadata cache. Shared between the explicit +/// `prepare_index_worktree_metadata_cache` and the Auto branch in `into_iter`. +#[cfg(windows)] +pub(crate) fn build_metadata_cache( + repo: &Repository, + thread_limit: Option, +) -> Result { + let workdir = repo + .workdir() + .ok_or(crate::status::index_worktree::Error::MissingWorkDir)?; + let sync_repo = repo.clone().into_sync(); + let index = repo.index_or_empty()?; + let index_state: &gix_index::State = &index; + + let make_excludes = || -> Box bool> { + let thread_repo = sync_repo.to_thread_local(); + let Ok(stack) = thread_repo.excludes( + index_state, + None, + gix_worktree::stack::state::ignore::Source::WorktreeThenIdMappingIfNotSkipped, + ) else { + return Box::new(|_| false); + }; + let mut stack = stack.detach(); + let objects = thread_repo.objects.clone(); + Box::new(move |path: &crate::bstr::BStr| -> bool { + stack + .at_entry(path, Some(gix_index::entry::Mode::DIR), &objects) + .map(|p| p.is_excluded()) + .unwrap_or(false) + }) + }; + + Ok(gix_status::metadata_cache::prepare(workdir, thread_limit, make_excludes)?) +} + mod platform; /// diff --git a/gix/src/status/platform.rs b/gix/src/status/platform.rs index ececa94611..5df0219f53 100644 --- a/gix/src/status/platform.rs +++ b/gix/src/status/platform.rs @@ -125,3 +125,36 @@ where self } } + +/// Windows-only metadata-cache builder methods. See +/// [`crate::status::MetadataCacheConfig`] for the default-on behaviour. +#[cfg(windows)] +impl Platform<'_, Progress> +where + Progress: gix_features::progress::Progress, +{ + /// Use `cache` instead of building one. For out-of-band prep (e.g. file- + /// watcher refresh) reused across status calls. + pub fn index_worktree_metadata_cache(mut self, cache: gix_status::MetadataCache) -> Self { + self.metadata_cache = crate::status::MetadataCacheConfig::Provided(cache); + self + } + + /// Skip the metadata cache. Prefer the Auto default unless measured. + pub fn disable_index_worktree_metadata_cache(mut self) -> Self { + self.metadata_cache = crate::status::MetadataCacheConfig::Disabled; + self + } + + /// Eagerly prepare the cache with a specific `thread_limit` (`Some(1)` = + /// single-threaded, `None` = all cores). Use this to pick parallelism or + /// to fail-fast before building the iterator. + pub fn prepare_index_worktree_metadata_cache( + mut self, + thread_limit: Option, + ) -> Result { + let cache = crate::status::build_metadata_cache(self.repo, thread_limit)?; + self.metadata_cache = crate::status::MetadataCacheConfig::Provided(cache); + Ok(self) + } +}