From 6c6781f3e526f56cefb3daceb5a045dc2f90985d Mon Sep 17 00:00:00 2001 From: yishuiliunian Date: Thu, 25 Jun 2026 09:34:24 +0800 Subject: [PATCH] fix(search): bound glob/grep tree-walk with cooperative + watchdog timeout A Glob over a path containing a network mount (rclone/OSS NFS) hung the agent loop forever: the single-threaded walker exhaustively traversed the whole tree for a zero-match pattern with no deadline, blocking the spawn_blocking thread that the agent loop awaits. Two-layer fix: - Inner cooperative deadline (ResourceLimits::walk_timeout, 30s) checked between walk entries; returns partial results + timed_out so the tool can tell the LLM to narrow `path`. - Outer hard floor via the runtime per-tool watchdog, now extended from Bash-only to all fs-read tools (Glob/Grep/Ls/Read/ReadPdf/ReadImage/ ReadHtml) with a typed StaleReason. The watchdog is applied at a single convergence point (execute_tool_watchdogged) that BOTH the streaming early-start path and the normal approval path route through, so a tool can never be bounded on one path and unbounded on the other. Also: parallelize the glob walker (mirrors grep's build_parallel), default follow_links to false (ripgrep parity, avoids cross-mount escape/cycles), decouple `truncated` from the timeout signal (no spurious overflow files), add a deterministic path tiebreak to glob output, dedup the timeout notice into loopal-tool-api, and clamp search `max` to >=1. Design notes under design/glob-traversal-hang/. --- crates/loopal-backend/src/limits.rs | 6 + crates/loopal-backend/src/search/glob.rs | 103 ++++--- crates/loopal-backend/src/search/grep.rs | 14 +- crates/loopal-backend/src/search/grep_file.rs | 2 + crates/loopal-backend/src/search/walker.rs | 4 +- crates/loopal-backend/tests/suite.rs | 4 + .../tests/suite/glob_parallel_test.rs | 76 +++++ .../tests/suite/search_timeout_test.rs | 118 ++++++++ .../src/agent_loop/streaming_tool_exec.rs | 5 +- .../src/agent_loop/tool_exec.rs | 35 ++- .../src/agent_loop/tool_watchdog.rs | 32 ++- crates/loopal-tool-api/src/backend_types.rs | 2 + crates/loopal-tool-api/src/lib.rs | 6 +- crates/loopal-tool-api/src/truncate.rs | 4 + crates/tools/filesystem/glob/src/lib.rs | 19 +- .../glob/tests/suite/glob_tool_edge_test.rs | 49 ++++ .../tools/filesystem/grep/src/grep_format.rs | 7 + crates/tools/filesystem/grep/tests/suite.rs | 2 + .../grep/tests/suite/grep_timeout_test.rs | 62 +++++ .../0001-root-cause-analysis.md | 261 ++++++++++++++++++ .../0002-parallel-glob-walker.md | 218 +++++++++++++++ .../0003-post-review-consolidation.md | 58 ++++ 22 files changed, 1018 insertions(+), 69 deletions(-) create mode 100644 crates/loopal-backend/tests/suite/glob_parallel_test.rs create mode 100644 crates/loopal-backend/tests/suite/search_timeout_test.rs create mode 100644 crates/tools/filesystem/grep/tests/suite/grep_timeout_test.rs create mode 100644 design/glob-traversal-hang/0001-root-cause-analysis.md create mode 100644 design/glob-traversal-hang/0002-parallel-glob-walker.md create mode 100644 design/glob-traversal-hang/0003-post-review-consolidation.md diff --git a/crates/loopal-backend/src/limits.rs b/crates/loopal-backend/src/limits.rs index edc60d3b..feb383fc 100644 --- a/crates/loopal-backend/src/limits.rs +++ b/crates/loopal-backend/src/limits.rs @@ -22,6 +22,11 @@ pub struct ResourceLimits { pub max_fetch_bytes: usize, /// Default shell command timeout. pub default_timeout: Duration, + /// Cooperative deadline checked between glob/grep walk entries: bounds + /// slow-but-responsive trees and returns partial results. A syscall stuck + /// on a dead mount can't be interrupted here — that case is bounded by the + /// runtime per-tool watchdog instead. + pub walk_timeout: Duration, /// HTTP fetch timeout. pub fetch_timeout: Duration, /// Maximum image file size in bytes. @@ -40,6 +45,7 @@ impl Default for ResourceLimits { max_grep_matches: 500, max_fetch_bytes: 5 * 1024 * 1024, // 5 MB default_timeout: Duration::from_secs(300), // 5 min + walk_timeout: Duration::from_secs(30), fetch_timeout: Duration::from_secs(30), image_max_bytes: IMAGE_MAX_BYTES, image_max_pixels: IMAGE_MAX_PIXELS, diff --git a/crates/loopal-backend/src/search/glob.rs b/crates/loopal-backend/src/search/glob.rs index ed212b45..3ccd7a69 100644 --- a/crates/loopal-backend/src/search/glob.rs +++ b/crates/loopal-backend/src/search/glob.rs @@ -1,17 +1,18 @@ -//! Glob pattern search with file-type filtering and modification time. - use std::path::Path; -use std::time::UNIX_EPOCH; +use std::sync::Arc; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::time::{Instant, UNIX_EPOCH}; use globset::Glob; +use ignore::WalkState; use loopal_error::ToolIoError; use loopal_tool_api::backend_types::{GlobEntry, GlobOptions, GlobSearchResult}; use loopal_tool_api::save_to_overflow_file; +use parking_lot::Mutex; use crate::limits::ResourceLimits; use crate::search::{overflow_fmt, walker}; -/// Execute a glob search and return matching entries. pub fn glob_search( opts: &GlobOptions, cwd: &Path, @@ -25,50 +26,75 @@ pub fn glob_search( let glob = Glob::new(&opts.pattern).map_err(|e| ToolIoError::Other(format!("invalid glob: {e}")))?; - let matcher = glob.compile_matcher(); + let max = opts.max_results.min(limits.max_glob_results).max(1); - let max = opts.max_results.min(limits.max_glob_results); - let Some(walker) = walker::build_walker(&search_path, opts.type_filter.as_deref()) else { + let Some(w) = walker::build_walker(&search_path, opts.type_filter.as_deref()) else { return Ok(GlobSearchResult { entries: Vec::new(), truncated: false, + timed_out: false, overflow_path: None, }); }; - let mut entries = Vec::new(); - let mut truncated = false; - - for entry in walker.build().flatten() { - if !entry.file_type().is_some_and(|ft| ft.is_file()) { - continue; - } - let path = entry.path(); - let rel = match path.strip_prefix(&search_path) { - Ok(r) => r, - Err(_) => continue, - }; - if !matcher.is_match(rel) { - continue; - } - let modified_secs = entry - .metadata() - .ok() - .and_then(|m| m.modified().ok()) - .and_then(|t| t.duration_since(UNIX_EPOCH).ok()) - .map(|d| d.as_secs()); - - entries.push(GlobEntry { - path: path.to_string_lossy().into_owned(), - modified_secs, - }); + let deadline = Instant::now() + limits.walk_timeout; + let done = Arc::new(AtomicBool::new(false)); + let timed_out = Arc::new(AtomicBool::new(false)); + let entries: Arc>> = Arc::new(Mutex::new(Vec::new())); + let search_path = Arc::new(search_path); + let matcher = Arc::new(glob.compile_matcher()); - if entries.len() >= max { - truncated = true; - break; - } - } + w.build_parallel().run(|| { + let done = Arc::clone(&done); + let timed_out = Arc::clone(&timed_out); + let entries = Arc::clone(&entries); + let search_path = Arc::clone(&search_path); + let matcher = Arc::clone(&matcher); + Box::new(move |entry| { + if done.load(Ordering::Relaxed) { + return WalkState::Quit; + } + if Instant::now() >= deadline { + done.store(true, Ordering::Relaxed); + timed_out.store(true, Ordering::Relaxed); + return WalkState::Quit; + } + let Ok(entry) = entry else { + return WalkState::Continue; + }; + if !entry.file_type().is_some_and(|ft| ft.is_file()) { + return WalkState::Continue; + } + let Ok(rel) = entry.path().strip_prefix(search_path.as_path()) else { + return WalkState::Continue; + }; + if !matcher.is_match(rel) { + return WalkState::Continue; + } + let modified_secs = entry + .metadata() + .ok() + .and_then(|m| m.modified().ok()) + .and_then(|t| t.duration_since(UNIX_EPOCH).ok()) + .map(|d| d.as_secs()); + let n = { + let mut guard = entries.lock(); + guard.push(GlobEntry { + path: entry.path().to_string_lossy().into_owned(), + modified_secs, + }); + guard.len() + }; + if n >= max { + done.store(true, Ordering::Relaxed); + return WalkState::Quit; + } + WalkState::Continue + }) + }); + let entries = Arc::try_unwrap(entries).unwrap().into_inner(); + let truncated = entries.len() >= max; let overflow_path = if truncated { Some(save_to_overflow_file( &overflow_fmt::serialize_glob_results(&entries), @@ -81,6 +107,7 @@ pub fn glob_search( Ok(GlobSearchResult { entries, truncated, + timed_out: timed_out.load(Ordering::Relaxed), overflow_path, }) } diff --git a/crates/loopal-backend/src/search/grep.rs b/crates/loopal-backend/src/search/grep.rs index 62adb215..5ea7802d 100644 --- a/crates/loopal-backend/src/search/grep.rs +++ b/crates/loopal-backend/src/search/grep.rs @@ -1,6 +1,7 @@ use std::path::Path; use std::sync::Arc; use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; +use std::time::Instant; use globset::Glob; use ignore::WalkState; @@ -55,12 +56,14 @@ pub fn grep_search( None => None, }; - let max = opts.max_matches.min(limits.max_grep_matches); + let max = opts.max_matches.min(limits.max_grep_matches).max(1); let ctx_before = opts.context_before; let ctx_after = opts.context_after; let multiline = opts.multiline; let total = Arc::new(AtomicUsize::new(0)); let done = Arc::new(AtomicBool::new(false)); + let timed_out = Arc::new(AtomicBool::new(false)); + let deadline = Instant::now() + limits.walk_timeout; let results: Arc>> = Arc::new(Mutex::new(Vec::new())); let search_path = Arc::new(search_path); let glob_matcher = Arc::new(glob_matcher); @@ -74,11 +77,17 @@ pub fn grep_search( let search_path = Arc::clone(&search_path); let total = Arc::clone(&total); let done = Arc::clone(&done); + let timed_out = Arc::clone(&timed_out); let results = Arc::clone(&results); Box::new(move |entry| { if done.load(Ordering::Relaxed) { return WalkState::Quit; } + if Instant::now() >= deadline { + done.store(true, Ordering::Relaxed); + timed_out.store(true, Ordering::Relaxed); + return WalkState::Quit; + } let entry = match entry { Ok(e) => e, Err(_) => return WalkState::Continue, @@ -100,9 +109,10 @@ pub fn grep_search( }); let file_matches = Arc::try_unwrap(results).unwrap().into_inner(); - let truncated = done.load(Ordering::Relaxed); + let truncated = total.load(Ordering::Relaxed) >= max; Ok(GrepSearchResult { total_match_count: total.load(Ordering::Relaxed), + timed_out: timed_out.load(Ordering::Relaxed), overflow_path: maybe_save_overflow(truncated, &file_matches), file_matches, }) diff --git a/crates/loopal-backend/src/search/grep_file.rs b/crates/loopal-backend/src/search/grep_file.rs index ec38fd58..3de97682 100644 --- a/crates/loopal-backend/src/search/grep_file.rs +++ b/crates/loopal-backend/src/search/grep_file.rs @@ -43,6 +43,7 @@ pub(crate) fn empty_result() -> GrepSearchResult { GrepSearchResult { file_matches: Vec::new(), total_match_count: 0, + timed_out: false, overflow_path: None, } } @@ -90,6 +91,7 @@ pub(crate) fn search_single_file( let overflow_path = maybe_save_overflow(truncated, &file_matches); Ok(GrepSearchResult { total_match_count: count, + timed_out: false, file_matches, overflow_path, }) diff --git a/crates/loopal-backend/src/search/walker.rs b/crates/loopal-backend/src/search/walker.rs index 283aa960..b9b4cb76 100644 --- a/crates/loopal-backend/src/search/walker.rs +++ b/crates/loopal-backend/src/search/walker.rs @@ -7,7 +7,7 @@ use ignore::types::TypesBuilder; /// Build a `WalkBuilder` with shared defaults. /// -/// * Follows symlinks. +/// * Does not follow symlinks (ripgrep default) — avoids cross-mount escape and traversal cycles. /// * Respects `.gitignore` (ignore crate default). /// * Applies file-type filtering when `type_filter` is given. /// @@ -15,7 +15,7 @@ use ignore::types::TypesBuilder; /// caller should short-circuit with an empty result. pub fn build_walker(search_path: &Path, type_filter: Option<&str>) -> Option { let mut builder = WalkBuilder::new(search_path); - builder.follow_links(true); + builder.follow_links(false); if let Some(ty) = type_filter { let mut tb = TypesBuilder::new(); diff --git a/crates/loopal-backend/tests/suite.rs b/crates/loopal-backend/tests/suite.rs index 50241357..f3d1720f 100644 --- a/crates/loopal-backend/tests/suite.rs +++ b/crates/loopal-backend/tests/suite.rs @@ -5,6 +5,8 @@ mod approved_paths_test; mod batch_test; #[path = "suite/fetch_headers_test.rs"] mod fetch_headers_test; +#[path = "suite/glob_parallel_test.rs"] +mod glob_parallel_test; #[path = "suite/image_test.rs"] mod image_test; #[path = "suite/log_file_test.rs"] @@ -15,5 +17,7 @@ mod path_approval_test; mod process_group_test; #[path = "suite/resolve_checked_test.rs"] mod resolve_checked_test; +#[path = "suite/search_timeout_test.rs"] +mod search_timeout_test; #[path = "suite/tmp_cleanup_test.rs"] mod tmp_cleanup_test; diff --git a/crates/loopal-backend/tests/suite/glob_parallel_test.rs b/crates/loopal-backend/tests/suite/glob_parallel_test.rs new file mode 100644 index 00000000..1e420f02 --- /dev/null +++ b/crates/loopal-backend/tests/suite/glob_parallel_test.rs @@ -0,0 +1,76 @@ +use loopal_backend::ResourceLimits; +use loopal_backend::search::glob_search; +use loopal_tool_api::backend_types::GlobOptions; + +fn glob_opts(pattern: &str) -> GlobOptions { + GlobOptions { + pattern: pattern.to_string(), + path: None, + type_filter: None, + max_results: 10_000, + } +} + +#[test] +fn parallel_glob_finds_all_nested_matches() { + let tmp = tempfile::tempdir().unwrap(); + std::fs::write(tmp.path().join("a.rs"), "").unwrap(); + let sub = tmp.path().join("sub/deep"); + std::fs::create_dir_all(&sub).unwrap(); + std::fs::write(tmp.path().join("sub/b.rs"), "").unwrap(); + std::fs::write(sub.join("c.rs"), "").unwrap(); + std::fs::write(tmp.path().join("skip.txt"), "").unwrap(); + + let res = glob_search( + &glob_opts("**/*.rs"), + tmp.path(), + &ResourceLimits::default(), + ) + .unwrap(); + + let paths: Vec<&str> = res.entries.iter().map(|e| e.path.as_str()).collect(); + assert_eq!(res.entries.len(), 3); + assert!(paths.iter().any(|p| p.ends_with("a.rs"))); + assert!(paths.iter().any(|p| p.ends_with("b.rs"))); + assert!(paths.iter().any(|p| p.ends_with("c.rs"))); + assert!(!res.truncated); + assert!(!res.timed_out); +} + +#[test] +fn parallel_glob_truncates_at_max_results_with_tolerance() { + let tmp = tempfile::tempdir().unwrap(); + for i in 0..50 { + std::fs::write(tmp.path().join(format!("f{i:02}.rs")), "").unwrap(); + } + let limits = ResourceLimits { + max_glob_results: 10, + ..ResourceLimits::default() + }; + + let res = glob_search(&glob_opts("**/*.rs"), tmp.path(), &limits).unwrap(); + + assert!(res.truncated); + assert!(!res.timed_out); + assert!(res.entries.len() >= 10); + assert!(res.entries.len() < 50); +} + +#[cfg(unix)] +#[test] +fn glob_does_not_follow_symlinks() { + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path().join("root"); + let outside = tmp.path().join("outside"); + std::fs::create_dir_all(&root).unwrap(); + std::fs::create_dir_all(&outside).unwrap(); + std::fs::write(root.join("real.rs"), "").unwrap(); + std::fs::write(outside.join("secret.rs"), "").unwrap(); + std::os::unix::fs::symlink(&outside, root.join("link")).unwrap(); + + let res = glob_search(&glob_opts("**/*.rs"), &root, &ResourceLimits::default()).unwrap(); + + let paths: Vec<&str> = res.entries.iter().map(|e| e.path.as_str()).collect(); + assert!(paths.iter().any(|p| p.ends_with("real.rs"))); + assert!(!paths.iter().any(|p| p.ends_with("secret.rs"))); +} diff --git a/crates/loopal-backend/tests/suite/search_timeout_test.rs b/crates/loopal-backend/tests/suite/search_timeout_test.rs new file mode 100644 index 00000000..f710ce88 --- /dev/null +++ b/crates/loopal-backend/tests/suite/search_timeout_test.rs @@ -0,0 +1,118 @@ +use std::time::Duration; + +use loopal_backend::ResourceLimits; +use loopal_backend::search::{glob_search, glob_search_async, grep_search, grep_search_async}; +use loopal_tool_api::backend_types::{GlobOptions, GrepOptions}; + +fn tmp_with_files() -> tempfile::TempDir { + let tmp = tempfile::tempdir().unwrap(); + std::fs::write(tmp.path().join("a.rs"), "fn alpha() {}").unwrap(); + std::fs::write(tmp.path().join("b.rs"), "fn beta() {}").unwrap(); + let sub = tmp.path().join("sub"); + std::fs::create_dir_all(&sub).unwrap(); + std::fs::write(sub.join("c.rs"), "fn gamma() {}").unwrap(); + tmp +} + +fn limits_with_walk(walk: Duration) -> ResourceLimits { + ResourceLimits { + walk_timeout: walk, + ..ResourceLimits::default() + } +} + +fn glob_opts(pattern: &str) -> GlobOptions { + GlobOptions { + pattern: pattern.to_string(), + path: None, + type_filter: None, + max_results: 10_000, + } +} + +fn grep_opts(pattern: &str) -> GrepOptions { + GrepOptions { + pattern: pattern.to_string(), + path: None, + glob_filter: None, + case_insensitive: false, + multiline: false, + fixed_strings: false, + context_before: 0, + context_after: 0, + type_filter: None, + max_matches: 500, + } +} + +#[test] +fn glob_search_times_out_with_zero_walk_budget() { + let tmp = tmp_with_files(); + let res = glob_search( + &glob_opts("**/*.rs"), + tmp.path(), + &limits_with_walk(Duration::ZERO), + ) + .unwrap(); + assert!(res.timed_out); + assert!(!res.truncated); +} + +#[test] +fn glob_search_completes_within_normal_budget() { + let tmp = tmp_with_files(); + let res = glob_search( + &glob_opts("**/*.rs"), + tmp.path(), + &ResourceLimits::default(), + ) + .unwrap(); + assert!(!res.timed_out); + assert_eq!(res.entries.len(), 3); +} + +#[test] +fn grep_search_times_out_with_zero_walk_budget() { + let tmp = tmp_with_files(); + let res = grep_search( + &grep_opts("fn"), + tmp.path(), + &limits_with_walk(Duration::ZERO), + ) + .unwrap(); + assert!(res.timed_out); +} + +#[test] +fn grep_search_completes_within_normal_budget() { + let tmp = tmp_with_files(); + let res = grep_search(&grep_opts("alpha"), tmp.path(), &ResourceLimits::default()).unwrap(); + assert!(!res.timed_out); + assert_eq!(res.total_match_count, 1); +} + +#[tokio::test] +async fn glob_search_async_returns_partial_before_hard_backstop() { + let tmp = tmp_with_files(); + let res = glob_search_async( + glob_opts("**/*.rs"), + tmp.path().to_path_buf(), + limits_with_walk(Duration::ZERO), + ) + .await + .unwrap(); + assert!(res.timed_out); +} + +#[tokio::test] +async fn grep_search_async_returns_partial_before_hard_backstop() { + let tmp = tmp_with_files(); + let res = grep_search_async( + grep_opts("fn"), + tmp.path().to_path_buf(), + limits_with_walk(Duration::ZERO), + ) + .await + .unwrap(); + assert!(res.timed_out); +} diff --git a/crates/loopal-runtime/src/agent_loop/streaming_tool_exec.rs b/crates/loopal-runtime/src/agent_loop/streaming_tool_exec.rs index efd47875..a7a20848 100644 --- a/crates/loopal-runtime/src/agent_loop/streaming_tool_exec.rs +++ b/crates/loopal-runtime/src/agent_loop/streaming_tool_exec.rs @@ -16,7 +16,6 @@ use tracing::{debug, info}; use crate::frontend::traits::EventEmitter; use crate::mode::AgentMode; -use crate::tool_pipeline::execute_tool; #[derive(Debug, Clone)] pub struct ToolUseArrived { @@ -111,7 +110,9 @@ pub fn feed_tool( .join_set .spawn(loopal_protocol::event_id::propagate_to_spawn(async move { let tool_start = Instant::now(); - let result = execute_tool(&kernel, &name, input, &tool_ctx, &mode).await; + let result = + super::tool_exec::execute_tool_watchdogged(&kernel, &name, input, &tool_ctx, &mode) + .await; let tool_duration = tool_start.elapsed(); let (block, event) = match result { diff --git a/crates/loopal-runtime/src/agent_loop/tool_exec.rs b/crates/loopal-runtime/src/agent_loop/tool_exec.rs index 7b686d40..32f8b8c4 100644 --- a/crates/loopal-runtime/src/agent_loop/tool_exec.rs +++ b/crates/loopal-runtime/src/agent_loop/tool_exec.rs @@ -4,7 +4,8 @@ use std::time::Instant; use loopal_kernel::Kernel; use loopal_protocol::AgentEventPayload; use loopal_provider_api::ContentBlock; -use loopal_tool_api::{OutputTail, ToolContext}; +use loopal_tool_api::{OutputTail, ToolContext, ToolResult}; +use serde_json::Value; use tracing::{Instrument, info}; use crate::frontend::traits::AgentFrontend; @@ -15,6 +16,26 @@ use super::cancel::TurnCancel; use super::tool_collect::collect_results; use super::tool_progress::maybe_spawn_progress; +/// Single convergence point for tool execution: applies the per-tool watchdog +/// deadline. Both the normal pipeline and the streaming early-start path +/// (`feed_tool`) route through here, so a tool can never run unbounded on one +/// path while bounded on the other. +pub(crate) async fn execute_tool_watchdogged( + kernel: &Kernel, + name: &str, + input: Value, + ctx: &ToolContext, + mode: &AgentMode, +) -> loopal_error::Result { + let Some(deadline) = super::tool_watchdog::watchdog_deadline(name, &input) else { + return execute_tool(kernel, name, input, ctx, mode).await; + }; + match tokio::time::timeout(deadline, execute_tool(kernel, name, input, ctx, mode)).await { + Ok(r) => r, + Err(_) => Ok(super::tool_watchdog::timeout_result(deadline)), + } +} + /// Execute approved tools in parallel via JoinSet, with cancellation support. /// /// Each tool runs concurrently; results are collected and sorted by original index. @@ -62,16 +83,8 @@ pub async fn execute_approved_tools( maybe_spawn_progress(&name, &input, id.clone(), progress_emitter, tail); let tool_start = Instant::now(); - let watchdog_deadline = super::tool_watchdog::watchdog_deadline(&name, &input); - let result = if let Some(deadline) = watchdog_deadline { - let exec = execute_tool(&kernel, &name, input, &tool_ctx, &mode); - match tokio::time::timeout(deadline, exec).await { - Ok(r) => r, - Err(_) => Ok(super::tool_watchdog::timeout_result(deadline)), - } - } else { - execute_tool(&kernel, &name, input, &tool_ctx, &mode).await - }; + let result = + execute_tool_watchdogged(&kernel, &name, input, &tool_ctx, &mode).await; let tool_duration = tool_start.elapsed(); if let Some(h) = progress { diff --git a/crates/loopal-runtime/src/agent_loop/tool_watchdog.rs b/crates/loopal-runtime/src/agent_loop/tool_watchdog.rs index 00db1a6f..8dca1416 100644 --- a/crates/loopal-runtime/src/agent_loop/tool_watchdog.rs +++ b/crates/loopal-runtime/src/agent_loop/tool_watchdog.rs @@ -6,21 +6,29 @@ use serde_json::Value; const GRACE: Duration = Duration::from_secs(30); const BASH_MAX_TIMEOUT: Duration = Duration::from_secs(600); +const FS_READ_TIMEOUT: Duration = Duration::from_secs(60); pub fn watchdog_deadline(tool_name: &str, input: &Value) -> Option { - if tool_name != "Bash" { - return None; + match tool_name { + "Bash" => { + let timeout = + TimeoutSecs::from_tool_input(input, 300).to_duration_clamped(BASH_MAX_TIMEOUT); + Some(timeout + GRACE) + } + // Read-only filesystem tools can hang on a dead/slow mount; bound them so + // a stuck syscall cannot wedge the agent loop (glob/grep also self-bound + // cooperatively via ResourceLimits::walk_timeout, this is the hard floor). + "Glob" | "Grep" | "Ls" | "Read" | "ReadPdf" | "ReadImage" | "ReadHtml" => { + Some(FS_READ_TIMEOUT) + } + _ => None, } - let timeout = TimeoutSecs::from_tool_input(input, 300).to_duration_clamped(BASH_MAX_TIMEOUT); - Some(timeout + GRACE) } pub fn timeout_result(deadline: Duration) -> ToolResult { let secs = deadline.as_secs(); ToolResult { - content: format!( - "Watchdog timeout: tool did not return within {secs}s (timeout + 30s grace)", - ), + content: format!("Watchdog timeout: tool did not return within {secs}s"), images: Vec::new(), is_error: true, metadata: Some(ToolResultMetadata::stale(StaleReason::WatchdogTimeout)), @@ -33,8 +41,14 @@ mod tests { use serde_json::json; #[test] - fn watchdog_only_for_bash() { - assert!(watchdog_deadline("Read", &json!({})).is_none()); + fn watchdog_covers_bash_and_fs_read_tools() { + let want = Some(Duration::from_secs(60)); + for t in ["Glob", "Grep", "Ls", "Read"] { + assert_eq!(watchdog_deadline(t, &json!({})), want, "tool {t}"); + } + for t in ["ReadPdf", "ReadImage", "ReadHtml"] { + assert_eq!(watchdog_deadline(t, &json!({})), want, "tool {t}"); + } assert!(watchdog_deadline("Write", &json!({})).is_none()); assert!(watchdog_deadline("Edit", &json!({})).is_none()); assert!(watchdog_deadline("Agent", &json!({})).is_none()); diff --git a/crates/loopal-tool-api/src/backend_types.rs b/crates/loopal-tool-api/src/backend_types.rs index 0df36ef1..c8826297 100644 --- a/crates/loopal-tool-api/src/backend_types.rs +++ b/crates/loopal-tool-api/src/backend_types.rs @@ -81,6 +81,7 @@ pub struct GlobOptions { pub struct GlobSearchResult { pub entries: Vec, pub truncated: bool, + pub timed_out: bool, pub overflow_path: Option, } @@ -108,6 +109,7 @@ pub struct GrepOptions { pub struct GrepSearchResult { pub file_matches: Vec, pub total_match_count: usize, + pub timed_out: bool, pub overflow_path: Option, } diff --git a/crates/loopal-tool-api/src/lib.rs b/crates/loopal-tool-api/src/lib.rs index e69c4b7a..b8a9fdde 100644 --- a/crates/loopal-tool-api/src/lib.rs +++ b/crates/loopal-tool-api/src/lib.rs @@ -38,9 +38,9 @@ pub use stderr_buf::{STDERR_CAP_BYTES, StderrCappedBuffer}; pub use tool::{Tool, ToolDefinition, ToolDispatch, ToolResult}; pub use tool_context::ToolContext; pub use truncate::{ - DEFAULT_MAX_OUTPUT_BYTES, DEFAULT_MAX_OUTPUT_LINES, OverflowResult, extract_overflow_path, - handle_overflow, humanize_size, needs_truncation, save_to_overflow_file, truncate_output, - truncate_tail, + DEFAULT_MAX_OUTPUT_BYTES, DEFAULT_MAX_OUTPUT_LINES, OverflowResult, SEARCH_TIMEOUT_NOTICE, + extract_overflow_path, handle_overflow, humanize_size, needs_truncation, save_to_overflow_file, + truncate_output, truncate_tail, }; pub use truncate_middle::truncate_middle; pub use typed_bridge::TypedBridge; diff --git a/crates/loopal-tool-api/src/truncate.rs b/crates/loopal-tool-api/src/truncate.rs index dfde04f0..849616f6 100644 --- a/crates/loopal-tool-api/src/truncate.rs +++ b/crates/loopal-tool-api/src/truncate.rs @@ -1,6 +1,10 @@ pub const DEFAULT_MAX_OUTPUT_LINES: usize = 2_000; pub const DEFAULT_MAX_OUTPUT_BYTES: usize = 512_000; +pub const SEARCH_TIMEOUT_NOTICE: &str = "\n\n⚠️ Search timed out before scanning the whole tree — \ +results are incomplete. Narrow `path` to a specific project subdirectory and retry \ +(a broad `path` over a network mount or huge tree cannot finish in time)."; + pub fn truncate_output(output: &str, max_lines: usize, max_bytes: usize) -> String { if output.is_empty() { return String::new(); diff --git a/crates/tools/filesystem/glob/src/lib.rs b/crates/tools/filesystem/glob/src/lib.rs index 0fa7ffb8..a967af7e 100644 --- a/crates/tools/filesystem/glob/src/lib.rs +++ b/crates/tools/filesystem/glob/src/lib.rs @@ -1,6 +1,8 @@ use async_trait::async_trait; use loopal_error::LoopalError; -use loopal_tool_api::{GlobOptions, PermissionLevel, ToolContext, ToolResult, TypedTool}; +use loopal_tool_api::{ + GlobOptions, PermissionLevel, SEARCH_TIMEOUT_NOTICE, ToolContext, ToolResult, TypedTool, +}; use schemars::JsonSchema; use serde::Deserialize; @@ -72,11 +74,21 @@ impl TypedTool for GlobTool { LoopalError::Tool(loopal_error::ToolError::ExecutionFailed(e.to_string())) })?; + let timed_out = result.timed_out; let mut entries = result.entries; - entries.sort_by(|a, b| b.modified_secs.cmp(&a.modified_secs)); + entries.sort_by(|a, b| { + b.modified_secs + .cmp(&a.modified_secs) + .then_with(|| a.path.cmp(&b.path)) + }); let total_found = entries.len(); if total_found == 0 { + if timed_out { + return Ok(ToolResult::success(format!( + "No files matched before the search timed out.{SEARCH_TIMEOUT_NOTICE}" + ))); + } return Ok(ToolResult::success("No files matched the pattern.")); } @@ -98,6 +110,9 @@ impl TypedTool for GlobTool { if page_end < total_found { output.push_str(&format!("\n\n(Use offset={page_end} to see more.)")); } + if timed_out { + output.push_str(SEARCH_TIMEOUT_NOTICE); + } Ok(ToolResult::success(output)) } diff --git a/crates/tools/filesystem/glob/tests/suite/glob_tool_edge_test.rs b/crates/tools/filesystem/glob/tests/suite/glob_tool_edge_test.rs index 8309339f..89aca39a 100644 --- a/crates/tools/filesystem/glob/tests/suite/glob_tool_edge_test.rs +++ b/crates/tools/filesystem/glob/tests/suite/glob_tool_edge_test.rs @@ -84,3 +84,52 @@ async fn test_glob_default_limit_is_100() { assert!(result.content.contains("Found 105 files. Showing 1-100:")); assert!(result.content.contains("Use offset=100")); } + +fn make_ctx_zero_walk(cwd: &std::path::Path) -> ToolContext { + let limits = loopal_backend::ResourceLimits { + walk_timeout: std::time::Duration::ZERO, + ..Default::default() + }; + let backend = + loopal_backend::LocalBackend::new(cwd.to_path_buf(), None, limits, "test-session"); + ToolContext::new(backend, "test") +} + +#[tokio::test] +async fn test_glob_timeout_surfaces_notice() { + let tmp = tempfile::tempdir().unwrap(); + std::fs::write(tmp.path().join("foo.rs"), "fn main() {}").unwrap(); + + let tool = make_tool(); + let ctx = make_ctx_zero_walk(tmp.path()); + + let result = tool + .execute(json!({"pattern": "**/*.rs"}), &ctx) + .await + .unwrap(); + + assert!(!result.is_error); + assert!(result.content.contains("timed out")); +} + +#[tokio::test] +async fn test_glob_output_order_is_deterministic() { + let tmp = tempfile::tempdir().unwrap(); + for i in 0..20 { + std::fs::write(tmp.path().join(format!("f{i:02}.rs")), "").unwrap(); + } + + let tool = make_tool(); + let ctx = make_ctx(tmp.path()); + + let r1 = tool + .execute(json!({"pattern": "*.rs"}), &ctx) + .await + .unwrap(); + let r2 = tool + .execute(json!({"pattern": "*.rs"}), &ctx) + .await + .unwrap(); + + assert_eq!(r1.content, r2.content); +} diff --git a/crates/tools/filesystem/grep/src/grep_format.rs b/crates/tools/filesystem/grep/src/grep_format.rs index 82202ae6..e1661d08 100644 --- a/crates/tools/filesystem/grep/src/grep_format.rs +++ b/crates/tools/filesystem/grep/src/grep_format.rs @@ -1,6 +1,7 @@ use std::fmt::Write; use loopal_error::LoopalError; +use loopal_tool_api::SEARCH_TIMEOUT_NOTICE; use loopal_tool_api::backend_types::{GrepSearchResult, MatchLine}; use crate::grep_format_summary::{format_count, format_files}; @@ -51,6 +52,9 @@ pub fn format_results( fmt_opts: &FormatOptions, ) -> String { if results.file_matches.is_empty() { + if results.timed_out { + return format!("No matches found before the search timed out.{SEARCH_TIMEOUT_NOTICE}"); + } return "No matches found.".to_string(); } @@ -67,6 +71,9 @@ pub fn format_results( ) .unwrap(); } + if results.timed_out { + output.push_str(SEARCH_TIMEOUT_NOTICE); + } output } diff --git a/crates/tools/filesystem/grep/tests/suite.rs b/crates/tools/filesystem/grep/tests/suite.rs index b7282903..b3d35bb4 100644 --- a/crates/tools/filesystem/grep/tests/suite.rs +++ b/crates/tools/filesystem/grep/tests/suite.rs @@ -5,6 +5,8 @@ mod grep_context_test; mod grep_fixed_strings_test; #[path = "suite/grep_options_test.rs"] mod grep_options_test; +#[path = "suite/grep_timeout_test.rs"] +mod grep_timeout_test; #[path = "suite/grep_tool_edge_test.rs"] mod grep_tool_edge_test; #[path = "suite/grep_tool_test.rs"] diff --git a/crates/tools/filesystem/grep/tests/suite/grep_timeout_test.rs b/crates/tools/filesystem/grep/tests/suite/grep_timeout_test.rs new file mode 100644 index 00000000..07cf1492 --- /dev/null +++ b/crates/tools/filesystem/grep/tests/suite/grep_timeout_test.rs @@ -0,0 +1,62 @@ +use loopal_tool_api::backend_types::{FileMatchResult, GrepSearchResult, MatchGroup, MatchLine}; +use loopal_tool_grep::grep_format::{FormatOptions, OutputMode, format_results}; + +fn timed_out_empty() -> GrepSearchResult { + GrepSearchResult { + file_matches: vec![], + total_match_count: 0, + timed_out: true, + overflow_path: None, + } +} + +fn timed_out_with_match() -> GrepSearchResult { + GrepSearchResult { + file_matches: vec![FileMatchResult { + path: "x.rs".to_string(), + groups: vec![MatchGroup { + lines: vec![MatchLine { + line_num: 1, + content: "fn main() {}".to_string(), + is_match: true, + }], + }], + }], + total_match_count: 1, + timed_out: true, + overflow_path: None, + } +} + +#[test] +fn timeout_notice_replaces_no_matches_message() { + let out = format_results( + &timed_out_empty(), + OutputMode::FilesWithMatches, + 50, + 500, + &FormatOptions::default(), + ); + assert!(out.contains("timed out")); +} + +#[test] +fn timeout_notice_appended_after_matches() { + let out = format_results( + &timed_out_with_match(), + OutputMode::Content, + 50, + 500, + &FormatOptions::default(), + ); + assert!(out.contains("x.rs")); + assert!(out.contains("timed out")); +} + +#[test] +fn no_timeout_notice_when_not_timed_out() { + let mut r = timed_out_with_match(); + r.timed_out = false; + let out = format_results(&r, OutputMode::Content, 50, 500, &FormatOptions::default()); + assert!(!out.contains("timed out")); +} diff --git a/design/glob-traversal-hang/0001-root-cause-analysis.md b/design/glob-traversal-hang/0001-root-cause-analysis.md new file mode 100644 index 00000000..c70b75df --- /dev/null +++ b/design/glob-traversal-hang/0001-root-cause-analysis.md @@ -0,0 +1,261 @@ +# Glob Traversal Hang — Root Cause Analysis + +状态:P0 遍历超时已实现。P1 glob 并行 walker + `follow_links(false)` 已实现,见 [0002](0002-parallel-glob-walker.md)。⚠️ 本文 §5.3 的"后端外层硬兜底"已被一次 code review **取代**——外层超时合并到 runtime 通用 watchdog(并覆盖 Ls/Read),详见 [0003](0003-post-review-consolidation.md)。P2 旁路加固未做。 + +| | | +|---|---| +| **故障会话** | `fa027124-a400-4a24-9299-3684945b83f2`(cwd `/Users/stone/Works/AgentsMesh/GTMApps`,model gpt-5.5)| +| **挂死时间** | `2026-06-24T09:36:55Z`(本地 17:36)| +| **现象** | 会话停止响应,无报错、无崩溃;agent loop 永久 `await` 一个不返回的工具结果 | +| **定性** | **不是崩溃**,是 Glob 工具在网络挂载点上做*无超时 + 单线程 + 0 命中*的穷举遍历,堵死 `spawn_blocking` 线程 | +| **触发器** | 单次 `Glob` 调用把 `path` 指到 `/Users/stone/Works`(覆盖了一个 rclone/OSS 网络挂载点)| + +--- + +## 1. 摘要 + +一次 `Glob { path: "/Users/stone/Works", pattern: "**/*YoloShell*" }` 调用: + +1. `path` 逃逸出项目目录,扩到整棵 `Works` 树; +2. `Works` 下挂着一个 rclone→阿里云 OSS 的 NFS 网络盘; +3. 模式 `**/*YoloShell*` **零命中**,于是遍历**永不触发提前退出**,被迫穷举整棵树(含网络盘); +4. Glob 的遍历**单线程、无 deadline、`follow_links(true)`**,runtime 也**没有 per-tool 超时**。 + +四者叠加 → `spawn_blocking` 线程在网络盘上无限期遍历 → agent loop 永远拿不到结果 → 会话静默假死。 + +--- + +## 2. 故障现场(日志证据) + +会话事件流 `turns.jsonl` 的**最后几行**,每个 `ToolBatch` 此前都在毫秒级收到 `StepUpdated → Done`,唯独最后一批没有: + +``` +422 09:35:35.211Z StepUpdated item_index=0 -> Done +423 09:35:35.214Z StepUpdated item_index=1 -> Done +424 09:35:35.220Z StepUpdated item_index=2 -> Done +... +431 09:36:55.618Z StepAppended LlmCall tools=['Glob','Grep'] +432 09:36:55.625Z StepAppended ToolBatch [ ('Glob','Pending'), ('Grep','Pending') ] + +``` + +第 431 行的 LLM 文本与调用参数: + +```jsonc +// LLM 自述: +"我缺 YoloShell 的 App Store 链接,先从本地项目/素材里查。" + +// 卡死的调用: +Glob { "path": "/Users/stone/Works", "pattern": "**/*YoloShell*", "limit": 80 } +Grep { "path": "/Users/stone/Works", "glob": "**/*.{json,md,txt,js,swift,plist}", ... } +``` + +网络挂载证据(`mount` 输出 + 挂载脚本): + +``` +localhost:/AppMaterials on /Users/stone/Works/AppsMeterial (nfs, nodev, nosuid, mounted by stone) +``` +```bash +# mount-appsmeterial.sh —— 把阿里云 OSS bucket 挂成 NFS +rclone nfsmount appmaterials:appmeterials "$HOME/Works/AppsMeterial" \ + --vfs-cache-mode full --vfs-cache-max-size 50G ... +``` + +> 对照:同一会话**更早**的 Glob `path=".../GTMApps"`(限定在项目内)秒回;唯独这次 `path="/Users/stone/Works"`(覆盖 OSS 挂载)挂死。差别只在"遍历范围是否覆盖到网络盘"。 + +--- + +## 3. 调用链与根因代码 + +### 调用链 + +``` +LLM tool_use(Glob) + └─ GlobTool::execute crates/tools/filesystem/glob/src/lib.rs + └─ ctx.backend.glob() crates/loopal-backend/src/local_backend_impl.rs:57 + └─ glob_search_async crates/loopal-backend/src/search/mod.rs ← spawn_blocking,无 timeout + └─ glob_search crates/loopal-backend/src/search/glob.rs ← 单线程穷举,0 命中不退出 + └─ build_walker crates/.../search/walker.rs ← follow_links(true) +``` + +### 因素 ①:`path` 逃逸出项目,且绝对读路径不做包含检查 + +`crates/tools/filesystem/glob/src/lib.rs`: + +```rust +let search_path = match input.path.as_deref() { + // resolve_path 的第二个参数 false = 不做 cwd 包含检查(读路径直通) + Some(p) => Some(ctx.backend.resolve_path(p, false).map_err(/* ... */)?), + None => None, +}; +// ... +let result = ctx.backend.glob(&opts).await /* ← 整个 agent loop 在这里永久挂起 */ ?; +``` + +会话 cwd 是 `.../GTMApps`,但 LLM 传入的 `path` 是父目录 `/Users/stone/Works`。读路径不做 cwd 包含检查,遍历范围合法地扩大到整棵 `Works` 树——其中包含 `AppsMeterial` 这个 OSS 网络挂载点。 + +### 因素 ②③:单线程穷举遍历 + 0 命中永不提前退出 + +`crates/loopal-backend/src/search/glob.rs`(修复前): + +```rust +let max = opts.max_results.min(limits.max_glob_results); // = 10_000 +// ... +for entry in walker.build().flatten() { // ← 单线程迭代器(对比 grep 用 build_parallel) + if !entry.file_type().is_some_and(|ft| ft.is_file()) { continue; } + let rel = match entry.path().strip_prefix(&search_path) { Ok(r) => r, Err(_) => continue }; + if !matcher.is_match(rel) { continue; } + // ... + entries.push(GlobEntry { /* ... */ }); + if entries.len() >= max { break; } // ← 唯一的提前退出:命中数达到 10_000 +} +``` + +**关键缺陷**:唯一的提前退出条件是"命中数达到 `max`"。而 `**/*YoloShell*` 在整棵 `Works` 树里**零命中**,`entries.len()` 恒为 0,永远 `break` 不了 → walker 必须**穷举遍历完每一个文件**才能返回空结果。这个穷举过程要钻进 OSS 网络盘,每次 `readdir`/`metadata` 都是网络往返。 + +### 因素 ④a:遍历跟随符号链接 + +`crates/loopal-backend/src/search/walker.rs`: + +```rust +let mut builder = WalkBuilder::new(search_path); +builder.follow_links(true); // ← 跟随符号链接,跨网络盘/iCloud 时可触发下载、跨设备、环路 +``` + +> 注:`ignore` 的 `.gitignore` 过滤在此**不生效**——`/Users/stone/Works` 本身不是 git 仓库,gitignore 规则只在各子仓库内部应用,挡不住对网络盘的下钻。 + +### 因素 ④b:遍历层无超时,runtime 也无 per-tool 超时 + +`crates/loopal-backend/src/search/mod.rs`(修复前): + +```rust +pub async fn glob_search_async(opts, cwd, limits) -> Result { + tokio::task::spawn_blocking(move || glob_search(&opts, &cwd, &limits)) // ← 无 timeout 包裹 + .await + .map_err(|e| ToolIoError::Other(e.to_string()))? +} +``` + +`crates/loopal-backend/src/limits.rs` 的 `default_timeout`(300s)只作用于 shell `exec`,glob/grep 遍历不受其约束;runtime 工具管线对 tool execute 也**没有任何 per-tool 超时**(仓库内仅有 LLM 重试退避与 compaction 的超时)。因此阻塞线程卡在网络盘 syscall 上时,**没有任何机制能打断它**,agent loop 只能无限期 `await`。 + +### 对照组:grep 为何相对不易挂(但同样有隐患) + +`crates/loopal-backend/src/search/grep.rs` 用的是**并行 walker**: + +```rust +w.build_parallel().run(|| { /* 每线程 visitor */ }); +``` + +并在 `search_one_file` 命中 `max_grep_matches`(500)时设 `done`,visitor 顶部 `if done.load() { return WalkState::Quit; }` 早停。grep 多线程 + 命中上限更低,"卡死"概率比 glob 小。但 grep **修复前同样没有遍历超时**,在零命中 + 网络盘场景下一样会长时间穷举——只是这次 ToolBatch 是两者并行、整批等最慢的 Glob,所以表面看是 Glob 挂死。 + +--- + +## 4. 影响面 + +- **严重度**:高。任何一次把 `path` 指向包含慢 IO(OSS/iCloud/坏 NFS/超大目录)的目录的 Glob/Grep,都能让**整个会话静默假死**,且日志只留下一个 `Pending` 的 ToolBatch,排查成本极高。 +- **不限于网络盘**:`/Users/stone/Works` 级别的超大本地目录树 + 零命中模式,也会造成数十秒到数分钟级的卡顿。 +- **后台线程泄漏**:外层 `tokio::time::timeout` 只能让 agent loop 不再 `await`;`spawn_blocking` 的阻塞线程**不会被 tokio 取消**,会继续在网络盘上空跑——因此还需要遍历内部的协作式停止。 + +--- + +## 5. 已实施的修复(P0 遍历超时) + +主方案为"遍历超时",采用**双层**设计,缺一不可:协作式 deadline 负责"停掉后台线程并带回部分结果",外层 `timeout` 负责"无论如何解放 agent loop"。 + +### 5.1 `ResourceLimits` 新增 `walk_timeout` + +`crates/loopal-backend/src/limits.rs`:默认 `Duration::from_secs(30)`。所有既有 `ResourceLimits { .. }` 字面量都用 `..Default::default()`,故无需改动任何调用点。 + +### 5.2 遍历内协作式 deadline + +`glob.rs`(单线程循环顶部): + +```rust +let deadline = Instant::now() + limits.walk_timeout; +// ... +for entry in walker.build().flatten() { + if Instant::now() >= deadline { + timed_out = true; + truncated = true; + break; // 带着已收集的部分结果返回 + } + // ... +} +``` + +`grep.rs`(并行 visitor 顶部,复用已有 `done` 早停旗标): + +```rust +let deadline = Instant::now() + limits.walk_timeout; +// ... +Box::new(move |entry| { + if done.load(Ordering::Relaxed) { return WalkState::Quit; } + if Instant::now() >= deadline { + done.store(true, Ordering::Relaxed); + timed_out.store(true, Ordering::Relaxed); + return WalkState::Quit; + } + // ... +}) +``` + +> 协作式检查只在两次 entry 之间生效,无法打断正卡在单个 `readdir`/`metadata` syscall 里的线程——那种极端情况由 5.3 的外层硬兜底兜住。 + +### 5.3 外层硬兜底 `tokio::time::timeout` + +`crates/loopal-backend/src/search/mod.rs`: + +```rust +const WALK_TIMEOUT_GRACE: Duration = Duration::from_secs(10); + +pub async fn glob_search_async(opts, cwd, limits) -> Result { + let hard = limits.walk_timeout.saturating_add(WALK_TIMEOUT_GRACE); + let join = tokio::task::spawn_blocking(move || glob_search(&opts, &cwd, &limits)); + match tokio::time::timeout(hard, join).await { + Ok(joined) => joined.map_err(|e| ToolIoError::Other(e.to_string()))?, + Err(_) => Err(ToolIoError::Timeout(hard)), // ← agent loop 一定能在 hard 内解放 + } +} +``` + +正常情况下协作式 deadline(30s)先返回部分结果,外层(40s)只在内层卡死 syscall 时才触发并返回 `Timeout`。`grep_search_async` 同构。 + +### 5.4 向 LLM 暴露超时信号 + +`GlobSearchResult` / `GrepSearchResult` 各加 `timed_out: bool`。工具层据此输出明确提示(`glob/src/lib.rs`、`grep/src/grep_format.rs`),让 LLM 知道"结果不完整、请缩小 `path`",避免对一个超时返回的空结果误判为"确实没有": + +``` +⚠️ Search timed out before scanning the whole tree — results are incomplete. + Narrow `path` to a specific project subdirectory and retry ... +``` + +### 5.5 测试 + +- `crates/loopal-backend/tests/suite/search_timeout_test.rs`:`walk_timeout=0` 时 `glob_search`/`grep_search`/`*_async` 均返回 `timed_out=true`;默认预算时正常完成且命中正确。 +- `glob_tool_edge_test.rs`:零预算 backend 下工具输出含 "timed out"。 +- `grep_timeout_test.rs`:直接单测 `format_results` 的超时提示(空结果替换、有结果追加、未超时不追加)。 + +全量 `bazel build //...`、受影响 `bazel test`、`--config=clippy`、`--config=rustfmt` 均通过。 + +--- + +## 6. 未做项(待后续排期) + +- **P1 — glob 改并行 walker + `follow_links(false)`**:已按独立方案 [0002](0002-parallel-glob-walker.md) 实现。提速且消除符号链接环路,但非"防挂死"必需。 +- **P2 — 旁路加固**:walker 加 `same_file_system(true)` 不跨越文件系统边界进入 NFS/OSS 挂载;对读路径做"逃逸出 cwd 过多层"的软上限/提示。 + +--- + +## 附:涉及文件清单 + +| 文件 | 角色 / 改动 | +|---|---| +| `crates/loopal-tool-api/src/backend_types.rs` | `GlobSearchResult`/`GrepSearchResult` 加 `timed_out` | +| `crates/loopal-backend/src/limits.rs` | `ResourceLimits` 加 `walk_timeout`(默认 30s)| +| `crates/loopal-backend/src/search/glob.rs` | 单线程循环内协作式 deadline;构造点补 `timed_out` | +| `crates/loopal-backend/src/search/grep.rs` | 并行 visitor 内协作式 deadline;构造点补 `timed_out` | +| `crates/loopal-backend/src/search/grep_file.rs` | `empty_result`/`search_single_file` 构造点补 `timed_out` | +| `crates/loopal-backend/src/search/mod.rs` | `*_search_async` 外层 `tokio::time::timeout` 硬兜底 | +| `crates/tools/filesystem/glob/src/lib.rs` | `timed_out` 输出提示 | +| `crates/tools/filesystem/grep/src/grep_format.rs` | `timed_out` 输出提示 | +| `crates/loopal-backend/src/search/walker.rs` | 现状 `follow_links(true)`(P1 待改)| diff --git a/design/glob-traversal-hang/0002-parallel-glob-walker.md b/design/glob-traversal-hang/0002-parallel-glob-walker.md new file mode 100644 index 00000000..a1254146 --- /dev/null +++ b/design/glob-traversal-hang/0002-parallel-glob-walker.md @@ -0,0 +1,218 @@ +# Glob Traversal Hang — Parallel Walker & Symlink Policy (P1) + +状态:已实现(仅 P1:glob 并行 walker + `follow_links(false)` + glob 排序补 `path` 二级键)。前置 P0 遍历超时见 [0001](0001-root-cause-analysis.md)。P2 旁路加固(`same_file_system`、读路径逃逸软上限)不在本方案范围。`GlobMatcher: Send + Sync` 已由编译期确认,§3.1 注的 clone 退化路径未触发。 + +| | | +|---|---| +| **需求** | 把 glob 的目录遍历从单线程改为并行(对齐 grep),并把共享 walker 的 `follow_links` 默认改为 `false` | +| **动机** | 单线程穷举是放大故障的因素之一;`follow_links(true)` 偏离 ripgrep 默认,是符号链接逃逸/环路的隐患 | +| **范围** | 仅 backend 搜索层(`crates/loopal-backend/src/search/`)+ glob 工具排序(`glob/src/lib.rs`)。不改工具入参、不改 IPC、不抽公共 framework | +| **非目标** | 不改 grep 行为(已并行);不引入 `same_file_system`/路径逃逸上限(P2);不加配置开关(YAGNI) | + +--- + +## 1. 背景与目标 + +P0 已用「协作式 deadline + 外层硬超时」消除了"会话假死"这一**致命**问题([0001](0001-root-cause-analysis.md) §5)。P1 处理 [0001](0001-root-cause-analysis.md) §3 暴露的两个**结构性弱点**,它们不致命但长期有害: + +1. **glob 单线程穷举**(`search/glob.rs` 的 `for entry in walker.build().flatten()`)。grep 早已是 `build_parallel().run()`,glob 是仓库内唯一仍走单线程的 tree-walk。慢 IO 下单线程会被单个慢目录串行堵住,整体 wall-time = 所有目录耗时之和。 +2. **`build_walker` 的 `follow_links(true)`**(`search/walker.rs`,glob/grep 共用)。跟随符号链接会跨设备、触发 iCloud 下载、并可能产生遍历环路;且偏离了 ripgrep(Grep 工具自我描述的底座)的默认 `false`。 + +**目标** + +- glob 遍历并行化,wall-time 在大目录上随核数下降,且单个慢目录不再串行堵死全局。 +- glob 与 grep 的遍历骨架在风格上对齐(同一套 `done`/`timed_out`/`deadline` 纪律),降低维护心智。 +- `follow_links` 回到 `false`,与 ripgrep 默认一致,关闭符号链接逃逸/环路这一类隐患。 +- 保持对外行为等价:相同 `pattern`/`path` 下的结果集合不变(除"符号链接目标不再被遍历"这一明确语义变更)。 + +**显式非目标**:见上表。特别地,**不**把 glob/grep 的并行骨架抽成共享 runner(理由见 §3.3)。 + +--- + +## 2. 现状基线(P0 之后) + +| 组件 | 现状 | +|---|---| +| `search/glob.rs` | 单线程 `walker.build().flatten()`;收集 `Vec`;`entries.len() >= max` 时 `break`;P0 在循环顶部加了 `Instant::now() >= deadline` 协作式停止 | +| `search/grep.rs` | **已并行**:`w.build_parallel().run(\|\| visitor)`;`Arc`(命中数)、`Arc done`(早停)、`Arc timed_out`(超时)、`Arc>>`(收集);visitor 顶部检查 `done`/`deadline` | +| `search/walker.rs` | `WalkBuilder::new(path).follow_links(true)`,可选 type filter;**glob/grep 共用** | +| `glob/src/lib.rs` | 工具层对 backend 返回的 `entries` 按 `modified_secs` **降序排序**,再按 `offset/limit` 分页(backend 返回未排序) | +| 结果类型 | `GlobSearchResult { entries, truncated, timed_out, overflow_path }` | + +> 关键:glob 的最终顺序由**工具层的 mtime 排序**决定,不依赖遍历顺序——这是并行化能保持兼容的前提(详见 §4.2)。 + +--- + +## 3. 设计与备选取舍 + +### 3.1 glob 并行化(对齐 grep 骨架) + +把单线程循环替换为 `WalkParallel`,visitor 内完成"判文件 → 相对路径匹配 → 收集"。结构与 grep 一一对应: + +```text +deadline = now + walk_timeout +count : Arc // 已收集条数 +done : Arc // 早停(命中 max 或超时) +timed_out: Arc // 早停原因 = 超时 +entries : Arc>> +matcher : Arc // 见 3.1 注 +search_path: Arc + +build_parallel().run(per-thread): + visitor(entry): + if done -> Quit + if now >= deadline -> done=1; timed_out=1; Quit // 协作式超时(与 P0 同义) + if !is_file -> Continue + rel = strip_prefix(search_path); if no rel -> Continue + if !matcher.is_match(rel) -> Continue + push GlobEntry{ path, modified_secs } + if count.fetch_add(1)+1 >= max -> done=1; Quit // 命中上限早停 + +entries = Arc::try_unwrap(entries) // run() 已 join 所有线程 +truncated = done.load() // 超时或上限都算截断(语义同 P0 单线程版) +timed_out = timed_out.load() +``` + +**3.1 注 — matcher 的跨线程共享**:`globset::GlobMatcher` 为 `Send + Sync`,故用 `Arc` 跨线程共享只读匹配(无需每线程克隆)。若编译期发现某版本不满足 `Sync`,退化为每线程 `matcher.clone()`(`GlobMatcher: Clone`,内部 Arc 化,克隆廉价)——与 grep 对 `regex::Regex` 的 `.clone()` 做法同源。此判定在实现时由编译器兜底,不是运行期风险。 + +**3.1 注 — `Arc::try_unwrap` 的前提**:`WalkParallel::run()` 返回前会 join 全部 worker 线程,所有 visitor 闭包已析构,`entries` 只剩外层一个强引用,`try_unwrap().unwrap()` 必然成功。此前提与 grep 现有代码一致。 + +### 3.2 `follow_links` 策略:改为 `false` + +`search/walker.rs` 的 `build_walker` 把 `follow_links(true)` 改为 `false`。 + +- **为什么是 `false`**:ripgrep 默认 `follow_links=false`(`-L` 才 opt-in)。Grep 工具描述自称"built on ripgrep",当前 `true` 实际是**偏离**底座默认。改 `false` 是**回归**预期,不是新增限制。 +- **可配置性决策**:**硬编码 `false`,不引入设置项**。理由:(a) 与 ripgrep 默认对齐后,"需要跟随符号链接搜索"是少数派需求;(b) YAGNI——在出现真实诉求前不加配置面(符合仓库 Principles)。若未来确有需要,再补 `search.follow_symlinks`(默认 false)的逃逸阀,届时是纯增量。 +- **作用边界(重要,避免误读)**:本变更关闭的是"经由**符号链接**进入外部树/网络盘/产生环路"这一类。它**修不了** [0001](0001-root-cause-analysis.md) 的原始事故——那里的 `AppsMeterial` 是一个**真实目录**(NFS 挂载点),遍历是直接下钻、不涉及符号链接。原始 vector 由 P0(超时)兜底、由 P2(`same_file_system`)根除。三者**互补**,详见 §4.5。 + +### 3.3 备选方案对比 + +| 方案 | 描述 | 取舍 | +|---|---|---| +| **A(采纳)** | glob 内联并行化 + `follow_links(false)`,glob/grep 各自独立但骨架对齐 | 提速、消除环路隐患、与 grep 风格统一;改动可控 | +| B | 仅保留 P0 单线程+超时,不并行化 | 已能防假死,但慢 IO 下仍串行堵塞、wall-time 高;放着唯一单线程 walk 是技术债 | +| C | 抽 `parallel_walk_with_deadline(visitor)` 公共 runner,glob/grep 共用 | **暂不做**:仅 2 个消费者,且 visitor 主体(文件名匹配 vs 读文件正则)差异大,共享部分只有 ~15 行样板;过早抽象违背既有偏好(tool 优化按 tool 单独做、不造横切 framework)。**触发条件**:出现第 3 个并行搜索消费者时再抽取 | +| D | `follow_links` 加配置开关 | YAGNI,见 §3.2 | + +--- + +## 4. 正确性与一致性分析 + +### 4.1 `max` 早停在并行下是"近似"的 + +单线程版精确在 `entries.len() == max` 处 `break`。并行版中,多个线程可能在 `done` 传播前各自完成一次 `push`,最终 `entries.len()` 可能**轻微越过** `max`(最坏约 `max + (线程数 - 1)`)。 + +- **是否可接受**:可接受。`max = max_glob_results = 10_000`,越过几十条对工具层分页(`DEFAULT_LIMIT=100`/用户 `limit`)无影响;`truncated=true` 照常置位、overflow 文件照常落盘。 +- **与 grep 一致**:grep 的 `total_match_count` 早停同样是 `fetch_add` 后判断,本就允许轻微越界。glob 采用同一近似,是**一致性收敛**而非新引入的不确定。 + +### 4.2 结果顺序与确定性(本方案唯一需要主动补强的点) + +并行遍历的**收集顺序非确定**(线程交错)。但 glob 的对外顺序由**工具层 mtime 降序排序**决定,因此: + +- **不同 mtime 的文件**:顺序完全不受影响(由 mtime 决定)。 +- **相同 mtime 的文件**(同一秒 checkout/解压极常见):`sort_by` 稳定,但**输入顺序**变成非确定,于是平局项的相对顺序在多次运行间会**抖动**——当"同 mtime 文件数 > 分页 limit"时,首页落入哪些文件可能逐次不同。这是并行化引入的**真实(虽轻微)行为变化**。 + +**对策(纳入 P1)**:在 `glob/src/lib.rs` 的排序里加 `path` 作为**二级排序键**,把平局打破从"遍历顺序"改为"路径字典序",恢复跨运行确定性: + +```text +sort by (modified_secs desc, path asc) +``` + +成本一行、零风险,且让 glob 比 grep 更确定(grep 当前对 `file_matches` 不排序、本就非确定——见 §4.4 脚注)。 + +### 4.3 `truncated` / `timed_out` 语义保持 + +- `truncated = done.load()`:`done` 由"命中 max"或"超时"任一置位,与 P0 单线程版"两种情况都置 `truncated=true`"等价。 +- `timed_out` 独立标记"截断原因=超时",工具层据此输出"搜索超时、请缩小 path"提示([0001](0001-root-cause-analysis.md) §5.4)。语义不变。 +- overflow:`truncated` 为真时落盘,行为不变。 + +### 4.4 与 grep 的行为对齐 + +P1 后 glob 与 grep 共享同一套遍历纪律(`build_walker` → `build_parallel` → `done`/`timed_out`/`deadline` → `Arc` 收集),**对称且各自独立**。这降低维护心智,同时不引入共享抽象(§3.3-C)。 + +> 脚注:grep 目前对 `file_matches` **不做排序**,其文件顺序本就非确定。把 grep 也改成确定顺序属 P1 范围外的独立改进,本方案仅记录该观察,不在此处实施。 + +### 4.5 `follow_links=false` 与原始事故的关系(诚实说明) + +| 逃逸 vector | 经由 | P0 超时 | P1 follow_links=false | P2 same_file_system | +|---|---|---|---|---| +| 直接下钻真实挂载目录(**原始事故** `AppsMeterial`) | 真实目录 | ✅ 兜底(限时返回) | ❌ 无关 | ✅ 根除(不跨设备) | +| 经符号链接进入外部树/网络盘 | symlink | ✅ 兜底 | ✅ 关闭 | ✅ 关闭 | +| 符号链接环路 | symlink | ✅ 兜底 | ✅ 关闭 | — | + +结论:P1 的 `follow_links=false` 是**防御纵深**,覆盖与原始事故**相邻**的一类,与 P0/P2 互补;它**不**单独修复原始 vector,文档不夸大其作用。 + +--- + +## 5. 影响面 / 兼容 / 迁移 + +### 5.1 行为变化清单 + +1. **符号链接目标不再被遍历**(语义变更,glob+grep 同时生效)。仅经由符号链接可达的文件不再出现在结果中。 + - 风险评估:pnpm 等的 `node_modules` 符号链接通常被 `.gitignore`,walker 本就不进入,影响小;源码级 `src -> ../shared` 类符号链接较罕见但存在。 + - 缓解:与 ripgrep 默认一致,行为可预期;在 release note 注明。 +2. **glob 平局顺序改为路径字典序**(§4.2)。对"同 mtime 文件数 > limit"的场景,首页内容更稳定——属**改善**,但与并行化前的具体首页可能不同。 +3. **glob `truncated` 时 `entries` 数量可能轻微越过 `max`**(§4.1)。对外分页无感。 + +### 5.2 现有测试影响 + +- glob 现有用例(`glob_tool_test.rs`/`glob_tool_edge_test.rs`/`glob_type_filter_test.rs`)断言均为**集合包含**与**计数**(如 `contains("foo.rs")`、`Found 105 files. Showing 1-100`),不依赖具体平局顺序 → 预期继续通过。 +- P0 超时用例(`search_timeout_test.rs` + glob 工具超时用例):deadline 检查从"循环顶部"移到"visitor 顶部",语义不变 → 预期继续通过。 +- grep 全部用例:`build_walker` 改 `follow_links` 影响 grep,但现有 grep 用例不构造符号链接 → 预期不受影响。 + +### 5.3 兼容 / 迁移 + +- **无配置迁移**:不新增/不删除任何 settings 字段。 +- **无工具入参变更**:`Glob`/`Grep` 的 JSON schema 不变,LLM 侧无感。 +- **无 IPC/存储格式变更**。 +- **release note**:注明"搜索默认不再跟随符号链接(与 ripgrep 对齐)"。 + +--- + +## 6. 测试与性能验收 + +### 6.1 测试计划(目标覆盖率 ≥95% 改动代码) + +| 用例 | 验证点 | 备注 | +|---|---|---| +| 并行正确性 | 含多层嵌套子目录的树,`**/*.rs` 命中全部目标,与单线程结果**集合相等** | 顺序无关断言 | +| 符号链接不跟随 | 根内放一个指向外部文件/目录的 symlink,断言结果**不含**该目标 | `#[cfg(unix)]`,用 `std::os::unix::fs::symlink`;Windows 跳过 | +| 截断越界容忍 | `ResourceLimits{ max_glob_results: 10, .. }` + 50 个匹配文件,断言 `truncated==true` 且 `entries.len()` ∈ `[10, 10+N线程]` | 复用 P0 测试里的自定义 limits 手法 | +| 平局确定性 | 同时写入多个同 mtime 文件,**两次**调用结果顺序一致 | 验证 §4.2 二级 path 排序 | +| 超时仍生效 | `walk_timeout=0` 时 glob 返回 `timed_out=true` | P0 用例迁移确认不回归 | + +### 6.2 性能验收 + +- **基准方法**:选一个大目录(如 loopal 仓库自身或合成 N=10⁵ 文件树),对"零命中模式"(最坏穷举)与"高命中模式"各跑单线程基线 vs 并行,测 wall-time。 +- **验收标准**: + - 大目录并行 wall-time **显著低于**单线程(数量级期望 ≈ `min(核数, ignore 默认线程数)` 倍加速,IO 受限时打折)。 + - 小目录(<100 文件)并行因线程启动开销**不出现可感知回归**(容差 < 1ms 级,用户无感)。 + - 无功能回归:并行结果集合 == 单线程结果集合。 +- 不在文档中预填基准数字(尚未实测),仅固化方法与门槛。 + +### 6.3 风险登记 + +| 风险 | 等级 | 缓解 | +|---|---|---| +| `GlobMatcher` 非 `Sync` | 低 | 编译期暴露;退化为每线程 `clone()`(§3.1 注) | +| 平局顺序抖动 | 低 | §4.2 二级 path 排序 | +| `follow_links=false` 漏搜符号链接目标 | 中 | 与 ripgrep 默认一致;release note;必要时后续加逃逸阀 | +| `max` 轻微越界 | 低 | 与 grep 一致,分页无感(§4.1) | + +--- + +## 7. 落地步骤(实现期参照,本方案不含源码改动) + +1. `search/walker.rs`:`follow_links(true)` → `false`。 +2. `search/glob.rs`:单线程循环 → `build_parallel().run(visitor)`,按 §3.1 骨架;引入 `parking_lot::Mutex`、`ignore::WalkState`、`Arc` 原子量。 +3. `glob/src/lib.rs`:排序键改为 `(modified_secs desc, path asc)`(§4.2)。 +4. 测试:按 §6.1 增/改用例;跑 `bazel test` 受影响 target + `--config=clippy` + `--config=rustfmt`。 +5. 全量 `bazel build //...` 确认结构无回归。 + +--- + +## 8. 未决项 + +- **二级排序键是否默认开启**:建议默认开启(§4.2),零成本恢复确定性。若评审认为"对外顺序变化"也需避免,可讨论。 +- **grep 文件顺序是否也确定化**:范围外(§4.4 脚注),可另开条目。 +- **符号链接逃逸阀**:暂不做(§3.2),出现真实诉求再增量。 diff --git a/design/glob-traversal-hang/0003-post-review-consolidation.md b/design/glob-traversal-hang/0003-post-review-consolidation.md new file mode 100644 index 00000000..9f09efae --- /dev/null +++ b/design/glob-traversal-hang/0003-post-review-consolidation.md @@ -0,0 +1,58 @@ +# Glob Traversal Hang — Post-Review Consolidation + +状态:已实现。本文记录 0001(P0)/0002(P1) 落地后,一次 max-effort code review 驱动的修正。取代 [0001](0001-root-cause-analysis.md) §5.3 的"后端外层硬兜底"设计。 + +| | | +|---|---| +| **触发** | 对 P0+P1 diff 的 10-angle code review | +| **核心发现** | 后端自建的外层 `tokio::time::timeout` backstop 与 runtime 已有的 `watchdog_deadline` 通用机制**重复**,且只覆盖 glob/grep;`truncated` 与超时**语义混淆**导致超时写空 overflow 文件 | +| **范围** | `loopal-backend/src/search/`、`loopal-tool-api`、`loopal-runtime` watchdog、glob/grep 工具层 | + +--- + +## 1. 背景:评审发现了什么 + +P0([0001](0001-root-cause-analysis.md))为 glob/grep 加了**双层**超时:内层协作式 `walk_timeout` + 后端外层 `tokio::time::timeout(walk_timeout+10s)`。评审指出: + +1. **重复造轮子(altitude)**:runtime 早有通用 per-tool 看门狗 `tool_watchdog::watchdog_deadline()`(`tool_exec.rs` 对其返回 `Some` 的工具套 `tokio::time::timeout`,超时返回**带 typed `StaleReason::WatchdogTimeout`** 的结果)——但它**只对 Bash 生效**,其余工具返回 `None`。P0 没有扩展它,而是在后端另起一套外层超时,且只保护 glob/grep。原始事故类("任意慢 IO 挂死 agent loop")只关了一半:**Ls / Read 走死挂载依旧会挂**。 +2. **`truncated` 语义混淆**:glob/grep 的 `truncated = done.load()`,而 `done` 同时被"命中 max"和"超时"置位,导致**每次超时都误判为截断并写一个无人读取的 overflow 文件**(工具层只读 `entries`/`timed_out`)。 +3. **`Err(Timeout)` 误处理**:后端外层超时返回 `Err(ToolIoError::Timeout)`,被工具 `execute` 的 `.map_err(...)?` 转成硬错误,绕过了精心设计的 `timed_out` 部分结果 + 提示路径。 +4. 次要:`TIMEOUT_NOTICE` 在两个工具里逐字重复;glob 的 `count` 原子与 `entries.len()` 冗余;`walk_timeout` doc 注释夸大"dead NFS"保证;两处 grace 常量(10s vs watchdog 30s)。 + +## 2. 合并后的分层(取代 0001 §5.3) + +``` +内层(后端,协作式) ResourceLimits::walk_timeout = 30s + └─ 在 walk 的两次 entry 之间检查 Instant >= deadline + └─ 命中 → 返回【部分结果】+ timed_out=true(Ok),工具层追加 SEARCH_TIMEOUT_NOTICE 文本 + └─ 局限:打不断卡在单个 readdir/stat syscall 的线程 + +外层(runtime,硬兜底) tool_watchdog::watchdog_deadline = 60s + └─ 收敛点 tool_exec::execute_tool_watchdogged 对 Bash + 7 个 fs-read 工具 + (Glob/Grep/Ls/Read/ReadPdf/ReadImage/ReadHtml) 套 tokio::time::timeout + └─ 命中 → 返回带【typed StaleReason::WatchdogTimeout】的 is_error 结果,解放 agent loop + └─ 覆盖 Ls/Read 等(P0 没覆盖的死挂载向量) +``` + +正常慢树:内层 30s 先返回部分结果,外层 60s 永不触发。死挂载:内层打不断,外层 60s 兜底。后端**不再**自建外层超时。 + +> **二次评审修正(关键)**:watchdog 起初只加在 `tool_exec::execute_approved_tools`,但 ReadOnly 工具(正是这 7 个 fs-read)会在 LLM 流式阶段被 `streaming_tool_exec::feed_tool` **提前启动**,该路径**绕过** watchdog——等于早启动的 glob/grep 又回到"无外层超时"的原始挂死。修复:抽出单一收敛点 `execute_tool_watchdogged`,**两条执行路径(早启动 + 正常审批)都经它**,杜绝"一条路有界、另一条无界"。并补全 ReadPdf/ReadImage/ReadHtml(同为 ReadOnly、同样早启动)。 + +## 3. 实施的改动 + +| 改动 | 文件 | 对应发现 | +|---|---|---| +| 后端外层 `tokio::time::timeout` + `WALK_TIMEOUT_GRACE` **删除**,`*_search_async` 回归纯 `spawn_blocking` | `search/mod.rs` | #1 #5 #12 | +| `watchdog_deadline` 扩展覆盖 7 个 fs-read 工具(固定 `FS_READ_TIMEOUT=60s`) | `tool_watchdog.rs` | #1 | +| 抽 `execute_tool_watchdogged` 单一收敛点,早启动路径(`feed_tool`)与审批路径都经它 | `tool_exec.rs` `streaming_tool_exec.rs` | 二次评审(早启动绕过) | +| glob/grep `max` 钳到 `≥1`,消除 `max==0` 时 truncated 恒真 + 写空 overflow | `search/glob.rs` `search/grep.rs` | 二次评审(max==0) | +| `truncated` 改由"命中 cap"派生(glob `entries.len()>=max`;grep `total>=max`),与超时解耦 → 超时不再写空 overflow | `search/glob.rs` `search/grep.rs` | #3 | +| `SEARCH_TIMEOUT_NOTICE` 提到 `loopal-tool-api`,两个工具共享 | `tool-api/truncate.rs` 等 | #9 | +| glob 删除冗余 `count` 原子,cap 检查折进 push 锁 | `search/glob.rs` | #11 | +| `walk_timeout` doc 注释改准,不再夸大 dead-NFS | `limits.rs` | #4 | + +## 4. 行为变化与未决 + +- **Ls/Read 新增 60s 看门狗**:健康本地盘上 Ls/Read 毫秒级完成,60s 极宽松;死挂载下从"永久挂起"变为"60s 后硬错误"。这是**有意的行为变化**,关闭了原始事故类。 +- **`timed_out` 的 typed 信号**:硬超时现由 watchdog 提供 typed `StaleReason`;软协作式超时仍以文本提示面向 LLM(部分结果场景,文本是合适的指引而非控制信号)。 +- 未做:follow_links 可配置开关([0002](0002-parallel-glob-walker.md) §3.2,YAGNI);`>max` 截断时并行结果集非确定([0002](0002-parallel-glob-walker.md) §4.2,固有,已记录)。