diff --git a/.github/workflows/autofix.yml b/.github/workflows/autofix.yml index 2e401ea87a..f4862d1039 100644 --- a/.github/workflows/autofix.yml +++ b/.github/workflows/autofix.yml @@ -54,7 +54,7 @@ jobs: - name: Cargo Clippy run: cargo +nightly clippy --all-features --workspace --all-targets --fix --allow-dirty -- -D warnings - name: Cargo Clippy String Safety - run: cargo +nightly clippy --all-features --workspace -- -D clippy::string_slice -D clippy::indexing_slicing + run: cargo +nightly clippy --all-features --workspace -- -D clippy::string_slice -D clippy::indexing_slicing -D clippy::disallowed_methods - name: Autofix uses: autofix-ci/action@7a166d7532b277f34e16238930461bf77f9d7ed8 concurrency: diff --git a/Cargo.lock b/Cargo.lock index e4e2bd5786..d5b8771a1b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2185,6 +2185,7 @@ name = "forge_fs" version = "0.1.0" dependencies = [ "anyhow", + "bstr", "forge_domain", "hex", "infer", @@ -2203,6 +2204,7 @@ dependencies = [ "async-trait", "backon", "base64 0.22.1", + "bstr", "bytes", "cacache", "chrono", @@ -2263,6 +2265,7 @@ dependencies = [ "anyhow", "arboard", "async-recursion", + "bstr", "chrono", "clap", "clap_complete", @@ -2421,6 +2424,7 @@ dependencies = [ "async-trait", "backon", "base64 0.22.1", + "bstr", "bytes", "chrono", "dashmap 7.0.0-rc2", @@ -2537,6 +2541,7 @@ version = "0.1.0" dependencies = [ "anyhow", "async-trait", + "bstr", "chrono", "convert_case 0.11.0", "derive_more", diff --git a/Cargo.toml b/Cargo.toml index 34370be295..908484a78c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -27,6 +27,7 @@ aws-smithy-runtime-api = "1.11.3" aws-smithy-async = { version = "1.2.11", features = ["rt-tokio"] } aws-smithy-runtime = { version = "1.10", features = ["connector-hyper-0-14-x", "tls-rustls"] } base64 = "0.22.1" +bstr = "1.12.1" bytes = "1.11.1" chrono = { version = "0.4.44", features = ["serde"] } clap = { version = "4.6.0", features = ["derive"] } diff --git a/clippy.toml b/clippy.toml new file mode 100644 index 0000000000..3c8e1b975f --- /dev/null +++ b/clippy.toml @@ -0,0 +1,3 @@ +disallowed-methods = [ + { path = "std::string::String::from_utf8_lossy", reason = "Prefer bstr ByteSlice decoding so lossy UTF-8 handling stays explicit and consistent across byte-oriented paths.", replacement = "bstr::ByteSlice::to_str_lossy" }, +] diff --git a/crates/forge_ci/src/jobs/lint.rs b/crates/forge_ci/src/jobs/lint.rs index 8e0c50f776..b3f95b4900 100644 --- a/crates/forge_ci/src/jobs/lint.rs +++ b/crates/forge_ci/src/jobs/lint.rs @@ -60,6 +60,8 @@ pub fn clippy_string_safety_cmd(fix: bool) -> String { "clippy::string_slice", "-D", "clippy::indexing_slicing", + "-D", + "clippy::disallowed_methods", ]); cargo_cmd(&parts) diff --git a/crates/forge_fs/Cargo.toml b/crates/forge_fs/Cargo.toml index e228ec3fe8..e6e2bf7dab 100644 --- a/crates/forge_fs/Cargo.toml +++ b/crates/forge_fs/Cargo.toml @@ -11,6 +11,7 @@ sha2.workspace = true hex.workspace = true infer = "0.19.0" # For binary file detection thiserror = "2.0" +bstr.workspace = true forge_domain.workspace = true diff --git a/crates/forge_fs/src/read.rs b/crates/forge_fs/src/read.rs index b9e2a2f0cb..b96c3c82e4 100644 --- a/crates/forge_fs/src/read.rs +++ b/crates/forge_fs/src/read.rs @@ -1,12 +1,13 @@ use std::path::Path; use anyhow::{Context, Result}; +use bstr::ByteSlice; impl crate::ForgeFS { pub async fn read_utf8>(path: T) -> Result { Self::read(path) .await - .map(|bytes| String::from_utf8_lossy(&bytes).to_string()) + .map(|bytes| bytes.to_str_lossy().to_string()) } pub async fn read>(path: T) -> Result> { diff --git a/crates/forge_fs/src/read_range.rs b/crates/forge_fs/src/read_range.rs index 5dd2071687..ed48f5de1b 100644 --- a/crates/forge_fs/src/read_range.rs +++ b/crates/forge_fs/src/read_range.rs @@ -2,6 +2,7 @@ use std::cmp; use std::path::Path; use anyhow::{Context, Result}; +use bstr::ByteSlice; use forge_domain::FileInfo; use crate::error::Error; @@ -50,7 +51,7 @@ impl crate::ForgeFS { let content = tokio::fs::read(path_ref) .await .with_context(|| format!("Failed to read file content from {}", path_ref.display()))?; - let content = String::from_utf8_lossy(&content); + let content = content.to_str_lossy(); // Hash the full file content so callers get a stable, whole-file hash // that matches what the external-change detector reads back from disk. diff --git a/crates/forge_infra/Cargo.toml b/crates/forge_infra/Cargo.toml index 232926dbdb..0efe074ec4 100644 --- a/crates/forge_infra/Cargo.toml +++ b/crates/forge_infra/Cargo.toml @@ -18,6 +18,7 @@ tokio.workspace = true serde_json.workspace = true reqwest.workspace = true +bstr.workspace = true bytes.workspace = true pretty_assertions.workspace = true forge_select.workspace = true diff --git a/crates/forge_infra/src/console.rs b/crates/forge_infra/src/console.rs index 552e638e6c..b7c84142f9 100644 --- a/crates/forge_infra/src/console.rs +++ b/crates/forge_infra/src/console.rs @@ -72,6 +72,8 @@ mod tests { use std::io::Cursor; use std::thread; + use bstr::ByteSlice; + use super::*; #[test] @@ -112,7 +114,7 @@ mod tests { assert!( valid_orderings.contains(&actual), "Output was interleaved: {:?}", - String::from_utf8_lossy(&actual) + actual.as_slice().to_str_lossy() ); } diff --git a/crates/forge_infra/src/executor.rs b/crates/forge_infra/src/executor.rs index 34e1327648..af8718738d 100644 --- a/crates/forge_infra/src/executor.rs +++ b/crates/forge_infra/src/executor.rs @@ -2,6 +2,7 @@ use std::io::{self, Write}; use std::path::{Path, PathBuf}; use std::sync::Arc; +use bstr::ByteSlice; use forge_app::CommandInfra; use forge_domain::{CommandOutput, ConsoleWriter as OutputPrinterTrait, Environment}; use tokio::io::AsyncReadExt; @@ -140,8 +141,8 @@ impl ForgeCommandExecutorService { drop(ready); Ok(CommandOutput { - stdout: String::from_utf8_lossy(&stdout_buffer).into_owned(), - stderr: String::from_utf8_lossy(&stderr_buffer).into_owned(), + stdout: stdout_buffer.to_str_lossy().into_owned(), + stderr: stderr_buffer.to_str_lossy().into_owned(), exit_code: status.code(), command, }) @@ -190,20 +191,52 @@ async fn stream( let mut output = Vec::new(); if let Some(io) = io.as_mut() { let mut buff = [0; 1024]; + // Carry incomplete trailing UTF-8 codepoint bytes across reads — Windows + // console stdio rejects even one byte of a split codepoint. + let mut pending = Vec::::new(); loop { let n = io.read(&mut buff).await?; if n == 0 { break; } - writer.write_all(buff.get(..n).unwrap_or(&[]))?; + let chunk = buff.get(..n).unwrap_or(&[]); + output.extend_from_slice(chunk); + + let mut working = std::mem::take(&mut pending); + working.extend_from_slice(chunk); + pending = write_lossy_utf8(&mut writer, &working)?; // note: flush is necessary else we get the cursor could not be found error. writer.flush()?; - output.extend_from_slice(buff.get(..n).unwrap_or(&[])); + } + // Flush dangling bytes from a stream that ended mid-codepoint. + if !pending.is_empty() { + writer.write_all(pending.to_str_lossy().as_bytes())?; + writer.flush()?; } } Ok(output) } +/// Writes `buf` as valid UTF-8 (invalid bytes → `U+FFFD`) and returns any +/// incomplete trailing codepoint bytes for the caller to carry into the next +/// chunk. +fn write_lossy_utf8(writer: &mut W, buf: &[u8]) -> io::Result> { + let mut chunks = ByteSlice::utf8_chunks(buf).peekable(); + + while let Some(chunk) = chunks.next() { + writer.write_all(chunk.valid().as_bytes())?; + + if !chunk.invalid().is_empty() { + if chunk.incomplete() && chunks.peek().is_none() { + return Ok(chunk.invalid().to_vec()); + } + writer.write_all("\u{FFFD}".as_bytes())?; + } + } + + Ok(Vec::new()) +} + /// The implementation for CommandExecutorService #[async_trait::async_trait] impl CommandInfra for ForgeCommandExecutorService { @@ -429,4 +462,77 @@ mod tests { assert_eq!(actual.stderr, expected.stderr); assert_eq!(actual.success(), expected.success()); } + + mod write_lossy_utf8 { + use pretty_assertions::assert_eq; + + use super::super::write_lossy_utf8; + + fn run(buf: &[u8]) -> (Vec, Vec) { + let mut out = Vec::::new(); + let pending = write_lossy_utf8(&mut out, buf).unwrap(); + (out, pending) + } + + #[test] + fn valid_ascii_passes_through() { + let (out, pending) = run(b"hello"); + assert_eq!(out, b"hello"); + assert!(pending.is_empty()); + } + + #[test] + fn valid_multibyte_passes_through() { + // "héllo ✓" — mixed 2-byte and 3-byte codepoints. + let input = "héllo ✓".as_bytes(); + let (out, pending) = run(input); + assert_eq!(out, input); + assert!(pending.is_empty()); + } + + #[test] + fn incomplete_trailing_codepoint_is_buffered() { + // "é" is 0xC3 0xA9 — leading byte alone must be held back. + let (out, pending) = run(&[b'a', 0xC3]); + assert_eq!(out, b"a"); + assert_eq!(pending, vec![0xC3]); + } + + #[test] + fn multibyte_split_across_two_chunks_emits_once_whole() { + let mut out = Vec::::new(); + let pending = write_lossy_utf8(&mut out, &[b'a', 0xC3]).unwrap(); + assert_eq!(pending, vec![0xC3]); + assert_eq!(out, b"a"); + + let mut working = pending; + working.push(0xA9); + let pending = write_lossy_utf8(&mut out, &working).unwrap(); + assert!(pending.is_empty()); + assert_eq!(out, "aé".as_bytes()); + } + + #[test] + fn invalid_byte_in_middle_becomes_replacement() { + let (out, pending) = run(&[b'a', 0xFF, b'b']); + assert_eq!(out, "a\u{FFFD}b".as_bytes()); + assert!(pending.is_empty()); + } + + #[test] + fn lone_continuation_byte_becomes_replacement() { + let (out, pending) = run(&[b'a', 0x80, b'b']); + assert_eq!(out, "a\u{FFFD}b".as_bytes()); + assert!(pending.is_empty()); + } + + #[test] + fn windows_1252_smart_quote_becomes_replacement() { + // Regression: 0x91/0x92 land as bare continuation bytes and broke + // console stdio on Windows before this fix. + let (out, pending) = run(b"quote: \x91hi\x92"); + assert_eq!(out, "quote: \u{FFFD}hi\u{FFFD}".as_bytes()); + assert!(pending.is_empty()); + } + } } diff --git a/crates/forge_infra/src/mcp_client.rs b/crates/forge_infra/src/mcp_client.rs index d8a044e64c..511771833f 100644 --- a/crates/forge_infra/src/mcp_client.rs +++ b/crates/forge_infra/src/mcp_client.rs @@ -5,6 +5,7 @@ use std::str::FromStr; use std::sync::{Arc, OnceLock, RwLock}; use backon::{ExponentialBuilder, Retryable}; +use bstr::ByteSlice; use forge_app::McpClientInfra; use forge_domain::{ Environment, Image, McpHttpServer, McpServerConfig, ToolDefinition, ToolName, ToolOutput, @@ -453,7 +454,7 @@ impl ForgeMcpClient { // Read the HTTP request let mut buf = vec![0u8; 4096]; let n = stream.read(&mut buf).await?; - let request = String::from_utf8_lossy(buf.get(..n).unwrap_or(&[])); + let request = buf.get(..n).unwrap_or(&[]).to_str_lossy(); let first_line = request.lines().next().unwrap_or(""); let path = first_line.split_whitespace().nth(1).unwrap_or("/"); @@ -681,7 +682,7 @@ pub async fn mcp_auth(server_url: &str, env: &Environment) -> anyhow::Result<()> use tokio::io::{AsyncReadExt, AsyncWriteExt}; let mut buf = vec![0u8; 4096]; let n = stream.read(&mut buf).await?; - let request = String::from_utf8_lossy(buf.get(..n).unwrap_or(&[])); + let request = buf.get(..n).unwrap_or(&[]).to_str_lossy(); let first_line = request.lines().next().unwrap_or(""); let path = first_line.split_whitespace().nth(1).unwrap_or("/"); let query_start = path.find('?').unwrap_or(path.len()); diff --git a/crates/forge_main/Cargo.toml b/crates/forge_main/Cargo.toml index 2660fac793..e136bc306d 100644 --- a/crates/forge_main/Cargo.toml +++ b/crates/forge_main/Cargo.toml @@ -21,6 +21,7 @@ forge_config.workspace = true forge_walker.workspace = true forge_display.workspace = true forge_tracker.workspace = true +bstr.workspace = true forge_spinner.workspace = true forge_select.workspace = true diff --git a/crates/forge_main/src/sandbox.rs b/crates/forge_main/src/sandbox.rs index 6f77e0ad68..eee8daeadd 100644 --- a/crates/forge_main/src/sandbox.rs +++ b/crates/forge_main/src/sandbox.rs @@ -2,6 +2,7 @@ use std::path::PathBuf; use std::process::Command; use anyhow::{Context, Result, bail}; +use bstr::ByteSlice; use forge_domain::TitleFormat; use crate::title_display::TitleDisplayExt; @@ -120,7 +121,7 @@ impl<'a> Sandbox<'a> { .context("Failed to create git worktree")?; if !worktree_output.status.success() { - let stderr = String::from_utf8_lossy(&worktree_output.stderr); + let stderr = worktree_output.stderr.to_str_lossy(); bail!("Failed to create git worktree: {stderr}"); } diff --git a/crates/forge_main/src/stream_renderer.rs b/crates/forge_main/src/stream_renderer.rs index cc12cfa445..eac0794b96 100644 --- a/crates/forge_main/src/stream_renderer.rs +++ b/crates/forge_main/src/stream_renderer.rs @@ -1,7 +1,9 @@ +use std::borrow::Cow; use std::io; use std::sync::{Arc, Mutex}; use anyhow::Result; +use bstr::ByteSlice; use colored::Colorize; use forge_domain::ConsoleWriter; use forge_markdown_stream::StreamdownRenderer; @@ -198,11 +200,11 @@ impl io::Write for StreamDirectWriter

{ fn write(&mut self, buf: &[u8]) -> io::Result { self.pause_spinner(); - let content = match std::str::from_utf8(buf) { - Ok(s) => s.to_string(), - Err(_) => String::from_utf8_lossy(buf).into_owned(), + let content = match buf.to_str() { + Ok(content) => Cow::Borrowed(content), + Err(_) => buf.to_str_lossy(), }; - let styled = self.style.apply(content); + let styled = self.style.apply(content.into_owned()); self.printer.write(styled.as_bytes())?; self.printer.flush()?; diff --git a/crates/forge_services/Cargo.toml b/crates/forge_services/Cargo.toml index 6784647880..8fcc553f39 100644 --- a/crates/forge_services/Cargo.toml +++ b/crates/forge_services/Cargo.toml @@ -41,6 +41,7 @@ serde_yml.workspace = true gray_matter.workspace = true merge.workspace = true strip-ansi-escapes.workspace = true +bstr.workspace = true forge_app.workspace = true url.workspace = true reqwest-eventsource.workspace = true diff --git a/crates/forge_services/src/tool_services/fs_search.rs b/crates/forge_services/src/tool_services/fs_search.rs index f208b80fbb..efa2c50865 100644 --- a/crates/forge_services/src/tool_services/fs_search.rs +++ b/crates/forge_services/src/tool_services/fs_search.rs @@ -2,6 +2,7 @@ use std::path::{Path, PathBuf}; use std::sync::Arc; use anyhow::{Context, anyhow}; +use bstr::ByteSlice; use forge_app::{ FileInfoInfra, FileReaderInfra, FsSearchService, Match, MatchResult, SearchResult, Walker, WalkerInfra, @@ -386,7 +387,7 @@ impl Sink for ContextSink { // Store the current match (before_context is already accumulated, after_context // will be added via context() calls) let line_num = mat.line_number().unwrap_or(0) as usize; - let line = String::from_utf8_lossy(mat.bytes()).trim_end().to_string(); + let line = mat.bytes().to_str_lossy().trim_end().to_string(); self.current_match = Some((line_num, line)); Ok(true) @@ -397,7 +398,7 @@ impl Sink for ContextSink { _searcher: &Searcher, ctx: &SinkContext<'_>, ) -> Result { - let line = String::from_utf8_lossy(ctx.bytes()).trim_end().to_string(); + let line = ctx.bytes().to_str_lossy().trim_end().to_string(); match ctx.kind() { SinkContextKind::Before => { diff --git a/crates/forge_services/src/tool_services/shell.rs b/crates/forge_services/src/tool_services/shell.rs index 05a671de71..2745e9b423 100644 --- a/crates/forge_services/src/tool_services/shell.rs +++ b/crates/forge_services/src/tool_services/shell.rs @@ -2,13 +2,14 @@ use std::path::PathBuf; use std::sync::Arc; use anyhow::bail; +use bstr::ByteSlice; use forge_app::domain::Environment; use forge_app::{CommandInfra, EnvironmentInfra, ShellOutput, ShellService}; use strip_ansi_escapes::strip; // Strips out the ansi codes from content. fn strip_ansi(content: String) -> String { - String::from_utf8_lossy(&strip(content.as_bytes())).into_owned() + strip(content.as_bytes()).to_str_lossy().into_owned() } /// Prevents potentially harmful operations like absolute path execution and diff --git a/crates/forge_tracker/Cargo.toml b/crates/forge_tracker/Cargo.toml index 2cc6c5c9f4..2ea2af50ce 100644 --- a/crates/forge_tracker/Cargo.toml +++ b/crates/forge_tracker/Cargo.toml @@ -23,6 +23,7 @@ regex.workspace = true tracing-appender.workspace = true tracing-subscriber.workspace = true anyhow.workspace = true +bstr.workspace = true forge_domain.workspace = true lazy_static.workspace = true dirs.workspace = true diff --git a/crates/forge_tracker/src/dispatch.rs b/crates/forge_tracker/src/dispatch.rs index e3fcdbbb78..bbec64e4f3 100644 --- a/crates/forge_tracker/src/dispatch.rs +++ b/crates/forge_tracker/src/dispatch.rs @@ -3,6 +3,7 @@ use std::process::Output; use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::{Arc, LazyLock}; +use bstr::ByteSlice; use chrono::{DateTime, Utc}; use forge_domain::Conversation; use sysinfo::System; @@ -177,7 +178,7 @@ async fn system_info() -> HashSet { fn parse(output: Output) -> Option { if output.status.success() { - let text = String::from_utf8_lossy(&output.stdout).trim().to_string(); + let text = output.stdout.to_str_lossy().trim().to_string(); if !text.is_empty() { return Some(text); } diff --git a/crates/forge_tracker/src/event.rs b/crates/forge_tracker/src/event.rs index 3544315000..ed09c919d4 100644 --- a/crates/forge_tracker/src/event.rs +++ b/crates/forge_tracker/src/event.rs @@ -1,5 +1,6 @@ use std::ops::Deref; +use bstr::ByteSlice; use chrono::{DateTime, Utc}; use convert_case::{Case, Casing}; use forge_domain::Conversation; @@ -96,7 +97,7 @@ impl EventKind { Self::Prompt(content) => content.to_string(), Self::Error(content) => content.to_string(), Self::ToolCall(payload) => serde_json::to_string(&payload).unwrap_or_default(), - Self::Trace(trace) => String::from_utf8_lossy(trace).to_string(), + Self::Trace(trace) => trace.to_str_lossy().to_string(), Self::Login(id) => id.login.to_owned(), } }