From a0f9a15b4a916c92d51131418e9fe080c83f9d3c Mon Sep 17 00:00:00 2001 From: Andreas Liljeqvist Date: Sat, 17 Jan 2026 11:36:25 +0100 Subject: [PATCH 1/4] Fix is_ascii performance regression on AVX-512 CPUs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When `[u8]::is_ascii()` is compiled with `-C target-cpu=native` on AVX-512 CPUs, LLVM generates inefficient code. Because `is_ascii` is marked `#[inline]`, it gets inlined and recompiled with the user's target settings. The previous implementation used a counting loop that LLVM auto-vectorizes to `pmovmskb` on SSE2, but with AVX-512 enabled, LLVM uses k-registers and extracts bits individually with ~31 `kshiftrd` instructions. This fix replaces the counting loop with explicit SSE2 intrinsics (`_mm_loadu_si128`, `_mm_or_si128`, `_mm_movemask_epi8`) for x86_64. `_mm_movemask_epi8` compiles to `pmovmskb`, forcing efficient codegen regardless of CPU features. Benchmark results on AMD Ryzen 5 7500F (Zen 4 with AVX-512): - Default build: ~73 GB/s → ~74 GB/s (no regression) - With -C target-cpu=native: ~3 GB/s → ~67 GB/s (22x improvement) The loongarch64 implementation retains the original counting loop since it doesn't have this issue. Regression from: https://github.com/rust-lang/rust/pull/130733 --- library/core/src/slice/ascii.rs | 86 +++++++++++++++++--- tests/assembly-llvm/slice-is-ascii-avx512.rs | 18 ++++ tests/codegen-llvm/slice-is-ascii.rs | 9 +- 3 files changed, 98 insertions(+), 15 deletions(-) create mode 100644 tests/assembly-llvm/slice-is-ascii-avx512.rs diff --git a/library/core/src/slice/ascii.rs b/library/core/src/slice/ascii.rs index 3e8c553f9f159..c9e168d6cbf83 100644 --- a/library/core/src/slice/ascii.rs +++ b/library/core/src/slice/ascii.rs @@ -3,10 +3,7 @@ use core::ascii::EscapeDefault; use crate::fmt::{self, Write}; -#[cfg(not(any( - all(target_arch = "x86_64", target_feature = "sse2"), - all(target_arch = "loongarch64", target_feature = "lsx") -)))] +#[cfg(not(all(target_arch = "loongarch64", target_feature = "lsx")))] use crate::intrinsics::const_eval_select; use crate::{ascii, iter, ops}; @@ -463,19 +460,84 @@ const fn is_ascii(s: &[u8]) -> bool { ) } -/// ASCII test optimized to use the `pmovmskb` instruction on `x86-64` and the -/// `vmskltz.b` instruction on `loongarch64`. +/// SSE2 implementation using `_mm_movemask_epi8` (compiles to `pmovmskb`) to +/// avoid LLVM's broken AVX-512 auto-vectorization of counting loops. +/// +/// # Safety +/// Requires SSE2 support (guaranteed on x86_64). +#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))] +#[target_feature(enable = "sse2")] +unsafe fn is_ascii_sse2(bytes: &[u8]) -> bool { + use crate::arch::x86_64::{__m128i, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128}; + + const CHUNK_SIZE: usize = 32; + + let mut i = 0; + + while i + CHUNK_SIZE <= bytes.len() { + // SAFETY: We have verified that `i + CHUNK_SIZE <= bytes.len()`. + let ptr = unsafe { bytes.as_ptr().add(i) }; + + // Load two 16-byte chunks and combine them. + // SAFETY: We verified `i + 32 <= len`, so ptr is valid for 32 bytes. + // `_mm_loadu_si128` allows unaligned loads. + let chunk1 = unsafe { _mm_loadu_si128(ptr as *const __m128i) }; + // SAFETY: Same as above - ptr.add(16) is within the valid 32-byte range. + let chunk2 = unsafe { _mm_loadu_si128(ptr.add(16) as *const __m128i) }; + + // OR them together - if any byte has the high bit set, the result will too + let combined = _mm_or_si128(chunk1, chunk2); + + // Create a mask from the MSBs of each byte. + // If any byte is >= 128, its MSB is 1, so the mask will be non-zero. + let mask = _mm_movemask_epi8(combined); + + if mask != 0 { + return false; + } + + i += CHUNK_SIZE; + } + + // Handle remaining bytes with simple loop + while i < bytes.len() { + if !bytes[i].is_ascii() { + return false; + } + i += 1; + } + + true +} + +/// ASCII test optimized to use the `pmovmskb` instruction on `x86-64`. +/// +/// Uses explicit SSE2 intrinsics to prevent LLVM from auto-vectorizing with +/// broken AVX-512 code that extracts mask bits one-by-one. +#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))] +#[inline] +#[rustc_allow_const_fn_unstable(const_eval_select)] +const fn is_ascii(bytes: &[u8]) -> bool { + const_eval_select!( + @capture { bytes: &[u8] } -> bool: + if const { + is_ascii_simple(bytes) + } else { + // SAFETY: SSE2 is guaranteed available on x86_64 + unsafe { is_ascii_sse2(bytes) } + } + ) +} + +/// ASCII test optimized to use the `vmskltz.b` instruction on `loongarch64`. /// /// Other platforms are not likely to benefit from this code structure, so they /// use SWAR techniques to test for ASCII in `usize`-sized chunks. -#[cfg(any( - all(target_arch = "x86_64", target_feature = "sse2"), - all(target_arch = "loongarch64", target_feature = "lsx") -))] +#[cfg(all(target_arch = "loongarch64", target_feature = "lsx"))] #[inline] const fn is_ascii(bytes: &[u8]) -> bool { // Process chunks of 32 bytes at a time in the fast path to enable - // auto-vectorization and use of `pmovmskb`. Two 128-bit vector registers + // auto-vectorization and use of `vmskltz.b`. Two 128-bit vector registers // can be OR'd together and then the resulting vector can be tested for // non-ASCII bytes. const CHUNK_SIZE: usize = 32; @@ -485,7 +547,7 @@ const fn is_ascii(bytes: &[u8]) -> bool { while i + CHUNK_SIZE <= bytes.len() { let chunk_end = i + CHUNK_SIZE; - // Get LLVM to produce a `pmovmskb` instruction on x86-64 which + // Get LLVM to produce a `vmskltz.b` instruction on loongarch64 which // creates a mask from the most significant bit of each byte. // ASCII bytes are less than 128 (0x80), so their most significant // bit is unset. diff --git a/tests/assembly-llvm/slice-is-ascii-avx512.rs b/tests/assembly-llvm/slice-is-ascii-avx512.rs new file mode 100644 index 0000000000000..d3a441fec96cb --- /dev/null +++ b/tests/assembly-llvm/slice-is-ascii-avx512.rs @@ -0,0 +1,18 @@ +//@ only-x86_64 +//@ compile-flags: -C opt-level=3 -C target-cpu=znver4 +//@ compile-flags: -C llvm-args=-x86-asm-syntax=intel +//@ assembly-output: emit-asm +#![crate_type = "lib"] + +// Verify is_ascii uses pmovmskb/vpmovmskb instead of kshiftrd with AVX-512. +// The fix uses explicit SSE2 intrinsics to avoid LLVM's broken auto-vectorization. +// +// See: https://github.com/rust-lang/rust/issues/129293 + +// CHECK-LABEL: test_is_ascii +#[no_mangle] +pub fn test_is_ascii(s: &[u8]) -> bool { + // CHECK-NOT: kshiftrd + // CHECK-NOT: kshiftrq + s.is_ascii() +} diff --git a/tests/codegen-llvm/slice-is-ascii.rs b/tests/codegen-llvm/slice-is-ascii.rs index 67537c871a0a3..1f41b69e43966 100644 --- a/tests/codegen-llvm/slice-is-ascii.rs +++ b/tests/codegen-llvm/slice-is-ascii.rs @@ -1,10 +1,13 @@ -//@ only-x86_64 -//@ compile-flags: -C opt-level=3 -C target-cpu=x86-64 +//@ only-loongarch64 +//@ compile-flags: -C opt-level=3 #![crate_type = "lib"] -/// Check that the fast-path of `is_ascii` uses a `pmovmskb` instruction. +/// Check that the fast-path of `is_ascii` uses a `vmskltz.b` instruction. /// Platforms lacking an equivalent instruction use other techniques for /// optimizing `is_ascii`. +/// +/// Note: x86_64 uses explicit SSE2 intrinsics instead of relying on +/// auto-vectorization. See `slice-is-ascii-avx512.rs`. // CHECK-LABEL: @is_ascii_autovectorized #[no_mangle] pub fn is_ascii_autovectorized(s: &[u8]) -> bool { From 08432c892758a06a6bab9fa0584effb7e7881303 Mon Sep 17 00:00:00 2001 From: Andreas Liljeqvist Date: Sun, 18 Jan 2026 22:49:37 +0100 Subject: [PATCH 2/4] Optimize small input path for is_ascii on x86_64 For inputs smaller than 32 bytes, use usize-at-a-time processing instead of calling the SSE2 function. This avoids function call overhead from #[target_feature(enable = "sse2")] which prevents inlining. Also moves CHUNK_SIZE to module level so it can be shared between is_ascii and is_ascii_sse2. --- library/core/src/slice/ascii.rs | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/library/core/src/slice/ascii.rs b/library/core/src/slice/ascii.rs index c9e168d6cbf83..25b8a10af3555 100644 --- a/library/core/src/slice/ascii.rs +++ b/library/core/src/slice/ascii.rs @@ -460,6 +460,10 @@ const fn is_ascii(s: &[u8]) -> bool { ) } +/// Chunk size for vectorized ASCII checking (two 16-byte SSE registers). +#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))] +const CHUNK_SIZE: usize = 32; + /// SSE2 implementation using `_mm_movemask_epi8` (compiles to `pmovmskb`) to /// avoid LLVM's broken AVX-512 auto-vectorization of counting loops. /// @@ -470,8 +474,6 @@ const fn is_ascii(s: &[u8]) -> bool { unsafe fn is_ascii_sse2(bytes: &[u8]) -> bool { use crate::arch::x86_64::{__m128i, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128}; - const CHUNK_SIZE: usize = 32; - let mut i = 0; while i + CHUNK_SIZE <= bytes.len() { @@ -518,11 +520,27 @@ unsafe fn is_ascii_sse2(bytes: &[u8]) -> bool { #[inline] #[rustc_allow_const_fn_unstable(const_eval_select)] const fn is_ascii(bytes: &[u8]) -> bool { + const USIZE_SIZE: usize = size_of::(); + const NONASCII_MASK: usize = usize::MAX / 255 * 0x80; + const_eval_select!( @capture { bytes: &[u8] } -> bool: if const { is_ascii_simple(bytes) } else { + // For small inputs, use usize-at-a-time processing to avoid SSE2 call overhead. + if bytes.len() < CHUNK_SIZE { + let chunks = bytes.chunks_exact(USIZE_SIZE); + let remainder = chunks.remainder(); + for chunk in chunks { + let word = usize::from_ne_bytes(chunk.try_into().unwrap()); + if (word & NONASCII_MASK) != 0 { + return false; + } + } + return remainder.iter().all(|b| b.is_ascii()); + } + // SAFETY: SSE2 is guaranteed available on x86_64 unsafe { is_ascii_sse2(bytes) } } From c609cce8cf6c6a5a1b6e081a671c7ba9f1165fdd Mon Sep 17 00:00:00 2001 From: Andreas Liljeqvist Date: Thu, 22 Jan 2026 22:18:00 +0100 Subject: [PATCH 3/4] Merge is_ascii codegen tests using revisions Combine the x86_64 and loongarch64 is_ascii tests into a single file using compiletest revisions. Both now test assembly output: - X86_64: Verifies no broken kshiftrd/kshiftrq instructions (AVX-512 fix) - LA64: Verifies vmskltz.b instruction is used (auto-vectorization) --- tests/assembly-llvm/slice-is-ascii-avx512.rs | 18 ----------- tests/assembly-llvm/slice-is-ascii.rs | 32 ++++++++++++++++++++ tests/codegen-llvm/slice-is-ascii.rs | 19 ------------ 3 files changed, 32 insertions(+), 37 deletions(-) delete mode 100644 tests/assembly-llvm/slice-is-ascii-avx512.rs create mode 100644 tests/assembly-llvm/slice-is-ascii.rs delete mode 100644 tests/codegen-llvm/slice-is-ascii.rs diff --git a/tests/assembly-llvm/slice-is-ascii-avx512.rs b/tests/assembly-llvm/slice-is-ascii-avx512.rs deleted file mode 100644 index d3a441fec96cb..0000000000000 --- a/tests/assembly-llvm/slice-is-ascii-avx512.rs +++ /dev/null @@ -1,18 +0,0 @@ -//@ only-x86_64 -//@ compile-flags: -C opt-level=3 -C target-cpu=znver4 -//@ compile-flags: -C llvm-args=-x86-asm-syntax=intel -//@ assembly-output: emit-asm -#![crate_type = "lib"] - -// Verify is_ascii uses pmovmskb/vpmovmskb instead of kshiftrd with AVX-512. -// The fix uses explicit SSE2 intrinsics to avoid LLVM's broken auto-vectorization. -// -// See: https://github.com/rust-lang/rust/issues/129293 - -// CHECK-LABEL: test_is_ascii -#[no_mangle] -pub fn test_is_ascii(s: &[u8]) -> bool { - // CHECK-NOT: kshiftrd - // CHECK-NOT: kshiftrq - s.is_ascii() -} diff --git a/tests/assembly-llvm/slice-is-ascii.rs b/tests/assembly-llvm/slice-is-ascii.rs new file mode 100644 index 0000000000000..d01b321bf460a --- /dev/null +++ b/tests/assembly-llvm/slice-is-ascii.rs @@ -0,0 +1,32 @@ +//@ revisions: X86_64 LA64 +//@ assembly-output: emit-asm +//@ compile-flags: -C opt-level=3 +// +//@ [X86_64] only-x86_64 +//@ [X86_64] compile-flags: -C target-cpu=znver4 +//@ [X86_64] compile-flags: -C llvm-args=-x86-asm-syntax=intel +// +//@ [LA64] only-loongarch64 + +#![crate_type = "lib"] + +/// Verify `is_ascii` generates efficient code on different architectures: +/// +/// - x86_64: Must NOT use `kshiftrd`/`kshiftrq` (broken AVX-512 auto-vectorization). +/// The fix uses explicit SSE2 intrinsics (`pmovmskb`/`vpmovmskb`). +/// See: https://github.com/llvm/llvm-project/issues/176906 +/// +/// - loongarch64: Should use `vmskltz.b` instruction for the fast-path. +/// This architecture still relies on LLVM auto-vectorization. + +// X86_64-LABEL: test_is_ascii +// X86_64-NOT: kshiftrd +// X86_64-NOT: kshiftrq + +// LA64-LABEL: test_is_ascii +// LA64: vmskltz.b + +#[no_mangle] +pub fn test_is_ascii(s: &[u8]) -> bool { + s.is_ascii() +} diff --git a/tests/codegen-llvm/slice-is-ascii.rs b/tests/codegen-llvm/slice-is-ascii.rs deleted file mode 100644 index 1f41b69e43966..0000000000000 --- a/tests/codegen-llvm/slice-is-ascii.rs +++ /dev/null @@ -1,19 +0,0 @@ -//@ only-loongarch64 -//@ compile-flags: -C opt-level=3 -#![crate_type = "lib"] - -/// Check that the fast-path of `is_ascii` uses a `vmskltz.b` instruction. -/// Platforms lacking an equivalent instruction use other techniques for -/// optimizing `is_ascii`. -/// -/// Note: x86_64 uses explicit SSE2 intrinsics instead of relying on -/// auto-vectorization. See `slice-is-ascii-avx512.rs`. -// CHECK-LABEL: @is_ascii_autovectorized -#[no_mangle] -pub fn is_ascii_autovectorized(s: &[u8]) -> bool { - // CHECK: load <32 x i8> - // CHECK-NEXT: icmp slt <32 x i8> - // CHECK-NEXT: bitcast <32 x i1> - // CHECK-NEXT: icmp eq i32 - s.is_ascii() -} From 890c0fd4e8a3cbd49b761db2432114b147ae3487 Mon Sep 17 00:00:00 2001 From: Andreas Liljeqvist Date: Thu, 22 Jan 2026 22:41:57 +0100 Subject: [PATCH 4/4] Make is_ascii_sse2 a safe function Remove the `#[target_feature(enable = "sse2")]` attribute and make the function safe to call. The SSE2 requirement is already enforced by the `#[cfg(target_feature = "sse2")]` predicate. Individual unsafe blocks are used for intrinsic calls with appropriate SAFETY comments. Also adds FIXME reference to llvm#176906 for tracking when this workaround can be removed. --- library/core/src/slice/ascii.rs | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/library/core/src/slice/ascii.rs b/library/core/src/slice/ascii.rs index 25b8a10af3555..459c826f40646 100644 --- a/library/core/src/slice/ascii.rs +++ b/library/core/src/slice/ascii.rs @@ -467,11 +467,9 @@ const CHUNK_SIZE: usize = 32; /// SSE2 implementation using `_mm_movemask_epi8` (compiles to `pmovmskb`) to /// avoid LLVM's broken AVX-512 auto-vectorization of counting loops. /// -/// # Safety -/// Requires SSE2 support (guaranteed on x86_64). +/// FIXME(llvm#176906): Remove this workaround once LLVM generates efficient code. #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))] -#[target_feature(enable = "sse2")] -unsafe fn is_ascii_sse2(bytes: &[u8]) -> bool { +fn is_ascii_sse2(bytes: &[u8]) -> bool { use crate::arch::x86_64::{__m128i, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128}; let mut i = 0; @@ -487,12 +485,14 @@ unsafe fn is_ascii_sse2(bytes: &[u8]) -> bool { // SAFETY: Same as above - ptr.add(16) is within the valid 32-byte range. let chunk2 = unsafe { _mm_loadu_si128(ptr.add(16) as *const __m128i) }; - // OR them together - if any byte has the high bit set, the result will too - let combined = _mm_or_si128(chunk1, chunk2); + // OR them together - if any byte has the high bit set, the result will too. + // SAFETY: SSE2 is guaranteed by the cfg predicate. + let combined = unsafe { _mm_or_si128(chunk1, chunk2) }; // Create a mask from the MSBs of each byte. // If any byte is >= 128, its MSB is 1, so the mask will be non-zero. - let mask = _mm_movemask_epi8(combined); + // SAFETY: SSE2 is guaranteed by the cfg predicate. + let mask = unsafe { _mm_movemask_epi8(combined) }; if mask != 0 { return false; @@ -541,8 +541,7 @@ const fn is_ascii(bytes: &[u8]) -> bool { return remainder.iter().all(|b| b.is_ascii()); } - // SAFETY: SSE2 is guaranteed available on x86_64 - unsafe { is_ascii_sse2(bytes) } + is_ascii_sse2(bytes) } ) }