From a0f9a15b4a916c92d51131418e9fe080c83f9d3c Mon Sep 17 00:00:00 2001
From: Andreas Liljeqvist <bonega@gmail.com>
Date: Sat, 17 Jan 2026 11:36:25 +0100
Subject: [PATCH 1/4] Fix is_ascii performance regression on AVX-512 CPUs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When `[u8]::is_ascii()` is compiled with `-C target-cpu=native` on
AVX-512 CPUs, LLVM generates inefficient code. Because `is_ascii` is
marked `#[inline]`, it gets inlined and recompiled with the user's
target settings. The previous implementation used a counting loop that
LLVM auto-vectorizes to `pmovmskb` on SSE2, but with AVX-512 enabled,
LLVM uses k-registers and extracts bits individually with ~31
`kshiftrd` instructions.

This fix replaces the counting loop with explicit SSE2 intrinsics
(`_mm_loadu_si128`, `_mm_or_si128`, `_mm_movemask_epi8`) for x86_64.
`_mm_movemask_epi8` compiles to `pmovmskb`, forcing efficient codegen
regardless of CPU features.

Benchmark results on AMD Ryzen 5 7500F (Zen 4 with AVX-512):
- Default build: ~73 GB/s → ~74 GB/s (no regression)
- With -C target-cpu=native: ~3 GB/s → ~67 GB/s (22x improvement)

The loongarch64 implementation retains the original counting loop
since it doesn't have this issue.

Regression from: https://github.com/rust-lang/rust/pull/130733
---
 library/core/src/slice/ascii.rs              | 86 +++++++++++++++++---
 tests/assembly-llvm/slice-is-ascii-avx512.rs | 18 ++++
 tests/codegen-llvm/slice-is-ascii.rs         |  9 +-
 3 files changed, 98 insertions(+), 15 deletions(-)
 create mode 100644 tests/assembly-llvm/slice-is-ascii-avx512.rs

diff --git a/library/core/src/slice/ascii.rs b/library/core/src/slice/ascii.rs
index 3e8c553f9f159..c9e168d6cbf83 100644
--- a/library/core/src/slice/ascii.rs
+++ b/library/core/src/slice/ascii.rs
@@ -3,10 +3,7 @@
 use core::ascii::EscapeDefault;
 
 use crate::fmt::{self, Write};
-#[cfg(not(any(
-    all(target_arch = "x86_64", target_feature = "sse2"),
-    all(target_arch = "loongarch64", target_feature = "lsx")
-)))]
+#[cfg(not(all(target_arch = "loongarch64", target_feature = "lsx")))]
 use crate::intrinsics::const_eval_select;
 use crate::{ascii, iter, ops};
 
@@ -463,19 +460,84 @@ const fn is_ascii(s: &[u8]) -> bool {
     )
 }
 
-/// ASCII test optimized to use the `pmovmskb` instruction on `x86-64` and the
-/// `vmskltz.b` instruction on `loongarch64`.
+/// SSE2 implementation using `_mm_movemask_epi8` (compiles to `pmovmskb`) to
+/// avoid LLVM's broken AVX-512 auto-vectorization of counting loops.
+///
+/// # Safety
+/// Requires SSE2 support (guaranteed on x86_64).
+#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
+#[target_feature(enable = "sse2")]
+unsafe fn is_ascii_sse2(bytes: &[u8]) -> bool {
+    use crate::arch::x86_64::{__m128i, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128};
+
+    const CHUNK_SIZE: usize = 32;
+
+    let mut i = 0;
+
+    while i + CHUNK_SIZE <= bytes.len() {
+        // SAFETY: We have verified that `i + CHUNK_SIZE <= bytes.len()`.
+        let ptr = unsafe { bytes.as_ptr().add(i) };
+
+        // Load two 16-byte chunks and combine them.
+        // SAFETY: We verified `i + 32 <= len`, so ptr is valid for 32 bytes.
+        // `_mm_loadu_si128` allows unaligned loads.
+        let chunk1 = unsafe { _mm_loadu_si128(ptr as *const __m128i) };
+        // SAFETY: Same as above - ptr.add(16) is within the valid 32-byte range.
+        let chunk2 = unsafe { _mm_loadu_si128(ptr.add(16) as *const __m128i) };
+
+        // OR them together - if any byte has the high bit set, the result will too
+        let combined = _mm_or_si128(chunk1, chunk2);
+
+        // Create a mask from the MSBs of each byte.
+        // If any byte is >= 128, its MSB is 1, so the mask will be non-zero.
+        let mask = _mm_movemask_epi8(combined);
+
+        if mask != 0 {
+            return false;
+        }
+
+        i += CHUNK_SIZE;
+    }
+
+    // Handle remaining bytes with simple loop
+    while i < bytes.len() {
+        if !bytes[i].is_ascii() {
+            return false;
+        }
+        i += 1;
+    }
+
+    true
+}
+
+/// ASCII test optimized to use the `pmovmskb` instruction on `x86-64`.
+///
+/// Uses explicit SSE2 intrinsics to prevent LLVM from auto-vectorizing with
+/// broken AVX-512 code that extracts mask bits one-by-one.
+#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
+#[inline]
+#[rustc_allow_const_fn_unstable(const_eval_select)]
+const fn is_ascii(bytes: &[u8]) -> bool {
+    const_eval_select!(
+        @capture { bytes: &[u8] } -> bool:
+        if const {
+            is_ascii_simple(bytes)
+        } else {
+            // SAFETY: SSE2 is guaranteed available on x86_64
+            unsafe { is_ascii_sse2(bytes) }
+        }
+    )
+}
+
+/// ASCII test optimized to use the `vmskltz.b` instruction on `loongarch64`.
 ///
 /// Other platforms are not likely to benefit from this code structure, so they
 /// use SWAR techniques to test for ASCII in `usize`-sized chunks.
-#[cfg(any(
-    all(target_arch = "x86_64", target_feature = "sse2"),
-    all(target_arch = "loongarch64", target_feature = "lsx")
-))]
+#[cfg(all(target_arch = "loongarch64", target_feature = "lsx"))]
 #[inline]
 const fn is_ascii(bytes: &[u8]) -> bool {
     // Process chunks of 32 bytes at a time in the fast path to enable
-    // auto-vectorization and use of `pmovmskb`. Two 128-bit vector registers
+    // auto-vectorization and use of `vmskltz.b`. Two 128-bit vector registers
     // can be OR'd together and then the resulting vector can be tested for
     // non-ASCII bytes.
     const CHUNK_SIZE: usize = 32;
@@ -485,7 +547,7 @@ const fn is_ascii(bytes: &[u8]) -> bool {
     while i + CHUNK_SIZE <= bytes.len() {
         let chunk_end = i + CHUNK_SIZE;
 
-        // Get LLVM to produce a `pmovmskb` instruction on x86-64 which
+        // Get LLVM to produce a `vmskltz.b` instruction on loongarch64 which
         // creates a mask from the most significant bit of each byte.
         // ASCII bytes are less than 128 (0x80), so their most significant
         // bit is unset.
diff --git a/tests/assembly-llvm/slice-is-ascii-avx512.rs b/tests/assembly-llvm/slice-is-ascii-avx512.rs
new file mode 100644
index 0000000000000..d3a441fec96cb
--- /dev/null
+++ b/tests/assembly-llvm/slice-is-ascii-avx512.rs
@@ -0,0 +1,18 @@
+//@ only-x86_64
+//@ compile-flags: -C opt-level=3 -C target-cpu=znver4
+//@ compile-flags: -C llvm-args=-x86-asm-syntax=intel
+//@ assembly-output: emit-asm
+#![crate_type = "lib"]
+
+// Verify is_ascii uses pmovmskb/vpmovmskb instead of kshiftrd with AVX-512.
+// The fix uses explicit SSE2 intrinsics to avoid LLVM's broken auto-vectorization.
+//
+// See: https://github.com/rust-lang/rust/issues/129293
+
+// CHECK-LABEL: test_is_ascii
+#[no_mangle]
+pub fn test_is_ascii(s: &[u8]) -> bool {
+    // CHECK-NOT: kshiftrd
+    // CHECK-NOT: kshiftrq
+    s.is_ascii()
+}
diff --git a/tests/codegen-llvm/slice-is-ascii.rs b/tests/codegen-llvm/slice-is-ascii.rs
index 67537c871a0a3..1f41b69e43966 100644
--- a/tests/codegen-llvm/slice-is-ascii.rs
+++ b/tests/codegen-llvm/slice-is-ascii.rs
@@ -1,10 +1,13 @@
-//@ only-x86_64
-//@ compile-flags: -C opt-level=3 -C target-cpu=x86-64
+//@ only-loongarch64
+//@ compile-flags: -C opt-level=3
 #![crate_type = "lib"]
 
-/// Check that the fast-path of `is_ascii` uses a `pmovmskb` instruction.
+/// Check that the fast-path of `is_ascii` uses a `vmskltz.b` instruction.
 /// Platforms lacking an equivalent instruction use other techniques for
 /// optimizing `is_ascii`.
+///
+/// Note: x86_64 uses explicit SSE2 intrinsics instead of relying on
+/// auto-vectorization. See `slice-is-ascii-avx512.rs`.
 // CHECK-LABEL: @is_ascii_autovectorized
 #[no_mangle]
 pub fn is_ascii_autovectorized(s: &[u8]) -> bool {

From 08432c892758a06a6bab9fa0584effb7e7881303 Mon Sep 17 00:00:00 2001
From: Andreas Liljeqvist <bonega@gmail.com>
Date: Sun, 18 Jan 2026 22:49:37 +0100
Subject: [PATCH 2/4] Optimize small input path for is_ascii on x86_64

For inputs smaller than 32 bytes, use usize-at-a-time processing
instead of calling the SSE2 function. This avoids function call
overhead from #[target_feature(enable = "sse2")] which prevents
inlining.

Also moves CHUNK_SIZE to module level so it can be shared between
is_ascii and is_ascii_sse2.
---
 library/core/src/slice/ascii.rs | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/library/core/src/slice/ascii.rs b/library/core/src/slice/ascii.rs
index c9e168d6cbf83..25b8a10af3555 100644
--- a/library/core/src/slice/ascii.rs
+++ b/library/core/src/slice/ascii.rs
@@ -460,6 +460,10 @@ const fn is_ascii(s: &[u8]) -> bool {
     )
 }
 
+/// Chunk size for vectorized ASCII checking (two 16-byte SSE registers).
+#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
+const CHUNK_SIZE: usize = 32;
+
 /// SSE2 implementation using `_mm_movemask_epi8` (compiles to `pmovmskb`) to
 /// avoid LLVM's broken AVX-512 auto-vectorization of counting loops.
 ///
@@ -470,8 +474,6 @@ const fn is_ascii(s: &[u8]) -> bool {
 unsafe fn is_ascii_sse2(bytes: &[u8]) -> bool {
     use crate::arch::x86_64::{__m128i, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128};
 
-    const CHUNK_SIZE: usize = 32;
-
     let mut i = 0;
 
     while i + CHUNK_SIZE <= bytes.len() {
@@ -518,11 +520,27 @@ unsafe fn is_ascii_sse2(bytes: &[u8]) -> bool {
 #[inline]
 #[rustc_allow_const_fn_unstable(const_eval_select)]
 const fn is_ascii(bytes: &[u8]) -> bool {
+    const USIZE_SIZE: usize = size_of::<usize>();
+    const NONASCII_MASK: usize = usize::MAX / 255 * 0x80;
+
     const_eval_select!(
         @capture { bytes: &[u8] } -> bool:
         if const {
             is_ascii_simple(bytes)
         } else {
+            // For small inputs, use usize-at-a-time processing to avoid SSE2 call overhead.
+            if bytes.len() < CHUNK_SIZE {
+                let chunks = bytes.chunks_exact(USIZE_SIZE);
+                let remainder = chunks.remainder();
+                for chunk in chunks {
+                    let word = usize::from_ne_bytes(chunk.try_into().unwrap());
+                    if (word & NONASCII_MASK) != 0 {
+                        return false;
+                    }
+                }
+                return remainder.iter().all(|b| b.is_ascii());
+            }
+
             // SAFETY: SSE2 is guaranteed available on x86_64
             unsafe { is_ascii_sse2(bytes) }
         }

From c609cce8cf6c6a5a1b6e081a671c7ba9f1165fdd Mon Sep 17 00:00:00 2001
From: Andreas Liljeqvist <bonega@gmail.com>
Date: Thu, 22 Jan 2026 22:18:00 +0100
Subject: [PATCH 3/4] Merge is_ascii codegen tests using revisions

Combine the x86_64 and loongarch64 is_ascii tests into a single file
using compiletest revisions. Both now test assembly output:

- X86_64: Verifies no broken kshiftrd/kshiftrq instructions (AVX-512 fix)
- LA64: Verifies vmskltz.b instruction is used (auto-vectorization)
---
 tests/assembly-llvm/slice-is-ascii-avx512.rs | 18 -----------
 tests/assembly-llvm/slice-is-ascii.rs        | 32 ++++++++++++++++++++
 tests/codegen-llvm/slice-is-ascii.rs         | 19 ------------
 3 files changed, 32 insertions(+), 37 deletions(-)
 delete mode 100644 tests/assembly-llvm/slice-is-ascii-avx512.rs
 create mode 100644 tests/assembly-llvm/slice-is-ascii.rs
 delete mode 100644 tests/codegen-llvm/slice-is-ascii.rs

diff --git a/tests/assembly-llvm/slice-is-ascii-avx512.rs b/tests/assembly-llvm/slice-is-ascii-avx512.rs
deleted file mode 100644
index d3a441fec96cb..0000000000000
--- a/tests/assembly-llvm/slice-is-ascii-avx512.rs
+++ /dev/null
@@ -1,18 +0,0 @@
-//@ only-x86_64
-//@ compile-flags: -C opt-level=3 -C target-cpu=znver4
-//@ compile-flags: -C llvm-args=-x86-asm-syntax=intel
-//@ assembly-output: emit-asm
-#![crate_type = "lib"]
-
-// Verify is_ascii uses pmovmskb/vpmovmskb instead of kshiftrd with AVX-512.
-// The fix uses explicit SSE2 intrinsics to avoid LLVM's broken auto-vectorization.
-//
-// See: https://github.com/rust-lang/rust/issues/129293
-
-// CHECK-LABEL: test_is_ascii
-#[no_mangle]
-pub fn test_is_ascii(s: &[u8]) -> bool {
-    // CHECK-NOT: kshiftrd
-    // CHECK-NOT: kshiftrq
-    s.is_ascii()
-}
diff --git a/tests/assembly-llvm/slice-is-ascii.rs b/tests/assembly-llvm/slice-is-ascii.rs
new file mode 100644
index 0000000000000..d01b321bf460a
--- /dev/null
+++ b/tests/assembly-llvm/slice-is-ascii.rs
@@ -0,0 +1,32 @@
+//@ revisions: X86_64 LA64
+//@ assembly-output: emit-asm
+//@ compile-flags: -C opt-level=3
+//
+//@ [X86_64] only-x86_64
+//@ [X86_64] compile-flags: -C target-cpu=znver4
+//@ [X86_64] compile-flags: -C llvm-args=-x86-asm-syntax=intel
+//
+//@ [LA64] only-loongarch64
+
+#![crate_type = "lib"]
+
+/// Verify `is_ascii` generates efficient code on different architectures:
+///
+/// - x86_64: Must NOT use `kshiftrd`/`kshiftrq` (broken AVX-512 auto-vectorization).
+///   The fix uses explicit SSE2 intrinsics (`pmovmskb`/`vpmovmskb`).
+///   See: https://github.com/llvm/llvm-project/issues/176906
+///
+/// - loongarch64: Should use `vmskltz.b` instruction for the fast-path.
+///   This architecture still relies on LLVM auto-vectorization.
+
+// X86_64-LABEL: test_is_ascii
+// X86_64-NOT: kshiftrd
+// X86_64-NOT: kshiftrq
+
+// LA64-LABEL: test_is_ascii
+// LA64: vmskltz.b
+
+#[no_mangle]
+pub fn test_is_ascii(s: &[u8]) -> bool {
+    s.is_ascii()
+}
diff --git a/tests/codegen-llvm/slice-is-ascii.rs b/tests/codegen-llvm/slice-is-ascii.rs
deleted file mode 100644
index 1f41b69e43966..0000000000000
--- a/tests/codegen-llvm/slice-is-ascii.rs
+++ /dev/null
@@ -1,19 +0,0 @@
-//@ only-loongarch64
-//@ compile-flags: -C opt-level=3
-#![crate_type = "lib"]
-
-/// Check that the fast-path of `is_ascii` uses a `vmskltz.b` instruction.
-/// Platforms lacking an equivalent instruction use other techniques for
-/// optimizing `is_ascii`.
-///
-/// Note: x86_64 uses explicit SSE2 intrinsics instead of relying on
-/// auto-vectorization. See `slice-is-ascii-avx512.rs`.
-// CHECK-LABEL: @is_ascii_autovectorized
-#[no_mangle]
-pub fn is_ascii_autovectorized(s: &[u8]) -> bool {
-    // CHECK: load <32 x i8>
-    // CHECK-NEXT: icmp slt <32 x i8>
-    // CHECK-NEXT: bitcast <32 x i1>
-    // CHECK-NEXT: icmp eq i32
-    s.is_ascii()
-}

From 890c0fd4e8a3cbd49b761db2432114b147ae3487 Mon Sep 17 00:00:00 2001
From: Andreas Liljeqvist <bonega@gmail.com>
Date: Thu, 22 Jan 2026 22:41:57 +0100
Subject: [PATCH 4/4] Make is_ascii_sse2 a safe function

Remove the `#[target_feature(enable = "sse2")]` attribute and make the
function safe to call. The SSE2 requirement is already enforced by the
`#[cfg(target_feature = "sse2")]` predicate.

Individual unsafe blocks are used for intrinsic calls with appropriate
SAFETY comments.

Also adds FIXME reference to llvm#176906 for tracking when this
workaround can be removed.
---
 library/core/src/slice/ascii.rs | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/library/core/src/slice/ascii.rs b/library/core/src/slice/ascii.rs
index 25b8a10af3555..459c826f40646 100644
--- a/library/core/src/slice/ascii.rs
+++ b/library/core/src/slice/ascii.rs
@@ -467,11 +467,9 @@ const CHUNK_SIZE: usize = 32;
 /// SSE2 implementation using `_mm_movemask_epi8` (compiles to `pmovmskb`) to
 /// avoid LLVM's broken AVX-512 auto-vectorization of counting loops.
 ///
-/// # Safety
-/// Requires SSE2 support (guaranteed on x86_64).
+/// FIXME(llvm#176906): Remove this workaround once LLVM generates efficient code.
 #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
-#[target_feature(enable = "sse2")]
-unsafe fn is_ascii_sse2(bytes: &[u8]) -> bool {
+fn is_ascii_sse2(bytes: &[u8]) -> bool {
     use crate::arch::x86_64::{__m128i, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128};
 
     let mut i = 0;
@@ -487,12 +485,14 @@ unsafe fn is_ascii_sse2(bytes: &[u8]) -> bool {
         // SAFETY: Same as above - ptr.add(16) is within the valid 32-byte range.
         let chunk2 = unsafe { _mm_loadu_si128(ptr.add(16) as *const __m128i) };
 
-        // OR them together - if any byte has the high bit set, the result will too
-        let combined = _mm_or_si128(chunk1, chunk2);
+        // OR them together - if any byte has the high bit set, the result will too.
+        // SAFETY: SSE2 is guaranteed by the cfg predicate.
+        let combined = unsafe { _mm_or_si128(chunk1, chunk2) };
 
         // Create a mask from the MSBs of each byte.
         // If any byte is >= 128, its MSB is 1, so the mask will be non-zero.
-        let mask = _mm_movemask_epi8(combined);
+        // SAFETY: SSE2 is guaranteed by the cfg predicate.
+        let mask = unsafe { _mm_movemask_epi8(combined) };
 
         if mask != 0 {
             return false;
@@ -541,8 +541,7 @@ const fn is_ascii(bytes: &[u8]) -> bool {
                 return remainder.iter().all(|b| b.is_ascii());
             }
 
-            // SAFETY: SSE2 is guaranteed available on x86_64
-            unsafe { is_ascii_sse2(bytes) }
+            is_ascii_sse2(bytes)
         }
     )
 }