diff --git a/src/lib.rs b/src/lib.rs index 2804e8a..03897a9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -117,11 +117,22 @@ static OVERFLOW_BYTES: AtomicUsize = AtomicUsize::new(0); /// the arena, so library state that outlives a phase doesn't land in /// recycled memory. /// -/// Defaults to 4096 (one page) — covers the known phase-crossing patterns: -/// crossbeam_deque::Injector blocks (~1.5 KB), tracing-subscriber Registry -/// slot data (sub-KB), hashbrown HashMap entries (sub-KB), rayon-core job -/// stack frames (sub-KB). Set ZK_ALLOC_MIN_BYTES=0 to disable, or override +/// Defaults to 4096 (one page) on x86_64 — covers the known phase-crossing +/// patterns: crossbeam_deque::Injector blocks (~1.5 KB), tracing-subscriber +/// Registry slot data (sub-KB), hashbrown HashMap entries (sub-KB), rayon-core +/// job stack frames (sub-KB). Set ZK_ALLOC_MIN_BYTES=0 to disable, or override /// to a different threshold. +/// +/// On aarch64 the default drops to 256: with the iter-8 32 MiB-THP-backed +/// arena, allocs in the arena hit a hugepage TLB entry while System allocs +/// land on 16 KiB base pages, so routing the 256..4095 size band into the +/// arena buys the hugepage TLB benefit for those allocations (-1.30% of +/// cycles in glibc helpers observed on the original zk-alloc M2 profile). +/// The rayon-flush feature (default-on) keeps Injector blocks safe across +/// phase boundaries; sticky-System realloc protects grown Vecs. +#[cfg(target_arch = "aarch64")] +const DEFAULT_MIN_ARENA_BYTES: usize = 256; +#[cfg(not(target_arch = "aarch64"))] const DEFAULT_MIN_ARENA_BYTES: usize = 4096; static MIN_ARENA_BYTES: AtomicUsize = AtomicUsize::new(DEFAULT_MIN_ARENA_BYTES); @@ -159,16 +170,81 @@ fn ensure_region() -> usize { let max_threads = cpus + SLACK; let region_size = slab_size * max_threads; + // On aarch64 Linux (M2/Asahi) THP page size is 32 MiB. Over-allocate by + // THP_SIZE so we can round REGION_BASE up to a 32 MiB boundary — required + // for khugepaged to collapse base pages into hugepages. Without alignment + // + an eager touch (one write per 32 MiB) the kernel collapses the touched + // region into THP synchronously instead of relying on async khugepaged. + #[cfg(target_arch = "aarch64")] + const THP_SIZE: usize = 32 << 20; + + #[cfg(target_arch = "aarch64")] + let mmap_size = region_size + THP_SIZE; + #[cfg(not(target_arch = "aarch64"))] + let mmap_size = region_size; // SAFETY: mmap_anonymous returns a page-aligned pointer or null. // MAP_NORESERVE means no physical memory is committed until pages are touched. - let ptr = unsafe { syscall::mmap_anonymous(region_size) }; - if ptr.is_null() { + let raw = unsafe { syscall::mmap_anonymous(mmap_size) }; + if raw.is_null() { std::process::abort(); } - unsafe { syscall::madvise(ptr, region_size, syscall::MADV_NOHUGEPAGE) }; + + #[cfg(target_arch = "aarch64")] + let aligned_base = (raw as usize).next_multiple_of(THP_SIZE); + #[cfg(not(target_arch = "aarch64"))] + let aligned_base = raw as usize; + + // On aarch64, ask khugepaged to use THP for the slab region. On x86_64 + // preserve the historical NOHUGEPAGE hint (2 MiB THP can fragment slab + // release; documented original choice). + #[cfg(target_arch = "aarch64")] + let advice = syscall::MADV_HUGEPAGE; + #[cfg(not(target_arch = "aarch64"))] + let advice = syscall::MADV_NOHUGEPAGE; + unsafe { syscall::madvise(aligned_base as *mut u8, region_size, advice) }; + + // Eager pre-touch on aarch64: write one byte per 32 MiB hugepage across + // the first `pretouch_bytes` of every per-thread slab. Each write triggers + // a page fault that the kernel resolves into a 32 MiB THP given our + // MADV_HUGEPAGE hint and the 32 MiB-aligned base. Makes the THP win + // deterministic instead of khugepaged-async-dependent. + // + // Adapt `pretouch_bytes` to MemTotal so total pre-touch stays under + // MemTotal / OVERCOMMIT_GUARD (= 1/3 of RAM): on a 16 GiB Asahi M2 box, + // a hard-coded 1 GiB × 14 slabs = 14 GiB pre-touch over-commits and gets + // OOM-killed. Formula gives ~390 MiB per slab at 16 GiB, ~1 GiB at 64 GiB. + // Floor at THP_SIZE so we still pre-touch at least one hugepage if + // `total_ram_bytes()` returns 0 (stub or syscall failure). + #[cfg(target_arch = "aarch64")] + { + const PRETOUCH_HARD_CAP: usize = 1 << 30; + const OVERCOMMIT_GUARD: usize = 3; + // SAFETY: total_ram_bytes is allocation-free on platforms with a real + // impl, and the libc-fallback stub returns 0 without allocating. + let mem_total = unsafe { syscall::total_ram_bytes() }; + let pretouch_bytes = if mem_total == 0 { + THP_SIZE + } else { + let budget = mem_total / max_threads / OVERCOMMIT_GUARD; + budget.clamp(THP_SIZE, PRETOUCH_HARD_CAP) + }; + for slab_idx in 0..max_threads { + let slab_base = aligned_base + slab_idx * slab_size; + let mut off = 0; + while off < pretouch_bytes { + // SAFETY: aligned_base..aligned_base+region_size is a valid + // anonymous mmap reservation; we only touch within slab. + unsafe { + std::ptr::write_volatile((slab_base + off) as *mut u8, 0); + } + off += THP_SIZE; + } + } + } + MAX_THREADS.store(max_threads, Ordering::Release); REGION_SIZE.store(region_size, Ordering::Release); - REGION_BASE.store(ptr as usize, Ordering::Release); + REGION_BASE.store(aligned_base, Ordering::Release); }); REGION_BASE.load(Ordering::Acquire) } diff --git a/src/syscall.rs b/src/syscall.rs index f676b2a..a873517 100644 --- a/src/syscall.rs +++ b/src/syscall.rs @@ -96,6 +96,7 @@ mod imp { mod imp { use std::ptr; + pub const MADV_HUGEPAGE: usize = 14; pub const MADV_NOHUGEPAGE: usize = 15; #[inline] @@ -117,6 +118,18 @@ mod imp { pub unsafe fn madvise(_ptr: *mut u8, _size: usize, _advice: usize) { // The advice values we pass are Linux-specific. } + + /// Conservative stub: returns 0 to signal "unknown". Real allocation-free + /// implementations (sysinfo syscall on aarch64-Linux, sysctl on macOS) live + /// behind their own raw-syscall imp blocks. With 0, the lib.rs adaptive + /// pre-touch falls back to a single hugepage per slab — safe (no OOM) + /// but loses the full THP-coverage win of iter 8. + #[inline] + pub unsafe fn total_ram_bytes() -> usize { + 0 + } } pub use imp::{madvise, mmap_anonymous, MADV_NOHUGEPAGE}; +#[cfg(target_arch = "aarch64")] +pub use imp::{total_ram_bytes, MADV_HUGEPAGE};