OffchainLabs · bragaigor · Mar 20, 2026 · Mar 20, 2026 · Mar 20, 2026 · Mar 20, 2026
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/lib/vm/src/trap/mod.rs b/lib/vm/src/trap/mod.rs
@@ -10,8 +10,9 @@ mod traphandlers;
 
 pub use trap::Trap;
 pub use traphandlers::{
-    TrapHandlerFn, VMConfig, catch_traps, on_host_stack, raise_lib_trap, raise_user_trap,
-    set_stack_size, wasmer_call_trampoline,
+    MAX_STACK_SIZE, TrapHandlerFn, VMConfig, catch_traps, get_stack_size, get_thread_stack_size,
+    on_host_stack, raise_lib_trap, raise_user_trap, set_stack_size, set_thread_stack_size,
+    wasmer_call_trampoline,
 };
 pub use traphandlers::{init_traps, resume_panic};
 pub use wasmer_types::TrapCode;
diff --git a/lib/vm/src/trap/traphandlers.rs b/lib/vm/src/trap/traphandlers.rs
@@ -57,11 +57,47 @@ struct ucontext_t {
 #[cfg(all(unix, not(all(target_arch = "aarch64", target_os = "macos"))))]
 use libc::ucontext_t;
 
-/// Default stack size is 1MB.
+/// Maximum allowed stack size (100 MB).
+pub const MAX_STACK_SIZE: usize = 100 * 1024 * 1024;
+
+/// Sets the process-wide default stack size for new Wasmer coroutines.
+/// The value is clamped to [8 KB, 100 MB].
 pub fn set_stack_size(size: usize) {
-    DEFAULT_STACK_SIZE.store(size.clamp(8 * 1024, 100 * 1024 * 1024), Ordering::Relaxed);
+    DEFAULT_STACK_SIZE.store(size.clamp(8 * 1024, MAX_STACK_SIZE), Ordering::Relaxed);
+}
+
+/// Returns the process-wide default stack size in bytes.
+pub fn get_stack_size() -> usize {
+    DEFAULT_STACK_SIZE.load(Ordering::Relaxed)
+}
+
+thread_local! {
+    /// Per-thread override for the coroutine stack size. When `Some`, this
+    /// takes precedence over the process-wide `DEFAULT_STACK_SIZE` in
+    /// `catch_traps`. This allows a single call site (e.g. the Stylus retry
+    /// loop) to request a larger stack without affecting other threads.
+    static STACK_SIZE_OVERRIDE: Cell<Option<usize>> = const { Cell::new(None) };
+}
+
+/// Sets a thread-local stack size override. While `Some`, all Wasmer
+/// coroutines created on this thread will use the given size instead of the
+/// process-wide default. Pass `None` to clear the override.
+/// The value is clamped to [8 KB, MAX_STACK_SIZE] just like `set_stack_size`.
+pub fn set_thread_stack_size(size: Option<usize>) {
+    STACK_SIZE_OVERRIDE.with(|cell| cell.set(size.map(|s| s.clamp(8 * 1024, MAX_STACK_SIZE))));
 }
 
+/// Returns the current thread-local stack size override, if any.
+pub fn get_thread_stack_size() -> Option<usize> {
+    STACK_SIZE_OVERRIDE.with(|cell| cell.get())
+}
+
+/// Pool of pre-allocated coroutine stacks to avoid repeated mmap syscalls.
+/// Each entry is tagged with the size it was allocated at so that callers
+/// requesting a larger stack can skip undersized entries instead of reusing them.
+static STACK_POOL: LazyLock<crossbeam_queue::SegQueue<(DefaultStack, usize)>> =
+    LazyLock::new(crossbeam_queue::SegQueue::new);
+
 cfg_if::cfg_if! {
     if #[cfg(unix)] {
         /// Function which may handle custom signals while processing traps.
@@ -725,6 +761,15 @@ pub unsafe fn wasmer_call_trampoline(
     }
 }
 
+/// Resolves the effective stack size from the three-tier priority chain:
+/// per-call VMConfig > thread-local override > process-wide default.
+fn resolve_stack_size(config: &VMConfig) -> usize {
+    config
+        .wasm_stack_size
+        .or_else(get_thread_stack_size)
+        .unwrap_or_else(|| DEFAULT_STACK_SIZE.load(Ordering::Relaxed))
+}
+
 /// Catches any wasm traps that happen within the execution of `closure`,
 /// returning them as a `Result`.
 ///
@@ -741,9 +786,7 @@ where
 {
     // Ensure that per-thread initialization is done.
     lazy_per_thread_init()?;
-    let stack_size = config
-        .wasm_stack_size
-        .unwrap_or_else(|| DEFAULT_STACK_SIZE.load(Ordering::Relaxed));
+    let stack_size = resolve_stack_size(config);
     on_wasm_stack(stack_size, trap_handler, closure).map_err(UnwindReason::into_trap)
 }
 
@@ -971,20 +1014,35 @@ fn on_wasm_stack<F: FnOnce() -> T + 'static, T: 'static>(
     trap_handler: Option<*const TrapHandlerFn<'static>>,
     f: F,
 ) -> Result<T, UnwindReason> {
-    // Allocating a new stack is pretty expensive since it involves several
-    // system calls. We therefore keep a cache of pre-allocated stacks which
-    // allows them to be reused multiple times.
-    // FIXME(Amanieu): We should refactor this to avoid the lock.
-    static STACK_POOL: LazyLock<crossbeam_queue::SegQueue<DefaultStack>> =
-        LazyLock::new(crossbeam_queue::SegQueue::new);
-
-    let stack = STACK_POOL
-        .pop()
-        .unwrap_or_else(|| DefaultStack::new(stack_size).unwrap());
-    let mut stack = scopeguard::guard(stack, |stack| STACK_POOL.push(stack));
+    // Try to reuse a pooled stack that is large enough. Undersized stacks are
+    // collected and returned to the pool after the search so we don't re-pop
+    // the same entry in a loop.
+    let stack = {
+        let mut found = None;
+        let mut skipped = Vec::new();
+        while let Some((s, sz)) = STACK_POOL.pop() {
+            if sz >= stack_size {
+                found = Some((s, sz));
+                break;
+            }
+            skipped.push((s, sz));
+        }
+        for entry in skipped {
+            STACK_POOL.push(entry);
+        }
+        match found {
+            Some(entry) => entry,
+            None => {
+                let s = DefaultStack::new(stack_size)
+                    .map_err(|_| UnwindReason::LibTrap(Trap::oom()))?;
+                (s, stack_size)
+            }
+        }
+    };
+    let mut stack = scopeguard::guard(stack, |entry| STACK_POOL.push(entry));
 
     // Create a coroutine with a new stack to run the function on.
-    let coro = ScopedCoroutine::with_stack(&mut *stack, move |yielder, ()| {
+    let coro = ScopedCoroutine::with_stack(&mut stack.0, move |yielder, ()| {
         // Save the yielder to TLS so that it can be used later.
         YIELDER.with(|cell| cell.set(Some(yielder.into())));
 
@@ -1174,3 +1232,195 @@ pub fn lazy_per_thread_init() -> Result<(), Trap> {
         }
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::sync::{Arc, Barrier};
+
+    #[test]
+    fn thread_local_override_is_isolated() {
+        // Each thread's override must be invisible to other threads.
+        let original = get_stack_size();
+        let barrier = Arc::new(Barrier::new(3));
+
+        let b1 = barrier.clone();
+        let t1 = std::thread::spawn(move || {
+            assert_eq!(get_thread_stack_size(), None);
+            set_thread_stack_size(Some(2 * 1024 * 1024));
+            assert_eq!(get_thread_stack_size(), Some(2 * 1024 * 1024));
+            b1.wait(); // sync: all threads have set their overrides
+            // Still our own value — not polluted by t2.
+            assert_eq!(get_thread_stack_size(), Some(2 * 1024 * 1024));
+            b1.wait(); // sync: all threads have verified
+            set_thread_stack_size(None);
+            assert_eq!(get_thread_stack_size(), None);
+        });
+
+        let b2 = barrier.clone();
+        let t2 = std::thread::spawn(move || {
+            assert_eq!(get_thread_stack_size(), None);
+            set_thread_stack_size(Some(4 * 1024 * 1024));
+            assert_eq!(get_thread_stack_size(), Some(4 * 1024 * 1024));
+            b2.wait(); // sync
+            // Still our own value — not polluted by t1.
+            assert_eq!(get_thread_stack_size(), Some(4 * 1024 * 1024));
+            b2.wait(); // sync
+            set_thread_stack_size(None);
+            assert_eq!(get_thread_stack_size(), None);
+        });
+
+        // Main thread: no override set, should see None throughout.
+        barrier.wait(); // sync: t1 and t2 have set overrides
+        assert_eq!(get_thread_stack_size(), None);
+        barrier.wait(); // sync: let threads verify
+
+        t1.join().unwrap();
+        t2.join().unwrap();
+
+        // Global default must be untouched.
+        assert_eq!(get_stack_size(), original);
+    }
+
+    #[test]
+    fn thread_local_override_does_not_affect_global() {
+        let original = get_stack_size();
+
+        set_thread_stack_size(Some(8 * 1024 * 1024));
+        // Global is unchanged.
+        assert_eq!(get_stack_size(), original);
+        assert_eq!(get_thread_stack_size(), Some(8 * 1024 * 1024));
+
+        set_thread_stack_size(None);
+        assert_eq!(get_stack_size(), original);
+        assert_eq!(get_thread_stack_size(), None);
+    }
+
+    #[test]
+    fn concurrent_retries_do_not_interfere() {
+        // Simulate the stylus_call retry pattern on multiple threads:
+        // each thread bumps its thread-local, "retries", then clears it.
+        // No thread should see another thread's override.
+        let original = get_stack_size();
+        let num_threads = 8;
+        let barrier = Arc::new(Barrier::new(num_threads));
+
+        let handles: Vec<_> = (0..num_threads)
+            .map(|i| {
+                let b = barrier.clone();
+                std::thread::spawn(move || {
+                    let my_size = (i + 1) * 1024 * 1024; // 1MB, 2MB, ..., 8MB
+
+                    // Phase 1: all threads set different overrides simultaneously.
+                    set_thread_stack_size(Some(my_size));
+                    b.wait();
+
+                    // Phase 2: verify each thread still sees its own value.
+                    let seen = get_thread_stack_size();
+                    assert_eq!(
+                        seen,
+                        Some(my_size),
+                        "thread {i} expected {my_size}, got {seen:?}"
+                    );
+                    b.wait();
+
+                    // Phase 3: simulate "retry succeeded" — double and verify.
+                    let doubled = my_size * 2;
+                    set_thread_stack_size(Some(doubled));
+                    b.wait();
+
+                    let seen = get_thread_stack_size();
+                    assert_eq!(
+                        seen,
+                        Some(doubled),
+                        "thread {i} after doubling: expected {doubled}, got {seen:?}"
+                    );
+                    b.wait();
+
+                    // Phase 4: clear (like the drop guard in stylus_call).
+                    set_thread_stack_size(None);
+                    assert_eq!(get_thread_stack_size(), None);
+                })
+            })
+            .collect();
+
+        for h in handles {
+            h.join().unwrap();
+        }
+
+        // Global must be untouched after all threads finish.
+        assert_eq!(get_stack_size(), original);
+    }
+
+    #[test]
+    fn pool_returns_correctly_sized_stacks() {
+        // Push stacks of different sizes into the pool, then verify that
+        // on_wasm_stack picks one that is large enough.
+        let small = 64 * 1024;
+        let large = 2 * 1024 * 1024;
+
+        // Seed the pool with a small stack.
+        let s = DefaultStack::new(small).unwrap();
+        STACK_POOL.push((s, small));
+
+        // Request a large stack — the small one should be skipped.
+        // We can't directly call on_wasm_stack (it needs trap init), but we
+        // can test the pool search logic by replicating it.
+        let mut skipped = Vec::new();
+        let mut found = None;
+        while let Some((s, sz)) = STACK_POOL.pop() {
+            if sz >= large {
+                found = Some((s, sz));
+                break;
+            }
+            skipped.push((s, sz));
+        }
+        for entry in skipped {
+            STACK_POOL.push(entry);
+        }
+
+        // Should not have found anything large enough.
+        assert!(found.is_none(), "pool should not have a stack >= {large}");
+
+        // The small stack should still be in the pool.
+        let entry = STACK_POOL.pop();
+        assert!(entry.is_some(), "small stack should still be pooled");
+        let (_, sz) = entry.unwrap();
+        assert_eq!(sz, small);
+    }
+
+    #[test]
+    fn resolve_stack_size_priority_chain() {
+        let global_default = get_stack_size();
+
+        // Case 1: no VMConfig override, no thread-local → uses global default.
+        set_thread_stack_size(None);
+        let config = VMConfig {
+            wasm_stack_size: None,
+        };
+        assert_eq!(resolve_stack_size(&config), global_default);
+
+        // Case 2: thread-local set, VMConfig None → thread-local wins.
+        set_thread_stack_size(Some(4 * 1024 * 1024));
+        let config = VMConfig {
+            wasm_stack_size: None,
+        };
+        assert_eq!(resolve_stack_size(&config), 4 * 1024 * 1024);
+
+        // Case 3: both thread-local and VMConfig set → VMConfig wins.
+        let config = VMConfig {
+            wasm_stack_size: Some(2 * 1024 * 1024),
+        };
+        assert_eq!(resolve_stack_size(&config), 2 * 1024 * 1024);
+
+        // Case 4: VMConfig set, no thread-local → VMConfig wins.
+        set_thread_stack_size(None);
+        let config = VMConfig {
+            wasm_stack_size: Some(6 * 1024 * 1024),
+        };
+        assert_eq!(resolve_stack_size(&config), 6 * 1024 * 1024);
+
+        // Cleanup.
+        set_thread_stack_size(None);
+    }
+}