diff --git a/.github/workflows/Linux-libretro-common-samples.yml b/.github/workflows/Linux-libretro-common-samples.yml
index 88e5989f8e4d..378e02809197 100644
--- a/.github/workflows/Linux-libretro-common-samples.yml
+++ b/.github/workflows/Linux-libretro-common-samples.yml
@@ -80,6 +80,7 @@ jobs:
             task_queue_title_error_test
             tpool_wait_test
             retro_atomic_test
+            retro_spsc_test
           )
 
           # Per-binary run command (overrides ./<binary> if present).
@@ -289,6 +290,31 @@ jobs:
 
           TSAN_OPTIONS=halt_on_error=1 ./retro_atomic_test
 
+      - name: Run retro_spsc_test under Clang + ThreadSanitizer
+        shell: bash
+        working-directory: libretro-common/samples/queues/retro_spsc_test
+        run: |
+          # retro_spsc.c is a lock-free SPSC byte queue built on
+          # retro_atomic.h.  Its correctness contract is acquire-load
+          # / release-store on the head and tail cursors, with the
+          # buffer reads/writes between them ordered by those barriers.
+          # Missing or weakened barriers produce torn data on the
+          # consumer side, observable as content mismatches in the
+          # stress harness AND as TSan-reported races.  The default
+          # ASan/UBSan pass above catches the content mismatches but
+          # not the races; this lane catches both.
+          #
+          # halt_on_error=1 makes TSan exit non-zero on the first race
+          # rather than continuing -- which is what we want for CI:
+          # any race here means the SPSC contract is broken.
+          set -u
+          set -o pipefail
+
+          make clean
+          CC=clang make all SANITIZER=thread
+
+          TSAN_OPTIONS=halt_on_error=1 ./retro_spsc_test
+
   # Cross-architecture validation lane for retro_atomic_test.
   #
   # The samples job above runs on x86_64, which is a strongly-ordered
diff --git a/.github/workflows/Linux-samples-gfx.yml b/.github/workflows/Linux-samples-gfx.yml
index c6ab6dbb5cbc..d2b28798f6f5 100644
--- a/.github/workflows/Linux-samples-gfx.yml
+++ b/.github/workflows/Linux-samples-gfx.yml
@@ -205,3 +205,86 @@ jobs:
           test -x vulkan_ctx_double_free_test
           timeout 60 ./vulkan_ctx_double_free_test
           echo "[pass] vulkan_ctx_double_free_test"
+
+      - name: Build and run gfx_thumbnail_status_atomic_test (TSan)
+        shell: bash
+        working-directory: samples/gfx/gfx_thumbnail_status_atomic
+        run: |
+          set -eu
+          # Regression test for the cross-thread thumbnail status
+          # synchronisation in gfx/gfx_thumbnail.c.  The .status
+          # field is read by the video thread (gfx_thumbnail_draw)
+          # and written by the upload-callback thread
+          # (gfx_thumbnail_handle_upload), with a release-store /
+          # acquire-load pairing that guards the texture / width /
+          # height fields published alongside the AVAILABLE
+          # status.  Pre-port the file used a hand-rolled
+          # __atomic_* / _Interlocked / volatile shim.  Post-port
+          # the .status field is retro_atomic_int_t and the
+          # accesses route through retro_atomic_*_int macros.
+          #
+          # ThreadSanitizer is the right validator for this test:
+          # it instruments every atomic load and store and would
+          # flag a missing-barrier regression as a data race, even
+          # on x86 TSO where the hardware otherwise hides the bug.
+          # ASan/UBSan would not catch barrier removal at all
+          # (no out-of-bounds access, no UB).  The test also
+          # carries static-assertion checks that validate the
+          # struct layout invariant (.status size == sizeof(int))
+          # so the field's atomic type does not change the
+          # gfx_thumbnail_t ABI.  If gfx_thumbnail.c amends
+          # GFX_THUMB_STATUS_STORE / GFX_THUMB_STATUS_LOAD or the
+          # .status field type, the verbatim copies in
+          # gfx_thumbnail_status_atomic_test.c must follow.
+          make clean all SANITIZER=thread
+          test -x gfx_thumbnail_status_atomic_test
+          TSAN_OPTIONS=halt_on_error=1 timeout 60 \
+             ./gfx_thumbnail_status_atomic_test
+          echo "[pass] gfx_thumbnail_status_atomic_test"
+
+      - name: Build and run gfx_widgets_msg_queue_race_test (TSan)
+        shell: bash
+        working-directory: samples/gfx/gfx_widgets_msg_queue_race
+        run: |
+          set -eu
+          # Regression test for the producer-producer race fix on
+          # dispgfx_widget_t::msg_queue in gfx/gfx_widgets.c.
+          # Pre-fix the FIFO had no dedicated lock: producers could
+          # be invoked from any thread (the threaded task system at
+          # libretro-common/queues/task_queue.c runs a worker
+          # thread; gfx/video_driver.c::video_driver_frame released
+          # RUNLOOP_MSG_QUEUE_LOCK before calling the producer, so
+          # main-thread producers ran unlocked) yet there was no
+          # lock dedicated to msg_queue itself.  Post-fix a
+          # msg_queue_lock is acquired around every fifo_write (in
+          # gfx_widgets_msg_queue_push), every fifo_read (in
+          # gfx_widgets_iterate), and the avail-check that gates
+          # them.  The outer fast-path FIFO_*_AVAIL checks that
+          # used to wrap the function bodies were dropped: reading
+          # the FIFO cursors outside the lock is itself a data
+          # race against the locked writes (TSan-detectable;
+          # benign on x86 TSO but real on weak-memory hardware).
+          #
+          # ThreadSanitizer is the right validator: it instruments
+          # every memory access on fifo_buffer_t::end and
+          # ::first and would flag a missing-lock regression as a
+          # data race on the producer side, the consumer side, or
+          # the avail check.  ASan/UBSan would not catch the lock
+          # removal at all.  halt_on_error=1 makes TSan exit
+          # non-zero on the first observed race, which is what we
+          # want for CI: any race here means the production
+          # locking discipline is broken.
+          #
+          # The test mirrors the post-fix locking protocol of
+          # gfx_widgets_msg_queue_push and gfx_widgets_iterate's
+          # FIFO interaction; the widget allocation, font
+          # measurement, and animation push that wrap the actual
+          # FIFO calls in production are not part of the race and
+          # are stripped from the test.  If gfx_widgets.c amends
+          # the locking around msg_queue, the verbatim copies in
+          # gfx_widgets_msg_queue_race_test.c must follow.
+          make clean all SANITIZER=thread
+          test -x gfx_widgets_msg_queue_race_test
+          TSAN_OPTIONS=halt_on_error=1 timeout 60 \
+             ./gfx_widgets_msg_queue_race_test
+          echo "[pass] gfx_widgets_msg_queue_race_test"
diff --git a/Makefile.common b/Makefile.common
index a7cc46b1721b..291741557a16 100644
--- a/Makefile.common
+++ b/Makefile.common
@@ -364,6 +364,7 @@ OBJ += \
        input/input_autodetect_builtin.o \
        input/input_keymaps.o \
        $(LIBRETRO_COMM_DIR)/queues/fifo_queue.o \
+       $(LIBRETRO_COMM_DIR)/queues/retro_spsc.o \
        $(LIBRETRO_COMM_DIR)/compat/compat_fnmatch.o \
        $(LIBRETRO_COMM_DIR)/compat/compat_posix_string.o
 
diff --git a/audio/drivers/asio.c b/audio/drivers/asio.c
index 0b7fa9a39bcc..73652da99290 100644
--- a/audio/drivers/asio.c
+++ b/audio/drivers/asio.c
@@ -55,7 +55,7 @@
 #include <compat/strl.h>
 #include <string/stdstring.h>
 #include <lists/string_list.h>
-#include <queues/fifo_queue.h>
+#include <retro_spsc.h>
 
 #ifdef HAVE_THREADS
 #include <rthreads/rthreads.h>
@@ -644,7 +644,26 @@ static void *asio_load_driver(const CLSID *clsid)
 typedef struct ra_asio
 {
    void              *iasio;        /* COM interface pointer */
-   fifo_buffer_t     *ring;         /* Ring buffer between write() and callback */
+   /* Lock-free SPSC ring buffer between asio_write (producer, main
+    * thread) and asio_cb_buffer_switch (consumer, ASIO callback
+    * thread).  Pre-port this used fifo_buffer_t with no surrounding
+    * lock, which was a real cross-thread race on first/end -- see
+    * commit message.  retro_spsc_t is the SPSC primitive designed
+    * for this exact pattern: lock-free (avoiding the priority-
+    * inversion concern of locking from a real-time audio callback),
+    * acquire/release ordering on the cursors, single-producer /
+    * single-consumer enforced by the type contract.
+    *
+    * Embedded by value (not via pointer) so the lifetime exactly
+    * tracks ra_asio_t.  Initialised with retro_spsc_init in
+    * ra_asio_init / ra_asio_init_via_persistent and freed with
+    * retro_spsc_free in the corresponding teardown paths.  The
+    * `ring_initialized` flag below distinguishes "never-initialised"
+    * from "init succeeded" so cleanup paths know whether to call
+    * retro_spsc_free.  retro_spsc_t doesn't carry that bit
+    * internally because most callers know their own lifecycle. */
+   retro_spsc_t       ring;
+   bool               ring_initialized;
 #ifdef HAVE_THREADS
    scond_t           *cond;
    slock_t           *cond_lock;
@@ -712,7 +731,11 @@ static void asio_deinterleave_to_buffers(ra_asio_t *ad,
    long i;
    void *buf_l  = ad->buf_info[0].buffers[index];
    void *buf_r  = ad->buf_info[1].buffers[index];
-   size_t avail = FIFO_READ_AVAIL(ad->ring);
+   /* Acquire-load on the producer's head cursor.  Pairs with the
+    * release-store inside retro_spsc_write that asio_write
+    * issues on the main thread, so the bytes we're about to read
+    * out via retro_spsc_read are guaranteed visible. */
+   size_t avail = retro_spsc_read_avail(&ad->ring);
    long have    = (long)(avail / (2 * sizeof(float)));
 
    if (have > frames)
@@ -727,7 +750,7 @@ static void asio_deinterleave_to_buffers(ra_asio_t *ad,
          float tmp[2];
          for (i = 0; i < have; i++)
          {
-            fifo_read(ad->ring, tmp, sizeof(tmp));
+            retro_spsc_read(&ad->ring, tmp, sizeof(tmp));
             dl[i] = tmp[0];
             dr[i] = tmp[1];
          }
@@ -742,7 +765,7 @@ static void asio_deinterleave_to_buffers(ra_asio_t *ad,
          float tmp[2];
          for (i = 0; i < have; i++)
          {
-            fifo_read(ad->ring, tmp, sizeof(tmp));
+            retro_spsc_read(&ad->ring, tmp, sizeof(tmp));
             dl[i] = (double)tmp[0];
             dr[i] = (double)tmp[1];
          }
@@ -757,7 +780,7 @@ static void asio_deinterleave_to_buffers(ra_asio_t *ad,
          float tmp[2];
          for (i = 0; i < have; i++)
          {
-            fifo_read(ad->ring, tmp, sizeof(tmp));
+            retro_spsc_read(&ad->ring, tmp, sizeof(tmp));
             dl[i] = (int32_t)((double)tmp[0] * 2147483647.0);
             dr[i] = (int32_t)((double)tmp[1] * 2147483647.0);
          }
@@ -773,7 +796,7 @@ static void asio_deinterleave_to_buffers(ra_asio_t *ad,
          for (i = 0; i < have; i++)
          {
             int32_t l, r;
-            fifo_read(ad->ring, tmp, sizeof(tmp));
+            retro_spsc_read(&ad->ring, tmp, sizeof(tmp));
             l = (int32_t)(tmp[0] * 8388607.0f);
             r = (int32_t)(tmp[1] * 8388607.0f);
             l = l >  8388607 ?  8388607 : (l < -8388608 ? -8388608 : l);
@@ -797,7 +820,7 @@ static void asio_deinterleave_to_buffers(ra_asio_t *ad,
          for (i = 0; i < have; i++)
          {
             int32_t l, r;
-            fifo_read(ad->ring, tmp, sizeof(tmp));
+            retro_spsc_read(&ad->ring, tmp, sizeof(tmp));
             l = (int32_t)(tmp[0] * 32767.0f);
             r = (int32_t)(tmp[1] * 32767.0f);
             dl[i] = (int16_t)(l > 32767 ? 32767 : (l < -32768 ? -32768 : l));
@@ -826,7 +849,7 @@ static void asio_cb_buffer_switch(long index,
 {
    ra_asio_t *ad = g_asio;
 
-   if (!ad || !ad->ring || ad->is_paused || ad->shutdown)
+   if (!ad || !ad->ring_initialized || ad->is_paused || ad->shutdown)
    {
       if (ad && ad->buf_info[0].buffers[index])
       {
@@ -924,8 +947,11 @@ static void asio_atexit_cleanup(void)
       ASIO_CALL_RELEASE(ad->iasio);
    }
 
-   if (ad->ring)
-      fifo_free(ad->ring);
+   if (ad->ring_initialized)
+   {
+      retro_spsc_free(&ad->ring);
+      ad->ring_initialized = false;
+   }
 
 #ifdef HAVE_THREADS
    if (ad->cond_lock)
@@ -991,7 +1017,11 @@ static void *ra_asio_init(const char *device, unsigned rate,
       if (new_rate)
          *new_rate = ad->sample_rate;
 
-      fifo_clear(ad->ring);
+      /* Discard any stale audio left over from the previous
+       * session.  Safe here because the ASIO callback isn't
+       * running yet (g_asio is still NULL until the next line),
+       * so the SPSC is single-threaded at this point. */
+      retro_spsc_clear(&ad->ring);
 
       g_asio = ad;
       ad->running = true;
@@ -1158,14 +1188,18 @@ static void *ra_asio_init(const char *device, unsigned rate,
 
    /* Create ring buffer BEFORE ASIO buffers — the driver may issue
     * a bufferSwitch callback during ASIOCreateBuffers, and the
-    * callback needs the ring buffer to exist (even if empty). */
+    * callback needs the ring buffer to exist (even if empty).
+    * retro_spsc_init rounds capacity up to a power of 2; the
+    * over-allocation is small (factor of < 2) and irrelevant to
+    * the ASIO latency calculation, which uses ad->buffer_frames
+    * not the ring's actual byte capacity. */
    ad->ring_size = pref_sz * 2 * sizeof(float) * ASIO_RING_MULT;
-   ad->ring      = fifo_new(ad->ring_size);
-   if (!ad->ring)
+   if (!retro_spsc_init(&ad->ring, ad->ring_size))
    {
       RARCH_ERR("[ASIO] Failed to create ring buffer.\n");
       goto error;
    }
+   ad->ring_initialized = true;
 
 #ifdef HAVE_THREADS
    ad->cond      = scond_new();
@@ -1229,8 +1263,11 @@ static void *ra_asio_init(const char *device, unsigned rate,
          ASIO_CALL_DISPOSE_BUFFERS(ad->iasio);
       ASIO_CALL_RELEASE(ad->iasio);
    }
-   if (ad->ring)
-      fifo_free(ad->ring);
+   if (ad->ring_initialized)
+   {
+      retro_spsc_free(&ad->ring);
+      ad->ring_initialized = false;
+   }
 #ifdef HAVE_THREADS
    if (ad->cond_lock)
       slock_free(ad->cond_lock);
@@ -1259,17 +1296,22 @@ static ssize_t ra_asio_write(void *data, const void *buf, size_t len)
       if (ad->shutdown)
          return -1;
 
-      avail    = FIFO_WRITE_AVAIL(ad->ring);
+      avail    = retro_spsc_write_avail(&ad->ring);
       to_write = (len < avail) ? len : avail;
       /* Align to frame boundary (stereo float = 8 bytes) */
       to_write = (to_write / 8) * 8;
 
       if (to_write > 0)
       {
-         fifo_write(ad->ring, src, to_write);
-         src     += to_write;
-         len     -= to_write;
-         written += to_write;
+         /* retro_spsc_write returns bytes actually written.  We've
+          * already capped to_write by retro_spsc_write_avail above,
+          * so the return value will equal to_write -- but use it
+          * defensively in case the contract ever changes. */
+         size_t actually_written =
+            retro_spsc_write(&ad->ring, src, to_write);
+         src     += actually_written;
+         len     -= actually_written;
+         written += actually_written;
       }
       else if (!ad->nonblock)
       {
@@ -1346,9 +1388,14 @@ static void ra_asio_free(void *data)
    /* Detach from the callback — silence output while parked */
    g_asio = NULL;
 
-   /* Flush stale audio */
-   if (ad->ring)
-      fifo_clear(ad->ring);
+   /* No retro_spsc_clear here.  The pre-port fifo_clear at this
+    * site was racy with stray ASIO callbacks that may still be
+    * running after ASIO_CALL_STOP returns (some drivers, notably
+    * ASIO4ALL, don't synchronously join their audio thread).
+    * The restart path in ra_asio_init_via_persistent calls
+    * retro_spsc_clear anyway, so any stale data will be flushed
+    * before the next run -- after the callback is provably
+    * stopped (g_asio == NULL gates it). */
 
    /* Store for reuse */
    g_asio_persistent = ad;
@@ -1361,9 +1408,9 @@ static bool ra_asio_use_float(void *data) { return true; }
 static size_t ra_asio_write_avail(void *data)
 {
    ra_asio_t *ad = (ra_asio_t *)data;
-   if (!ad || !ad->ring)
+   if (!ad || !ad->ring_initialized)
       return 0;
-   return FIFO_WRITE_AVAIL(ad->ring);
+   return retro_spsc_write_avail(&ad->ring);
 }
 
 static size_t ra_asio_buffer_size(void *data)
diff --git a/audio/drivers/coreaudio_mic_macos.m b/audio/drivers/coreaudio_mic_macos.m
index 77270a9348a6..e7cc7dccb277 100644
--- a/audio/drivers/coreaudio_mic_macos.m
+++ b/audio/drivers/coreaudio_mic_macos.m
@@ -333,7 +333,6 @@ void coreaudio_macos_microphone_free(void *data)
 
 static struct string_list *coreaudio_macos_microphone_device_list_new(const void *data)
 {
-   RARCH_LOG("[CoreAudio macOS Mic] device_list_new called.\n");
    struct string_list *list = string_list_new();
    if (!list)
    {
@@ -462,8 +461,6 @@ void coreaudio_macos_microphone_free(void *data)
 
 static void coreaudio_macos_microphone_device_list_free(const void *data, struct string_list *list)
 {
-   (void)data; /* Not used in this implementation */
-   RARCH_LOG("[CoreAudio macOS Mic] device_list_free called.\n");
    if (list)
    {
       for (size_t i = 0; i < list->size; i++)
@@ -818,7 +815,6 @@ static bool coreaudio_macos_microphone_mic_alive(const void *data, const void *m
 
 static bool coreaudio_macos_microphone_start_mic(void *data, void *mic_data)
 {
-   RARCH_LOG("[CoreAudio macOS Mic] start_mic called. Mic context: %p\n", mic_data);
    coreaudio_macos_microphone_t *mic = (coreaudio_macos_microphone_t *)mic_data;
    if (!mic || !mic->audio_unit || !atomic_load(&mic->is_initialized))
    {
@@ -827,10 +823,7 @@ static bool coreaudio_macos_microphone_start_mic(void *data, void *mic_data)
    }
 
    if (atomic_load(&mic->is_running))
-   {
-      RARCH_LOG("[CoreAudio macOS Mic] Already running.\n");
       return true;
-   }
 
    /* Check microphone permission on macOS */
    RARCH_LOG("[CoreAudio macOS Mic] Checking microphone permission...\n");
@@ -849,7 +842,7 @@ static bool coreaudio_macos_microphone_start_mic(void *data, void *mic_data)
    if (perm_status != noErr || default_input_device == kAudioObjectUnknown)
    {
       RARCH_ERR("[CoreAudio macOS Mic] No default input device available or permission denied. Status: %d, Device ID: %u\n",
-               (int)perm_status, (unsigned)default_input_device);
+            (int)perm_status, (unsigned)default_input_device);
    }
    else
    {
@@ -858,8 +851,8 @@ static bool coreaudio_macos_microphone_start_mic(void *data, void *mic_data)
 
    if (!mic->fifo)
    {
-       RARCH_ERR("[CoreAudio macOS Mic] No FIFO buffer available\n");
-       return false;
+      RARCH_ERR("[CoreAudio macOS Mic] No FIFO buffer available\n");
+      return false;
    }
    slock_lock(mic->fifo_lock);
    fifo_clear(mic->fifo);
@@ -872,26 +865,20 @@ static bool coreaudio_macos_microphone_start_mic(void *data, void *mic_data)
       RARCH_LOG("[CoreAudio macOS Mic] Microphone started successfully\n");
       return true;
    }
-   else
-   {
-      RARCH_ERR("[CoreAudio macOS Mic] Failed to start AudioUnit: %d (0x%x)\n", (int)status, (unsigned)status);
-      atomic_store(&mic->is_running, false);
-      return false;
-   }
+
+   RARCH_ERR("[CoreAudio macOS Mic] Failed to start AudioUnit: %d (0x%x)\n", (int)status, (unsigned)status);
+   atomic_store(&mic->is_running, false);
+   return false;
 }
 
 static bool coreaudio_macos_microphone_stop_mic(void *data, void *mic_data)
 {
    coreaudio_macos_microphone_t *mic = (coreaudio_macos_microphone_t *)mic_data;
    if (!mic || !mic->audio_unit)
-   {
       return true; /* Considered stopped if not valid */
-   }
 
    if (!atomic_load(&mic->is_running))
-   {
       return true;
-   }
 
    OSStatus status = AudioOutputUnitStop(mic->audio_unit);
    if (status == noErr)
@@ -905,21 +892,17 @@ static bool coreaudio_macos_microphone_stop_mic(void *data, void *mic_data)
       }
       return true;
    }
-   else
-   {
-      RARCH_ERR("[CoreAudio macOS Mic] Failed to stop AudioUnit: %d\n", (int)status);
-      /* Even if stop fails, we mark as not running from our perspective */
-      atomic_store(&mic->is_running, false);
-      return false;
-   }
+
+   RARCH_ERR("[CoreAudio macOS Mic] Failed to stop AudioUnit: %d\n", (int)status);
+   /* Even if stop fails, we mark as not running from our perspective */
+   atomic_store(&mic->is_running, false);
+   return false;
 }
 
 static bool coreaudio_macos_microphone_mic_use_float(const void *data, const void *mic_data)
 {
-   (void)data;
    coreaudio_macos_microphone_t *mic = (coreaudio_macos_microphone_t *)mic_data;
-   bool result = mic && mic->use_float;
-   return result;
+   return mic && mic->use_float;
 }
 
 
diff --git a/frontend/drivers/platform_darwin.m b/frontend/drivers/platform_darwin.m
index 41b2c9608c90..0cb1dead5a4a 100644
--- a/frontend/drivers/platform_darwin.m
+++ b/frontend/drivers/platform_darwin.m
@@ -27,6 +27,7 @@
 #ifdef HAVE_GCD
 #include <dispatch/dispatch.h>
 #include <defines/cocoa_defines.h>
+#include <retro_atomic.h>
 #endif
 
 #include <CoreFoundation/CoreFoundation.h>
@@ -144,7 +145,7 @@
     * any already-dispatched event handler invocation keeps
     * running to completion on its target queue, which is the
     * global concurrent queue here.  The event handler
-    * dereferences &watch_data->has_changes, so we cannot free
+    * dereferences &watch_data->event_count, so we cannot free
     * watch_data until we're sure no in-flight handler remains.
     * The cancel handler fires once all handler invocations
     * have drained, so waiting on this semaphore before free()
@@ -159,7 +160,27 @@
    dispatch_queue_t queue;       /* Dispatch queue for event handlers */
    darwin_watch_entry_t *watches; /* Array of watch entries */
    size_t watch_count;           /* Number of active watches */
-   volatile int32_t has_changes; /* Atomic flag indicating changes occurred */
+   /* Monotonic event counter.  Setter (GCD event handler thread)
+    * increments via retro_atomic_fetch_add_int when the
+    * filesystem reports an event; reader (main thread, in
+    * frontend_darwin_check_for_path_changes) acquire-loads it
+    * and compares against last_seen below.
+    *
+    * The previous design used a binary flag set/cleared via
+    * OSAtomicCompareAndSwap32, but Apple deprecated OSAtomic.h
+    * in 10.12 (2016) in favour of <stdatomic.h>.  Replacing the
+    * flag with a counter is also strictly more flexible: the
+    * "did anything change?" semantic is preserved exactly
+    * (now != last_seen), and the count is available if a future
+    * caller wants to batch events.  retro_atomic.h provides the
+    * portable fetch_add / load_acquire primitives needed; no
+    * compare-exchange operation is necessary, because the
+    * monotonic counter never needs to be read-and-cleared
+    * atomically (last_seen is main-thread-only state). */
+   retro_atomic_int_t event_count;
+   /* Reader-side cursor.  Touched only by the main thread in
+    * frontend_darwin_check_for_path_changes; not atomic. */
+   int last_seen;
    int flags;                    /* Event flags to monitor */
 } darwin_watch_data_t;
 #endif
@@ -929,8 +950,8 @@ static void darwin_watch_data_free(darwin_watch_data_t *watch_data)
              * handler to fire.  dispatch_source_cancel is
              * asynchronous - it only marks the source as
              * cancelled.  Any already-dispatched event handler
-             * invocation (the one that reads
-             * &watch_data->has_changes) keeps running to
+             * invocation (the one that increments
+             * &watch_data->event_count) keeps running to
              * completion on the global concurrent queue.  The
              * cancel handler is guaranteed to fire AFTER all
              * pending event handler invocations have drained,
@@ -938,7 +959,7 @@ static void darwin_watch_data_free(darwin_watch_data_t *watch_data)
              * is the standard 'wait until source is fully
              * quiesced' pattern.  Without this wait a racing
              * event handler would NUL-deref or, worse, write
-             * into freed memory for has_changes. */
+             * into freed memory for event_count. */
             dispatch_source_cancel(watch_data->watches[i].source);
             if (watch_data->watches[i].cancel_sem)
             {
@@ -989,7 +1010,12 @@ static void frontend_darwin_watch_path_for_changes(
    watch_data->watches = (darwin_watch_entry_t*)calloc(
          list->size, sizeof(darwin_watch_entry_t));
    watch_data->flags = flags;
-   watch_data->has_changes = 0;
+   /* watch_data was calloc'd, so event_count and last_seen are
+    * already zero-bit; but plain assignment to a
+    * retro_atomic_int_t is illegal under the C11 stdatomic
+    * backend, so use the init helper for the atomic field. */
+   retro_atomic_int_init(&watch_data->event_count, 0);
+   watch_data->last_seen = 0;
 
    if (!watch_data->watches)
    {
@@ -1052,12 +1078,25 @@ static void frontend_darwin_watch_path_for_changes(
 
             if (source)
             {
-               /* Set up event handler - sets atomic flag when changes
-                * occur.  This block captures watch_data by pointer and
-                * the cancel-handler synchronisation below is what keeps
-                * the capture safe against the teardown path. */
+               /* Set up event handler - bump the atomic event
+                * counter when a filesystem event fires.  This
+                * block captures watch_data by pointer and the
+                * cancel-handler synchronisation below is what
+                * keeps the capture safe against the teardown
+                * path.
+                *
+                * fetch_add (rather than the previous CAS(0, 1))
+                * always stores, but the rate is bounded by GCD
+                * vnode events (sub-millisecond/day in normal
+                * use), so the extra store cost is below
+                * measurement noise.  In return we get a
+                * monotonic counter that the reader can compare
+                * against its last_seen cursor without losing
+                * notifications, and we drop the dependency on
+                * OSAtomic.h which Apple deprecated in 10.12. */
                dispatch_source_set_event_handler(source, ^{
-                  OSAtomicCompareAndSwap32(0, 1, &watch_data->has_changes);
+                  retro_atomic_fetch_add_int(
+                        &watch_data->event_count, 1);
                });
 
                /* Cancel handler signals cancel_sem.  darwin_watch_
@@ -1122,8 +1161,30 @@ static bool frontend_darwin_check_for_path_changes(
 
    watch_data = (darwin_watch_data_t*)(change_data->data);
 
-   /* Atomically read and clear the flag */
-   return OSAtomicCompareAndSwap32(1, 0, &watch_data->has_changes);
+   /* Acquire-load the producer's counter and compare against
+    * our reader-side cursor.  If they differ, at least one
+    * event fired since the last call -- update the cursor and
+    * return true.
+    *
+    * No CAS is needed because last_seen is main-thread-only
+    * state.  The acquire-load pairs with the producer's
+    * fetch_add (which has acq_rel semantics in retro_atomic.h),
+    * so any data the producer published before its increment
+    * is visible to us by the time we read here.
+    *
+    * The counter is monotonic and never reset, so over a long
+    * enough run it would wrap around -- but on a 32-bit signed
+    * int at filesystem-event rates this takes hundreds of
+    * years.  The `now != last_seen` comparison remains correct
+    * across wraparound under modular int arithmetic; any non-
+    * zero (now - last_seen) means the producer advanced. */
+   {
+      int now = retro_atomic_load_acquire_int(
+            &watch_data->event_count);
+      bool changed = (now != watch_data->last_seen);
+      watch_data->last_seen = now;
+      return changed;
+   }
 }
 #endif
 
diff --git a/gfx/gfx_thumbnail.c b/gfx/gfx_thumbnail.c
index 4703b959393e..624a7238652c 100644
--- a/gfx/gfx_thumbnail.c
+++ b/gfx/gfx_thumbnail.c
@@ -43,41 +43,60 @@
 #define DEFAULT_GFX_THUMBNAIL_STREAM_DELAY  16.66667f * 3
 #define DEFAULT_GFX_THUMBNAIL_FADE_DURATION 166.66667f
 
-/* Helpers for cross-thread thumbnail status synchronisation.
+/* The thumbnail .status field is atomically-typed (see the
+ * gfx_thumbnail_t comment in gfx_thumbnail.h).  Reads and writes
+ * therefore must go through the retro_atomic API rather than
+ * plain assignment / dereference; on the C11 stdatomic and C++11
+ * std::atomic backends, plain access to such a field is illegal.
  *
- * The main thread writes texture/width/height then publishes
- * status = AVAILABLE.  The video thread reads status; if it
- * sees AVAILABLE, the data fields must be visible.
+ * The two cross-thread sites in this file are noted at the
+ * call sites:
+ *  - The release-stores in gfx_thumbnail_handle_upload publish
+ *    prior texture / width / height writes.
+ *  - The acquire-load in gfx_thumbnail_draw pairs with those.
+ * All other accesses are single-threaded; they go through the
+ * retro_atomic API only because the field's atomic-typed
+ * storage requires it.  The cost on weak-memory ARM/PowerPC is
+ * one extra ldar/stlr per cold-path access; on x86 TSO the
+ * barriers compile out entirely.
  *
- * On GCC/Clang (covers ARM64, x86, PowerPC, MIPS):
- *   release-store / acquire-load give the required ordering.
- * On MSVC (x86/x64 only in RetroArch's target matrix):
- *   x86 total-store-order makes plain volatile sufficient,
- *   but _InterlockedOr gives an explicit acquire load.
- * Fallback:
- *   Plain volatile read/write — sufficient on single-core
- *   and x86.  Platforms with weak ordering that lack GCC
- *   builtins do not run threaded video in practice.
- */
-#if defined(__clang__) || (defined(__GNUC__) && \
-   ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7)))
-#define GFX_THUMB_STATUS_STORE(ptr, val) \
-   __atomic_store_n((int*)(ptr), (int)(val), __ATOMIC_RELEASE)
-#define GFX_THUMB_STATUS_LOAD(ptr) \
-   ((enum gfx_thumbnail_status)__atomic_load_n((int*)(ptr), __ATOMIC_ACQUIRE))
-#elif defined(_MSC_VER)
-#include <intrin.h>
-#define GFX_THUMB_STATUS_STORE(ptr, val) \
-   _InterlockedExchange((volatile long*)(ptr), (long)(val))
-#define GFX_THUMB_STATUS_LOAD(ptr) \
-   ((enum gfx_thumbnail_status)_InterlockedOr((volatile long*)(ptr), 0))
+ * The wrappers are static-inline (rather than function-like
+ * macros) so callers have a clear function boundary.  GCC 13+
+ * at -O3 emits a spurious -Wstringop-overflow on the inlined
+ * __atomic_* primitive when called from gfx_thumbnail_request,
+ * because the optimiser cannot prove the @c thumbnail argument
+ * non-NULL across every `goto end:` flow-graph path; the
+ * suppression below is a targeted fix that does not affect
+ * codegen.  No effect on non-GCC backends. */
+
+#if defined(__GNUC__) && !defined(__clang__) && __GNUC__ >= 12
+#  define GFX_THUMB_STATUS_DIAG_PUSH \
+   _Pragma("GCC diagnostic push") \
+   _Pragma("GCC diagnostic ignored \"-Wstringop-overflow\"")
+#  define GFX_THUMB_STATUS_DIAG_POP \
+   _Pragma("GCC diagnostic pop")
 #else
-#define GFX_THUMB_STATUS_STORE(ptr, val) \
-   do { (*(volatile int*)(ptr)) = (int)(val); } while(0)
-#define GFX_THUMB_STATUS_LOAD(ptr) \
-   ((enum gfx_thumbnail_status)(*(volatile int*)(ptr)))
+#  define GFX_THUMB_STATUS_DIAG_PUSH
+#  define GFX_THUMB_STATUS_DIAG_POP
 #endif
 
+GFX_THUMB_STATUS_DIAG_PUSH
+static INLINE void gfx_thumb_status_store(
+      retro_atomic_int_t *ptr, enum gfx_thumbnail_status val)
+{
+   retro_atomic_store_release_int(ptr, (int)val);
+}
+
+static INLINE enum gfx_thumbnail_status gfx_thumb_status_load(
+      retro_atomic_int_t *ptr)
+{
+   return (enum gfx_thumbnail_status)retro_atomic_load_acquire_int(ptr);
+}
+GFX_THUMB_STATUS_DIAG_POP
+
+#define GFX_THUMB_STATUS_STORE(ptr, val) gfx_thumb_status_store((ptr), (val))
+#define GFX_THUMB_STATUS_LOAD(ptr)       gfx_thumb_status_load((ptr))
+
 /* Utility structure, sent as userdata when pushing
  * an image load */
 typedef struct
@@ -152,9 +171,9 @@ static void gfx_thumbnail_init_fade(
    /* A 'fade in' animation is triggered if:
     * - Thumbnail is available
     * - Thumbnail is missing and 'fade_missing' is enabled */
-   if (   (thumbnail->status == GFX_THUMBNAIL_STATUS_AVAILABLE)
+   if (   (GFX_THUMB_STATUS_LOAD(&thumbnail->status) == GFX_THUMBNAIL_STATUS_AVAILABLE)
        || (p_gfx_thumb->fade_missing
-       && (thumbnail->status == GFX_THUMBNAIL_STATUS_MISSING)))
+       && (GFX_THUMB_STATUS_LOAD(&thumbnail->status) == GFX_THUMBNAIL_STATUS_MISSING)))
    {
       if (p_gfx_thumb->fade_duration > 0.0f)
       {
@@ -198,7 +217,7 @@ static void gfx_thumbnail_handle_upload(
       goto end;
 
    /* Only process image if we are waiting for it */
-   if (thumbnail_tag->thumbnail->status != GFX_THUMBNAIL_STATUS_PENDING)
+   if (GFX_THUMB_STATUS_LOAD(&thumbnail_tag->thumbnail->status) != GFX_THUMBNAIL_STATUS_PENDING)
       goto end;
 
    /* Sanity check: if thumbnail already has a texture,
@@ -346,7 +365,7 @@ void gfx_thumbnail_request(
    /* Reset thumbnail, then set 'missing' status by default
     * (saves a number of checks later) */
    gfx_thumbnail_reset(thumbnail);
-   thumbnail->status = GFX_THUMBNAIL_STATUS_MISSING;
+   GFX_THUMB_STATUS_STORE(&thumbnail->status, GFX_THUMBNAIL_STATUS_MISSING);
 
    /* Update/extract thumbnail path */
    if (gfx_thumbnail_is_enabled(path_data, thumbnail_id))
@@ -375,7 +394,7 @@ void gfx_thumbnail_request(
                         thumbnail_path, (video_driver_get_disp_flags() & VIDEO_FLAG_USE_RGBA),
                         gfx_thumbnail_upscale_threshold,
                         gfx_thumbnail_handle_upload, thumbnail_tag))
-                  thumbnail->status = GFX_THUMBNAIL_STATUS_PENDING;
+                  GFX_THUMB_STATUS_STORE(&thumbnail->status, GFX_THUMBNAIL_STATUS_PENDING);
             }
 #ifdef HAVE_NETWORKING
             /* Handle on demand thumbnail downloads */
@@ -433,7 +452,7 @@ void gfx_thumbnail_request(
 
 end:
    /* Trigger 'fade in' animation, if required */
-   if (thumbnail->status != GFX_THUMBNAIL_STATUS_PENDING)
+   if (GFX_THUMB_STATUS_LOAD(&thumbnail->status) != GFX_THUMBNAIL_STATUS_PENDING)
       gfx_thumbnail_init_fade(p_gfx_thumb,
             thumbnail);
 }
@@ -459,7 +478,7 @@ void gfx_thumbnail_request_file(
    /* Reset thumbnail, then set 'missing' status by default
     * (saves a number of checks later) */
    gfx_thumbnail_reset(thumbnail);
-   thumbnail->status = GFX_THUMBNAIL_STATUS_MISSING;
+   GFX_THUMB_STATUS_STORE(&thumbnail->status, GFX_THUMBNAIL_STATUS_MISSING);
 
    /* Check if file path is valid */
    if (   (!file_path || !*file_path)
@@ -480,7 +499,7 @@ void gfx_thumbnail_request_file(
          file_path, (video_driver_get_disp_flags() & VIDEO_FLAG_USE_RGBA),
          gfx_thumbnail_upscale_threshold,
          gfx_thumbnail_handle_upload, thumbnail_tag))
-      thumbnail->status = GFX_THUMBNAIL_STATUS_PENDING;
+      GFX_THUMB_STATUS_STORE(&thumbnail->status, GFX_THUMBNAIL_STATUS_PENDING);
 }
 
 /* Resets (and free()s the current texture of) the
@@ -502,7 +521,7 @@ void gfx_thumbnail_reset(gfx_thumbnail_t *thumbnail)
    }
 
    /* Reset all parameters */
-   thumbnail->status      = GFX_THUMBNAIL_STATUS_UNKNOWN;
+   GFX_THUMB_STATUS_STORE(&thumbnail->status, GFX_THUMBNAIL_STATUS_UNKNOWN);
    thumbnail->texture     = 0;
    thumbnail->width       = 0;
    thumbnail->height      = 0;
@@ -549,7 +568,7 @@ void gfx_thumbnail_request_stream(
    /* Only process request if current status
     * is GFX_THUMBNAIL_STATUS_UNKNOWN */
    if (   !thumbnail
-       || (thumbnail->status != GFX_THUMBNAIL_STATUS_UNKNOWN))
+       || (GFX_THUMB_STATUS_LOAD(&thumbnail->status) != GFX_THUMBNAIL_STATUS_UNKNOWN))
       return;
 
    /* Check if stream delay timer has elapsed */
@@ -564,7 +583,7 @@ void gfx_thumbnail_request_stream(
           * > Reset thumbnail and set missing status
           *   to prevent repeated load attempts */
          gfx_thumbnail_reset(thumbnail);
-         thumbnail->status = GFX_THUMBNAIL_STATUS_MISSING;
+         GFX_THUMB_STATUS_STORE(&thumbnail->status, GFX_THUMBNAIL_STATUS_MISSING);
          thumbnail->alpha  = 1.0f;
          return;
       }
@@ -615,8 +634,8 @@ void gfx_thumbnail_request_streams(
 
    /* Only process request if current status
     * is GFX_THUMBNAIL_STATUS_UNKNOWN */
-   process_r = (right_thumbnail->status == GFX_THUMBNAIL_STATUS_UNKNOWN);
-   process_l = (left_thumbnail->status  == GFX_THUMBNAIL_STATUS_UNKNOWN);
+   process_r = (GFX_THUMB_STATUS_LOAD(&right_thumbnail->status) == GFX_THUMBNAIL_STATUS_UNKNOWN);
+   process_l = (GFX_THUMB_STATUS_LOAD(&left_thumbnail->status)  == GFX_THUMBNAIL_STATUS_UNKNOWN);
 
    if (process_r || process_l)
    {
@@ -652,14 +671,14 @@ void gfx_thumbnail_request_streams(
             if (request_r)
             {
                gfx_thumbnail_reset(right_thumbnail);
-               right_thumbnail->status = GFX_THUMBNAIL_STATUS_MISSING;
+               GFX_THUMB_STATUS_STORE(&right_thumbnail->status, GFX_THUMBNAIL_STATUS_MISSING);
                right_thumbnail->alpha  = 1.0f;
             }
 
             if (request_l)
             {
                gfx_thumbnail_reset(left_thumbnail);
-               left_thumbnail->status  = GFX_THUMBNAIL_STATUS_MISSING;
+               GFX_THUMB_STATUS_STORE(&left_thumbnail->status, GFX_THUMBNAIL_STATUS_MISSING);
                left_thumbnail->alpha   = 1.0f;
             }
 
@@ -712,7 +731,7 @@ void gfx_thumbnail_process_stream(
       /* Entry is on-screen
        * > Only process if current status is
        *   GFX_THUMBNAIL_STATUS_UNKNOWN */
-      if (thumbnail->status == GFX_THUMBNAIL_STATUS_UNKNOWN)
+      if (GFX_THUMB_STATUS_LOAD(&thumbnail->status) == GFX_THUMBNAIL_STATUS_UNKNOWN)
       {
          gfx_thumbnail_state_t *p_gfx_thumb = &gfx_thumb_st;
 
@@ -729,7 +748,7 @@ void gfx_thumbnail_process_stream(
                /* Content is invalid
                 * > Reset thumbnail and set missing status */
                gfx_thumbnail_reset(thumbnail);
-               thumbnail->status = GFX_THUMBNAIL_STATUS_MISSING;
+               GFX_THUMB_STATUS_STORE(&thumbnail->status, GFX_THUMBNAIL_STATUS_MISSING);
                thumbnail->alpha  = 1.0f;
                return;
             }
@@ -748,7 +767,7 @@ void gfx_thumbnail_process_stream(
        * > If status is GFX_THUMBNAIL_STATUS_UNKNOWN,
        *   thumbnail is already in a blank state - but we
        *   must ensure that delay timer is set to zero */
-      if (thumbnail->status == GFX_THUMBNAIL_STATUS_UNKNOWN)
+      if (GFX_THUMB_STATUS_LOAD(&thumbnail->status) == GFX_THUMBNAIL_STATUS_UNKNOWN)
          thumbnail->delay_timer = 0.0f;
       /* In all other cases, reset thumbnail */
       else
@@ -786,8 +805,8 @@ void gfx_thumbnail_process_streams(
       /* Entry is on-screen
        * > Only process if current status is
        *   GFX_THUMBNAIL_STATUS_UNKNOWN */
-      bool process_r = (right_thumbnail->status == GFX_THUMBNAIL_STATUS_UNKNOWN);
-      bool process_l = (left_thumbnail->status  == GFX_THUMBNAIL_STATUS_UNKNOWN);
+      bool process_r = (GFX_THUMB_STATUS_LOAD(&right_thumbnail->status) == GFX_THUMBNAIL_STATUS_UNKNOWN);
+      bool process_l = (GFX_THUMB_STATUS_LOAD(&left_thumbnail->status)  == GFX_THUMBNAIL_STATUS_UNKNOWN);
 
       if (process_r || process_l)
       {
@@ -824,14 +843,14 @@ void gfx_thumbnail_process_streams(
                if (request_r)
                {
                   gfx_thumbnail_reset(right_thumbnail);
-                  right_thumbnail->status = GFX_THUMBNAIL_STATUS_MISSING;
+                  GFX_THUMB_STATUS_STORE(&right_thumbnail->status, GFX_THUMBNAIL_STATUS_MISSING);
                   right_thumbnail->alpha  = 1.0f;
                }
 
                if (request_l)
                {
                   gfx_thumbnail_reset(left_thumbnail);
-                  left_thumbnail->status  = GFX_THUMBNAIL_STATUS_MISSING;
+                  GFX_THUMB_STATUS_STORE(&left_thumbnail->status, GFX_THUMBNAIL_STATUS_MISSING);
                   left_thumbnail->alpha   = 1.0f;
                }
 
@@ -860,12 +879,12 @@ void gfx_thumbnail_process_streams(
        *   thumbnail is already in a blank state - but we
        *   must ensure that delay timer is set to zero
        * > In all other cases, reset thumbnail */
-      if (right_thumbnail->status == GFX_THUMBNAIL_STATUS_UNKNOWN)
+      if (GFX_THUMB_STATUS_LOAD(&right_thumbnail->status) == GFX_THUMBNAIL_STATUS_UNKNOWN)
          right_thumbnail->delay_timer = 0.0f;
       else
          gfx_thumbnail_reset(right_thumbnail);
 
-      if (left_thumbnail->status == GFX_THUMBNAIL_STATUS_UNKNOWN)
+      if (GFX_THUMB_STATUS_LOAD(&left_thumbnail->status) == GFX_THUMBNAIL_STATUS_UNKNOWN)
          left_thumbnail->delay_timer = 0.0f;
       else
          gfx_thumbnail_reset(left_thumbnail);
@@ -1038,7 +1057,12 @@ void gfx_thumbnail_draw(
          thumb_snapshot.height  = thumb_height;
          thumb_snapshot.alpha   = thumb_alpha;
          thumb_snapshot.flags   = thumb_flags;
-         thumb_snapshot.status  = GFX_THUMBNAIL_STATUS_AVAILABLE;
+         /* Local snapshot: initialise the atomic-typed status
+          * field via _init since this is its first write --
+          * direct assignment to the C11/C++11 atomic forms is
+          * illegal.  This local is never observed by another
+          * thread, so no ordering is required. */
+         retro_atomic_int_init(&thumb_snapshot.status, GFX_THUMBNAIL_STATUS_AVAILABLE);
          thumb_snapshot.delay_timer = 0.0f;
          gfx_thumbnail_get_draw_dimensions(
                &thumb_snapshot, width, height, scale_factor,
diff --git a/gfx/gfx_thumbnail.h b/gfx/gfx_thumbnail.h
index 686407e0f919..f38669b2df26 100644
--- a/gfx/gfx_thumbnail.h
+++ b/gfx/gfx_thumbnail.h
@@ -28,6 +28,7 @@
 
 #include <boolean.h>
 #include <retro_miscellaneous.h>
+#include <retro_atomic.h>
 
 #include "gfx_animation.h"
 
@@ -185,7 +186,20 @@ enum gfx_thumbnail_flags
 };
 
 /* Holds all runtime parameters associated with
- * an entry thumbnail */
+ * an entry thumbnail.
+ *
+ * @c status is read by the video thread (via
+ * gfx_thumbnail_draw) and written by both the upload-callback
+ * thread (release-store after publishing texture/width/height)
+ * and the main thread (a number of plain transitions during menu
+ * processing).  retro_atomic_int_t backs the field with an
+ * atomic-typed integer that is safe to read/write through the
+ * retro_atomic_*_int API on every supported backend.
+ *
+ * Same size and alignment as plain int on every backend; struct
+ * layout is unchanged.  The cost of the acquire/release barriers
+ * on weak-memory ARM/PowerPC is negligible at this field's
+ * access rates (menu and frame draw, never per-sample). */
 typedef struct
 {
    uintptr_t texture;
@@ -193,10 +207,37 @@ typedef struct
    unsigned height;
    float alpha;
    float delay_timer;
-   enum gfx_thumbnail_status status;
+   retro_atomic_int_t status;
    uint8_t flags;
 } gfx_thumbnail_t;
 
+/* Field-by-field initializer for non-trivial gfx_thumbnail_t.
+ *
+ * Now that .status is atomically-typed, a wholesale
+ * memset(t, 0, sizeof(*t)) of a struct containing this type
+ * warns under CXX_BUILD's C++ compile (the struct is no longer
+ * trivially-copyable per C++11), even though the resulting
+ * bytes are identical.  RetroArch's CXX_BUILD mode
+ * compiles every .c file as C++, so this helper is required
+ * for clean builds, not just style.
+ *
+ * gfx_thumbnail_init_blank zero-inits the small struct field
+ * by field, using retro_atomic_int_init() for the atomic field
+ * so the first write is well-defined under C11 stdatomic and
+ * C++11 std::atomic.  Equivalent in effect to the pre-port
+ * memset on every real backend (status field is also zero ->
+ * UNKNOWN). */
+static INLINE void gfx_thumbnail_init_blank(gfx_thumbnail_t *t)
+{
+   t->texture     = 0;
+   t->width       = 0;
+   t->height      = 0;
+   t->alpha       = 0.0f;
+   t->delay_timer = 0.0f;
+   retro_atomic_int_init(&t->status, 0 /* GFX_THUMBNAIL_STATUS_UNKNOWN */);
+   t->flags       = 0;
+}
+
 /* Holds all configuration parameters associated
  * with a thumbnail shadow effect */
 typedef struct
diff --git a/gfx/gfx_widgets.c b/gfx/gfx_widgets.c
index b633fc5f007b..2b7ff979e0f0 100644
--- a/gfx/gfx_widgets.c
+++ b/gfx/gfx_widgets.c
@@ -189,6 +189,13 @@ static void msg_widget_msg_transition_animation_done(void *userdata)
    msg->msg_transition_animation = 0.0f;
 }
 
+/* Forward declaration: msg_queue_free is defined further below
+ * (around line 550) but is needed by msg_queue_push for the
+ * race-loss rollback path added in the producer-locking fix. */
+static void gfx_widgets_msg_queue_free(
+      dispgfx_widget_t *p_dispwidget,
+      disp_widget_msg_t *msg);
+
 void gfx_widgets_msg_queue_push(
       retro_task_t *task,
       const char *msg,
@@ -203,7 +210,18 @@ void gfx_widgets_msg_queue_push(
    disp_widget_msg_t    *msg_widget = NULL;
    dispgfx_widget_t *p_dispwidget   = &dispwidget_st;
 
-   if (FIFO_WRITE_AVAIL_NONPTR(p_dispwidget->msg_queue) > 0)
+   /* The outer FIFO_WRITE_AVAIL fast-path check that used to wrap
+    * this function body has been removed: reading the FIFO cursors
+    * outside msg_queue_lock is a data race against the producer
+    * lock-protected fifo_write below (TSan-detectable; benign on
+    * x86 TSO but real on weak-memory hardware).  The locked
+    * avail re-check at the fifo_write site is the correctness gate.
+    *
+    * Removing the outer gate also fixes a latent behaviour bug:
+    * the update-existing branch (else clause below) does not write
+    * to the FIFO -- it only mutates an already-tracked widget --
+    * so suppressing it on FIFO-full was wrong.  Existing widgets
+    * now get updated regardless of FIFO state. */
    {
       /* Get current msg if it exists */
       if (task && task->frontend_userdata)
@@ -376,8 +394,45 @@ void gfx_widgets_msg_queue_push(
                msg_widget->text_height *= 2;
          }
 
-         fifo_write(&p_dispwidget->msg_queue,
-               &msg_widget, sizeof(msg_widget));
+         /* Lock the FIFO across the avail re-check + fifo_write so
+          * concurrent producers can't both pass the check and
+          * overwrite each other's data.  fifo_write itself does no
+          * bounds checking -- it silently wraps -- so the inner
+          * avail check is the actual gate.
+          *
+          * The outer FIFO_WRITE_AVAIL_NONPTR check at the top of
+          * this function is a fast-path bail and is intentionally
+          * left unlocked: a stale read there at worst causes us to
+          * skip a single message that we could have queued, which
+          * is recoverable on the next call.  The check below is the
+          * one whose accuracy matters for correctness. */
+         {
+#ifdef HAVE_THREADS
+            bool fifo_full;
+            slock_lock(p_dispwidget->msg_queue_lock);
+            fifo_full = (FIFO_WRITE_AVAIL_NONPTR(p_dispwidget->msg_queue)
+                  < sizeof(msg_widget));
+            if (!fifo_full)
+               fifo_write(&p_dispwidget->msg_queue,
+                     &msg_widget, sizeof(msg_widget));
+            slock_unlock(p_dispwidget->msg_queue_lock);
+
+            if (fifo_full)
+            {
+               /* Lost the race against another producer.  Roll back
+                * the widget we just allocated -- nobody else has a
+                * reference to it yet (we only got here from the
+                * spawn-new branch, where msg_widget is freshly
+                * allocated above), so this is safe. */
+               gfx_widgets_msg_queue_free(p_dispwidget, msg_widget);
+               free(msg_widget);
+               return;
+            }
+#else
+            fifo_write(&p_dispwidget->msg_queue,
+                  &msg_widget, sizeof(msg_widget));
+#endif
+         }
       }
       /* Update task info */
       else
@@ -962,9 +1017,16 @@ void gfx_widgets_iterate(
 
    /* Messages queue */
 
-   /* Consume one message if available */
-   if ((FIFO_READ_AVAIL_NONPTR(p_dispwidget->msg_queue) > 0)
-         && !(p_dispwidget->flags & DISPGFX_WIDGET_FLAG_MOVING)
+   /* Consume one message if available.  The outer condition no
+    * longer reads FIFO_READ_AVAIL_NONPTR -- doing so outside
+    * msg_queue_lock would race with concurrent producer fifo_writes
+    * (TSan-detectable; benign on x86 TSO but real on weak-memory
+    * hardware).  The locked re-check at the fifo_read site below
+    * is the correctness gate.  current_msgs_size and the MOVING
+    * flag remain in the outer guard -- they are unrelated to the
+    * msg_queue race; their own synchronisation discipline is
+    * handled by current_msgs_lock and is unchanged here. */
+   if (    !(p_dispwidget->flags & DISPGFX_WIDGET_FLAG_MOVING)
          && (p_dispwidget->current_msgs_size < ARRAY_SIZE(p_dispwidget->current_msgs)))
    {
       disp_widget_msg_t *msg_widget = NULL;
@@ -975,9 +1037,21 @@ void gfx_widgets_iterate(
 
       if (p_dispwidget->current_msgs_size < ARRAY_SIZE(p_dispwidget->current_msgs))
       {
+         /* Lock around the FIFO read to serialise against
+          * concurrent producer fifo_writes.  Held strictly inside
+          * current_msgs_lock (which protects current_msgs[]) --
+          * lock order is consistent with all other call sites:
+          * msg_queue_lock is always the inner lock when both
+          * are held. */
+#ifdef HAVE_THREADS
+         slock_lock(p_dispwidget->msg_queue_lock);
+#endif
          if (FIFO_READ_AVAIL_NONPTR(p_dispwidget->msg_queue) > 0)
             fifo_read(&p_dispwidget->msg_queue,
                   &msg_widget, sizeof(msg_widget));
+#ifdef HAVE_THREADS
+         slock_unlock(p_dispwidget->msg_queue_lock);
+#endif
 
          if (msg_widget)
          {
@@ -1999,6 +2073,8 @@ static void gfx_widgets_free(dispgfx_widget_t *p_dispwidget)
 
    slock_free(p_dispwidget->current_msgs_lock);
    p_dispwidget->current_msgs_lock = NULL;
+   slock_free(p_dispwidget->msg_queue_lock);
+   p_dispwidget->msg_queue_lock = NULL;
 #endif
 
    p_dispwidget->msg_queue_tasks_count = 0;
@@ -2150,6 +2226,7 @@ bool gfx_widgets_init(
 
 #ifdef HAVE_THREADS
       p_dispwidget->current_msgs_lock = slock_new();
+      p_dispwidget->msg_queue_lock    = slock_new();
 #endif
 
       fill_pathname_join_special(
diff --git a/gfx/gfx_widgets.h b/gfx/gfx_widgets.h
index d73a0a95ba39..9fd02e90d0fd 100644
--- a/gfx/gfx_widgets.h
+++ b/gfx/gfx_widgets.h
@@ -191,6 +191,22 @@ typedef struct dispgfx_widget
 
 #ifdef HAVE_THREADS
    slock_t* current_msgs_lock;
+   /* Serialises producer and consumer access to msg_queue.
+    * Producers (gfx_widgets_msg_queue_push) can be called from
+    * any thread -- the threaded task system at libretro-common/
+    * queues/task_queue.c runs a worker thread, and several call
+    * paths reach the producer without holding any other lock
+    * (notably gfx/video_driver.c::video_driver_frame, which
+    * releases RUNLOOP_MSG_QUEUE_LOCK before the call).  The
+    * consumer (gfx_widgets_iterate) runs on the main thread
+    * but did not previously serialise its fifo_read against
+    * concurrent producer fifo_writes.  This separates concerns:
+    * current_msgs_lock continues to guard the displayed-message
+    * deque (current_msgs[]); msg_queue_lock guards the FIFO
+    * staging buffer.  Acquired by every fifo_* call site and
+    * by FIFO_*_AVAIL checks where the result is used to gate a
+    * subsequent FIFO operation. */
+   slock_t* msg_queue_lock;
 #endif
    fifo_buffer_t msg_queue;
    disp_widget_msg_t* current_msgs[MSG_QUEUE_ONSCREEN_MAX];
diff --git a/griffin/griffin.c b/griffin/griffin.c
index 0da5730a18d8..18d8dccc3d7e 100644
--- a/griffin/griffin.c
+++ b/griffin/griffin.c
@@ -815,6 +815,7 @@ INPUT (HID)
 FIFO BUFFER
 ============================================================ */
 #include "../libretro-common/queues/fifo_queue.c"
+#include "../libretro-common/queues/retro_spsc.c"
 
 /*============================================================
 AUDIO RESAMPLER
diff --git a/libretro-common/features/features_cpu.c b/libretro-common/features/features_cpu.c
index d4b1ca13fcdc..236b18561afd 100644
--- a/libretro-common/features/features_cpu.c
+++ b/libretro-common/features/features_cpu.c
@@ -614,56 +614,78 @@ uint64_t cpu_features_get(void)
    const int avx_flags = (1 << 27) | (1 << 28);
 #endif
 #if defined(__MACH__)
-   size_t _len          = sizeof(size_t);
-   if (sysctlbyname("hw.optional.floatingpoint", NULL, &_len, NULL, 0) == 0)
+   /* sysctlbyname() returns 0 (success) whenever the key exists, regardless
+    * of its value. On Intel Macs the hw.optional.* keys are always present
+    * but report 0 when the feature is unsupported (e.g. avx512f on pre-Skylake
+    * Xeon CPUs). We therefore have to read the value, not just the success
+    * code. */
+   int      _val        = 0;
+   size_t   _len        = sizeof(_val);
+   if (   sysctlbyname("hw.optional.floatingpoint", &_val, &_len, NULL, 0) == 0
+       && _val)
       cpu |= RETRO_SIMD_CMOV;
 
 #if defined(CPU_X86)
-   _len            = sizeof(size_t);
-   if (sysctlbyname("hw.optional.mmx", NULL, &_len, NULL, 0) == 0)
+   _val = 0;
+   _len = sizeof(_val);
+   if (sysctlbyname("hw.optional.mmx", &_val, &_len, NULL, 0) == 0 && _val)
       cpu |= RETRO_SIMD_MMX | RETRO_SIMD_MMXEXT;
-   _len            = sizeof(size_t);
-   if (sysctlbyname("hw.optional.sse", NULL, &_len, NULL, 0) == 0)
+   _val = 0;
+   _len = sizeof(_val);
+   if (sysctlbyname("hw.optional.sse", &_val, &_len, NULL, 0) == 0 && _val)
       cpu |= RETRO_SIMD_SSE;
-   _len            = sizeof(size_t);
-   if (sysctlbyname("hw.optional.sse2", NULL, &_len, NULL, 0) == 0)
+   _val = 0;
+   _len = sizeof(_val);
+   if (sysctlbyname("hw.optional.sse2", &_val, &_len, NULL, 0) == 0 && _val)
       cpu |= RETRO_SIMD_SSE2;
-   _len            = sizeof(size_t);
-   if (sysctlbyname("hw.optional.sse3", NULL, &_len, NULL, 0) == 0)
+   _val = 0;
+   _len = sizeof(_val);
+   if (sysctlbyname("hw.optional.sse3", &_val, &_len, NULL, 0) == 0 && _val)
       cpu |= RETRO_SIMD_SSE3;
-   _len            = sizeof(size_t);
-   if (sysctlbyname("hw.optional.supplementalsse3", NULL, &_len, NULL, 0) == 0)
+   _val = 0;
+   _len = sizeof(_val);
+   if (sysctlbyname("hw.optional.supplementalsse3", &_val, &_len, NULL, 0) == 0 && _val)
       cpu |= RETRO_SIMD_SSSE3;
-   _len            = sizeof(size_t);
-   if (sysctlbyname("hw.optional.sse4_1", NULL, &_len, NULL, 0) == 0)
+   _val = 0;
+   _len = sizeof(_val);
+   if (sysctlbyname("hw.optional.sse4_1", &_val, &_len, NULL, 0) == 0 && _val)
       cpu |= RETRO_SIMD_SSE4;
-   _len            = sizeof(size_t);
-   if (sysctlbyname("hw.optional.sse4_2", NULL, &_len, NULL, 0) == 0)
+   _val = 0;
+   _len = sizeof(_val);
+   if (sysctlbyname("hw.optional.sse4_2", &_val, &_len, NULL, 0) == 0 && _val)
       cpu |= RETRO_SIMD_SSE42;
-   _len            = sizeof(size_t);
-   if (sysctlbyname("hw.optional.aes", NULL, &_len, NULL, 0) == 0)
+   _val = 0;
+   _len = sizeof(_val);
+   if (sysctlbyname("hw.optional.aes", &_val, &_len, NULL, 0) == 0 && _val)
       cpu |= RETRO_SIMD_AES;
-   _len            = sizeof(size_t);
-   if (sysctlbyname("hw.optional.avx1_0", NULL, &_len, NULL, 0) == 0)
+   _val = 0;
+   _len = sizeof(_val);
+   if (sysctlbyname("hw.optional.avx1_0", &_val, &_len, NULL, 0) == 0 && _val)
       cpu |= RETRO_SIMD_AVX;
-   _len            = sizeof(size_t);
-   if (sysctlbyname("hw.optional.avx2_0", NULL, &_len, NULL, 0) == 0)
+   _val = 0;
+   _len = sizeof(_val);
+   if (sysctlbyname("hw.optional.avx2_0", &_val, &_len, NULL, 0) == 0 && _val)
       cpu |= RETRO_SIMD_AVX2;
-   _len            = sizeof(size_t);
-   if (sysctlbyname("hw.optional.avx512f", NULL, &_len, NULL, 0) == 0)
+   _val = 0;
+   _len = sizeof(_val);
+   if (sysctlbyname("hw.optional.avx512f", &_val, &_len, NULL, 0) == 0 && _val)
       cpu |= RETRO_SIMD_AVX512;
-   _len            = sizeof(size_t);
-   if (sysctlbyname("hw.optional.altivec", NULL, &_len, NULL, 0) == 0)
+   _val = 0;
+   _len = sizeof(_val);
+   if (sysctlbyname("hw.optional.altivec", &_val, &_len, NULL, 0) == 0 && _val)
       cpu |= RETRO_SIMD_VMX;
 #else
-   _len            = sizeof(size_t);
-   if (sysctlbyname("hw.optional.neon", NULL, &_len, NULL, 0) == 0)
+   _val = 0;
+   _len = sizeof(_val);
+   if (sysctlbyname("hw.optional.neon", &_val, &_len, NULL, 0) == 0 && _val)
       cpu |= RETRO_SIMD_NEON;
-   _len            = sizeof(size_t);
-   if (sysctlbyname("hw.optional.neon_fp16", NULL, &_len, NULL, 0) == 0)
+   _val = 0;
+   _len = sizeof(_val);
+   if (sysctlbyname("hw.optional.neon_fp16", &_val, &_len, NULL, 0) == 0 && _val)
       cpu |= RETRO_SIMD_VFPV3;
-   _len            = sizeof(size_t);
-   if (sysctlbyname("hw.optional.neon_hpfp", NULL, &_len, NULL, 0) == 0)
+   _val = 0;
+   _len = sizeof(_val);
+   if (sysctlbyname("hw.optional.neon_hpfp", &_val, &_len, NULL, 0) == 0 && _val)
       cpu |= RETRO_SIMD_VFPV4;
 #endif
 #elif defined(_XBOX1)
diff --git a/libretro-common/include/retro_spsc.h b/libretro-common/include/retro_spsc.h
new file mode 100644
index 000000000000..9191d1f0e656
--- /dev/null
+++ b/libretro-common/include/retro_spsc.h
@@ -0,0 +1,270 @@
+/* Copyright  (C) 2010-2026 The RetroArch team
+ *
+ * ---------------------------------------------------------------------------------------
+ * The following license statement only applies to this file (retro_spsc.h).
+ * ---------------------------------------------------------------------------------------
+ *
+ * Permission is hereby granted, free of charge,
+ * to any person obtaining a copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __LIBRETRO_SDK_SPSC_H
+#define __LIBRETRO_SDK_SPSC_H
+
+/*
+ * retro_spsc.h - portable single-producer / single-consumer byte queue
+ *
+ * Lock-free byte-stream queue with one writer thread and one reader
+ * thread.  Uses release-store / acquire-load on two cursors (the
+ * Lamport / Disruptor design) rather than a single shared count, so
+ * neither thread issues atomic RMW operations on the hot path.
+ *
+ * Constraints:
+ *   - Exactly ONE producer and ONE consumer.  No locking is performed;
+ *     two producers or two consumers will corrupt the queue.
+ *   - Capacity must be power-of-2.  Init rounds up.
+ *   - Maximum capacity is SIZE_MAX/2 (so head - tail never overflows
+ *     the well-defined unsigned modular range).
+ *   - The buffer is byte-addressed.  Callers pushing structured records
+ *     are responsible for framing.
+ *
+ * Memory model:
+ *   - Producer writes data into the buffer (non-atomic stores), then
+ *     publishes the new head via release-store.
+ *   - Consumer acquire-loads head (sees data writes that preceded the
+ *     release), reads data (non-atomic loads), then publishes the new
+ *     tail via release-store.
+ *   - Producer acquire-loads tail to know how much space is free.
+ *   - This pairing gives the SPSC invariant on every backend
+ *     retro_atomic.h supports lock-freely.  On the volatile fallback
+ *     (no real backend), correctness reduces to single-core or x86 TSO,
+ *     same as every other retro_atomic.h caller.
+ *
+ * Cache behaviour:
+ *   - head and tail are placed on separate cache lines via explicit
+ *     padding to RETRO_SPSC_CACHE_LINE.  Without this, the producer's
+ *     publish would invalidate the consumer's tail line and vice versa,
+ *     halving throughput on contended SMP.  The padding is a
+ *     performance hint; correctness does not depend on it.
+ *
+ * Lifetime:
+ *   - retro_spsc_init allocates an internal buffer; retro_spsc_free
+ *     releases it.  The caller owns the retro_spsc_t struct itself.
+ *   - The struct is NOT itself thread-safe to construct or destroy
+ *     while in use.  Build it on one thread, hand the producer pointer
+ *     to one thread and the consumer pointer to another, then tear
+ *     down on one thread after both producer and consumer have stopped.
+ *
+ * Example:
+ *
+ *   retro_spsc_t q;
+ *   retro_spsc_init(&q, 4096);
+ *
+ *   // Producer thread:
+ *   uint8_t msg[64] = ...;
+ *   if (retro_spsc_write_avail(&q) >= sizeof(msg))
+ *      retro_spsc_write(&q, msg, sizeof(msg));
+ *
+ *   // Consumer thread:
+ *   uint8_t msg[64];
+ *   if (retro_spsc_read_avail(&q) >= sizeof(msg))
+ *      retro_spsc_read(&q, msg, sizeof(msg));
+ *
+ *   retro_spsc_free(&q);
+ *
+ * Comparison with libretro-common's existing fifo_queue_t:
+ *   - fifo_queue uses a slock_t internally; safe with multiple producers
+ *     and multiple consumers, but every push/pop takes a mutex.
+ *   - retro_spsc is lock-free but limited to one producer and one
+ *     consumer.  Use this when (a) the producer/consumer count is fixed
+ *     at one each (audio driver -> backend, video -> task system, etc.)
+ *     and (b) the lock contention in fifo_queue measurably matters.
+ *   - For most code paths, fifo_queue is the better default.  This
+ *     primitive is for hot paths where lock-free is a measured win.
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+#include <retro_common_api.h>
+#include <retro_atomic.h>
+
+RETRO_BEGIN_DECLS
+
+/* Cache-line padding size.  64 covers x86-64, AArch64, most modern
+ * ARMs, and modern PowerPC.  Apple Silicon's effective coherency line
+ * is 128 due to its M-series cluster topology; over-padding to 64
+ * still avoids false sharing, just slightly less efficiently.  Older
+ * 32-bit ARMs (ARMv6, Cortex-A8) used 32-byte lines but tolerate the
+ * larger pad without correctness impact. */
+#ifndef RETRO_SPSC_CACHE_LINE
+#define RETRO_SPSC_CACHE_LINE 64
+#endif
+
+/* Padding helper.  C89 has no _Static_assert; the array-size trick is
+ * portable.  We pad to the next multiple of cache-line size after the
+ * pointer-and-size_t prefix and after each cursor. */
+
+/* Effective cache line for padding.  We pad each cursor to a full
+ * cache line.  If the pre-cursor fields exceed RETRO_SPSC_CACHE_LINE
+ * on some hypothetical small-cache target, the underflow on the
+ * subtraction would produce a giant array; guard with a max() so
+ * the pad is always at least 1 byte and never underflows. */
+#define RETRO_SPSC_PAD0_BYTES \
+   ((RETRO_SPSC_CACHE_LINE > (sizeof(uint8_t*) + sizeof(size_t))) \
+      ? (RETRO_SPSC_CACHE_LINE - (sizeof(uint8_t*) + sizeof(size_t))) \
+      : 1)
+#define RETRO_SPSC_PAD1_BYTES \
+   ((RETRO_SPSC_CACHE_LINE > sizeof(retro_atomic_size_t)) \
+      ? (RETRO_SPSC_CACHE_LINE - sizeof(retro_atomic_size_t)) \
+      : 1)
+
+typedef struct retro_spsc
+{
+   uint8_t            *buffer;
+   size_t              capacity;   /* power of 2; mask = capacity - 1 */
+   /* Pad so head sits on a fresh cache line, isolating it from
+    * the buffer/capacity fields that init may touch. */
+   uint8_t             _pad0[RETRO_SPSC_PAD0_BYTES];
+   retro_atomic_size_t head;       /* producer publishes; consumer reads */
+   /* Pad so tail sits on its own cache line, isolating it from head. */
+   uint8_t             _pad1[RETRO_SPSC_PAD1_BYTES];
+   retro_atomic_size_t tail;       /* consumer publishes; producer reads */
+} retro_spsc_t;
+
+/**
+ * retro_spsc_init:
+ * @q             : The queue.
+ * @min_capacity  : Requested capacity in bytes.  Rounded up to the
+ *                  next power of 2.  Must be > 0 and <= SIZE_MAX/2.
+ *
+ * Allocates the internal buffer and zero-initialises both cursors.
+ * Both cursors begin at 0; the queue is empty.
+ *
+ * Returns: true on success, false on allocation failure or invalid
+ * @min_capacity.
+ *
+ * After this returns true, the producer thread can call
+ * retro_spsc_write / retro_spsc_write_avail and the consumer thread can
+ * call retro_spsc_read / retro_spsc_read_avail.
+ */
+bool retro_spsc_init(retro_spsc_t *q, size_t min_capacity);
+
+/**
+ * retro_spsc_free:
+ * @q : The queue.
+ *
+ * Releases the internal buffer.  Caller must ensure both producer
+ * and consumer have stopped using @q before calling.  Safe to call
+ * on a queue that retro_spsc_init failed on (no-op).
+ */
+void retro_spsc_free(retro_spsc_t *q);
+
+/**
+ * retro_spsc_clear:
+ * @q : The queue.
+ *
+ * Resets head and tail to 0, discarding any unread data.  The
+ * underlying buffer is preserved (no reallocation).
+ *
+ * SAFETY: callable only when both the producer and consumer are
+ * quiesced -- e.g. before either has started, or after both have
+ * stopped.  Concurrent calls with a live producer or consumer are
+ * a data race.  This is the same lifetime constraint as
+ * retro_spsc_init / retro_spsc_free.
+ *
+ * Typical use is when a stream is stopped and restarted (e.g. an
+ * audio driver pausing and resuming, or switching device formats),
+ * and stale buffered data should be discarded before the new
+ * stream begins.
+ */
+void retro_spsc_clear(retro_spsc_t *q);
+
+/**
+ * retro_spsc_write_avail:
+ * @q : The queue.
+ *
+ * Producer-side query: how many bytes can be written before the
+ * queue is full.
+ *
+ * Returns: byte count, in [0, capacity].
+ *
+ * SAFETY: callable only from the producer thread.
+ */
+size_t retro_spsc_write_avail(const retro_spsc_t *q);
+
+/**
+ * retro_spsc_read_avail:
+ * @q : The queue.
+ *
+ * Consumer-side query: how many bytes are available to read.
+ *
+ * Returns: byte count, in [0, capacity].
+ *
+ * SAFETY: callable only from the consumer thread.
+ */
+size_t retro_spsc_read_avail(const retro_spsc_t *q);
+
+/**
+ * retro_spsc_write:
+ * @q     : The queue.
+ * @data  : Source bytes.
+ * @bytes : Number of bytes to attempt to write.
+ *
+ * Writes up to @bytes from @data into the queue.  If the queue has
+ * less than @bytes free, writes only what fits.
+ *
+ * Returns: number of bytes actually written.
+ *
+ * SAFETY: callable only from the producer thread.  Concurrent calls
+ * with another producer corrupt the queue.
+ */
+size_t retro_spsc_write(retro_spsc_t *q, const void *data, size_t bytes);
+
+/**
+ * retro_spsc_read:
+ * @q     : The queue.
+ * @data  : Destination bytes.
+ * @bytes : Number of bytes to attempt to read.
+ *
+ * Reads up to @bytes from the queue into @data.  If the queue has
+ * less than @bytes available, reads only what is present.
+ *
+ * Returns: number of bytes actually read.
+ *
+ * SAFETY: callable only from the consumer thread.  Concurrent calls
+ * with another consumer corrupt the queue.
+ */
+size_t retro_spsc_read(retro_spsc_t *q, void *data, size_t bytes);
+
+/**
+ * retro_spsc_peek:
+ * @q     : The queue.
+ * @data  : Destination bytes.
+ * @bytes : Number of bytes to peek.
+ *
+ * Like retro_spsc_read but does not advance the read cursor.  The
+ * peeked bytes remain available for the next read.
+ *
+ * Returns: number of bytes peeked.
+ *
+ * SAFETY: callable only from the consumer thread.
+ */
+size_t retro_spsc_peek(const retro_spsc_t *q, void *data, size_t bytes);
+
+RETRO_END_DECLS
+
+#endif /* __LIBRETRO_SDK_SPSC_H */
diff --git a/libretro-common/queues/retro_spsc.c b/libretro-common/queues/retro_spsc.c
new file mode 100644
index 000000000000..c2fd7e0831c9
--- /dev/null
+++ b/libretro-common/queues/retro_spsc.c
@@ -0,0 +1,208 @@
+/* Copyright  (C) 2010-2026 The RetroArch team
+ *
+ * ---------------------------------------------------------------------------------------
+ * The following license statement only applies to this file (retro_spsc.c).
+ * ---------------------------------------------------------------------------------------
+ *
+ * Permission is hereby granted, free of charge,
+ * to any person obtaining a copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <retro_spsc.h>
+
+/* Round @v up to the next power of 2.  Returns 0 if @v == 0 or if
+ * the next power of 2 would overflow (caller checks @v <= SIZE_MAX/2). */
+static size_t spsc_round_up_pow2(size_t v)
+{
+   size_t r;
+   if (v == 0)
+      return 0;
+   /* If already a power of 2, return as-is. */
+   if ((v & (v - 1)) == 0)
+      return v;
+   /* Round up.  Loop terminates because v <= SIZE_MAX/2 means r
+    * cannot overflow. */
+   r = 1;
+   while (r < v)
+      r <<= 1;
+   return r;
+}
+
+bool retro_spsc_init(retro_spsc_t *q, size_t min_capacity)
+{
+   size_t cap;
+
+   if (!q || min_capacity == 0 || min_capacity > (SIZE_MAX / 2))
+      return false;
+
+   cap = spsc_round_up_pow2(min_capacity);
+   if (cap == 0 || cap > (SIZE_MAX / 2))
+      return false;
+
+   q->buffer = (uint8_t*)malloc(cap);
+   if (!q->buffer)
+      return false;
+
+   q->capacity = cap;
+   retro_atomic_size_init(&q->head, 0);
+   retro_atomic_size_init(&q->tail, 0);
+   return true;
+}
+
+void retro_spsc_free(retro_spsc_t *q)
+{
+   if (!q)
+      return;
+   if (q->buffer)
+   {
+      free(q->buffer);
+      q->buffer = NULL;
+   }
+   q->capacity = 0;
+}
+
+void retro_spsc_clear(retro_spsc_t *q)
+{
+   if (!q)
+      return;
+   /* Quiescence is the caller's responsibility (documented).
+    * Under that assumption, no other thread is touching head or
+    * tail, so plain init is correct here -- and necessary, because
+    * plain assignment to a retro_atomic_size_t is illegal under
+    * the C11 stdatomic backend. */
+   retro_atomic_size_init(&q->head, 0);
+   retro_atomic_size_init(&q->tail, 0);
+}
+
+size_t retro_spsc_write_avail(const retro_spsc_t *q)
+{
+   /* Producer query.  We read our own head (which we wrote) and the
+    * consumer's tail (which they wrote with a release-store).
+    * acquire-load on tail so any reads we do based on the freed
+    * space are ordered after the consumer's tail publication.
+    *
+    * We can read our own head without an acquire because we are the
+    * only writer to it; but retro_atomic.h does not expose relaxed
+    * loads, so use acquire.  Cost is one extra fence on weak-memory
+    * targets, which is negligible compared to the actual work
+    * surrounding this query. */
+   size_t head = retro_atomic_load_acquire_size(
+         (retro_atomic_size_t*)&q->head);
+   size_t tail = retro_atomic_load_acquire_size(
+         (retro_atomic_size_t*)&q->tail);
+   /* head - tail is well-defined modular arithmetic on size_t.
+    * Invariant: 0 <= head - tail <= capacity. */
+   return q->capacity - (head - tail);
+}
+
+size_t retro_spsc_read_avail(const retro_spsc_t *q)
+{
+   /* Consumer query.  acquire-load on head pairs with the producer's
+    * release-store on head, so the data writes that preceded the
+    * release are visible by the time we read them. */
+   size_t head = retro_atomic_load_acquire_size(
+         (retro_atomic_size_t*)&q->head);
+   size_t tail = retro_atomic_load_acquire_size(
+         (retro_atomic_size_t*)&q->tail);
+   return head - tail;
+}
+
+size_t retro_spsc_write(retro_spsc_t *q, const void *data, size_t bytes)
+{
+   size_t mask, head_idx, first;
+   const uint8_t *src = (const uint8_t*)data;
+   /* read tail first to know how much room there is */
+   size_t head  = retro_atomic_load_acquire_size(&q->head);
+   size_t tail  = retro_atomic_load_acquire_size(&q->tail);
+   size_t avail = q->capacity - (head - tail);
+   if (bytes > avail)
+      bytes = avail;
+   if (bytes == 0)
+      return 0;
+
+   mask     = q->capacity - 1;
+   head_idx = head & mask;
+
+   /* first = bytes from head_idx to end-of-buffer */
+   first = q->capacity - head_idx;
+   if (first > bytes)
+      first = bytes;
+
+   memcpy(q->buffer + head_idx, src, first);
+   memcpy(q->buffer, src + first, bytes - first);
+
+   /* Publish: release-store ensures the memcpys above are globally
+    * visible before the consumer observes the new head. */
+   retro_atomic_store_release_size(&q->head, head + bytes);
+   return bytes;
+}
+
+size_t retro_spsc_read(retro_spsc_t *q, void *data, size_t bytes)
+{
+   size_t mask, tail_idx, first;
+   uint8_t *dst = (uint8_t*)data;
+   /* acquire on head pairs with producer's release-store; this is
+    * what makes the subsequent memcpys safe to read. */
+   size_t head  = retro_atomic_load_acquire_size(&q->head);
+   size_t tail  = retro_atomic_load_acquire_size(&q->tail);
+   size_t avail = head - tail;
+   if (bytes > avail)
+      bytes = avail;
+   if (bytes == 0)
+      return 0;
+
+   mask     = q->capacity - 1;
+   tail_idx = tail & mask;
+
+   first = q->capacity - tail_idx;
+   if (first > bytes)
+      first = bytes;
+
+   memcpy(dst, q->buffer + tail_idx, first);
+   memcpy(dst + first, q->buffer, bytes - first);
+
+   /* Publish: release-store so the producer can re-use this space. */
+   retro_atomic_store_release_size(&q->tail, tail + bytes);
+   return bytes;
+}
+
+size_t retro_spsc_peek(const retro_spsc_t *q, void *data, size_t bytes)
+{
+   size_t mask, tail_idx, first;
+   uint8_t *dst = (uint8_t*)data;
+   size_t head  = retro_atomic_load_acquire_size(
+         (retro_atomic_size_t*)&q->head);
+   size_t tail = retro_atomic_load_acquire_size(
+         (retro_atomic_size_t*)&q->tail);
+   size_t avail = head - tail;
+   if (bytes > avail)
+      bytes = avail;
+   if (bytes == 0)
+      return 0;
+
+   mask     = q->capacity - 1;
+   tail_idx = tail & mask;
+
+   first    = q->capacity - tail_idx;
+   if (first > bytes)
+      first = bytes;
+
+   memcpy(dst, q->buffer + tail_idx, first);
+   memcpy(dst + first, q->buffer, bytes - first);
+   /* No tail update: peek does not consume. */
+   return bytes;
+}
diff --git a/libretro-common/samples/queues/retro_spsc_test/Makefile b/libretro-common/samples/queues/retro_spsc_test/Makefile
new file mode 100644
index 000000000000..fffcba4c9a12
--- /dev/null
+++ b/libretro-common/samples/queues/retro_spsc_test/Makefile
@@ -0,0 +1,44 @@
+TARGET := retro_spsc_test
+
+LIBRETRO_COMM_DIR := ../../..
+
+# retro_spsc.c is the lock-free SPSC byte queue under test; the test
+# harness drives a producer and consumer thread through rthreads.c
+# (HAVE_THREADS).  retro_atomic.h is header-only and pulled in via
+# the queue's transitive includes.
+#
+# Build under SANITIZER=thread (TSan) for the regression-catching
+# CI run; SANITIZER=address,undefined as the default for the
+# auto-discovery workflow gives a coarser smoke test.  Both pass on
+# correct code.
+SOURCES := \
+	retro_spsc_test.c \
+	$(LIBRETRO_COMM_DIR)/queues/retro_spsc.c \
+	$(LIBRETRO_COMM_DIR)/rthreads/rthreads.c
+
+OBJS := $(SOURCES:.c=.o)
+
+CFLAGS  += -Wall -pedantic -std=gnu99 -g -O0 \
+           -DHAVE_THREADS -I$(LIBRETRO_COMM_DIR)/include
+LDFLAGS += -lpthread
+# rthreads.c uses clock_gettime + CLOCK_REALTIME on Linux glibc; on
+# older glibc those live in -lrt.  Harmless on newer glibc.
+LDFLAGS += -lrt
+
+ifneq ($(SANITIZER),)
+   CFLAGS  := -fsanitize=$(SANITIZER) -fno-omit-frame-pointer $(CFLAGS)
+   LDFLAGS := -fsanitize=$(SANITIZER) $(LDFLAGS)
+endif
+
+all: $(TARGET)
+
+%.o: %.c
+	$(CC) -c -o $@ $< $(CFLAGS)
+
+$(TARGET): $(OBJS)
+	$(CC) -o $@ $^ $(LDFLAGS)
+
+clean:
+	rm -f $(TARGET) $(OBJS)
+
+.PHONY: clean
diff --git a/libretro-common/samples/queues/retro_spsc_test/retro_spsc_test.c b/libretro-common/samples/queues/retro_spsc_test/retro_spsc_test.c
new file mode 100644
index 000000000000..408858fa2064
--- /dev/null
+++ b/libretro-common/samples/queues/retro_spsc_test/retro_spsc_test.c
@@ -0,0 +1,265 @@
+/* SPSC stress test: producer pushes N bytes total; consumer reads N
+ * bytes; verify the byte stream is exactly an expected sequence.
+ * Validates ordering, no torn reads, no duplicates, no drops.
+ *
+ * Design notes:
+ *   - Producer writes incrementing 32-bit "tokens" (i = 1, 2, 3, ...).
+ *   - Consumer reads the byte stream and reassembles tokens.
+ *   - Each token is checked against expected sequence; mismatch =>
+ *     reordering or torn write.
+ *   - Producer/consumer race in tight loops with no per-iteration
+ *     handshake; a real lock-free SPSC must handle this concurrency
+ *     without external synchronisation.
+ *   - Run under TSan for race detection.  Run without sanitizer for
+ *     throughput sanity. */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <retro_spsc.h>
+#include <rthreads/rthreads.h>
+
+/* Total tokens to push.  ~10M gives a reproducible test of <2s on
+ * x86_64; under qemu-aarch64 and TSan it's ~30s. */
+#define TOTAL_TOKENS 10000000
+
+/* Buffer capacity in bytes.  Smaller -> more producer/consumer
+ * interleaving (better race coverage); larger -> better throughput
+ * but less torture. */
+#define BUF_BYTES 4096
+
+typedef struct
+{
+   retro_spsc_t  q;
+   /* Captured by the producer's loop sentinel and read by main after
+    * join, so it doesn't need to be atomic. */
+   unsigned long mismatches;
+   unsigned long produced_tokens;
+   unsigned long consumed_tokens;
+} test_state_t;
+
+static void producer_thread(void *arg)
+{
+   test_state_t *s = (test_state_t*)arg;
+   uint32_t      token;
+
+   for (token = 1; token <= TOTAL_TOKENS; token++)
+   {
+      /* Spin until there is room for a full token. */
+      while (retro_spsc_write_avail(&s->q) < sizeof(token))
+         ; /* spin */
+      retro_spsc_write(&s->q, &token, sizeof(token));
+      s->produced_tokens++;
+   }
+}
+
+static void consumer_thread(void *arg)
+{
+   test_state_t *s              = (test_state_t*)arg;
+   uint32_t      expected_token = 1;
+
+   while (expected_token <= TOTAL_TOKENS)
+   {
+      uint32_t got;
+      while (retro_spsc_read_avail(&s->q) < sizeof(got))
+         ; /* spin */
+      retro_spsc_read(&s->q, &got, sizeof(got));
+      if (got != expected_token)
+         s->mismatches++;
+      expected_token++;
+      s->consumed_tokens++;
+   }
+}
+
+static int run_stress(void)
+{
+   test_state_t s;
+   sthread_t   *prod;
+   sthread_t   *cons;
+
+   memset(&s, 0, sizeof(s));
+   if (!retro_spsc_init(&s.q, BUF_BYTES))
+   {
+      fprintf(stderr, "FAIL: retro_spsc_init\n");
+      return 1;
+   }
+
+   prod = sthread_create(producer_thread, &s);
+   if (!prod)
+   {
+      fprintf(stderr, "FAIL: sthread_create(producer)\n");
+      retro_spsc_free(&s.q);
+      return 1;
+   }
+   cons = sthread_create(consumer_thread, &s);
+   if (!cons)
+   {
+      fprintf(stderr, "FAIL: sthread_create(consumer)\n");
+      sthread_join(prod);
+      retro_spsc_free(&s.q);
+      return 1;
+   }
+
+   sthread_join(prod);
+   sthread_join(cons);
+   retro_spsc_free(&s.q);
+
+   if (s.mismatches != 0)
+   {
+      fprintf(stderr,
+         "FAIL: %lu mismatched tokens out of %lu\n",
+         s.mismatches, s.consumed_tokens);
+      return 1;
+   }
+   if (s.produced_tokens != TOTAL_TOKENS
+         || s.consumed_tokens != TOTAL_TOKENS)
+   {
+      fprintf(stderr,
+         "FAIL: produced=%lu consumed=%lu (expected %d each)\n",
+         s.produced_tokens, s.consumed_tokens, TOTAL_TOKENS);
+      return 1;
+   }
+
+   printf("[pass] stress: %d tokens through %d-byte buffer, "
+          "0 mismatches\n",
+      TOTAL_TOKENS, BUF_BYTES);
+   return 0;
+}
+
+/* Single-threaded property checks: verify the API contracts that
+ * don't require concurrency. */
+static int run_property_checks(void)
+{
+   retro_spsc_t q;
+   uint8_t      buf[64];
+   uint8_t      readback[64];
+   size_t       i, n;
+
+   /* init with non-power-of-2 should round up */
+   if (!retro_spsc_init(&q, 100))
+   {
+      fprintf(stderr, "FAIL: init(100)\n");
+      return 1;
+   }
+   if (q.capacity != 128)
+   {
+      fprintf(stderr, "FAIL: capacity %zu != 128 (round up)\n",
+         q.capacity);
+      retro_spsc_free(&q);
+      return 1;
+   }
+
+   /* fresh queue: read_avail = 0, write_avail = capacity */
+   if (retro_spsc_read_avail(&q) != 0
+         || retro_spsc_write_avail(&q) != 128)
+   {
+      fprintf(stderr, "FAIL: fresh-queue avails\n");
+      retro_spsc_free(&q);
+      return 1;
+   }
+
+   /* write 64, read 64, verify */
+   for (i = 0; i < sizeof(buf); i++)
+      buf[i] = (uint8_t)(i + 1);
+   n = retro_spsc_write(&q, buf, sizeof(buf));
+   if (n != sizeof(buf))
+   {
+      fprintf(stderr, "FAIL: write returned %zu\n", n);
+      retro_spsc_free(&q);
+      return 1;
+   }
+   if (retro_spsc_read_avail(&q) != sizeof(buf))
+   {
+      fprintf(stderr, "FAIL: read_avail after write\n");
+      retro_spsc_free(&q);
+      return 1;
+   }
+   memset(readback, 0, sizeof(readback));
+   n = retro_spsc_read(&q, readback, sizeof(readback));
+   if (n != sizeof(buf) || memcmp(buf, readback, sizeof(buf)) != 0)
+   {
+      fprintf(stderr, "FAIL: read content mismatch\n");
+      retro_spsc_free(&q);
+      return 1;
+   }
+
+   /* peek does not advance */
+   retro_spsc_write(&q, buf, 32);
+   if (retro_spsc_peek(&q, readback, 32) != 32)
+   {
+      fprintf(stderr, "FAIL: peek returned wrong size\n");
+      retro_spsc_free(&q);
+      return 1;
+   }
+   if (retro_spsc_read_avail(&q) != 32)
+   {
+      fprintf(stderr, "FAIL: peek advanced read cursor\n");
+      retro_spsc_free(&q);
+      return 1;
+   }
+   /* drain so wraparound test starts from a known state */
+   retro_spsc_read(&q, readback, 32);
+
+   /* wraparound: write 100 (forces wrap given capacity 128 + 32-byte
+    * peek state above) */
+   for (i = 0; i < sizeof(buf); i++)
+      buf[i] = (uint8_t)(0x80 + i);
+   n = retro_spsc_write(&q, buf, sizeof(buf));
+   if (n != sizeof(buf))
+   {
+      fprintf(stderr, "FAIL: wraparound write\n");
+      retro_spsc_free(&q);
+      return 1;
+   }
+   memset(readback, 0, sizeof(readback));
+   n = retro_spsc_read(&q, readback, sizeof(buf));
+   if (n != sizeof(buf) || memcmp(buf, readback, sizeof(buf)) != 0)
+   {
+      fprintf(stderr, "FAIL: wraparound read content mismatch\n");
+      retro_spsc_free(&q);
+      return 1;
+   }
+
+   /* clear discards unread data without reallocating buffer */
+   retro_spsc_write(&q, buf, 50);
+   if (retro_spsc_read_avail(&q) != 50)
+   {
+      fprintf(stderr, "FAIL: pre-clear read_avail\n");
+      retro_spsc_free(&q);
+      return 1;
+   }
+   retro_spsc_clear(&q);
+   if (retro_spsc_read_avail(&q) != 0
+         || retro_spsc_write_avail(&q) != 128)
+   {
+      fprintf(stderr, "FAIL: clear did not reset cursors\n");
+      retro_spsc_free(&q);
+      return 1;
+   }
+   /* queue is reusable after clear */
+   retro_spsc_write(&q, buf, 16);
+   memset(readback, 0, sizeof(readback));
+   n = retro_spsc_read(&q, readback, 16);
+   if (n != 16 || memcmp(buf, readback, 16) != 0)
+   {
+      fprintf(stderr, "FAIL: post-clear read mismatch\n");
+      retro_spsc_free(&q);
+      return 1;
+   }
+
+   retro_spsc_free(&q);
+   printf("[pass] property checks\n");
+   return 0;
+}
+
+int main(void)
+{
+   if (run_property_checks() != 0)
+      return 1;
+   if (run_stress() != 0)
+      return 1;
+   puts("ALL OK");
+   return 0;
+}
diff --git a/menu/drivers/xmb.c b/menu/drivers/xmb.c
index db650c2a6ef3..ff34f03b4873 100644
--- a/menu/drivers/xmb.c
+++ b/menu/drivers/xmb.c
@@ -653,7 +653,12 @@ const char* xmb_theme_ident(void)
  *     uninit `flags & FADE_ACTIVE` would call
  *     gfx_animation_kill_by_tag on stale state.
  *   - The lazy thumbnail path resolution in xmb_render reads
- *     `icon_path[0]` to decide whether resolution is needed. */
+ *     `icon_path[0]` to decide whether resolution is needed.
+ *
+ * gfx_thumbnail_init_blank() (rather than memset) is needed
+ * because gfx_thumbnail_t.status is now atomically-typed; a
+ * memset of a struct containing std::atomic<int> warns under
+ * CXX_BUILD's C++ compile of this file. */
 static xmb_node_t *xmb_alloc_node(void)
 {
    xmb_node_t *node = (xmb_node_t*)malloc(sizeof(*node));
@@ -664,8 +669,7 @@ static xmb_node_t *xmb_alloc_node(void)
    node->alpha        = node->label_alpha  = 0;
    node->zoom         = node->x = node->y  = 0;
    node->icon         = node->content_icon = 0;
-   memset(&node->thumbnail_icon.icon, 0,
-         sizeof(node->thumbnail_icon.icon));
+   gfx_thumbnail_init_blank(&node->thumbnail_icon.icon);
    node->thumbnail_icon.thumbnail_path_data.icon_path[0] = '\0';
    node->fullpath     = NULL;
    node->console_name = NULL;
diff --git a/samples/gfx/gfx_thumbnail_status_atomic/Makefile b/samples/gfx/gfx_thumbnail_status_atomic/Makefile
new file mode 100644
index 000000000000..f4e5edba934e
--- /dev/null
+++ b/samples/gfx/gfx_thumbnail_status_atomic/Makefile
@@ -0,0 +1,50 @@
+TARGET := gfx_thumbnail_status_atomic_test
+
+# Path back to the repo root from this sample dir.  The test references
+# headers from libretro-common/include and (when HAVE_THREADS=1) compiles
+# rthreads.c from libretro-common/rthreads.
+REPO_ROOT         := ../../..
+LIBRETRO_COMM_DIR := $(REPO_ROOT)/libretro-common
+
+# This sample mirrors gfx_thumbnail.c's atomic-status synchronisation
+# pattern in isolation; the real gfx_thumbnail.c is not linked.  The
+# stress test exercises the producer/consumer shape through rthreads
+# when HAVE_THREADS is enabled.  Without HAVE_THREADS the test still
+# runs the static-assertion and round-trip property checks, which
+# validate the type/layout invariants that the C11 stdatomic and C++11
+# std::atomic backends rely on.
+HAVE_THREADS ?= 1
+
+SOURCES := gfx_thumbnail_status_atomic_test.c
+
+CFLAGS  += -Wall -pedantic -std=gnu99 -g -O0 \
+           -I$(LIBRETRO_COMM_DIR)/include
+
+ifeq ($(HAVE_THREADS),1)
+   CFLAGS  += -DHAVE_THREADS
+   SOURCES += $(LIBRETRO_COMM_DIR)/rthreads/rthreads.c
+   LDFLAGS += -lpthread
+   # rthreads.c uses clock_gettime + CLOCK_REALTIME on Linux glibc; on
+   # older glibc those live in -lrt.  Harmless on newer glibc.
+   LDFLAGS += -lrt
+endif
+
+OBJS := $(SOURCES:.c=.o)
+
+ifneq ($(SANITIZER),)
+   CFLAGS  := -fsanitize=$(SANITIZER) -fno-omit-frame-pointer $(CFLAGS)
+   LDFLAGS := -fsanitize=$(SANITIZER) $(LDFLAGS)
+endif
+
+all: $(TARGET)
+
+%.o: %.c
+	$(CC) -c -o $@ $< $(CFLAGS)
+
+$(TARGET): $(OBJS)
+	$(CC) -o $@ $^ $(LDFLAGS)
+
+clean:
+	rm -f $(TARGET) $(OBJS)
+
+.PHONY: clean
diff --git a/samples/gfx/gfx_thumbnail_status_atomic/gfx_thumbnail_status_atomic_test.c b/samples/gfx/gfx_thumbnail_status_atomic/gfx_thumbnail_status_atomic_test.c
new file mode 100644
index 000000000000..6d48d922115a
--- /dev/null
+++ b/samples/gfx/gfx_thumbnail_status_atomic/gfx_thumbnail_status_atomic_test.c
@@ -0,0 +1,453 @@
+/* Copyright  (C) 2010-2026 The RetroArch team
+ *
+ * ---------------------------------------------------------------------------------------
+ * The following license statement only applies to this file (gfx_thumbnail_status_atomic_test.c).
+ * ---------------------------------------------------------------------------------------
+ *
+ * Permission is hereby granted, free of charge,
+ * to any person obtaining a copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/* Regression test for the cross-thread thumbnail status
+ * synchronisation in gfx/gfx_thumbnail.c.
+ *
+ * The .status field of gfx_thumbnail_t is read by the video
+ * thread (in gfx_thumbnail_draw) and written by the upload-
+ * callback thread (in gfx_thumbnail_handle_upload):
+ *
+ *   --- upload-callback thread ---
+ *   thumbnail->texture = ...;
+ *   thumbnail->width   = ...;
+ *   thumbnail->height  = ...;
+ *   GFX_THUMB_STATUS_STORE(&thumbnail->status,
+ *                          GFX_THUMBNAIL_STATUS_AVAILABLE);
+ *
+ *   --- video thread ---
+ *   if (GFX_THUMB_STATUS_LOAD(&thumbnail->status)
+ *       == GFX_THUMBNAIL_STATUS_AVAILABLE) {
+ *      uintptr_t tex = thumbnail->texture;
+ *      unsigned  w   = thumbnail->width;
+ *      unsigned  h   = thumbnail->height;
+ *      ...
+ *   }
+ *
+ * On weakly-ordered SMP hardware (ARM, AArch64, PowerPC), this
+ * pattern requires release-store / acquire-load barriers:
+ * without them the video thread can observe the AVAILABLE
+ * status before the texture/width/height writes are visible,
+ * leading to garbage thumbnail rendering or div-by-zero in the
+ * aspect-ratio code.  Pre-fix gfx_thumbnail.c had its own
+ * hand-rolled atomic shim covering GCC __atomic_*, MSVC
+ * Interlocked, and a volatile fallback; the port to
+ * retro_atomic.h replaces the shim with the portable API and
+ * stores the field as retro_atomic_int_t so the type system
+ * enforces the discipline.
+ *
+ * This test exercises two failure modes:
+ *
+ *  1. Compile-time: the .status field MUST be retro_atomic_int_t
+ *     (not a plain enum), so that on the C11 stdatomic and
+ *     C++11 std::atomic backends a future contributor cannot
+ *     accidentally bypass the API with a plain `t->status = X`
+ *     assignment -- those backends would refuse to compile
+ *     such an assignment.  We assert this with a static
+ *     check that verifies sizeof(.status) == sizeof(int)
+ *     (preserving struct layout) and that the macros
+ *     GFX_THUMB_STATUS_STORE / GFX_THUMB_STATUS_LOAD route
+ *     through retro_atomic_*_int (verified at preprocessing
+ *     by token-pasting their expansion).
+ *
+ *  2. Runtime: a 1M-iteration producer/consumer stress test
+ *     that mirrors the upload/draw shape: producer writes a
+ *     monotonic (texture, width, height) triple before
+ *     publishing AVAILABLE; consumer waits for AVAILABLE and
+ *     verifies the triple is internally consistent (all three
+ *     fields belong to the same generation).  Intended to be
+ *     run under ThreadSanitizer, which instruments every
+ *     atomic load/store and would flag the missing-barrier
+ *     case as a data race even on x86 TSO (where the
+ *     hardware otherwise hides the bug).
+ *
+ * If gfx_thumbnail.c amends GFX_THUMB_STATUS_STORE /
+ * GFX_THUMB_STATUS_LOAD or the .status field type, the
+ * verbatim copies in this test must follow.
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <retro_atomic.h>
+
+/* Needs HAVE_THREADS from the build for the SPSC stress; fall
+ * through to the static-assert checks otherwise. */
+#ifdef HAVE_THREADS
+#include <rthreads/rthreads.h>
+#endif
+
+/* === Verbatim mirror of gfx_thumbnail.h's relevant pieces === */
+
+enum gfx_thumbnail_status
+{
+   GFX_THUMBNAIL_STATUS_UNKNOWN = 0,
+   GFX_THUMBNAIL_STATUS_PENDING,
+   GFX_THUMBNAIL_STATUS_AVAILABLE,
+   GFX_THUMBNAIL_STATUS_MISSING
+};
+
+/* Mirror of gfx_thumbnail_t with the same field order/types.
+ * Layout must match gfx_thumbnail.h's definition; size/offset
+ * checks below validate this. */
+typedef struct
+{
+   uintptr_t          texture;
+   unsigned           width;
+   unsigned           height;
+   float              alpha;
+   float              delay_timer;
+   retro_atomic_int_t status;
+   uint8_t            flags;
+} gfx_thumbnail_t;
+
+/* Verbatim mirror of the macros from gfx/gfx_thumbnail.c.
+ * If the originals change, this block must follow. */
+#define GFX_THUMB_STATUS_STORE(ptr, val) \
+   retro_atomic_store_release_int((retro_atomic_int_t*)(void*)(ptr), (int)(val))
+#define GFX_THUMB_STATUS_LOAD(ptr) \
+   ((enum gfx_thumbnail_status)retro_atomic_load_acquire_int((retro_atomic_int_t*)(void*)(ptr)))
+
+/* Local mirror of gfx_thumbnail_init_blank.  The test cannot
+ * include gfx_thumbnail.h directly because it deliberately
+ * shadows that header's type definitions to verify layout
+ * invariants -- so we mirror the helper as well.  Field-by-
+ * field zero-init is required (rather than memset) when the
+ * test is compiled as C++ via CXX_BUILD: the .status field is
+ * std::atomic<int>, making gfx_thumbnail_t non-trivially-
+ * copyable, and memset of such a struct warns
+ * -Wclass-memaccess. */
+static void gfx_thumbnail_test_init_blank(gfx_thumbnail_t *t)
+{
+   t->texture     = 0;
+   t->width       = 0;
+   t->height      = 0;
+   t->alpha       = 0.0f;
+   t->delay_timer = 0.0f;
+   retro_atomic_int_init(&t->status, GFX_THUMBNAIL_STATUS_UNKNOWN);
+   t->flags       = 0;
+}
+
+/* === Compile-time invariants === */
+
+/* Size: the atomic-typed status field must be the same size as
+ * a plain int on every supported backend, so swapping it in for
+ * `enum gfx_thumbnail_status` doesn't change the gfx_thumbnail_t
+ * layout. */
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L
+_Static_assert(sizeof(((gfx_thumbnail_t*)0)->status) == sizeof(int),
+   "gfx_thumbnail_t.status must be int-sized for ABI compatibility");
+#elif defined(__cplusplus) && __cplusplus >= 201103L
+static_assert(sizeof(((gfx_thumbnail_t*)0)->status) == sizeof(int),
+   "gfx_thumbnail_t.status must be int-sized for ABI compatibility");
+#else
+/* Pre-C11 fallback: array-size negative-on-failure trick. */
+typedef char _gfx_thumb_status_size_check[
+   (sizeof(((gfx_thumbnail_t*)0)->status) == sizeof(int)) ? 1 : -1];
+#endif
+
+/* Backend sanity: retro_atomic.h must be selected and lock-free.
+ * The volatile fallback is correct only on x86 TSO / single-core
+ * targets and would not provide barriers on weak-memory hardware,
+ * which is exactly what the upload/draw pairing relies on. */
+#if !defined(HAVE_RETRO_ATOMIC)
+#  error "retro_atomic.h: HAVE_RETRO_ATOMIC not defined"
+#endif
+
+/* === Runtime SPSC stress (HAVE_THREADS only) === */
+
+#ifdef HAVE_THREADS
+
+/* Number of producer/consumer iterations.  1M is enough to flush
+ * out a missing-barrier bug under TSan within a few seconds; on
+ * real weak-memory hardware (qemu-aarch64) it surfaces faster.
+ * Adjust upward if the bug becomes harder to reproduce. */
+#define STRESS_ITERS 1000000
+
+/* SPSC handshake state.
+ *
+ * The real gfx_thumbnail upload/draw pairing is single-publication:
+ * the upload-callback thread writes the texture once and publishes
+ * AVAILABLE; the video thread observes AVAILABLE and reads the
+ * triple every frame thereafter.  There is no high-frequency
+ * back-and-forth.
+ *
+ * To mirror that shape under stress we need a per-generation
+ * handshake so the texture / width / height fields are NOT being
+ * rewritten while the consumer is reading them.  Without that,
+ * a tight loop where the producer rewrites the data while the
+ * consumer reads it would generate torn reads even with correct
+ * barriers, because the data is mutating concurrently -- a
+ * property of the test design, not of the synchronisation
+ * primitives.
+ *
+ * We pair two atomic counters for the handshake:
+ *  - `produced`: producer increments it after staging a generation,
+ *    via release-store so the staged data is visible to any thread
+ *    that observes the new value.  Consumer waits for it to exceed
+ *    the last-seen value via acquire-load.  This is the actual
+ *    publish edge the consumer keys on (rather than a status
+ *    transition, which would require the consumer to observe a
+ *    transient UNKNOWN that the producer might overwrite faster
+ *    than the consumer can poll).
+ *  - `consumed`: consumer increments it after reading; producer
+ *    waits for `consumed == produced` before staging the next
+ *    generation.
+ *
+ * The .status field (the actual subject of the test) is updated
+ * on the same pattern as in gfx_thumbnail.c -- each generation
+ * stores AVAILABLE through GFX_THUMB_STATUS_STORE before the
+ * counter bump, and the consumer reads it through
+ * GFX_THUMB_STATUS_LOAD as part of the per-generation read.  This
+ * exercises the macros under test even though the counter is what
+ * gates the handshake. */
+typedef struct
+{
+   gfx_thumbnail_t    thumb;
+   /* Producer's generation counter.  Bumped via release-store
+    * after staging the data; consumer reads via acquire-load. */
+   retro_atomic_int_t produced;
+   /* Consumer's acknowledgement counter.  Bumped via release-
+    * store after reading; producer reads via acquire-load. */
+   retro_atomic_int_t consumed;
+   /* Counter of mismatches the consumer observed.  Read by main
+    * thread after the join, so it doesn't need to be atomic. */
+   unsigned long      mismatches;
+} stress_state_t;
+
+static void producer_thread(void *arg)
+{
+   stress_state_t *s = (stress_state_t *)arg;
+   int             i;
+
+   for (i = 1; i <= STRESS_ITERS; i++)
+   {
+      /* Wait for the consumer to acknowledge the previous
+       * generation.  On the first iteration this is true
+       * immediately (consumed starts at 0). */
+      while (retro_atomic_load_acquire_int(&s->consumed) != (i - 1))
+         ; /* spin */
+
+      /* Stage the generation's data (no ordering required
+       * between these). */
+      s->thumb.texture = (uintptr_t)i;
+      s->thumb.width   = (unsigned)i;
+      s->thumb.height  = (unsigned)i;
+
+      /* Update status via the macros under test.  Same shape as
+       * the production gfx_thumbnail.c upload path:
+       * STATUS_STORE(AVAILABLE) after staging the texture, with
+       * release-store semantics that fence the earlier writes. */
+      GFX_THUMB_STATUS_STORE(&s->thumb.status,
+            GFX_THUMBNAIL_STATUS_AVAILABLE);
+
+      /* Publish the new generation.  Release-store so the staged
+       * data and the AVAILABLE status are visible before the
+       * consumer observes the new generation. */
+      retro_atomic_store_release_int(&s->produced, i);
+   }
+
+   /* Wait for the final ack, then signal shutdown via a
+    * sentinel value `produced = STRESS_ITERS + 1`, paired with
+    * a final STATUS_STORE so the consumer exercises the macro
+    * one last time. */
+   while (retro_atomic_load_acquire_int(&s->consumed) != STRESS_ITERS)
+      ; /* drain */
+   GFX_THUMB_STATUS_STORE(&s->thumb.status,
+         GFX_THUMBNAIL_STATUS_MISSING);
+   retro_atomic_store_release_int(&s->produced, STRESS_ITERS + 1);
+}
+
+static void consumer_thread(void *arg)
+{
+   stress_state_t *s             = (stress_state_t *)arg;
+   unsigned long   mismatches    = 0;
+   int             expected      = 1;
+
+   /* Per-generation loop.  Wait for produced >= expected, then
+    * read the staged data and the status.  The status read
+    * goes through GFX_THUMB_STATUS_LOAD (the macro under test);
+    * it should always observe AVAILABLE for the current
+    * generation.  Both reads are acquire-loads; the producer's
+    * release-stores ensure ordering between staging and
+    * publication. */
+   for (;;)
+   {
+      int                       prod;
+      uintptr_t                 tex;
+      unsigned                  w, h;
+      enum gfx_thumbnail_status st;
+
+      /* Wait for the producer to publish the next generation
+       * (or the shutdown sentinel STRESS_ITERS + 1). */
+      do {
+         prod = retro_atomic_load_acquire_int(&s->produced);
+      } while (prod < expected);
+
+      if (prod == STRESS_ITERS + 1)
+         break;
+
+      /* Read the staged data and status.  The acquire-load on
+       * `produced` fences these reads; if any field disagrees
+       * with the expected generation, the fence failed.  We
+       * additionally exercise GFX_THUMB_STATUS_LOAD here -- it
+       * should always observe AVAILABLE for the current
+       * generation. */
+      tex = s->thumb.texture;
+      w   = s->thumb.width;
+      h   = s->thumb.height;
+      st  = GFX_THUMB_STATUS_LOAD(&s->thumb.status);
+
+      if (tex != (uintptr_t)expected
+            || w != (unsigned)expected
+            || h != (unsigned)expected
+            || st != GFX_THUMBNAIL_STATUS_AVAILABLE)
+         mismatches++;
+
+      /* Ack: producer waits on this before staging the next
+       * generation. */
+      retro_atomic_store_release_int(&s->consumed, expected);
+      expected++;
+   }
+
+   s->mismatches = mismatches;
+}
+
+static int run_stress(void)
+{
+   stress_state_t s;
+   sthread_t     *prod;
+   sthread_t     *cons;
+
+   /* Field-by-field init, not memset, to silence
+    * -Wclass-memaccess under CXX_BUILD where .status is
+    * std::atomic<int> and the struct is non-trivially-copyable. */
+   gfx_thumbnail_test_init_blank(&s.thumb);
+   retro_atomic_int_init(&s.produced, 0);
+   retro_atomic_int_init(&s.consumed, 0);
+   s.mismatches = 0;
+
+   prod = sthread_create(producer_thread, &s);
+   if (!prod)
+   {
+      fprintf(stderr, "FAIL: sthread_create(producer)\n");
+      return 1;
+   }
+   cons = sthread_create(consumer_thread, &s);
+   if (!cons)
+   {
+      fprintf(stderr, "FAIL: sthread_create(consumer)\n");
+      sthread_join(prod);
+      return 1;
+   }
+
+   sthread_join(prod);
+   sthread_join(cons);
+
+   if (s.mismatches != 0)
+   {
+      fprintf(stderr,
+         "FAIL: SPSC stress observed %lu mismatched reads "
+         "out of %d iterations\n", s.mismatches, STRESS_ITERS);
+      return 1;
+   }
+
+   printf("[pass] SPSC stress: %d iterations, 0 mismatches\n",
+      STRESS_ITERS);
+   return 0;
+}
+
+#else /* HAVE_THREADS */
+
+static int run_stress(void)
+{
+   /* Compile-time-only mode: skip stress, but the static
+    * assertions above still validated the type/layout invariants. */
+   printf("[skip] HAVE_THREADS not defined; static checks only\n");
+   return 0;
+}
+
+#endif /* HAVE_THREADS */
+
+/* === Single-threaded property checks === */
+
+static int run_property_checks(void)
+{
+   gfx_thumbnail_t t;
+   /* Field-by-field init, not memset, to silence
+    * -Wclass-memaccess under CXX_BUILD. */
+   gfx_thumbnail_test_init_blank(&t);
+
+   /* Round-trip every enum value through STORE/LOAD. */
+   {
+      const enum gfx_thumbnail_status vals[] = {
+         GFX_THUMBNAIL_STATUS_UNKNOWN,
+         GFX_THUMBNAIL_STATUS_PENDING,
+         GFX_THUMBNAIL_STATUS_AVAILABLE,
+         GFX_THUMBNAIL_STATUS_MISSING,
+      };
+      size_t i;
+      for (i = 0; i < sizeof(vals) / sizeof(vals[0]); i++)
+      {
+         GFX_THUMB_STATUS_STORE(&t.status, vals[i]);
+         if (GFX_THUMB_STATUS_LOAD(&t.status) != vals[i])
+         {
+            fprintf(stderr,
+               "FAIL: STORE/LOAD round-trip on value %d\n",
+               (int)vals[i]);
+            return 1;
+         }
+      }
+   }
+
+   /* The acquire-load must read what the release-store wrote
+    * (single-threaded, so no ordering question, just contract
+    * verification). */
+   GFX_THUMB_STATUS_STORE(&t.status, GFX_THUMBNAIL_STATUS_AVAILABLE);
+   if (GFX_THUMB_STATUS_LOAD(&t.status) != GFX_THUMBNAIL_STATUS_AVAILABLE)
+   {
+      fprintf(stderr, "FAIL: AVAILABLE not observable after store\n");
+      return 1;
+   }
+
+   printf("[pass] STORE/LOAD round-trip on all four status values\n");
+   return 0;
+}
+
+int main(void)
+{
+   printf("retro_atomic backend: %s\n", RETRO_ATOMIC_BACKEND_NAME);
+#ifdef RETRO_ATOMIC_LOCK_FREE
+   printf("retro_atomic lock-free: yes\n");
+#else
+   printf("retro_atomic lock-free: NO (volatile fallback)\n");
+#endif
+
+   if (run_property_checks() != 0)
+      return 1;
+   if (run_stress() != 0)
+      return 1;
+
+   puts("ALL OK");
+   return 0;
+}
diff --git a/samples/gfx/gfx_widgets_msg_queue_race/Makefile b/samples/gfx/gfx_widgets_msg_queue_race/Makefile
new file mode 100644
index 000000000000..b8f62bad58b3
--- /dev/null
+++ b/samples/gfx/gfx_widgets_msg_queue_race/Makefile
@@ -0,0 +1,52 @@
+TARGET := gfx_widgets_msg_queue_race_test
+
+# Path back to the repo root from this sample dir.  The test references
+# headers from libretro-common/include and links against
+# libretro-common/queues/fifo_queue.c (the FIFO this regression test
+# exercises) and libretro-common/rthreads/rthreads.c (when HAVE_THREADS
+# is enabled).
+REPO_ROOT         := ../../..
+LIBRETRO_COMM_DIR := $(REPO_ROOT)/libretro-common
+
+# This sample mirrors the post-fix locking discipline around
+# dispgfx_widget_t::msg_queue in gfx/gfx_widgets.c.  The test uses
+# fifo_buffer_t directly (compiled from libretro-common) but does NOT
+# pull in gfx_widgets.c itself -- the fix is a locking protocol,
+# not a code change to the FIFO primitive.
+HAVE_THREADS ?= 1
+
+SOURCES := \
+	gfx_widgets_msg_queue_race_test.c \
+	$(LIBRETRO_COMM_DIR)/queues/fifo_queue.c
+
+CFLAGS  += -Wall -pedantic -std=gnu99 -g -O0 \
+           -I$(LIBRETRO_COMM_DIR)/include
+
+ifeq ($(HAVE_THREADS),1)
+   CFLAGS  += -DHAVE_THREADS
+   SOURCES += $(LIBRETRO_COMM_DIR)/rthreads/rthreads.c
+   LDFLAGS += -lpthread
+   # rthreads.c uses clock_gettime + CLOCK_REALTIME on Linux glibc; on
+   # older glibc those live in -lrt.  Harmless on newer glibc.
+   LDFLAGS += -lrt
+endif
+
+OBJS := $(SOURCES:.c=.o)
+
+ifneq ($(SANITIZER),)
+   CFLAGS  := -fsanitize=$(SANITIZER) -fno-omit-frame-pointer $(CFLAGS)
+   LDFLAGS := -fsanitize=$(SANITIZER) $(LDFLAGS)
+endif
+
+all: $(TARGET)
+
+%.o: %.c
+	$(CC) -c -o $@ $< $(CFLAGS)
+
+$(TARGET): $(OBJS)
+	$(CC) -o $@ $^ $(LDFLAGS)
+
+clean:
+	rm -f $(TARGET) $(OBJS)
+
+.PHONY: clean
diff --git a/samples/gfx/gfx_widgets_msg_queue_race/gfx_widgets_msg_queue_race_test.c b/samples/gfx/gfx_widgets_msg_queue_race/gfx_widgets_msg_queue_race_test.c
new file mode 100644
index 000000000000..30a307d15be1
--- /dev/null
+++ b/samples/gfx/gfx_widgets_msg_queue_race/gfx_widgets_msg_queue_race_test.c
@@ -0,0 +1,497 @@
+/* Copyright  (C) 2010-2026 The RetroArch team
+ *
+ * ---------------------------------------------------------------------------------------
+ * The following license statement only applies to this file (gfx_widgets_msg_queue_race_test.c).
+ * ---------------------------------------------------------------------------------------
+ *
+ * Permission is hereby granted, free of charge,
+ * to any person obtaining a copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/* Regression test for the producer-producer race fix on
+ * dispgfx_widget_t::msg_queue in gfx/gfx_widgets.c.
+ *
+ * Pre-fix: gfx_widgets_msg_queue_push called fifo_write on
+ * p_dispwidget->msg_queue without holding any lock.  The
+ * function is reachable from three callers:
+ *
+ *   1. runloop.c:runloop_msg_queue_push (40+ files transitively
+ *      reach this; many run on threaded task workers via
+ *      libretro-common/queues/task_queue.c).  Holds
+ *      RUNLOOP_MSG_QUEUE_LOCK across the call.
+ *   2. runloop.c:runloop_task_msg_queue_push.  Same lock.
+ *   3. gfx/video_driver.c:video_driver_frame.  Acquires
+ *      RUNLOOP_MSG_QUEUE_LOCK to drain runloop_st->msg_queue,
+ *      RELEASES it, then calls gfx_widgets_msg_queue_push
+ *      WITHOUT the lock.
+ *
+ * Path 3 runs on the main thread; path 1 can run on any thread.
+ * RUNLOOP_MSG_QUEUE_LOCK protects runloop_st->msg_queue, NOT
+ * p_dispwidget->msg_queue -- the lock name reflects the runloop
+ * struct it was named after.  So path 3's fifo_write (no lock)
+ * can race with path 1's fifo_write (lock held but irrelevant
+ * to path 3).
+ *
+ * The consumer (gfx_widgets_iterate -> fifo_read) ran inside
+ * RUNLOOP_MSG_QUEUE_LOCK at runloop.c:6048 and inside
+ * current_msgs_lock at gfx_widgets.c:973 -- but neither lock
+ * is taken on path 3, so the consumer's fifo_read could observe
+ * a half-updated `end` cursor from path 3's fifo_write.
+ *
+ * Outcome of a race: torn `end` cursor publishes a bad pointer
+ * in the disp_widget_msg_t* slots.  fifo_read returns garbage,
+ * the consumer dereferences it as a disp_widget_msg_t* in
+ * gfx_widgets_iterate, and we get a use-after-free / segfault.
+ * On x86 TSO the bug is masked most of the time by hardware
+ * ordering; on Win-on-ARM / Apple Silicon (AArch64) it surfaces
+ * more often.
+ *
+ * Fix: a dedicated msg_queue_lock on dispgfx_widget_t, acquired
+ * inside gfx_widgets_msg_queue_push around the fifo_write
+ * (with an avail re-check, since fifo_write does no bounds
+ * checking) and inside gfx_widgets_iterate around the fifo_read.
+ * msg_queue_lock is the inner lock; current_msgs_lock is the
+ * outer lock when both are held.
+ *
+ * This test exercises the post-fix shape:
+ *
+ *   - Two producer threads each call a stripped-down version of
+ *     gfx_widgets_msg_queue_push that mirrors the post-fix
+ *     locking discipline (msg_queue_lock around fifo_write +
+ *     avail re-check + rollback on race-loss).
+ *
+ *   - One consumer thread calls a stripped-down version of the
+ *     gfx_widgets_iterate FIFO-handling block: take
+ *     msg_queue_lock, fifo_read, release.  Then validate the
+ *     pointer it got is one we actually pushed.
+ *
+ *   - Run for STRESS_ITERS iterations under ThreadSanitizer.
+ *     halt_on_error=1 means the first race aborts the run
+ *     non-zero, which the workflow gates on.  If the locking
+ *     in the production code is removed or weakened (e.g.
+ *     someone drops the lock around fifo_write thinking the
+ *     outer FIFO_WRITE_AVAIL check is enough), TSan will see
+ *     concurrent writes to fifo_buffer_t::end and flag the race.
+ *
+ * The test does NOT mirror the full gfx_widgets_msg_queue_push
+ * function -- the message-widget allocation, font measurement,
+ * and animation push are irrelevant to the race.  What matters
+ * is the lock protocol around the fifo_buffer_t access, which
+ * is what we exercise.
+ *
+ * If gfx_widgets.c amends the locking around msg_queue, the
+ * verbatim copies in this test must follow.  Convention used by
+ * gfx_thumbnail_status_atomic_test, vulkan_extension_count_test,
+ * and others under samples/.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+#include <queues/fifo_queue.h>
+#include <retro_atomic.h>
+
+#ifdef HAVE_THREADS
+#include <rthreads/rthreads.h>
+#endif
+
+/* MSG_QUEUE_PENDING_MAX in gfx_widgets.c.  Mirrored here for
+ * realism, but the test works at any reasonable size. */
+#define MSG_QUEUE_PENDING_MAX 32
+
+/* Stand-in for disp_widget_msg_t.  The real struct has 30+
+ * fields; the FIFO only stores the POINTER, not the contents.
+ * For race-checking we just need allocations whose addresses
+ * we can validate. */
+typedef struct test_msg
+{
+   uint32_t generation;
+   uint32_t producer_id;
+} test_msg_t;
+
+/* Mirror of dispgfx_widget_t's msg_queue + lock pieces.  The
+ * real struct has many more fields; the test only models the
+ * synchronisation invariant. */
+typedef struct
+{
+   fifo_buffer_t msg_queue;
+#ifdef HAVE_THREADS
+   slock_t      *msg_queue_lock;
+#endif
+} dispgfx_widget_test_t;
+
+/* === Verbatim mirror of gfx_widgets_msg_queue_push's FIFO
+ *     interaction (post-fix).  Strips out the widget-allocation,
+ *     font-measurement, and animation-push work that is not
+ *     part of the race; what remains is the lock protocol.
+ *     If gfx_widgets.c amends the locking, this block must
+ *     follow. === */
+static int test_msg_queue_push(
+      dispgfx_widget_test_t *p_dispwidget,
+      test_msg_t *msg_widget)
+{
+   /* No outer FIFO_WRITE_AVAIL fast-path: reading the FIFO
+    * cursors outside msg_queue_lock would race with concurrent
+    * producer fifo_writes (TSan-detectable).  The locked
+    * avail re-check below is the correctness gate. */
+#ifdef HAVE_THREADS
+   bool fifo_full;
+   slock_lock(p_dispwidget->msg_queue_lock);
+   fifo_full = (FIFO_WRITE_AVAIL_NONPTR(p_dispwidget->msg_queue)
+         < sizeof(msg_widget));
+   if (!fifo_full)
+      fifo_write(&p_dispwidget->msg_queue,
+            &msg_widget, sizeof(msg_widget));
+   slock_unlock(p_dispwidget->msg_queue_lock);
+
+   if (fifo_full)
+      return 0;  /* race-loss or full: caller's responsibility to free */
+   return 1;
+#else
+   if (FIFO_WRITE_AVAIL_NONPTR(p_dispwidget->msg_queue) < sizeof(msg_widget))
+      return 0;
+   fifo_write(&p_dispwidget->msg_queue,
+         &msg_widget, sizeof(msg_widget));
+   return 1;
+#endif
+}
+
+/* === Verbatim mirror of gfx_widgets_iterate's FIFO read
+ *     block (post-fix).  Strips out the current_msgs[] insertion
+ *     logic; what remains is the lock protocol.  If gfx_widgets.c
+ *     amends the locking, this block must follow. === */
+static test_msg_t *test_msg_queue_consume(
+      dispgfx_widget_test_t *p_dispwidget)
+{
+   test_msg_t *msg_widget = NULL;
+
+   /* No outer FIFO_READ_AVAIL fast-path: reading the FIFO cursors
+    * outside msg_queue_lock would race with concurrent producer
+    * fifo_writes.  The locked re-check below is the gate. */
+#ifdef HAVE_THREADS
+   slock_lock(p_dispwidget->msg_queue_lock);
+#endif
+   if (FIFO_READ_AVAIL_NONPTR(p_dispwidget->msg_queue) >= sizeof(msg_widget))
+      fifo_read(&p_dispwidget->msg_queue,
+            &msg_widget, sizeof(msg_widget));
+#ifdef HAVE_THREADS
+   slock_unlock(p_dispwidget->msg_queue_lock);
+#endif
+   return msg_widget;
+}
+/* === end verbatim copies === */
+
+#ifdef HAVE_THREADS
+
+/* Number of pushes per producer thread.  500K * 2 producers gives
+ * 1M total push attempts.  Enough to flush out a missing-lock
+ * regression under TSan within a few seconds.  TSan halt_on_error=1
+ * means the first observed race aborts the run, so the iteration
+ * count primarily affects how reliably we trigger the bug if it
+ * regresses, not the cost of the success case. */
+#define STRESS_ITERS 500000
+
+typedef struct
+{
+   dispgfx_widget_test_t *widget;
+   uint32_t               producer_id;
+   uint32_t               pushed_ok;
+   uint32_t               pushed_full;
+} producer_ctx_t;
+
+typedef struct
+{
+   dispgfx_widget_test_t *widget;
+   uint32_t               consumed;
+   int                    failed;  /* set non-zero on a bogus pointer */
+   retro_atomic_int_t    *stop;
+} consumer_ctx_t;
+
+static void producer_thread(void *arg)
+{
+   producer_ctx_t *ctx = (producer_ctx_t*)arg;
+   uint32_t i;
+   for (i = 0; i < STRESS_ITERS; i++)
+   {
+      test_msg_t *m = (test_msg_t*)malloc(sizeof(*m));
+      if (!m)
+         continue;  /* OOM: skip; cosmetic-error path */
+      m->generation  = i;
+      m->producer_id = ctx->producer_id;
+
+      if (test_msg_queue_push(ctx->widget, m))
+         ctx->pushed_ok++;
+      else
+      {
+         /* Lost the race or FIFO full.  Free our allocation. */
+         free(m);
+         ctx->pushed_full++;
+      }
+   }
+}
+
+static void consumer_thread(void *arg)
+{
+   consumer_ctx_t *ctx = (consumer_ctx_t*)arg;
+   while (!retro_atomic_load_acquire_int(ctx->stop))
+   {
+      test_msg_t *m = test_msg_queue_consume(ctx->widget);
+      if (m)
+      {
+         /* Validate that what we got back is plausibly one of
+          * the things a producer pushed.  generation < STRESS_ITERS,
+          * producer_id is one of the small set we spawned.  A
+          * torn pointer from a missing-lock race would typically
+          * fail this check (garbage memory), but the primary
+          * regression signal here is TSan flagging the
+          * concurrent unsynchronised access on the FIFO cursors,
+          * not this content check.  This is belt-and-suspenders. */
+         if (m->generation >= STRESS_ITERS || m->producer_id >= 16)
+         {
+            ctx->failed = 1;
+            free(m);
+            return;
+         }
+         ctx->consumed++;
+         free(m);
+      }
+   }
+   /* Drain anything left behind after producers stopped. */
+   for (;;)
+   {
+      test_msg_t *m = test_msg_queue_consume(ctx->widget);
+      if (!m)
+         break;
+      ctx->consumed++;
+      free(m);
+   }
+}
+
+static int run_stress(void)
+{
+   dispgfx_widget_test_t widget;
+   producer_ctx_t        prod_ctx[2];
+   consumer_ctx_t        cons_ctx;
+   sthread_t            *prod_threads[2];
+   sthread_t            *cons_thread;
+   retro_atomic_int_t    stop;
+   int                   i;
+   uint32_t              total_pushed = 0;
+
+   retro_atomic_int_init(&stop, 0);
+
+   if (!fifo_initialize(&widget.msg_queue,
+            MSG_QUEUE_PENDING_MAX * sizeof(test_msg_t*)))
+   {
+      fprintf(stderr, "fifo_initialize failed\n");
+      return 1;
+   }
+   widget.msg_queue_lock = slock_new();
+   if (!widget.msg_queue_lock)
+   {
+      fprintf(stderr, "slock_new failed\n");
+      fifo_deinitialize(&widget.msg_queue);
+      return 1;
+   }
+
+   for (i = 0; i < 2; i++)
+   {
+      prod_ctx[i].widget      = &widget;
+      prod_ctx[i].producer_id = (uint32_t)i;
+      prod_ctx[i].pushed_ok   = 0;
+      prod_ctx[i].pushed_full = 0;
+   }
+   cons_ctx.widget   = &widget;
+   cons_ctx.consumed = 0;
+   cons_ctx.failed   = 0;
+   cons_ctx.stop     = &stop;
+
+   cons_thread     = sthread_create(consumer_thread, &cons_ctx);
+   prod_threads[0] = sthread_create(producer_thread, &prod_ctx[0]);
+   prod_threads[1] = sthread_create(producer_thread, &prod_ctx[1]);
+
+   if (!cons_thread || !prod_threads[0] || !prod_threads[1])
+   {
+      fprintf(stderr, "sthread_create failed\n");
+      return 1;
+   }
+
+   sthread_join(prod_threads[0]);
+   sthread_join(prod_threads[1]);
+   retro_atomic_store_release_int(&stop, 1);
+   sthread_join(cons_thread);
+
+   /* Drain anything still in the FIFO (consumer-side, single-
+    * threaded by this point so no lock needed -- but using the
+    * same helper for consistency). */
+   for (;;)
+   {
+      test_msg_t *m = test_msg_queue_consume(&widget);
+      if (!m)
+         break;
+      cons_ctx.consumed++;
+      free(m);
+   }
+
+   total_pushed = prod_ctx[0].pushed_ok + prod_ctx[1].pushed_ok;
+
+   if (cons_ctx.failed)
+   {
+      fprintf(stderr, "FAIL: consumer saw torn / bogus pointer\n");
+      slock_free(widget.msg_queue_lock);
+      fifo_deinitialize(&widget.msg_queue);
+      return 1;
+   }
+
+   if (cons_ctx.consumed != total_pushed)
+   {
+      fprintf(stderr,
+            "FAIL: consumed %u != pushed %u (P0=%u/%u, P1=%u/%u)\n",
+            cons_ctx.consumed, total_pushed,
+            prod_ctx[0].pushed_ok, prod_ctx[0].pushed_full,
+            prod_ctx[1].pushed_ok, prod_ctx[1].pushed_full);
+      slock_free(widget.msg_queue_lock);
+      fifo_deinitialize(&widget.msg_queue);
+      return 1;
+   }
+
+   printf("[pass] stress: %u pushed (%u+%u), %u rejected on full, "
+          "%u consumed, no torn pointers\n",
+         total_pushed, prod_ctx[0].pushed_ok, prod_ctx[1].pushed_ok,
+         prod_ctx[0].pushed_full + prod_ctx[1].pushed_full,
+         cons_ctx.consumed);
+
+   slock_free(widget.msg_queue_lock);
+   fifo_deinitialize(&widget.msg_queue);
+   return 0;
+}
+
+#endif /* HAVE_THREADS */
+
+/* Property-style smoke test for the single-threaded path:
+ * push fills the FIFO, push past full returns 0, consume drains
+ * everything in order. */
+static int run_smoke(void)
+{
+   dispgfx_widget_test_t widget;
+   int                   i;
+   int                   pushed = 0;
+   int                   consumed = 0;
+   test_msg_t           *messages[MSG_QUEUE_PENDING_MAX + 4];
+
+   if (!fifo_initialize(&widget.msg_queue,
+            MSG_QUEUE_PENDING_MAX * sizeof(test_msg_t*)))
+   {
+      fprintf(stderr, "fifo_initialize failed\n");
+      return 1;
+   }
+#ifdef HAVE_THREADS
+   widget.msg_queue_lock = slock_new();
+   if (!widget.msg_queue_lock)
+   {
+      fprintf(stderr, "slock_new failed\n");
+      fifo_deinitialize(&widget.msg_queue);
+      return 1;
+   }
+#endif
+
+   /* Push more than the FIFO can hold; later pushes should
+    * return 0 (rejected on full). */
+   for (i = 0; i < MSG_QUEUE_PENDING_MAX + 4; i++)
+   {
+      messages[i] = (test_msg_t*)malloc(sizeof(test_msg_t));
+      if (!messages[i])
+      {
+         fprintf(stderr, "malloc failed\n");
+         return 1;
+      }
+      messages[i]->generation  = i;
+      messages[i]->producer_id = 0;
+      if (test_msg_queue_push(&widget, messages[i]))
+         pushed++;
+      else
+      {
+         /* push refused: free the allocation we tried to enqueue */
+         free(messages[i]);
+         messages[i] = NULL;
+      }
+   }
+
+   if (pushed == 0 || pushed > MSG_QUEUE_PENDING_MAX)
+   {
+      fprintf(stderr, "FAIL: smoke pushed=%d; expected 1..%d\n",
+            pushed, MSG_QUEUE_PENDING_MAX);
+      return 1;
+   }
+
+   /* Drain.  Order should be FIFO. */
+   for (i = 0; i < pushed; i++)
+   {
+      test_msg_t *m = test_msg_queue_consume(&widget);
+      if (!m)
+      {
+         fprintf(stderr, "FAIL: smoke consume returned NULL at i=%d\n", i);
+         return 1;
+      }
+      if (m->generation != (uint32_t)i)
+      {
+         fprintf(stderr,
+               "FAIL: smoke order: expected generation %d, got %u\n",
+               i, m->generation);
+         free(m);
+         return 1;
+      }
+      free(m);
+      consumed++;
+   }
+
+   if (test_msg_queue_consume(&widget))
+   {
+      fprintf(stderr, "FAIL: smoke FIFO non-empty after drain\n");
+      return 1;
+   }
+
+   /* Free any allocations whose push was refused. */
+   for (i = pushed; i < MSG_QUEUE_PENDING_MAX + 4; i++)
+   {
+      if (messages[i])
+         free(messages[i]);
+   }
+
+   printf("[pass] smoke: pushed=%d consumed=%d (FIFO order verified)\n",
+         pushed, consumed);
+
+#ifdef HAVE_THREADS
+   slock_free(widget.msg_queue_lock);
+#endif
+   fifo_deinitialize(&widget.msg_queue);
+   return 0;
+}
+
+int main(void)
+{
+   if (run_smoke())
+      return 1;
+#ifdef HAVE_THREADS
+   if (run_stress())
+      return 1;
+#else
+   printf("[skip] stress: HAVE_THREADS not enabled\n");
+#endif
+   printf("ALL OK\n");
+   return 0;
+}