From 677454f53957cf393baea676849f3b49b4975111 Mon Sep 17 00:00:00 2001
From: "U-DESKTOP-SPFP6AQ\\twistedtechre" <reallibretroretroarch@gmail.com>
Date: Wed, 29 Apr 2026 06:17:51 +0200
Subject: [PATCH 1/2] Revert "libretro-common/queues: bound fifo_write/read +
 reject SIZE_MAX init"

This reverts commit 1a1396546e8595886f5afb01f66f57cde7a5f454.
---
 .../Linux-libretro-common-samples.yml         |   1 -
 libretro-common/include/queues/fifo_queue.h   |  12 -
 libretro-common/queues/fifo_queue.c           |  57 +---
 .../queues/fifo_queue_bounds_test/Makefile    |  29 --
 .../fifo_queue_bounds_test.c                  | 278 ------------------
 5 files changed, 3 insertions(+), 374 deletions(-)
 delete mode 100644 libretro-common/samples/queues/fifo_queue_bounds_test/Makefile
 delete mode 100644 libretro-common/samples/queues/fifo_queue_bounds_test/fifo_queue_bounds_test.c
diff --git a/.github/workflows/Linux-libretro-common-samples.yml b/.github/workflows/Linux-libretro-common-samples.yml
index 9046f1438e43..07e4fcddd06a 100644
--- a/.github/workflows/Linux-libretro-common-samples.yml
+++ b/.github/workflows/Linux-libretro-common-samples.yml
@@ -79,7 +79,6 @@ jobs:
             word_wrap_overflow_test
             task_queue_title_error_test
             tpool_wait_test
-            fifo_queue_bounds_test
           )
 
           # Per-binary run command (overrides ./<binary> if present).
diff --git a/libretro-common/include/queues/fifo_queue.h b/libretro-common/include/queues/fifo_queue.h
index bdd397420aac..4879c3e96e1d 100644
--- a/libretro-common/include/queues/fifo_queue.h
+++ b/libretro-common/include/queues/fifo_queue.h
@@ -126,12 +126,6 @@ static INLINE void fifo_clear(fifo_buffer_t *buffer)
 /**
  * Writes \c size bytes to the given queue.
  *
- * \c size is silently capped at \c FIFO_WRITE_AVAIL(buffer) --
- * the call writes at most that many bytes and discards any
- * excess.  Callers that need to be sure all bytes are queued
- * must gate on \c FIFO_WRITE_AVAIL beforehand.  Behaviour is
- * undefined if \c buffer is \c NULL.
- *
  * @param buffer The FIFO queue to write to.
  * @param in_buf The buffer to read bytes from.
  * @param size The length of \c in_buf, in bytes.
@@ -141,12 +135,6 @@ void fifo_write(fifo_buffer_t *buffer, const void *in_buf, size_t len);
 /**
  * Reads \c size bytes from the given queue.
  *
- * \c size is silently capped at \c FIFO_READ_AVAIL(buffer) --
- * the call returns at most that many bytes and leaves the
- * trailing portion of \c in_buf untouched.  Callers that need
- * exactly \c size bytes must gate on \c FIFO_READ_AVAIL
- * beforehand.  Behaviour is undefined if \c buffer is \c NULL.
- *
  * @param buffer The FIFO queue to read from.
  * @param in_buf The buffer to store the read bytes in.
  * @param size The length of \c in_buf, in bytes.
diff --git a/libretro-common/queues/fifo_queue.c b/libretro-common/queues/fifo_queue.c
index 0810c4222650..b05435addd87 100644
--- a/libretro-common/queues/fifo_queue.c
+++ b/libretro-common/queues/fifo_queue.c
@@ -31,21 +31,7 @@
 
 static bool fifo_initialize_internal(fifo_buffer_t *buf, size_t len)
 {
-   uint8_t *buffer;
-
-   /* The ring reserves one slot to distinguish empty from full,
-    * so the actual allocation is (len + 1) bytes.  Reject @len
-    * values that would wrap that addition: SIZE_MAX would
-    * compute (size_t)0, which calloc(1, 0) is allowed to satisfy
-    * with a non-NULL pointer to a zero-byte allocation.  Letting
-    * that succeed would leave buf->size == 0 and the next
-    * fifo_write would divide by zero at the `% buffer->size`
-    * step.  No current caller asks for SIZE_MAX, so the rejection
-    * is purely defensive. */
-   if (len >= SIZE_MAX)
-      return false;
-
-   buffer = (uint8_t*)calloc(1, len + 1);
+   uint8_t *buffer    = (uint8_t*)calloc(1, len + 1);
 
    if (!buffer)
       return false;
@@ -105,31 +91,8 @@ fifo_buffer_t *fifo_new(size_t len)
 
 void fifo_write(fifo_buffer_t *buffer, const void *in_buf, size_t len)
 {
-   size_t first_write;
+   size_t first_write = len;
    size_t rest_write  = 0;
-   size_t avail;
-
-   /* Cap @len at the available space.  Existing callers all
-    * gate on FIFO_WRITE_AVAIL before invoking us, so this is
-    * a no-op for them; for any caller that doesn't, the
-    * unbounded branch below would walk off the end of
-    * @buffer->buffer (the wrap-around copy at line `memcpy(
-    * buffer->buffer, ..., rest_write)` would write up to
-    * len - first_write bytes into a buffer of @buffer->size
-    * total, overrunning by len - size).  Worse, the original
-    * `buffer->end + len > buffer->size` test wraps in size_t
-    * for huge @len and silently misclassifies the request as
-    * "fits in one chunk", taking the corrupting first memcpy
-    * down a path with no wrap-around bound at all.  Capping
-    * here closes both windows. */
-   avail = FIFO_WRITE_AVAIL(buffer);
-   if (len > avail)
-      len = avail;
-
-   if (!len)
-      return;
-
-   first_write = len;
 
    if (buffer->end + len > buffer->size)
    {
@@ -146,22 +109,8 @@ void fifo_write(fifo_buffer_t *buffer, const void *in_buf, size_t len)
 
 void fifo_read(fifo_buffer_t *buffer, void *in_buf, size_t len)
 {
-   size_t first_read;
+   size_t first_read = len;
    size_t rest_read  = 0;
-   size_t avail;
-
-   /* Same rationale as fifo_write: cap @len at what's actually
-    * available to avoid out-of-buffer copies on a caller that
-    * forgot to gate on FIFO_READ_AVAIL.  Existing callers all
-    * gate first; this is defensive. */
-   avail = FIFO_READ_AVAIL(buffer);
-   if (len > avail)
-      len = avail;
-
-   if (!len)
-      return;
-
-   first_read = len;
 
    if (buffer->first + len > buffer->size)
    {
diff --git a/libretro-common/samples/queues/fifo_queue_bounds_test/Makefile b/libretro-common/samples/queues/fifo_queue_bounds_test/Makefile
deleted file mode 100644
index ee8146745f64..000000000000
--- a/libretro-common/samples/queues/fifo_queue_bounds_test/Makefile
+++ /dev/null
@@ -1,29 +0,0 @@
-TARGET := fifo_queue_bounds_test
-
-LIBRETRO_COMM_DIR := ../../..
-
-SOURCES := \
-	fifo_queue_bounds_test.c \
-	$(LIBRETRO_COMM_DIR)/queues/fifo_queue.c
-
-OBJS := $(SOURCES:.c=.o)
-
-CFLAGS += -Wall -pedantic -std=gnu99 -g -O0 -I$(LIBRETRO_COMM_DIR)/include
-
-ifneq ($(SANITIZER),)
-   CFLAGS  := -fsanitize=$(SANITIZER) -fno-omit-frame-pointer $(CFLAGS)
-   LDFLAGS := -fsanitize=$(SANITIZER) $(LDFLAGS)
-endif
-
-all: $(TARGET)
-
-%.o: %.c
-	$(CC) -c -o $@ $< $(CFLAGS)
-
-$(TARGET): $(OBJS)
-	$(CC) -o $@ $^ $(LDFLAGS)
-
-clean:
-	rm -f $(TARGET) $(OBJS)
-
-.PHONY: clean
diff --git a/libretro-common/samples/queues/fifo_queue_bounds_test/fifo_queue_bounds_test.c b/libretro-common/samples/queues/fifo_queue_bounds_test/fifo_queue_bounds_test.c
deleted file mode 100644
index 37c76b959b84..000000000000
--- a/libretro-common/samples/queues/fifo_queue_bounds_test/fifo_queue_bounds_test.c
+++ /dev/null
@@ -1,278 +0,0 @@
-/* Regression test for the fifo_queue bounds checks added in
- * libretro-common/queues/fifo_queue.c.
- *
- * Background
- * ----------
- * fifo_write / fifo_read previously trusted @len blindly: passing
- * len > FIFO_WRITE_AVAIL would walk off the end of the ring's
- * backing buffer (the wrap-around copy `memcpy(buffer->buffer,
- * src + first_write, rest_write)` writes rest_write bytes into a
- * size-byte buffer, overrunning by len - size).  Worse, the
- * `buffer->end + len > buffer->size` check itself wraps in size_t
- * for huge @len, mis-routing the caller down a single-memcpy
- * branch with no wrap-around bound at all.  fifo_initialize
- * accepted len == SIZE_MAX, which made `len + 1` wrap to 0, so
- * calloc(1, 0) might return a non-NULL zero-byte buffer and
- * subsequent fifo_write would `% 0` (division by zero) on the
- * end-pointer update.
- *
- * What this test asserts
- * ----------------------
- * 1. fifo_initialize rejects SIZE_MAX (no wrap to zero-byte buf).
- * 2. fifo_write caps @len at FIFO_WRITE_AVAIL: writing more than
- *    available drops the excess silently rather than overrunning
- *    the backing buffer.  ASan/LSan-clean.
- * 3. fifo_read caps @len at FIFO_READ_AVAIL: reading more than
- *    available leaves the trailing portion of @in_buf untouched.
- * 4. The cap survives integer-overflow attempts on @len (very
- *    large @len that would wrap (end + len) to a small value
- *    in size_t arithmetic, which the original code mis-routed).
- * 5. Wrap-around writes/reads still work correctly when the cap
- *    isn't engaged.
- *
- * Build under -fsanitize=address,undefined to catch any future
- * regression that re-introduces the OOB write or the SIZE_MAX
- * wrap.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdint.h>
-
-#include <queues/fifo_queue.h>
-
-static int failures = 0;
-
-#define EXPECT(cond, fmt, ...) do {                              \
-   if (!(cond)) {                                                \
-      fprintf(stderr, "[FAIL] %s:%d: " fmt "\n",                 \
-            __func__, __LINE__, ##__VA_ARGS__);                  \
-      failures++;                                                \
-   }                                                             \
-} while (0)
-
-/* Test 1: SIZE_MAX is rejected.  Without the guard, len + 1
- * would wrap to 0 and the buffer would be unusable. */
-static void test_initialize_size_max(void)
-{
-   fifo_buffer_t buf;
-
-   EXPECT(!fifo_initialize(&buf, SIZE_MAX),
-         "SIZE_MAX should be rejected (would wrap len + 1)");
-   /* If it incorrectly succeeded we'd leak; we asserted failure
-    * so no buffer was allocated. */
-   printf("[PASS] initialize_size_max\n");
-}
-
-/* Test 2: Normal init still works. */
-static void test_initialize_normal(void)
-{
-   fifo_buffer_t buf;
-
-   EXPECT(fifo_initialize(&buf, 256), "normal init should succeed");
-   /* size is len + 1 (one slot reserved for empty/full) */
-   /* Available bytes for writing == len */
-   EXPECT(FIFO_WRITE_AVAIL(&buf) == 256,
-         "fresh buffer should have 256 bytes available, got %zu",
-         FIFO_WRITE_AVAIL(&buf));
-   EXPECT(FIFO_READ_AVAIL(&buf) == 0,
-         "fresh buffer should have nothing to read, got %zu",
-         FIFO_READ_AVAIL(&buf));
-   fifo_deinitialize(&buf);
-   printf("[PASS] initialize_normal\n");
-}
-
-/* Test 3: write cap.  Pass more than available; the overrun
- * should be silently truncated rather than corrupting memory.
- * If ASan is enabled, an OOB write would trip it. */
-static void test_write_capped(void)
-{
-   fifo_buffer_t buf;
-   uint8_t       payload[2048];
-   size_t        i;
-
-   for (i = 0; i < sizeof(payload); i++)
-      payload[i] = (uint8_t)(i & 0xff);
-
-   EXPECT(fifo_initialize(&buf, 100), "init");
-   EXPECT(FIFO_WRITE_AVAIL(&buf) == 100, "100 avail");
-
-   /* Try to write 2048 into a 100-byte ring. */
-   fifo_write(&buf, payload, sizeof(payload));
-
-   EXPECT(FIFO_READ_AVAIL(&buf) == 100,
-         "after over-write, read avail should be 100, got %zu",
-         FIFO_READ_AVAIL(&buf));
-   EXPECT(FIFO_WRITE_AVAIL(&buf) == 0,
-         "after over-write, write avail should be 0, got %zu",
-         FIFO_WRITE_AVAIL(&buf));
-
-   fifo_deinitialize(&buf);
-   printf("[PASS] write_capped\n");
-}
-
-/* Test 4: read cap.  Try to read more than available; the
- * over-read should be capped at FIFO_READ_AVAIL and the trailing
- * portion of the destination buffer should remain untouched. */
-static void test_read_capped(void)
-{
-   fifo_buffer_t buf;
-   const char   *msg = "hello";
-   uint8_t       out[64];
-
-   EXPECT(fifo_initialize(&buf, 256), "init");
-   fifo_write(&buf, msg, 5);
-   EXPECT(FIFO_READ_AVAIL(&buf) == 5, "5 readable");
-
-   memset(out, 0xaa, sizeof(out));
-   /* Ask for more than available. */
-   fifo_read(&buf, out, 20);
-
-   EXPECT(memcmp(out, msg, 5) == 0,
-         "first 5 bytes should be 'hello'");
-   /* The cap means only 5 were actually written into out; the
-    * rest stays at the 0xaa sentinel.  This is the documented
-    * post-cap behaviour. */
-   EXPECT(out[5] == 0xaa,
-         "byte after capped read should be untouched (was 0x%02x)",
-         out[5]);
-   EXPECT(out[19] == 0xaa,
-         "trailing bytes should be untouched (was 0x%02x)",
-         out[19]);
-
-   EXPECT(FIFO_READ_AVAIL(&buf) == 0,
-         "after capped read, read avail should be 0, got %zu",
-         FIFO_READ_AVAIL(&buf));
-
-   fifo_deinitialize(&buf);
-   printf("[PASS] read_capped\n");
-}
-
-/* Test 5: huge @len that would have wrapped (end + len) in
- * size_t.  The original code's `buffer->end + len > buffer->size`
- * misclassifies this as fitting in one chunk, taking the
- * single-memcpy path with len bytes of OOB write into the ring.
- * The cap reduces len to FIFO_WRITE_AVAIL before any memcpy.
- *
- * Note: fifo_write reads exactly @len bytes from @in_buf -- the
- * cap only protects the destination ring, not the source buffer.
- * Callers must always supply a source buffer of at least @len
- * bytes (or now, after the cap, at least FIFO_WRITE_AVAIL bytes).
- * For this test we therefore use a source buffer big enough to
- * cover the post-cap copy (which will be 99 bytes here). */
-static void test_write_size_max_len(void)
-{
-   fifo_buffer_t buf;
-   uint8_t       byte = 0x42;
-   uint8_t       big_src[256];
-
-   memset(big_src, 0xcd, sizeof(big_src));
-
-   EXPECT(fifo_initialize(&buf, 100), "init");
-   /* Set end != 0 so end + SIZE_MAX would wrap to a small value: */
-   fifo_write(&buf, &byte, 1);
-   /* Now end == 1.  Pass SIZE_MAX as len; without the cap, the
-    * (end + len) addition wraps to 0 (for end=1, len=SIZE_MAX),
-    * the comparison "> size" is false, and the function would
-    * memcpy SIZE_MAX bytes from big_src into buffer->buffer + 1
-    * -- a destination overrun of essentially the entire address
-    * space.  With the cap, len becomes FIFO_WRITE_AVAIL = 99
-    * and the write completes safely. */
-   fifo_write(&buf, big_src, SIZE_MAX);
-
-   /* Buffer should be full now. */
-   EXPECT(FIFO_WRITE_AVAIL(&buf) == 0,
-         "after SIZE_MAX write, write avail should be 0, got %zu",
-         FIFO_WRITE_AVAIL(&buf));
-   EXPECT(FIFO_READ_AVAIL(&buf) == 100,
-         "after SIZE_MAX write, read avail should be 100, got %zu",
-         FIFO_READ_AVAIL(&buf));
-
-   fifo_deinitialize(&buf);
-   printf("[PASS] write_size_max_len\n");
-}
-
-/* Test 6: wrap-around writes still work when not engaging the
- * cap.  Write to fill, read half, write half: the wrap-around
- * branch in fifo_write should produce the right contents. */
-static void test_wrap_around(void)
-{
-   fifo_buffer_t buf;
-   uint8_t       in[10] = {0,1,2,3,4,5,6,7,8,9};
-   uint8_t       out[10];
-
-   EXPECT(fifo_initialize(&buf, 10), "init"); /* 10 usable */
-
-   /* Fill it. */
-   fifo_write(&buf, in, 10);
-   EXPECT(FIFO_WRITE_AVAIL(&buf) == 0, "full");
-
-   /* Drain half. */
-   fifo_read(&buf, out, 5);
-   EXPECT(memcmp(out, in, 5) == 0, "first 5 bytes");
-   EXPECT(FIFO_WRITE_AVAIL(&buf) == 5, "5 free");
-   EXPECT(FIFO_READ_AVAIL(&buf) == 5, "5 used");
-
-   /* Write 5 more — engages the wrap-around branch. */
-   {
-      uint8_t more[5] = {10,11,12,13,14};
-      fifo_write(&buf, more, 5);
-   }
-   EXPECT(FIFO_WRITE_AVAIL(&buf) == 0, "full again");
-
-   /* Read all 10 — also engages wrap-around branch. */
-   memset(out, 0, sizeof(out));
-   fifo_read(&buf, out, 10);
-   EXPECT(out[0] == 5,  "out[0]=5, got %u", out[0]);
-   EXPECT(out[4] == 9,  "out[4]=9, got %u", out[4]);
-   EXPECT(out[5] == 10, "out[5]=10, got %u", out[5]);
-   EXPECT(out[9] == 14, "out[9]=14, got %u", out[9]);
-
-   fifo_deinitialize(&buf);
-   printf("[PASS] wrap_around\n");
-}
-
-/* Test 7: zero-len write/read should be a defined no-op. */
-static void test_zero_len(void)
-{
-   fifo_buffer_t buf;
-   uint8_t       byte;
-
-   EXPECT(fifo_initialize(&buf, 32), "init");
-
-   /* Zero-len write on empty buffer. */
-   fifo_write(&buf, &byte, 0);
-   EXPECT(FIFO_READ_AVAIL(&buf) == 0, "still empty");
-
-   /* Zero-len read on empty buffer. */
-   fifo_read(&buf, &byte, 0);
-   EXPECT(FIFO_READ_AVAIL(&buf) == 0, "still empty");
-
-   /* Zero-len read on non-empty buffer. */
-   fifo_write(&buf, "x", 1);
-   fifo_read(&buf, &byte, 0);
-   EXPECT(FIFO_READ_AVAIL(&buf) == 1, "still 1 byte");
-
-   fifo_deinitialize(&buf);
-   printf("[PASS] zero_len\n");
-}
-
-int main(void)
-{
-   test_initialize_size_max();
-   test_initialize_normal();
-   test_write_capped();
-   test_read_capped();
-   test_write_size_max_len();
-   test_wrap_around();
-   test_zero_len();
-
-   if (failures)
-   {
-      fprintf(stderr, "\n%d fifo_queue test(s) failed\n", failures);
-      return 1;
-   }
-   printf("\nAll fifo_queue bounds tests passed.\n");
-   return 0;
-}

From 75ea1e457c1a5201e939f1eb9b06ed8bf2e69b8b Mon Sep 17 00:00:00 2001
From: "U-DESKTOP-SPFP6AQ\\twistedtechre" <reallibretroretroarch@gmail.com>
Date: Wed, 29 Apr 2026 07:53:34 +0200
Subject: [PATCH 2/2] libretro-common: add retro_atomic.h portable atomics
 primitive

A header-only API exposing acquire/release atomic loads, stores and
acq_rel fetch_add/fetch_sub for int and size_t, with a backend cascade
that picks the best primitive each toolchain offers:

  1. C11 <stdatomic.h>           - modern GCC/Clang/MSVC at -std=c11
  2. C++11 <atomic>              - C++ TUs at -std=c++11 or _MSVC_LANG
                                   >= 201103L
  3. GCC __atomic_*              - GCC 4.7+ / Clang 3.1+
                                   (Clang impersonates GCC 4.2 in
                                    __GNUC__, so the gate uses
                                    defined(__clang__) || version check
                                    to avoid falling through to __sync)
  4. MSVC Win32 Interlocked*     - VS2003+, OG Xbox, Xbox 360 XDK; on
                                   ARM/ARM64 the plain forms lack
                                   barriers (PostgreSQL hit this on
                                   Win11/ARM64 in 2025), so RMWs are
                                   bracketed with __dmb on those targets
  5. Apple OSAtomic*             - PPC / pre-10.7 fallback
  6. GCC __sync_*                - GCC 4.1-4.6
  7. volatile fallback           - last resort, single-core / x86 TSO
                                   only; emits a #warning unless
                                   suppressed

Capability flags exposed to callers:

  HAVE_RETRO_ATOMIC               always defined after include
  RETRO_ATOMIC_LOCK_FREE          defined iff a real backend selected
                                  (NOT for the volatile fallback)
  RETRO_ATOMIC_BACKEND_NAME       string literal, for diagnostics
  RETRO_ATOMIC_REQUIRE_LOCK_FREE  caller-side opt-in: setting this
                                  before include turns the volatile
                                  fallback into a #error

No active TU includes the header yet; it is the foundation for a future
SPSC fifo primitive and consolidates the hand-rolled atomic shims
currently scattered across coreaudio*.c/m, xaudio.c, mmdevice_common.c,
opensl.c, and gfx_thumbnail.c.

Sample: libretro-common/samples/atomic/retro_atomic_test/

  Single-threaded property checks of every macro plus a 1M-iteration
  SPSC stress test (when HAVE_THREADS) using rthreads sthread_create.
  Compile-time #error checks assert that every named real backend
  implies RETRO_ATOMIC_LOCK_FREE and that the volatile fallback never
  sets it.

CI: Linux-libretro-common-samples.yml

  - retro_atomic_test added to the native run allowlist (gcc, with
    the workflow's default ASan/UBSan)
  - new step: C++ smoke test compiled with both g++ and clang++ at
    -std=c++11/14/17 against the in-tree header
  - new step: retro_atomic_test built with clang -fsanitize=thread and
    run with TSAN_OPTIONS=halt_on_error=1; TSan instruments every
    atomic load/store and would flag a missing barrier in the SPSC
    stress that x86 TSO would otherwise hide
  - new job: retro-atomic-cross, matrix [aarch64, armv7], cross-compiles
    with gcc-aarch64-linux-gnu / gcc-arm-linux-gnueabihf, runs the
    binary under qemu-user-static, and grep-inspects the emitted asm
    for ldar/stlr/ldadd*_acq_rel (aarch64) or dmb/ldrex/strex (armv7);
    the inspect step exits 1 if no barrier mnemonics are found, which
    catches a silent regression to the volatile fallback

Verified locally:

  - x86_64 native (gcc, clang) + ASan/UBSan + TSan
  - AArch64 cross-compile + qemu, asm shows ldar/stlr/ldadd*_acq_rel
  - ARMv7 cross-compile + qemu, asm shows dmb/ldrex/strex
  - MIPSel cross-compile + qemu, asm shows ll/sc/sync
  - C++11/14/17/20 native (g++ and clang++)
  - C++98 (g++ and clang++) correctly falls through to GCC __atomic_*
  - All 9 forced-backend shape tests (C11, GCC __atomic_*, __sync_*,
    volatile, MSVC x86/x64/ARM64 mocked, Apple 32/64 mocked) plus
    forced C++11

Not verified on real hardware:

  - MSVC ARM64 (correct by construction from MS docs and PostgreSQL
    precedent; awaits Windows-on-ARM CI)
  - Real PowerPC SMP (Wii U, Xbox 360); reasoned from devkitPPC GCC
    and Microsoft's Xbox 360 lockless guide
  - __sync_* and Apple OSAtomic backends (dead code on every current
    target; selection requires GCC < 4.7 or pre-10.7 Apple)
---
 .../Linux-libretro-common-samples.yml         | 194 +++++-
 libretro-common/include/retro_atomic.h        | 646 ++++++++++++++++++
 .../samples/atomic/retro_atomic_test/Makefile |  43 ++
 .../retro_atomic_test/retro_atomic_test.c     | 418 ++++++++++++
 4 files changed, 1300 insertions(+), 1 deletion(-)
 create mode 100644 libretro-common/include/retro_atomic.h
 create mode 100644 libretro-common/samples/atomic/retro_atomic_test/Makefile
 create mode 100644 libretro-common/samples/atomic/retro_atomic_test/retro_atomic_test.c

diff --git a/.github/workflows/Linux-libretro-common-samples.yml b/.github/workflows/Linux-libretro-common-samples.yml
index 07e4fcddd06a..88e5989f8e4d 100644
--- a/.github/workflows/Linux-libretro-common-samples.yml
+++ b/.github/workflows/Linux-libretro-common-samples.yml
@@ -25,7 +25,7 @@ jobs:
       - name: Install dependencies
         run: |
           sudo apt-get update -y
-          sudo apt-get install -y build-essential zlib1g-dev
+          sudo apt-get install -y build-essential zlib1g-dev clang
 
       - name: Checkout
         uses: actions/checkout@v3
@@ -79,6 +79,7 @@ jobs:
             word_wrap_overflow_test
             task_queue_title_error_test
             tpool_wait_test
+            retro_atomic_test
           )
 
           # Per-binary run command (overrides ./<binary> if present).
@@ -196,3 +197,194 @@ jobs:
           if [[ $fails -gt 0 ]]; then
             exit 1
           fi
+
+      - name: Compile-test retro_atomic.h from a C++11 TU
+        shell: bash
+        working-directory: libretro-common
+        run: |
+          # The C++11 backend in retro_atomic.h is a fresh code path that
+          # none of the C samples above exercise.  Compile a tiny inline
+          # C++11 TU against the in-tree header to catch regressions like
+          # accidentally re-introducing an extern "C" wrapper around the
+          # std::atomic include, or breaking the __cplusplus / _MSVC_LANG
+          # gate.  This step is build-and-run, single-threaded only -- the
+          # behavioural SPSC stress is already covered by the C test
+          # binary above on this same host, and the C++11 backend bottoms
+          # out through the same libstdc++ __atomic_* builtins.
+          set -u
+          set -o pipefail
+
+          tmpdir=$(mktemp -d)
+          cat > "$tmpdir/cxx_smoke.cpp" <<'EOF'
+          #include <cstdio>
+          #include <cstddef>
+          #include <retro_atomic.h>
+
+          #if !defined(HAVE_RETRO_ATOMIC) || !defined(RETRO_ATOMIC_LOCK_FREE)
+          #  error "retro_atomic.h: capability flags not set on a C++11 host"
+          #endif
+
+          int main(void) {
+             retro_atomic_int_t  ai;  retro_atomic_int_init(&ai, 0);
+             retro_atomic_size_t as;  retro_atomic_size_init(&as, 0);
+
+             retro_atomic_store_release_int(&ai, 42);
+             retro_atomic_store_release_size(&as, (std::size_t)42);
+
+             int li = retro_atomic_load_acquire_int(&ai);
+             int ls = (int)retro_atomic_load_acquire_size(&as);
+
+             int pi = retro_atomic_fetch_add_int(&ai, 1);
+             int ps = (int)retro_atomic_fetch_add_size(&as, 1);
+
+             retro_atomic_inc_int(&ai);
+             retro_atomic_dec_size(&as);
+
+             int qi = retro_atomic_load_acquire_int(&ai);
+             int qs = (int)retro_atomic_load_acquire_size(&as);
+
+             std::printf("backend: %s\n", RETRO_ATOMIC_BACKEND_NAME);
+
+             bool ok = (li == 42) && (ls == 42)
+                    && (pi == 42) && (ps == 42)
+                    && (qi == 44) && (qs == 42);
+             std::puts(ok ? "ALL OK" : "FAIL");
+             return ok ? 0 : 1;
+          }
+          EOF
+
+          for cxx in g++ clang++; do
+            for std in c++11 c++14 c++17; do
+              echo "==> compile-test with $cxx -std=$std"
+              $cxx -std=$std -Wall -Wextra -pedantic -O2 \
+                  -I include \
+                  "$tmpdir/cxx_smoke.cpp" \
+                  -o "$tmpdir/cxx_smoke" \
+                || { echo "::error title=C++ compile failed::$cxx -std=$std"; exit 1; }
+              "$tmpdir/cxx_smoke" \
+                || { echo "::error title=C++ smoke failed::$cxx -std=$std"; exit 1; }
+            done
+          done
+
+          rm -rf "$tmpdir"
+
+      - name: Run retro_atomic_test under Clang + ThreadSanitizer
+        shell: bash
+        working-directory: libretro-common/samples/atomic/retro_atomic_test
+        run: |
+          # The native samples job above runs with GCC and ASan/UBSan.
+          # Clang is the toolchain on every Apple platform, Android NDK
+          # (since r18), Emscripten, and PS4-ORBIS, so a Clang lane is
+          # not optional coverage.  ThreadSanitizer is the strict
+          # validator for this test in particular: it instruments every
+          # atomic load and store and would flag a missing acquire /
+          # release barrier as a race in the 1M-iteration SPSC stress
+          # (a class of bug that x86 TSO would otherwise hide on the
+          # native runner).
+          set -u
+          set -o pipefail
+
+          make clean
+          CC=clang make all SANITIZER=thread
+
+          TSAN_OPTIONS=halt_on_error=1 ./retro_atomic_test
+
+  # Cross-architecture validation lane for retro_atomic_test.
+  #
+  # The samples job above runs on x86_64, which is a strongly-ordered
+  # (TSO) architecture.  retro_atomic.h's contract is that acquire-load
+  # / release-store / acq_rel-RMW emit real barriers on weakly-ordered
+  # SMP targets (ARM, AArch64, PowerPC, MIPS).  An x86_64 host run
+  # cannot exercise that property, because TSO masks reordering bugs
+  # at the hardware level even when the macros emit no barriers at all.
+  #
+  # This job cross-compiles retro_atomic_test for AArch64 and ARMv7 and
+  # runs the binary under qemu-user-static.  qemu-user emulates the
+  # weak memory model faithfully enough to expose missing-barrier bugs
+  # in the SPSC stress test, and is cheap enough to run on every push.
+  #
+  # We deliberately do NOT run the full samples sweep here -- the rest
+  # of the samples don't have architecture-dependent codegen that
+  # warrants the extra CI time.  retro_atomic_test is the one that
+  # benefits from cross-arch coverage.
+  #
+  # Real ARM hardware still beats qemu (see e.g. PostgreSQL's 2025
+  # Win11/ARM64 atomic ordering bug, found only on real silicon),
+  # but qemu catches most categorical errors and is much cheaper than
+  # provisioning ARM runners.
+  retro-atomic-cross:
+    name: Cross-arch retro_atomic_test (${{ matrix.arch }})
+    runs-on: ubuntu-latest
+    timeout-minutes: 10
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - arch: aarch64
+            cc: aarch64-linux-gnu-gcc
+            apt_pkgs: gcc-aarch64-linux-gnu
+            qemu: qemu-aarch64-static
+            sysroot: /usr/aarch64-linux-gnu
+          - arch: armv7
+            cc: arm-linux-gnueabihf-gcc
+            apt_pkgs: gcc-arm-linux-gnueabihf
+            qemu: qemu-arm-static
+            sysroot: /usr/arm-linux-gnueabihf
+
+    steps:
+      - name: Install dependencies
+        run: |
+          sudo apt-get update -y
+          sudo apt-get install -y build-essential ${{ matrix.apt_pkgs }} qemu-user-static
+
+      - name: Checkout
+        uses: actions/checkout@v3
+
+      - name: Build retro_atomic_test for ${{ matrix.arch }}
+        working-directory: libretro-common/samples/atomic/retro_atomic_test
+        run: |
+          set -u
+          set -o pipefail
+          make clean
+          CC=${{ matrix.cc }} make all
+
+      - name: Run retro_atomic_test under qemu-user
+        working-directory: libretro-common/samples/atomic/retro_atomic_test
+        run: |
+          set -u
+          set -o pipefail
+          ${{ matrix.qemu }} -L ${{ matrix.sysroot }} ./retro_atomic_test
+
+      - name: Inspect emitted atomic instructions
+        working-directory: libretro-common/samples/atomic/retro_atomic_test
+        run: |
+          set -u
+          set -o pipefail
+          # Spot-check the codegen.  If retro_atomic.h were silently
+          # falling through to a no-barrier backend on this arch, the
+          # asm would be conspicuously missing acquire/release
+          # instructions.  This is a cheap sanity check on top of the
+          # behavioural SPSC test above.
+          ${{ matrix.cc }} -O2 -S \
+              -I../../../include -DHAVE_THREADS \
+              retro_atomic_test.c -o /tmp/retro_atomic_test.s
+          echo
+          echo '== Unique barrier-emitting mnemonics =='
+          case "${{ matrix.arch }}" in
+            aarch64)
+              # Expect: ldar, stlr, and __aarch64_ldadd*_acq_rel libcalls
+              # (or inline ldaddal LSE on +lse builds).
+              pattern='\b(ldar|stlr|ldax|stlx|dmb|ldadd[a-z0-9_]*|swp[a-z0-9_]*|__aarch64_(ldadd|swp)[a-z0-9_]*acq_rel)\b'
+              ;;
+            armv7)
+              # Expect: dmb (data memory barrier) and ldrex/strex pairs.
+              pattern='\b(dmb|ldrex|strex|ldrexb|strexb|ldrexh|strexh)\b'
+              ;;
+          esac
+          mnemonics=$(grep -oE "$pattern" /tmp/retro_atomic_test.s | sort -u)
+          echo "$mnemonics"
+          if [[ -z "$mnemonics" ]]; then
+            echo
+            echo '::error title=No barrier instructions emitted::retro_atomic_test.s contains no acquire/release/barrier mnemonics for ${{ matrix.arch }}; retro_atomic.h may have fallen through to a no-barrier backend.'
+            exit 1
+          fi
diff --git a/libretro-common/include/retro_atomic.h b/libretro-common/include/retro_atomic.h
new file mode 100644
index 000000000000..f8d4be78f485
--- /dev/null
+++ b/libretro-common/include/retro_atomic.h
@@ -0,0 +1,646 @@
+/* Copyright  (C) 2010-2026 The RetroArch team
+ *
+ * ---------------------------------------------------------------------------------------
+ * The following license statement only applies to this file (retro_atomic.h).
+ * ---------------------------------------------------------------------------------------
+ *
+ * Permission is hereby granted, free of charge,
+ * to any person obtaining a copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __LIBRETRO_SDK_ATOMIC_H
+#define __LIBRETRO_SDK_ATOMIC_H
+
+/* Minimal portable atomic operations for SPSC patterns.
+ *
+ * This header consolidates the ad-hoc atomic shims previously duplicated
+ * in audio/drivers/{coreaudio,coreaudio3,xaudio,opensl}.c, audio/common/
+ * mmdevice_common.c and gfx/gfx_thumbnail.c.  The surface is intentionally
+ * narrow: load, store, fetch_add, fetch_sub, plus inc/dec convenience
+ * wrappers.  Everything is on plain machine words (int and size_t); no
+ * compare-exchange, no double-word ops, no thread-fences.  Add only when
+ * a real caller needs it.
+ *
+ * Memory ordering is fixed per-operation rather than parameterised, to
+ * keep the call sites readable and to avoid having to invent ordering
+ * tags for every backend:
+ *
+ *   retro_atomic_load_acquire   - acquire load   (pairs with release store)
+ *   retro_atomic_store_release  - release store  (pairs with acquire load)
+ *   retro_atomic_fetch_add      - acq_rel RMW
+ *   retro_atomic_fetch_sub      - acq_rel RMW
+ *   retro_atomic_inc / dec      - acq_rel RMW, return void
+ *
+ * Backend selection (in order):
+ *   1. C11 <stdatomic.h>            (modern GCC/Clang/MSVC with /std:c11)
+ *   2. C++11 <atomic>               (any C++ TU with __cplusplus >= 201103L
+ *                                    or _MSVC_LANG >= 201103L; the natural
+ *                                    peer of C11 stdatomic for C++ callers)
+ *   3. GCC __atomic_*               (GCC 4.7+ / Clang 3.1+)
+ *   4. MSVC Interlocked* (Win32)    (every MSVC since VS2003, OG Xbox,
+ *                                    Xbox 360 XDK, modern Win32/x64)
+ *   5. Mach OSAtomic*               (Apple PPC / pre-10.7)
+ *   6. GCC __sync_*                 (very old GCC, GCC 4.1-4.6)
+ *   7. volatile fallback            (single-core, x86 TSO, or last resort)
+ *
+ * Capability macros (defined after backend selection):
+ *
+ *   HAVE_RETRO_ATOMIC          -> always 1 if the header included
+ *                                 successfully.  Use for compile-time gating
+ *                                 of any code that uses the API at all.
+ *   RETRO_ATOMIC_LOCK_FREE     -> 1 if a real lock-free backend was
+ *                                 selected (1, 2, 3, 4, 5, or 6).  NOT
+ *                                 defined if the volatile fallback (7)
+ *                                 was selected.  SPSC fifos and other
+ *                                 lock-free data structures should gate
+ *                                 on this.
+ *
+ *                                 Strictly speaking, the C and C++
+ *                                 standards do not guarantee that
+ *                                 atomic_int / std::atomic<int> are
+ *                                 lock-free on all conforming
+ *                                 implementations.  In practice, on
+ *                                 every architecture and toolchain
+ *                                 RetroArch supports (x86/x64, ARM,
+ *                                 AArch64, PowerPC, MIPS, all with
+ *                                 32-bit aligned word atomics), int and
+ *                                 size_t are always lock-free, so this
+ *                                 macro is defined unconditionally for
+ *                                 every named backend.  If a future
+ *                                 port lands on a target where this is
+ *                                 not the case, this comment is the
+ *                                 right place to add an
+ *                                 ATOMIC_INT_LOCK_FREE / atomic
+ *                                 _is_always_lock_free gate.
+ *   RETRO_ATOMIC_BACKEND_NAME  -> string literal naming the active backend
+ *                                 (e.g. "C11 stdatomic", "volatile fallback").
+ *                                 Useful for diagnostics and CI logs.
+ *
+ * Caller-side opt-in for stricter selection:
+ *
+ *   RETRO_ATOMIC_REQUIRE_LOCK_FREE
+ *     If defined by the caller before including this header, an
+ *     #error is raised when only the volatile fallback would be
+ *     available.  Use in code paths whose correctness depends on
+ *     real hardware barriers (e.g. SPSC ring buffers used across
+ *     SMP threads on weakly-ordered targets).
+ *
+ * Caller patterns
+ * ---------------
+ * There are three idiomatic ways to consume this header, picked
+ * according to how the caller copes with no-atomics:
+ *
+ * Pattern 1 -- "lock-free fast path with a portable fallback"
+ *   Use when you have a working alternative (mutex-based, locked,
+ *   slock_t / scond_t, fifo_queue with a lock around it) that you
+ *   would happily fall back to on a target without real atomics.
+ *   This is the same shape as audio/drivers/coreaudio.c's
+ *   RARCH_COREAUDIO_LEGACY split.
+ *
+ *     #include <retro_atomic.h>
+ *     #if defined(RETRO_ATOMIC_LOCK_FREE)
+ *        // SPSC fast path -- producer/consumer split with
+ *        // load_acquire / store_release / fetch_add.
+ *        static retro_atomic_size_t fill;
+ *        ...
+ *     #else
+ *        // Locked fallback -- regular fifo_queue + slock_t.
+ *        static fifo_buffer_t *fifo;
+ *        static slock_t       *lock;
+ *        ...
+ *     #endif
+ *
+ * Pattern 2 -- "atomics required, refuse to compile otherwise"
+ *   Use when the calling code has no sensible non-atomic
+ *   implementation -- the fast path *is* the only path, and a
+ *   silent volatile fallback would be worse than a build break.
+ *   Define RETRO_ATOMIC_REQUIRE_LOCK_FREE before the include and
+ *   the header will #error out if no real backend is available.
+ *
+ *     #define RETRO_ATOMIC_REQUIRE_LOCK_FREE
+ *     #include <retro_atomic.h>
+ *
+ *     // From here on, RETRO_ATOMIC_LOCK_FREE is guaranteed.
+ *
+ * Pattern 3 -- "atomics if useful, harmless if not"
+ *   Use when the calling code is correct without atomics (e.g.
+ *   relaxed counters used only for diagnostics or rate-limiting
+ *   that can tolerate a torn read).  Just use the macros
+ *   unconditionally; the volatile fallback gives you the loosest
+ *   semantics that still compiles, and that's enough.
+ *
+ *     #include <retro_atomic.h>
+ *     // No #if needed -- counters work either way.
+ *     retro_atomic_int_t debug_counter;
+ *     retro_atomic_inc_int(&debug_counter);
+ *
+ * The choice between Pattern 1 and Pattern 2 is mostly about how
+ * forgiving the calling code can be: a reusable library primitive
+ * (fifo_spsc_t for instance) is better off with Pattern 1, because
+ * a dependent that doesn't care about SMP correctness on the rare
+ * volatile-fallback target shouldn't be forced to provide an
+ * alternative.  Application code that hard-relies on real barriers
+ * to be correct is better off with Pattern 2 -- it makes the
+ * portability requirement loud at build time on the platform that
+ * needs to fix it, instead of silently miscompiling.
+ *
+ * The fallback is intentionally weak.  It is correct on:
+ *   - true single-core hardware (PSP, original NDS-class)
+ *   - x86/x64 (TSO masks the missing release/acquire fences for naturally
+ *     aligned word-sized loads/stores; the missing piece is a compiler
+ *     barrier, supplied by `volatile`)
+ * It is NOT correct on weakly-ordered SMP without barriers (ARMv7+ SMP,
+ * PowerPC SMP, MIPS SMP).  No RetroArch target lands in that gap today
+ * without also having one of the higher-priority backends available, but
+ * compiling there raises a #warning so it's loud.
+ *
+ * PowerPC coverage:
+ *   - Xbox 360 XDK (MSVC + Xenon PPC) -> MSVC backend, *Acquire variants
+ *     emit lwsync. Correct on the 3-core console.
+ *   - libxenon Xbox 360 (xenon-gcc)   -> GCC __atomic_* backend.
+ *   - GameCube (single-core Gekko)    -> GCC __atomic_* backend; SMP
+ *     concerns moot anyway.
+ *   - Wii (single-core Broadway)      -> GCC __atomic_* backend; SMP
+ *     concerns moot anyway.
+ *   - Wii U (3-core Espresso)         -> GCC __atomic_* backend.
+ *   - PS3 (Cell PPU, plus SPEs the
+ *     host code does not run on)      -> GCC __atomic_* backend.
+ *   - Apple PPC G3/G4 (single-core)   -> Apple OSAtomic backend.
+ *   - Apple PPC G5 (SMP)              -> Apple OSAtomic backend.
+ *
+ * ARM / AArch64 coverage:
+ *   - Switch / libnx (Cortex-A57 SMP) -> GCC __atomic_* backend; emits
+ *     real ldar/stlr/ldadd*_acq_rel.  Verified by aarch64-linux-gnu
+ *     cross-compile + qemu user-mode.
+ *   - PSVita (Cortex-A9 SMP, ARMv7)   -> GCC __atomic_* backend; emits
+ *     dmb ish around exclusive monitor pairs.  Verified by qemu-arm.
+ *   - 3DS (ARM11 ARMv6, single-core
+ *     OldOld 3DS, dual-core New 3DS)  -> GCC __atomic_* backend.
+ *   - webOS / Miyoo / OpenPandora     -> GCC __atomic_* backend.
+ *   - Raspberry Pi / generic Linux    -> GCC __atomic_* or C11 stdatomic.
+ *   - Android (NDK Clang)             -> C11 stdatomic.
+ *   - Apple iOS / tvOS / Apple Silicon
+ *     Mac (ARM64, multi-core SMP)     -> C11 stdatomic.
+ *   - Windows on ARM64 (MSVC)         -> MSVC backend.  *Acquire variants
+ *     for load and store emit dmb per MSVC docs; fetch_add/fetch_sub
+ *     are bracketed with explicit __dmb(_ARM64_BARRIER_ISH) since plain
+ *     Interlocked* RMW lacks barriers on ARM64 (PostgreSQL hit this on
+ *     Win11/ARM64 in 2025).
+ *
+ * Clang notes:
+ *   Clang impersonates GCC 4.2 in its __GNUC__ / __GNUC_MINOR__
+ *   defines (a long-standing legacy compatibility setting), so a naive
+ *   "GCC >= 4.7" gate would fall through to __sync_* on Clang even
+ *   though Clang has supported __atomic_* since 3.1.  The GCC backend
+ *   gate above keys on `defined(__clang__) || (GCC version check)` to
+ *   short-circuit this trap.
+ *
+ *   Selection on Clang in practice:
+ *     -std=c89/c99/gnu99   -> GCC __atomic_*
+ *     -std=c11/c17/gnu17   -> C11 stdatomic
+ *     -std=c++98           -> GCC __atomic_*
+ *     -std=c++11 and later -> C++11 std::atomic
+ *
+ *   On AArch64, Clang and GCC emit the same family of instructions
+ *   (ldar / stlr / ldadd*_acq_rel), so the hardware contract is
+ *   honoured identically.  Clang on Apple platforms (macOS, iOS,
+ *   tvOS), Android NDK r18+, Emscripten, and PS4-ORBIS all flow
+ *   through one of the gcc / C11 / C++11 paths above; the CI lane
+ *   exercises Clang with ThreadSanitizer, which would flag any
+ *   missing-barrier regression in the SPSC stress.
+ */
+
+/* No external libretro-common includes are needed: the header is all
+ * macros and integer typedefs.  Each backend block pulls in the
+ * platform headers it needs (<stdatomic.h>, <atomic>, <windows.h>,
+ * <libkern/OSAtomic.h>) inside its own #if guard. */
+
+/* ---- Backend detection ------------------------------------------------- */
+
+/* Build-time overrides.  Define one of:
+ *   RETRO_ATOMIC_FORCE_C11
+ *   RETRO_ATOMIC_FORCE_CXX11
+ *   RETRO_ATOMIC_FORCE_GCC_NEW
+ *   RETRO_ATOMIC_FORCE_MSVC
+ *   RETRO_ATOMIC_FORCE_APPLE
+ *   RETRO_ATOMIC_FORCE_SYNC
+ *   RETRO_ATOMIC_FORCE_VOLATILE
+ * to bypass auto-detection.  Useful for porting and for testing.       */
+#if defined(RETRO_ATOMIC_FORCE_C11)
+#define RETRO_ATOMIC_BACKEND_C11 1
+#elif defined(RETRO_ATOMIC_FORCE_CXX11)
+#define RETRO_ATOMIC_BACKEND_CXX11 1
+#elif defined(RETRO_ATOMIC_FORCE_GCC_NEW)
+#define RETRO_ATOMIC_BACKEND_GCC_NEW 1
+#elif defined(RETRO_ATOMIC_FORCE_MSVC)
+#define RETRO_ATOMIC_BACKEND_MSVC 1
+#elif defined(RETRO_ATOMIC_FORCE_APPLE)
+#define RETRO_ATOMIC_BACKEND_APPLE 1
+#elif defined(RETRO_ATOMIC_FORCE_SYNC)
+#define RETRO_ATOMIC_BACKEND_SYNC 1
+#elif defined(RETRO_ATOMIC_FORCE_VOLATILE)
+#define RETRO_ATOMIC_BACKEND_VOLATILE 1
+#elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L && \
+    !defined(__STDC_NO_ATOMICS__)
+#define RETRO_ATOMIC_BACKEND_C11 1
+/* C++11 <atomic> is the natural peer of C11 <stdatomic.h> for any
+ * C++ TU that includes this header.  Note: MSVC keeps __cplusplus
+ * pinned at 199711L unless /Zc:__cplusplus is passed; _MSVC_LANG
+ * carries the actual language level, so we test both.  RetroArch
+ * builds Makefile.win and a few legacy paths with -std=c++98, so
+ * the gate must be exact -- defined(__cplusplus) alone is not
+ * enough. */
+#elif (defined(__cplusplus) && __cplusplus >= 201103L) || \
+      (defined(_MSVC_LANG)  && _MSVC_LANG  >= 201103L)
+#define RETRO_ATOMIC_BACKEND_CXX11 1
+#elif defined(__clang__) || (defined(__GNUC__) && \
+    ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7)))
+#define RETRO_ATOMIC_BACKEND_GCC_NEW 1
+#elif defined(_MSC_VER)
+#define RETRO_ATOMIC_BACKEND_MSVC 1
+#elif defined(__APPLE__) && defined(__MACH__)
+/* Old Apple toolchains (PPC / pre-10.7) without modern GCC builtins.
+ * OSAtomic is deprecated but functional through 10.x. */
+#define RETRO_ATOMIC_BACKEND_APPLE 1
+#elif defined(__GNUC__) && \
+    ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1))
+#define RETRO_ATOMIC_BACKEND_SYNC 1
+#else
+#define RETRO_ATOMIC_BACKEND_VOLATILE 1
+#if !defined(RETRO_ATOMIC_SUPPRESS_WARNING)
+#warning "retro_atomic.h: no atomic backend matched, falling back to volatile. Safe only on single-core or x86 TSO."
+#endif
+#endif
+
+/* ---- Capability flags -------------------------------------------------- */
+
+/* The header is always usable in the sense that the macros expand to
+ * working C; HAVE_RETRO_ATOMIC just signals that the API surface exists.
+ * Callers that want to know whether the backend is actually lock-free
+ * on SMP must additionally test RETRO_ATOMIC_LOCK_FREE. */
+#define HAVE_RETRO_ATOMIC 1
+
+#if defined(RETRO_ATOMIC_BACKEND_C11)
+#define RETRO_ATOMIC_LOCK_FREE 1
+#define RETRO_ATOMIC_BACKEND_NAME "C11 stdatomic"
+#elif defined(RETRO_ATOMIC_BACKEND_CXX11)
+#define RETRO_ATOMIC_LOCK_FREE 1
+#define RETRO_ATOMIC_BACKEND_NAME "C++11 std::atomic"
+#elif defined(RETRO_ATOMIC_BACKEND_GCC_NEW)
+#define RETRO_ATOMIC_LOCK_FREE 1
+#define RETRO_ATOMIC_BACKEND_NAME "GCC __atomic_*"
+#elif defined(RETRO_ATOMIC_BACKEND_MSVC)
+#define RETRO_ATOMIC_LOCK_FREE 1
+#define RETRO_ATOMIC_BACKEND_NAME "MSVC Interlocked*"
+#elif defined(RETRO_ATOMIC_BACKEND_APPLE)
+#define RETRO_ATOMIC_LOCK_FREE 1
+#define RETRO_ATOMIC_BACKEND_NAME "Apple OSAtomic*"
+#elif defined(RETRO_ATOMIC_BACKEND_SYNC)
+#define RETRO_ATOMIC_LOCK_FREE 1
+#define RETRO_ATOMIC_BACKEND_NAME "GCC __sync_*"
+#else /* RETRO_ATOMIC_BACKEND_VOLATILE */
+/* RETRO_ATOMIC_LOCK_FREE intentionally NOT defined for the volatile
+ * fallback; callers that gate on it will compile without the
+ * lock-free fast path on this target. */
+#define RETRO_ATOMIC_BACKEND_NAME "volatile fallback (best-effort)"
+#if defined(RETRO_ATOMIC_REQUIRE_LOCK_FREE)
+#error "retro_atomic.h: RETRO_ATOMIC_REQUIRE_LOCK_FREE was set, but only the volatile fallback is available on this target. The caller's correctness depends on hardware barriers that this backend does not provide. Either provide a real atomic backend (C11 stdatomic, C++11 std::atomic, GCC __atomic_*, MSVC Interlocked*, Apple OSAtomic*, or GCC __sync_*) or fall back to a locked implementation in the calling code."
+#endif
+#endif
+
+/* The header contains only macros and integer typedefs; there are no
+ * function declarations and therefore no need for RETRO_BEGIN_DECLS /
+ * extern "C" wrapping.  In particular the C++11 backend below
+ * #includes <atomic>, whose templates cannot be declared with C
+ * linkage, so the wrapper would actively break that path. */
+
+/* ---- C11 <stdatomic.h> ------------------------------------------------- */
+#if defined(RETRO_ATOMIC_BACKEND_C11)
+
+#include <stdatomic.h>
+#include <stddef.h>
+
+typedef atomic_int    retro_atomic_int_t;
+typedef atomic_size_t retro_atomic_size_t;
+
+#define retro_atomic_int_init(p, v)    atomic_init((p), (v))
+#define retro_atomic_size_init(p, v)   atomic_init((p), (v))
+
+#define retro_atomic_load_acquire_int(p) \
+   atomic_load_explicit((p), memory_order_acquire)
+#define retro_atomic_store_release_int(p, v) \
+   atomic_store_explicit((p), (v), memory_order_release)
+#define retro_atomic_fetch_add_int(p, v) \
+   atomic_fetch_add_explicit((p), (v), memory_order_acq_rel)
+#define retro_atomic_fetch_sub_int(p, v) \
+   atomic_fetch_sub_explicit((p), (v), memory_order_acq_rel)
+
+#define retro_atomic_load_acquire_size(p) \
+   atomic_load_explicit((p), memory_order_acquire)
+#define retro_atomic_store_release_size(p, v) \
+   atomic_store_explicit((p), (v), memory_order_release)
+#define retro_atomic_fetch_add_size(p, v) \
+   atomic_fetch_add_explicit((p), (v), memory_order_acq_rel)
+#define retro_atomic_fetch_sub_size(p, v) \
+   atomic_fetch_sub_explicit((p), (v), memory_order_acq_rel)
+
+/* ---- C++11 <atomic> --------------------------------------------------- */
+#elif defined(RETRO_ATOMIC_BACKEND_CXX11)
+
+#include <atomic>
+#include <cstddef>
+/* This header is included by C++ TUs in C++11+ mode (gated on
+ * __cplusplus >= 201103L or _MSVC_LANG >= 201103L).  We use the
+ * std::atomic_* free-function forms rather than the member-function
+ * forms because they are syntactically closest to the C11 macros
+ * above and keep the macro expansions identical in shape across
+ * the two languages.
+ *
+ * The std::atomic<T> types are required by the standard to be
+ * standard-layout for our integer instantiations and lock-free on
+ * every RetroArch-supported target (every architecture has a
+ * lock-free 32-bit and pointer-width atomic).  Size equality with
+ * the underlying T is not promised by the standard but holds in
+ * practice on every libstdc++/libc++/MSVC STL implementation we
+ * care about; we do not rely on it. */
+
+typedef std::atomic<int>         retro_atomic_int_t;
+typedef std::atomic<std::size_t> retro_atomic_size_t;
+
+#define retro_atomic_int_init(p, v)    std::atomic_init((p), (v))
+#define retro_atomic_size_init(p, v)   std::atomic_init((p), (std::size_t)(v))
+
+#define retro_atomic_load_acquire_int(p) \
+   std::atomic_load_explicit((p), std::memory_order_acquire)
+#define retro_atomic_store_release_int(p, v) \
+   std::atomic_store_explicit((p), (v), std::memory_order_release)
+#define retro_atomic_fetch_add_int(p, v) \
+   std::atomic_fetch_add_explicit((p), (v), std::memory_order_acq_rel)
+#define retro_atomic_fetch_sub_int(p, v) \
+   std::atomic_fetch_sub_explicit((p), (v), std::memory_order_acq_rel)
+
+#define retro_atomic_load_acquire_size(p) \
+   std::atomic_load_explicit((p), std::memory_order_acquire)
+#define retro_atomic_store_release_size(p, v) \
+   std::atomic_store_explicit((p), (std::size_t)(v), std::memory_order_release)
+#define retro_atomic_fetch_add_size(p, v) \
+   std::atomic_fetch_add_explicit((p), (std::size_t)(v), std::memory_order_acq_rel)
+#define retro_atomic_fetch_sub_size(p, v) \
+   std::atomic_fetch_sub_explicit((p), (std::size_t)(v), std::memory_order_acq_rel)
+
+/* ---- GCC __atomic_* (4.7+) / Clang ------------------------------------ */
+#elif defined(RETRO_ATOMIC_BACKEND_GCC_NEW)
+
+#include <stddef.h>
+
+typedef int    retro_atomic_int_t;
+typedef size_t retro_atomic_size_t;
+
+#define retro_atomic_int_init(p, v)    (*(p) = (v))
+#define retro_atomic_size_init(p, v)   (*(p) = (v))
+
+#define retro_atomic_load_acquire_int(p) \
+   __atomic_load_n((p), __ATOMIC_ACQUIRE)
+#define retro_atomic_store_release_int(p, v) \
+   __atomic_store_n((p), (v), __ATOMIC_RELEASE)
+#define retro_atomic_fetch_add_int(p, v) \
+   __atomic_fetch_add((p), (v), __ATOMIC_ACQ_REL)
+#define retro_atomic_fetch_sub_int(p, v) \
+   __atomic_fetch_sub((p), (v), __ATOMIC_ACQ_REL)
+
+#define retro_atomic_load_acquire_size(p) \
+   __atomic_load_n((p), __ATOMIC_ACQUIRE)
+#define retro_atomic_store_release_size(p, v) \
+   __atomic_store_n((p), (v), __ATOMIC_RELEASE)
+#define retro_atomic_fetch_add_size(p, v) \
+   __atomic_fetch_add((p), (v), __ATOMIC_ACQ_REL)
+#define retro_atomic_fetch_sub_size(p, v) \
+   __atomic_fetch_sub((p), (v), __ATOMIC_ACQ_REL)
+
+/* ---- MSVC Interlocked* (Win32 API, works back to VS2003 / Xbox 360) ---- */
+#elif defined(RETRO_ATOMIC_BACKEND_MSVC)
+
+#include <stddef.h>
+/* Use the Win32 API forms (capital I, declared in <windows.h>) rather
+ * than the compiler intrinsics (_InterlockedFoo, declared in <intrin.h>).
+ * The intrinsics require <intrin.h> which doesn't exist before VS2005.
+ * The Win32 API forms are available on every MSVC since Windows NT and
+ * on the Xbox 360 / OG Xbox XDKs.
+ *
+ * Memory ordering is non-trivial on this backend because Microsoft's
+ * Win32 plain Interlocked* functions have inconsistent ordering across
+ * architectures:
+ *   - x86/x64: full barrier (LOCK prefix), every form, always.
+ *   - Itanium / Xbox 360 PowerPC: full barrier, but historically the
+ *     docs warned to pair with __lwsync; the *Acquire / *Release
+ *     forms (which fold the barrier in) are recommended.
+ *   - ARM / ARM64: NO barrier on the plain forms; you must either
+ *     use the *Acquire / *Release forms or pair the plain form with
+ *     an explicit __dmb.
+ *
+ * To get correct semantics on every supported architecture without an
+ * x86 perf cost, we:
+ *   - Use InterlockedCompareExchangeAcquire for atomic loads.
+ *   - Use InterlockedExchange*Release* (the Release variant) for
+ *     atomic stores.
+ *   - Use the plain InterlockedExchangeAdd for fetch_add / fetch_sub,
+ *     bracketed by __dmb(_ARM64_BARRIER_ISH) on ARM64 to provide the
+ *     acq_rel ordering.  On every other MSVC target the bracketing
+ *     compiles out and the plain form's full-barrier semantics are
+ *     used directly.
+ *
+ * The __dmb intrinsic is declared in <intrin.h> and is available from
+ * VS2008 (the same release that introduced ARM as a target).  Since
+ * MSVC ARM/ARM64 builds are themselves a VS2008+ feature, the
+ * <intrin.h> include is gated on _M_ARM / _M_ARM64 and remains absent
+ * on the legacy x86 / Xbox 360 / Itanium paths.
+ */
+
+#include <windows.h>
+
+#if defined(_M_ARM) || defined(_M_ARM64)
+#include <intrin.h>
+#define RETRO_ATOMIC_MSVC_ARM_FENCE() __dmb(_ARM64_BARRIER_ISH)
+#else
+#define RETRO_ATOMIC_MSVC_ARM_FENCE() ((void)0)
+#endif
+
+typedef volatile LONG     retro_atomic_int_t;
+typedef volatile LONG_PTR retro_atomic_size_t;
+/* LONG_PTR is 32-bit on Win32, 64-bit on Win64 -- matches size_t width
+ * on every Windows ABI. */
+
+#define retro_atomic_int_init(p, v)    (*(p) = (LONG)(v))
+#define retro_atomic_size_init(p, v)   (*(p) = (LONG_PTR)(v))
+
+#define retro_atomic_load_acquire_int(p) \
+   InterlockedCompareExchangeAcquire((LONG volatile*)(p), 0, 0)
+#define retro_atomic_store_release_int(p, v)                              \
+   do {                                                                   \
+      RETRO_ATOMIC_MSVC_ARM_FENCE();                                      \
+      (void)InterlockedExchange((LONG volatile*)(p), (LONG)(v));          \
+   } while (0)
+/* fetch_add / fetch_sub: plain Interlocked* on x86/x64/Itanium/PPC
+ * is full-barrier; on ARM we surround with __dmb to get acq_rel. */
+#define retro_atomic_fetch_add_int(p, v) (                                \
+   RETRO_ATOMIC_MSVC_ARM_FENCE(),                                         \
+   InterlockedExchangeAdd((LONG volatile*)(p), (LONG)(v)) )
+#define retro_atomic_fetch_sub_int(p, v) (                                \
+   RETRO_ATOMIC_MSVC_ARM_FENCE(),                                         \
+   InterlockedExchangeAdd((LONG volatile*)(p), -(LONG)(v)) )
+/* Note: on ARM we'd ideally want a __dmb both before AND after the
+ * RMW for full sequential consistency (PostgreSQL's recent fix does
+ * exactly that).  acq_rel needs only one barrier on most use cases;
+ * the C11 contract says acq_rel = release-before, acquire-after,
+ * which on ARMv8 is satisfied by a single dmb ish.  If a caller
+ * needs seq_cst, they can pair this with an additional load_acquire
+ * on the same variable. */
+
+#if defined(_WIN64)
+#define retro_atomic_load_acquire_size(p) \
+   ((size_t)InterlockedCompareExchangeAcquire64((LONGLONG volatile*)(p), 0, 0))
+#define retro_atomic_store_release_size(p, v)                             \
+   do {                                                                   \
+      RETRO_ATOMIC_MSVC_ARM_FENCE();                                      \
+      (void)InterlockedExchange64((LONGLONG volatile*)(p), (LONGLONG)(v));\
+   } while (0)
+#define retro_atomic_fetch_add_size(p, v) (                               \
+   RETRO_ATOMIC_MSVC_ARM_FENCE(),                                         \
+   (size_t)InterlockedExchangeAdd64((LONGLONG volatile*)(p), (LONGLONG)(v)) )
+#define retro_atomic_fetch_sub_size(p, v) (                               \
+   RETRO_ATOMIC_MSVC_ARM_FENCE(),                                         \
+   (size_t)InterlockedExchangeAdd64((LONGLONG volatile*)(p), -(LONGLONG)(v)) )
+#else
+#define retro_atomic_load_acquire_size(p) \
+   ((size_t)InterlockedCompareExchangeAcquire((LONG volatile*)(p), 0, 0))
+#define retro_atomic_store_release_size(p, v)                             \
+   do {                                                                   \
+      RETRO_ATOMIC_MSVC_ARM_FENCE();                                      \
+      (void)InterlockedExchange((LONG volatile*)(p), (LONG)(v));          \
+   } while (0)
+#define retro_atomic_fetch_add_size(p, v) (                               \
+   RETRO_ATOMIC_MSVC_ARM_FENCE(),                                         \
+   (size_t)InterlockedExchangeAdd((LONG volatile*)(p), (LONG)(v)) )
+#define retro_atomic_fetch_sub_size(p, v) (                               \
+   RETRO_ATOMIC_MSVC_ARM_FENCE(),                                         \
+   (size_t)InterlockedExchangeAdd((LONG volatile*)(p), -(LONG)(v)) )
+#endif
+
+/* ---- Apple OSAtomic (deprecated but available pre-10.7) --------------- */
+#elif defined(RETRO_ATOMIC_BACKEND_APPLE)
+
+#include <libkern/OSAtomic.h>
+#include <stddef.h>
+
+typedef volatile int32_t  retro_atomic_int_t;
+typedef volatile intptr_t retro_atomic_size_t;
+/* OSAtomic uses int32 / int64; we pun size_t to intptr_t and assume
+ * size_t == intptr_t in width.  Holds on every Apple ABI. */
+
+#define retro_atomic_int_init(p, v)    (*(p) = (v))
+#define retro_atomic_size_init(p, v)   (*(p) = (intptr_t)(v))
+
+#define retro_atomic_load_acquire_int(p)  OSAtomicAdd32Barrier(0, (p))
+#define retro_atomic_store_release_int(p, v) \
+   do { OSMemoryBarrier(); *(p) = (v); } while (0)
+#define retro_atomic_fetch_add_int(p, v) \
+   (OSAtomicAdd32Barrier((v), (p)) - (v))
+#define retro_atomic_fetch_sub_int(p, v) \
+   (OSAtomicAdd32Barrier(-(v), (p)) + (v))
+
+#if defined(__LP64__)
+#define retro_atomic_load_acquire_size(p) \
+   ((size_t)OSAtomicAdd64Barrier(0, (volatile int64_t*)(p)))
+#define retro_atomic_store_release_size(p, v) \
+   do { OSMemoryBarrier(); *(p) = (intptr_t)(v); } while (0)
+#define retro_atomic_fetch_add_size(p, v) \
+   ((size_t)(OSAtomicAdd64Barrier((int64_t)(v), (volatile int64_t*)(p)) - (int64_t)(v)))
+#define retro_atomic_fetch_sub_size(p, v) \
+   ((size_t)(OSAtomicAdd64Barrier(-(int64_t)(v), (volatile int64_t*)(p)) + (int64_t)(v)))
+#else
+#define retro_atomic_load_acquire_size(p) \
+   ((size_t)OSAtomicAdd32Barrier(0, (volatile int32_t*)(p)))
+#define retro_atomic_store_release_size(p, v) \
+   do { OSMemoryBarrier(); *(p) = (intptr_t)(v); } while (0)
+#define retro_atomic_fetch_add_size(p, v) \
+   ((size_t)(OSAtomicAdd32Barrier((int32_t)(v), (volatile int32_t*)(p)) - (int32_t)(v)))
+#define retro_atomic_fetch_sub_size(p, v) \
+   ((size_t)(OSAtomicAdd32Barrier(-(int32_t)(v), (volatile int32_t*)(p)) + (int32_t)(v)))
+#endif
+
+/* ---- GCC __sync_* (legacy, 4.1-4.6) ----------------------------------- */
+#elif defined(RETRO_ATOMIC_BACKEND_SYNC)
+
+#include <stddef.h>
+
+typedef volatile int    retro_atomic_int_t;
+typedef volatile size_t retro_atomic_size_t;
+
+#define retro_atomic_int_init(p, v)    (*(p) = (v))
+#define retro_atomic_size_init(p, v)   (*(p) = (v))
+
+/* __sync builtins are full sequential-consistency; over-strong but correct.
+ * The "load via fetch_and_add 0" / "store via lock+swap" idioms are the
+ * canonical way to get an atomic load/store out of __sync. */
+#define retro_atomic_load_acquire_int(p) \
+   __sync_fetch_and_add((p), 0)
+#define retro_atomic_store_release_int(p, v) \
+   do { __sync_synchronize(); *(p) = (v); __sync_synchronize(); } while (0)
+#define retro_atomic_fetch_add_int(p, v) \
+   __sync_fetch_and_add((p), (v))
+#define retro_atomic_fetch_sub_int(p, v) \
+   __sync_fetch_and_sub((p), (v))
+
+#define retro_atomic_load_acquire_size(p) \
+   __sync_fetch_and_add((p), (size_t)0)
+#define retro_atomic_store_release_size(p, v) \
+   do { __sync_synchronize(); *(p) = (v); __sync_synchronize(); } while (0)
+#define retro_atomic_fetch_add_size(p, v) \
+   __sync_fetch_and_add((p), (v))
+#define retro_atomic_fetch_sub_size(p, v) \
+   __sync_fetch_and_sub((p), (v))
+
+/* ---- Volatile fallback ------------------------------------------------- */
+#else /* RETRO_ATOMIC_BACKEND_VOLATILE */
+
+#include <stddef.h>
+
+typedef volatile int    retro_atomic_int_t;
+typedef volatile size_t retro_atomic_size_t;
+
+#define retro_atomic_int_init(p, v)    (*(p) = (v))
+#define retro_atomic_size_init(p, v)   (*(p) = (v))
+
+/* No barriers.  Correct only on single-core or x86 TSO. */
+#define retro_atomic_load_acquire_int(p)         (*(p))
+#define retro_atomic_store_release_int(p, v)     do { *(p) = (v); } while (0)
+#define retro_atomic_fetch_add_int(p, v)         ((*(p) += (v)) - (v))
+#define retro_atomic_fetch_sub_int(p, v)         ((*(p) -= (v)) + (v))
+
+#define retro_atomic_load_acquire_size(p)        (*(p))
+#define retro_atomic_store_release_size(p, v)    do { *(p) = (v); } while (0)
+#define retro_atomic_fetch_add_size(p, v)        ((*(p) += (v)) - (v))
+#define retro_atomic_fetch_sub_size(p, v)        ((*(p) -= (v)) + (v))
+
+#endif /* backend selection */
+
+/* ---- Convenience wrappers (backend-agnostic) -------------------------- */
+
+#define retro_atomic_inc_int(p)    ((void)retro_atomic_fetch_add_int((p), 1))
+#define retro_atomic_dec_int(p)    ((void)retro_atomic_fetch_sub_int((p), 1))
+#define retro_atomic_inc_size(p)   ((void)retro_atomic_fetch_add_size((p), 1))
+#define retro_atomic_dec_size(p)   ((void)retro_atomic_fetch_sub_size((p), 1))
+
+#endif /* __LIBRETRO_SDK_ATOMIC_H */
diff --git a/libretro-common/samples/atomic/retro_atomic_test/Makefile b/libretro-common/samples/atomic/retro_atomic_test/Makefile
new file mode 100644
index 000000000000..abeda620f58d
--- /dev/null
+++ b/libretro-common/samples/atomic/retro_atomic_test/Makefile
@@ -0,0 +1,43 @@
+TARGET := retro_atomic_test
+
+LIBRETRO_COMM_DIR := ../../..
+
+# retro_atomic.h is a header-only primitive (no .c counterpart) so the
+# test only needs rthreads.c when the SPSC stress check is enabled.
+# Build with HAVE_THREADS for real coverage; without HAVE_THREADS the
+# header-only checks still run, validating the macros' single-thread
+# behaviour on platforms where threading is not available.
+HAVE_THREADS ?= 1
+
+SOURCES := retro_atomic_test.c
+
+CFLAGS  += -Wall -pedantic -std=gnu99 -g -O0 -I$(LIBRETRO_COMM_DIR)/include
+
+ifeq ($(HAVE_THREADS),1)
+   CFLAGS  += -DHAVE_THREADS
+   SOURCES += $(LIBRETRO_COMM_DIR)/rthreads/rthreads.c
+   LDFLAGS += -lpthread
+   # rthreads.c uses clock_gettime + CLOCK_REALTIME on Linux glibc; on
+   # older glibc those live in -lrt.  Harmless on newer glibc.
+   LDFLAGS += -lrt
+endif
+
+OBJS := $(SOURCES:.c=.o)
+
+ifneq ($(SANITIZER),)
+   CFLAGS  := -fsanitize=$(SANITIZER) -fno-omit-frame-pointer $(CFLAGS)
+   LDFLAGS := -fsanitize=$(SANITIZER) $(LDFLAGS)
+endif
+
+all: $(TARGET)
+
+%.o: %.c
+	$(CC) -c -o $@ $< $(CFLAGS)
+
+$(TARGET): $(OBJS)
+	$(CC) -o $@ $^ $(LDFLAGS)
+
+clean:
+	rm -f $(TARGET) $(OBJS)
+
+.PHONY: clean
diff --git a/libretro-common/samples/atomic/retro_atomic_test/retro_atomic_test.c b/libretro-common/samples/atomic/retro_atomic_test/retro_atomic_test.c
new file mode 100644
index 000000000000..f8143a356e5a
--- /dev/null
+++ b/libretro-common/samples/atomic/retro_atomic_test/retro_atomic_test.c
@@ -0,0 +1,418 @@
+/* Regression test for libretro-common/include/retro_atomic.h.
+ *
+ * Background
+ * ----------
+ * retro_atomic.h consolidates the ad-hoc atomic shims that were
+ * previously duplicated in audio/drivers/{coreaudio,coreaudio3,xaudio,
+ * opensl}.c, audio/common/mmdevice_common.c and gfx/gfx_thumbnail.c.
+ * It exposes a narrow surface (load/store with acquire/release
+ * ordering, fetch_add, fetch_sub, plus inc/dec wrappers) on int and
+ * size_t, with seven selectable backends:
+ *
+ *   1. C11 <stdatomic.h>          - modern toolchains
+ *   2. C++11 <atomic>             - C++ TUs with __cplusplus >= 201103L
+ *   3. GCC __atomic_*             - GCC 4.7+ / Clang 3.1+
+ *   4. MSVC Win32 Interlocked*    - VS2003+, OG Xbox, Xbox 360 XDK
+ *   5. Apple OSAtomic*            - PPC / pre-10.7
+ *   6. GCC __sync_*               - GCC 4.1-4.6
+ *   7. volatile fallback          - single-core / x86 TSO
+ *
+ * The header's correctness rests on each backend exposing the same
+ * sequential behaviour through the macros, so this test exercises the
+ * single-threaded behaviour exhaustively (any backend gets it wrong
+ * and we see it) and runs an SPSC stress test under one of the
+ * threading-capable backends to exercise the release/acquire pairing.
+ *
+ * What this test asserts
+ * ----------------------
+ *  1. The capability flags HAVE_RETRO_ATOMIC, RETRO_ATOMIC_BACKEND_NAME
+ *     and RETRO_ATOMIC_LOCK_FREE are defined consistently with the
+ *     selected backend (compile-time #error checks; a real-backend
+ *     selection must imply RETRO_ATOMIC_LOCK_FREE, and the volatile
+ *     fallback must NOT define RETRO_ATOMIC_LOCK_FREE).
+ *  2. Initialisers seed the slot to the requested value.
+ *  3. store_release publishes a value visible to load_acquire on the
+ *     same thread (single-thread observability).
+ *  4. fetch_add and fetch_sub return the previous value (POSIX-style)
+ *     and update the storage in place.
+ *  5. inc / dec wrappers map to fetch_add(1) / fetch_sub(1).
+ *  6. SPSC stress (HAVE_THREADS only): a producer running fetch_add
+ *     1..N and a release-store flag, paired with a consumer doing
+ *     load_acquire on the counter and the flag, sees a strictly
+ *     monotonically non-decreasing counter sequence and a final value
+ *     of exactly N.  This is the property the SPSC fifo design relies
+ *     on.  A backend that releases without ordering would be flagged
+ *     by a counter going backwards or by the consumer seeing the flag
+ *     before the writes that should have preceded it.
+ *  7. The test prints which backend was selected and whether
+ *     RETRO_ATOMIC_LOCK_FREE is defined, so a CI diff makes accidental
+ *     backend regressions obvious.
+ *
+ * What this test does NOT assert
+ * ------------------------------
+ * It does not validate hardware ordering on weakly-ordered SMP from
+ * a single host run on x86_64 (TSO masks most reordering bugs).  For
+ * the GCC backend, AArch64 / ARMv7 cross-compile + qemu user-mode
+ * has been verified locally: the test passes and the emitted asm
+ * contains real ldar/stlr instructions and ldadd*_acq_rel libcalls.
+ * The existing Switch (libnx), Wii U, PSVita, 3DS and Android CI
+ * workflows compile-test the rest of the tree on real ARM toolchains,
+ * which would catch any backend-selection regression at build time.
+ * MSVC ARM64 is the path we have not been able to validate from a
+ * Linux CI host; its correctness rests on the *Acquire / *Release
+ * Win32 forms emitting dmb (Microsoft-documented behaviour) and on
+ * the explicit __dmb brackets we add around the plain RMW path.
+ *
+ * It does not exercise compare-and-exchange or thread fences -- those
+ * are deliberately not in the API surface, since no caller in the tree
+ * needs them today.  Add them (and tests) only when motivated by a
+ * real caller.
+ *
+ * How a regression is caught
+ * --------------------------
+ * Each property check returns 1 on failure; main() sums them and
+ * exits non-zero if any tripped.  CI runs the binary with ASan +
+ * UBSan (the workflow's default), so any UB from torn writes or
+ * mistyped casts inside the macros is caught at the same time.
+ */
+
+#include <stdio.h>
+#include <stddef.h>
+
+#include <retro_atomic.h>
+
+#ifdef HAVE_THREADS
+#include <rthreads/rthreads.h>
+#endif
+
+/* ---- Capability flag sanity checks (compile-time) -------------------- */
+
+/* The header must always define HAVE_RETRO_ATOMIC after a successful
+ * include.  A regression that drops it (or makes it conditional) would
+ * silently break callers that gate on it -- this static check catches it. */
+#if !defined(HAVE_RETRO_ATOMIC)
+#error "retro_atomic.h was included but HAVE_RETRO_ATOMIC is not defined"
+#endif
+
+/* RETRO_ATOMIC_BACKEND_NAME is documented as always available. */
+#if !defined(RETRO_ATOMIC_BACKEND_NAME)
+#error "retro_atomic.h was included but RETRO_ATOMIC_BACKEND_NAME is not defined"
+#endif
+
+/* RETRO_ATOMIC_LOCK_FREE must be defined if and only if a real backend
+ * was selected.  We can't test the disjunction directly in the
+ * preprocessor, but we can assert the obvious half: every named real
+ * backend implies RETRO_ATOMIC_LOCK_FREE. */
+#if (defined(RETRO_ATOMIC_BACKEND_C11)     \
+  || defined(RETRO_ATOMIC_BACKEND_CXX11)   \
+  || defined(RETRO_ATOMIC_BACKEND_GCC_NEW) \
+  || defined(RETRO_ATOMIC_BACKEND_MSVC)    \
+  || defined(RETRO_ATOMIC_BACKEND_APPLE)   \
+  || defined(RETRO_ATOMIC_BACKEND_SYNC))   \
+   && !defined(RETRO_ATOMIC_LOCK_FREE)
+#error "a real atomic backend was selected but RETRO_ATOMIC_LOCK_FREE is not defined"
+#endif
+
+#if defined(RETRO_ATOMIC_BACKEND_VOLATILE) && defined(RETRO_ATOMIC_LOCK_FREE)
+#error "the volatile fallback was selected but RETRO_ATOMIC_LOCK_FREE was set anyway"
+#endif
+
+/* ---- Backend tag (printed once at start of run) ----------------------- */
+
+static const char *backend_name(void)
+{
+   return RETRO_ATOMIC_BACKEND_NAME;
+}
+
+/* ---- Single-threaded property checks --------------------------------- */
+
+static int check_init(void)
+{
+   retro_atomic_int_t  vi;
+   retro_atomic_size_t vs;
+
+   retro_atomic_int_init(&vi, 7);
+   retro_atomic_size_init(&vs, 99);
+
+   if (retro_atomic_load_acquire_int(&vi) != 7)
+   {
+      fprintf(stderr, "FAIL init_int: expected 7\n");
+      return 1;
+   }
+   if ((size_t)retro_atomic_load_acquire_size(&vs) != 99)
+   {
+      fprintf(stderr, "FAIL init_size: expected 99\n");
+      return 1;
+   }
+   return 0;
+}
+
+static int check_store_load(void)
+{
+   retro_atomic_int_t  vi;
+   retro_atomic_size_t vs;
+
+   retro_atomic_int_init(&vi, 0);
+   retro_atomic_size_init(&vs, 0);
+
+   retro_atomic_store_release_int(&vi, 42);
+   retro_atomic_store_release_size(&vs, (size_t)123456);
+
+   if (retro_atomic_load_acquire_int(&vi) != 42)
+   {
+      fprintf(stderr, "FAIL store_load_int\n");
+      return 1;
+   }
+   if ((size_t)retro_atomic_load_acquire_size(&vs) != 123456)
+   {
+      fprintf(stderr, "FAIL store_load_size\n");
+      return 1;
+   }
+   return 0;
+}
+
+static int check_fetch_add_returns_previous(void)
+{
+   retro_atomic_int_t  vi;
+   retro_atomic_size_t vs;
+   int    prev_i;
+   size_t prev_s;
+
+   retro_atomic_int_init(&vi, 100);
+   retro_atomic_size_init(&vs, 1000);
+
+   prev_i = retro_atomic_fetch_add_int(&vi, 5);
+   prev_s = (size_t)retro_atomic_fetch_add_size(&vs, 50);
+
+   if (prev_i != 100)
+   {
+      fprintf(stderr, "FAIL fetch_add_int returned %d, expected 100\n", prev_i);
+      return 1;
+   }
+   if (prev_s != 1000)
+   {
+      fprintf(stderr, "FAIL fetch_add_size returned %zu, expected 1000\n", prev_s);
+      return 1;
+   }
+   if (retro_atomic_load_acquire_int(&vi) != 105)
+   {
+      fprintf(stderr, "FAIL fetch_add_int post-state\n");
+      return 1;
+   }
+   if ((size_t)retro_atomic_load_acquire_size(&vs) != 1050)
+   {
+      fprintf(stderr, "FAIL fetch_add_size post-state\n");
+      return 1;
+   }
+   return 0;
+}
+
+static int check_fetch_sub_returns_previous(void)
+{
+   retro_atomic_int_t  vi;
+   retro_atomic_size_t vs;
+   int    prev_i;
+   size_t prev_s;
+
+   retro_atomic_int_init(&vi, 50);
+   retro_atomic_size_init(&vs, 500);
+
+   prev_i = retro_atomic_fetch_sub_int(&vi, 3);
+   prev_s = (size_t)retro_atomic_fetch_sub_size(&vs, 30);
+
+   if (prev_i != 50)
+   {
+      fprintf(stderr, "FAIL fetch_sub_int returned %d, expected 50\n", prev_i);
+      return 1;
+   }
+   if (prev_s != 500)
+   {
+      fprintf(stderr, "FAIL fetch_sub_size returned %zu, expected 500\n", prev_s);
+      return 1;
+   }
+   if (retro_atomic_load_acquire_int(&vi) != 47)
+   {
+      fprintf(stderr, "FAIL fetch_sub_int post-state\n");
+      return 1;
+   }
+   if ((size_t)retro_atomic_load_acquire_size(&vs) != 470)
+   {
+      fprintf(stderr, "FAIL fetch_sub_size post-state\n");
+      return 1;
+   }
+   return 0;
+}
+
+static int check_inc_dec_wrappers(void)
+{
+   retro_atomic_int_t  vi;
+   retro_atomic_size_t vs;
+   int i;
+
+   retro_atomic_int_init(&vi, 0);
+   retro_atomic_size_init(&vs, 0);
+
+   for (i = 0; i < 100; i++)
+      retro_atomic_inc_int(&vi);
+   for (i = 0; i < 30; i++)
+      retro_atomic_dec_int(&vi);
+
+   for (i = 0; i < 100; i++)
+      retro_atomic_inc_size(&vs);
+   for (i = 0; i < 30; i++)
+      retro_atomic_dec_size(&vs);
+
+   if (retro_atomic_load_acquire_int(&vi) != 70)
+   {
+      fprintf(stderr, "FAIL inc/dec int\n");
+      return 1;
+   }
+   if ((size_t)retro_atomic_load_acquire_size(&vs) != 70)
+   {
+      fprintf(stderr, "FAIL inc/dec size\n");
+      return 1;
+   }
+   return 0;
+}
+
+/* ---- SPSC stress test (HAVE_THREADS only) ---------------------------- */
+
+#ifdef HAVE_THREADS
+
+#define SPSC_N 1000000
+
+typedef struct
+{
+   retro_atomic_size_t counter;
+   retro_atomic_int_t  done;
+   /* Filled in by the consumer; checked by main. */
+   int counter_went_backwards;
+   int final_mismatch;
+   size_t final_seen;
+   int reader_runaway;
+} spsc_state_t;
+
+static void spsc_writer(void *userdata)
+{
+   spsc_state_t *st = (spsc_state_t*)userdata;
+   int i;
+   for (i = 1; i <= SPSC_N; i++)
+      retro_atomic_fetch_add_size(&st->counter, 1);
+   /* Publish the done flag *after* the counter writes; pairs with the
+    * consumer's load_acquire on `done`. */
+   retro_atomic_store_release_int(&st->done, 1);
+}
+
+static void spsc_reader(void *userdata)
+{
+   spsc_state_t *st = (spsc_state_t*)userdata;
+   size_t last  = 0;
+   int saw_done = 0;
+   /* Bound on iterations to keep CI from hanging if a backend is
+    * silently broken; SPSC_N is 1e6, the loop should converge well
+    * inside 1e8. */
+   unsigned long long loops = 0;
+
+   for (;;)
+   {
+      size_t cur = (size_t)retro_atomic_load_acquire_size(&st->counter);
+
+      if (cur < last)
+      {
+         st->counter_went_backwards = 1;
+         return;
+      }
+      last = cur;
+
+      if (!saw_done && retro_atomic_load_acquire_int(&st->done))
+         saw_done = 1;
+
+      if (saw_done && cur >= (size_t)SPSC_N)
+         break;
+
+      if (++loops > 100000000ull)
+      {
+         st->reader_runaway = 1;
+         return;
+      }
+   }
+
+   st->final_seen = last;
+   if (last != (size_t)SPSC_N)
+      st->final_mismatch = 1;
+}
+
+static int check_spsc_stress(void)
+{
+   spsc_state_t st;
+   sthread_t *tw, *tr;
+
+   retro_atomic_size_init(&st.counter, 0);
+   retro_atomic_int_init(&st.done, 0);
+   st.counter_went_backwards = 0;
+   st.final_mismatch         = 0;
+   st.final_seen             = 0;
+   st.reader_runaway         = 0;
+
+   tw = sthread_create(spsc_writer, &st);
+   tr = sthread_create(spsc_reader, &st);
+   if (!tw || !tr)
+   {
+      fprintf(stderr, "FAIL spsc: sthread_create returned NULL\n");
+      return 1;
+   }
+   sthread_join(tw);
+   sthread_join(tr);
+
+   if (st.counter_went_backwards)
+   {
+      fprintf(stderr, "FAIL spsc: counter observed going backwards\n");
+      return 1;
+   }
+   if (st.reader_runaway)
+   {
+      fprintf(stderr, "FAIL spsc: reader exceeded loop bound\n");
+      return 1;
+   }
+   if (st.final_mismatch)
+   {
+      fprintf(stderr, "FAIL spsc: final counter %zu != %d\n",
+            st.final_seen, SPSC_N);
+      return 1;
+   }
+   return 0;
+}
+
+#endif /* HAVE_THREADS */
+
+int main(void)
+{
+   int fails = 0;
+
+   printf("retro_atomic backend: %s\n", backend_name());
+#if defined(RETRO_ATOMIC_LOCK_FREE)
+   printf("retro_atomic lock-free: yes\n");
+#else
+   printf("retro_atomic lock-free: NO (volatile fallback; SMP-unsafe)\n");
+#endif
+
+   fails += check_init();
+   fails += check_store_load();
+   fails += check_fetch_add_returns_previous();
+   fails += check_fetch_sub_returns_previous();
+   fails += check_inc_dec_wrappers();
+
+#ifdef HAVE_THREADS
+   fails += check_spsc_stress();
+#else
+   printf("[skip] SPSC stress test (HAVE_THREADS not defined)\n");
+#endif
+
+   if (fails == 0)
+   {
+      printf("ALL OK\n");
+      return 0;
+   }
+   printf("%d FAILURE(S)\n", fails);
+   return 1;
+}