diff --git a/.github/workflows/Linux-libretro-common-samples.yml b/.github/workflows/Linux-libretro-common-samples.yml index 88e5989f8e4d..378e02809197 100644 --- a/.github/workflows/Linux-libretro-common-samples.yml +++ b/.github/workflows/Linux-libretro-common-samples.yml @@ -80,6 +80,7 @@ jobs: task_queue_title_error_test tpool_wait_test retro_atomic_test + retro_spsc_test ) # Per-binary run command (overrides ./ if present). @@ -289,6 +290,31 @@ jobs: TSAN_OPTIONS=halt_on_error=1 ./retro_atomic_test + - name: Run retro_spsc_test under Clang + ThreadSanitizer + shell: bash + working-directory: libretro-common/samples/queues/retro_spsc_test + run: | + # retro_spsc.c is a lock-free SPSC byte queue built on + # retro_atomic.h. Its correctness contract is acquire-load + # / release-store on the head and tail cursors, with the + # buffer reads/writes between them ordered by those barriers. + # Missing or weakened barriers produce torn data on the + # consumer side, observable as content mismatches in the + # stress harness AND as TSan-reported races. The default + # ASan/UBSan pass above catches the content mismatches but + # not the races; this lane catches both. + # + # halt_on_error=1 makes TSan exit non-zero on the first race + # rather than continuing -- which is what we want for CI: + # any race here means the SPSC contract is broken. + set -u + set -o pipefail + + make clean + CC=clang make all SANITIZER=thread + + TSAN_OPTIONS=halt_on_error=1 ./retro_spsc_test + # Cross-architecture validation lane for retro_atomic_test. # # The samples job above runs on x86_64, which is a strongly-ordered diff --git a/.github/workflows/Linux-samples-gfx.yml b/.github/workflows/Linux-samples-gfx.yml index c6ab6dbb5cbc..d2b28798f6f5 100644 --- a/.github/workflows/Linux-samples-gfx.yml +++ b/.github/workflows/Linux-samples-gfx.yml @@ -205,3 +205,86 @@ jobs: test -x vulkan_ctx_double_free_test timeout 60 ./vulkan_ctx_double_free_test echo "[pass] vulkan_ctx_double_free_test" + + - name: Build and run gfx_thumbnail_status_atomic_test (TSan) + shell: bash + working-directory: samples/gfx/gfx_thumbnail_status_atomic + run: | + set -eu + # Regression test for the cross-thread thumbnail status + # synchronisation in gfx/gfx_thumbnail.c. The .status + # field is read by the video thread (gfx_thumbnail_draw) + # and written by the upload-callback thread + # (gfx_thumbnail_handle_upload), with a release-store / + # acquire-load pairing that guards the texture / width / + # height fields published alongside the AVAILABLE + # status. Pre-port the file used a hand-rolled + # __atomic_* / _Interlocked / volatile shim. Post-port + # the .status field is retro_atomic_int_t and the + # accesses route through retro_atomic_*_int macros. + # + # ThreadSanitizer is the right validator for this test: + # it instruments every atomic load and store and would + # flag a missing-barrier regression as a data race, even + # on x86 TSO where the hardware otherwise hides the bug. + # ASan/UBSan would not catch barrier removal at all + # (no out-of-bounds access, no UB). The test also + # carries static-assertion checks that validate the + # struct layout invariant (.status size == sizeof(int)) + # so the field's atomic type does not change the + # gfx_thumbnail_t ABI. If gfx_thumbnail.c amends + # GFX_THUMB_STATUS_STORE / GFX_THUMB_STATUS_LOAD or the + # .status field type, the verbatim copies in + # gfx_thumbnail_status_atomic_test.c must follow. + make clean all SANITIZER=thread + test -x gfx_thumbnail_status_atomic_test + TSAN_OPTIONS=halt_on_error=1 timeout 60 \ + ./gfx_thumbnail_status_atomic_test + echo "[pass] gfx_thumbnail_status_atomic_test" + + - name: Build and run gfx_widgets_msg_queue_race_test (TSan) + shell: bash + working-directory: samples/gfx/gfx_widgets_msg_queue_race + run: | + set -eu + # Regression test for the producer-producer race fix on + # dispgfx_widget_t::msg_queue in gfx/gfx_widgets.c. + # Pre-fix the FIFO had no dedicated lock: producers could + # be invoked from any thread (the threaded task system at + # libretro-common/queues/task_queue.c runs a worker + # thread; gfx/video_driver.c::video_driver_frame released + # RUNLOOP_MSG_QUEUE_LOCK before calling the producer, so + # main-thread producers ran unlocked) yet there was no + # lock dedicated to msg_queue itself. Post-fix a + # msg_queue_lock is acquired around every fifo_write (in + # gfx_widgets_msg_queue_push), every fifo_read (in + # gfx_widgets_iterate), and the avail-check that gates + # them. The outer fast-path FIFO_*_AVAIL checks that + # used to wrap the function bodies were dropped: reading + # the FIFO cursors outside the lock is itself a data + # race against the locked writes (TSan-detectable; + # benign on x86 TSO but real on weak-memory hardware). + # + # ThreadSanitizer is the right validator: it instruments + # every memory access on fifo_buffer_t::end and + # ::first and would flag a missing-lock regression as a + # data race on the producer side, the consumer side, or + # the avail check. ASan/UBSan would not catch the lock + # removal at all. halt_on_error=1 makes TSan exit + # non-zero on the first observed race, which is what we + # want for CI: any race here means the production + # locking discipline is broken. + # + # The test mirrors the post-fix locking protocol of + # gfx_widgets_msg_queue_push and gfx_widgets_iterate's + # FIFO interaction; the widget allocation, font + # measurement, and animation push that wrap the actual + # FIFO calls in production are not part of the race and + # are stripped from the test. If gfx_widgets.c amends + # the locking around msg_queue, the verbatim copies in + # gfx_widgets_msg_queue_race_test.c must follow. + make clean all SANITIZER=thread + test -x gfx_widgets_msg_queue_race_test + TSAN_OPTIONS=halt_on_error=1 timeout 60 \ + ./gfx_widgets_msg_queue_race_test + echo "[pass] gfx_widgets_msg_queue_race_test" diff --git a/Makefile.common b/Makefile.common index a7cc46b1721b..291741557a16 100644 --- a/Makefile.common +++ b/Makefile.common @@ -364,6 +364,7 @@ OBJ += \ input/input_autodetect_builtin.o \ input/input_keymaps.o \ $(LIBRETRO_COMM_DIR)/queues/fifo_queue.o \ + $(LIBRETRO_COMM_DIR)/queues/retro_spsc.o \ $(LIBRETRO_COMM_DIR)/compat/compat_fnmatch.o \ $(LIBRETRO_COMM_DIR)/compat/compat_posix_string.o diff --git a/audio/drivers/asio.c b/audio/drivers/asio.c index 0b7fa9a39bcc..73652da99290 100644 --- a/audio/drivers/asio.c +++ b/audio/drivers/asio.c @@ -55,7 +55,7 @@ #include #include #include -#include +#include #ifdef HAVE_THREADS #include @@ -644,7 +644,26 @@ static void *asio_load_driver(const CLSID *clsid) typedef struct ra_asio { void *iasio; /* COM interface pointer */ - fifo_buffer_t *ring; /* Ring buffer between write() and callback */ + /* Lock-free SPSC ring buffer between asio_write (producer, main + * thread) and asio_cb_buffer_switch (consumer, ASIO callback + * thread). Pre-port this used fifo_buffer_t with no surrounding + * lock, which was a real cross-thread race on first/end -- see + * commit message. retro_spsc_t is the SPSC primitive designed + * for this exact pattern: lock-free (avoiding the priority- + * inversion concern of locking from a real-time audio callback), + * acquire/release ordering on the cursors, single-producer / + * single-consumer enforced by the type contract. + * + * Embedded by value (not via pointer) so the lifetime exactly + * tracks ra_asio_t. Initialised with retro_spsc_init in + * ra_asio_init / ra_asio_init_via_persistent and freed with + * retro_spsc_free in the corresponding teardown paths. The + * `ring_initialized` flag below distinguishes "never-initialised" + * from "init succeeded" so cleanup paths know whether to call + * retro_spsc_free. retro_spsc_t doesn't carry that bit + * internally because most callers know their own lifecycle. */ + retro_spsc_t ring; + bool ring_initialized; #ifdef HAVE_THREADS scond_t *cond; slock_t *cond_lock; @@ -712,7 +731,11 @@ static void asio_deinterleave_to_buffers(ra_asio_t *ad, long i; void *buf_l = ad->buf_info[0].buffers[index]; void *buf_r = ad->buf_info[1].buffers[index]; - size_t avail = FIFO_READ_AVAIL(ad->ring); + /* Acquire-load on the producer's head cursor. Pairs with the + * release-store inside retro_spsc_write that asio_write + * issues on the main thread, so the bytes we're about to read + * out via retro_spsc_read are guaranteed visible. */ + size_t avail = retro_spsc_read_avail(&ad->ring); long have = (long)(avail / (2 * sizeof(float))); if (have > frames) @@ -727,7 +750,7 @@ static void asio_deinterleave_to_buffers(ra_asio_t *ad, float tmp[2]; for (i = 0; i < have; i++) { - fifo_read(ad->ring, tmp, sizeof(tmp)); + retro_spsc_read(&ad->ring, tmp, sizeof(tmp)); dl[i] = tmp[0]; dr[i] = tmp[1]; } @@ -742,7 +765,7 @@ static void asio_deinterleave_to_buffers(ra_asio_t *ad, float tmp[2]; for (i = 0; i < have; i++) { - fifo_read(ad->ring, tmp, sizeof(tmp)); + retro_spsc_read(&ad->ring, tmp, sizeof(tmp)); dl[i] = (double)tmp[0]; dr[i] = (double)tmp[1]; } @@ -757,7 +780,7 @@ static void asio_deinterleave_to_buffers(ra_asio_t *ad, float tmp[2]; for (i = 0; i < have; i++) { - fifo_read(ad->ring, tmp, sizeof(tmp)); + retro_spsc_read(&ad->ring, tmp, sizeof(tmp)); dl[i] = (int32_t)((double)tmp[0] * 2147483647.0); dr[i] = (int32_t)((double)tmp[1] * 2147483647.0); } @@ -773,7 +796,7 @@ static void asio_deinterleave_to_buffers(ra_asio_t *ad, for (i = 0; i < have; i++) { int32_t l, r; - fifo_read(ad->ring, tmp, sizeof(tmp)); + retro_spsc_read(&ad->ring, tmp, sizeof(tmp)); l = (int32_t)(tmp[0] * 8388607.0f); r = (int32_t)(tmp[1] * 8388607.0f); l = l > 8388607 ? 8388607 : (l < -8388608 ? -8388608 : l); @@ -797,7 +820,7 @@ static void asio_deinterleave_to_buffers(ra_asio_t *ad, for (i = 0; i < have; i++) { int32_t l, r; - fifo_read(ad->ring, tmp, sizeof(tmp)); + retro_spsc_read(&ad->ring, tmp, sizeof(tmp)); l = (int32_t)(tmp[0] * 32767.0f); r = (int32_t)(tmp[1] * 32767.0f); dl[i] = (int16_t)(l > 32767 ? 32767 : (l < -32768 ? -32768 : l)); @@ -826,7 +849,7 @@ static void asio_cb_buffer_switch(long index, { ra_asio_t *ad = g_asio; - if (!ad || !ad->ring || ad->is_paused || ad->shutdown) + if (!ad || !ad->ring_initialized || ad->is_paused || ad->shutdown) { if (ad && ad->buf_info[0].buffers[index]) { @@ -924,8 +947,11 @@ static void asio_atexit_cleanup(void) ASIO_CALL_RELEASE(ad->iasio); } - if (ad->ring) - fifo_free(ad->ring); + if (ad->ring_initialized) + { + retro_spsc_free(&ad->ring); + ad->ring_initialized = false; + } #ifdef HAVE_THREADS if (ad->cond_lock) @@ -991,7 +1017,11 @@ static void *ra_asio_init(const char *device, unsigned rate, if (new_rate) *new_rate = ad->sample_rate; - fifo_clear(ad->ring); + /* Discard any stale audio left over from the previous + * session. Safe here because the ASIO callback isn't + * running yet (g_asio is still NULL until the next line), + * so the SPSC is single-threaded at this point. */ + retro_spsc_clear(&ad->ring); g_asio = ad; ad->running = true; @@ -1158,14 +1188,18 @@ static void *ra_asio_init(const char *device, unsigned rate, /* Create ring buffer BEFORE ASIO buffers — the driver may issue * a bufferSwitch callback during ASIOCreateBuffers, and the - * callback needs the ring buffer to exist (even if empty). */ + * callback needs the ring buffer to exist (even if empty). + * retro_spsc_init rounds capacity up to a power of 2; the + * over-allocation is small (factor of < 2) and irrelevant to + * the ASIO latency calculation, which uses ad->buffer_frames + * not the ring's actual byte capacity. */ ad->ring_size = pref_sz * 2 * sizeof(float) * ASIO_RING_MULT; - ad->ring = fifo_new(ad->ring_size); - if (!ad->ring) + if (!retro_spsc_init(&ad->ring, ad->ring_size)) { RARCH_ERR("[ASIO] Failed to create ring buffer.\n"); goto error; } + ad->ring_initialized = true; #ifdef HAVE_THREADS ad->cond = scond_new(); @@ -1229,8 +1263,11 @@ static void *ra_asio_init(const char *device, unsigned rate, ASIO_CALL_DISPOSE_BUFFERS(ad->iasio); ASIO_CALL_RELEASE(ad->iasio); } - if (ad->ring) - fifo_free(ad->ring); + if (ad->ring_initialized) + { + retro_spsc_free(&ad->ring); + ad->ring_initialized = false; + } #ifdef HAVE_THREADS if (ad->cond_lock) slock_free(ad->cond_lock); @@ -1259,17 +1296,22 @@ static ssize_t ra_asio_write(void *data, const void *buf, size_t len) if (ad->shutdown) return -1; - avail = FIFO_WRITE_AVAIL(ad->ring); + avail = retro_spsc_write_avail(&ad->ring); to_write = (len < avail) ? len : avail; /* Align to frame boundary (stereo float = 8 bytes) */ to_write = (to_write / 8) * 8; if (to_write > 0) { - fifo_write(ad->ring, src, to_write); - src += to_write; - len -= to_write; - written += to_write; + /* retro_spsc_write returns bytes actually written. We've + * already capped to_write by retro_spsc_write_avail above, + * so the return value will equal to_write -- but use it + * defensively in case the contract ever changes. */ + size_t actually_written = + retro_spsc_write(&ad->ring, src, to_write); + src += actually_written; + len -= actually_written; + written += actually_written; } else if (!ad->nonblock) { @@ -1346,9 +1388,14 @@ static void ra_asio_free(void *data) /* Detach from the callback — silence output while parked */ g_asio = NULL; - /* Flush stale audio */ - if (ad->ring) - fifo_clear(ad->ring); + /* No retro_spsc_clear here. The pre-port fifo_clear at this + * site was racy with stray ASIO callbacks that may still be + * running after ASIO_CALL_STOP returns (some drivers, notably + * ASIO4ALL, don't synchronously join their audio thread). + * The restart path in ra_asio_init_via_persistent calls + * retro_spsc_clear anyway, so any stale data will be flushed + * before the next run -- after the callback is provably + * stopped (g_asio == NULL gates it). */ /* Store for reuse */ g_asio_persistent = ad; @@ -1361,9 +1408,9 @@ static bool ra_asio_use_float(void *data) { return true; } static size_t ra_asio_write_avail(void *data) { ra_asio_t *ad = (ra_asio_t *)data; - if (!ad || !ad->ring) + if (!ad || !ad->ring_initialized) return 0; - return FIFO_WRITE_AVAIL(ad->ring); + return retro_spsc_write_avail(&ad->ring); } static size_t ra_asio_buffer_size(void *data) diff --git a/audio/drivers/coreaudio_mic_macos.m b/audio/drivers/coreaudio_mic_macos.m index 77270a9348a6..e7cc7dccb277 100644 --- a/audio/drivers/coreaudio_mic_macos.m +++ b/audio/drivers/coreaudio_mic_macos.m @@ -333,7 +333,6 @@ void coreaudio_macos_microphone_free(void *data) static struct string_list *coreaudio_macos_microphone_device_list_new(const void *data) { - RARCH_LOG("[CoreAudio macOS Mic] device_list_new called.\n"); struct string_list *list = string_list_new(); if (!list) { @@ -462,8 +461,6 @@ void coreaudio_macos_microphone_free(void *data) static void coreaudio_macos_microphone_device_list_free(const void *data, struct string_list *list) { - (void)data; /* Not used in this implementation */ - RARCH_LOG("[CoreAudio macOS Mic] device_list_free called.\n"); if (list) { for (size_t i = 0; i < list->size; i++) @@ -818,7 +815,6 @@ static bool coreaudio_macos_microphone_mic_alive(const void *data, const void *m static bool coreaudio_macos_microphone_start_mic(void *data, void *mic_data) { - RARCH_LOG("[CoreAudio macOS Mic] start_mic called. Mic context: %p\n", mic_data); coreaudio_macos_microphone_t *mic = (coreaudio_macos_microphone_t *)mic_data; if (!mic || !mic->audio_unit || !atomic_load(&mic->is_initialized)) { @@ -827,10 +823,7 @@ static bool coreaudio_macos_microphone_start_mic(void *data, void *mic_data) } if (atomic_load(&mic->is_running)) - { - RARCH_LOG("[CoreAudio macOS Mic] Already running.\n"); return true; - } /* Check microphone permission on macOS */ RARCH_LOG("[CoreAudio macOS Mic] Checking microphone permission...\n"); @@ -849,7 +842,7 @@ static bool coreaudio_macos_microphone_start_mic(void *data, void *mic_data) if (perm_status != noErr || default_input_device == kAudioObjectUnknown) { RARCH_ERR("[CoreAudio macOS Mic] No default input device available or permission denied. Status: %d, Device ID: %u\n", - (int)perm_status, (unsigned)default_input_device); + (int)perm_status, (unsigned)default_input_device); } else { @@ -858,8 +851,8 @@ static bool coreaudio_macos_microphone_start_mic(void *data, void *mic_data) if (!mic->fifo) { - RARCH_ERR("[CoreAudio macOS Mic] No FIFO buffer available\n"); - return false; + RARCH_ERR("[CoreAudio macOS Mic] No FIFO buffer available\n"); + return false; } slock_lock(mic->fifo_lock); fifo_clear(mic->fifo); @@ -872,26 +865,20 @@ static bool coreaudio_macos_microphone_start_mic(void *data, void *mic_data) RARCH_LOG("[CoreAudio macOS Mic] Microphone started successfully\n"); return true; } - else - { - RARCH_ERR("[CoreAudio macOS Mic] Failed to start AudioUnit: %d (0x%x)\n", (int)status, (unsigned)status); - atomic_store(&mic->is_running, false); - return false; - } + + RARCH_ERR("[CoreAudio macOS Mic] Failed to start AudioUnit: %d (0x%x)\n", (int)status, (unsigned)status); + atomic_store(&mic->is_running, false); + return false; } static bool coreaudio_macos_microphone_stop_mic(void *data, void *mic_data) { coreaudio_macos_microphone_t *mic = (coreaudio_macos_microphone_t *)mic_data; if (!mic || !mic->audio_unit) - { return true; /* Considered stopped if not valid */ - } if (!atomic_load(&mic->is_running)) - { return true; - } OSStatus status = AudioOutputUnitStop(mic->audio_unit); if (status == noErr) @@ -905,21 +892,17 @@ static bool coreaudio_macos_microphone_stop_mic(void *data, void *mic_data) } return true; } - else - { - RARCH_ERR("[CoreAudio macOS Mic] Failed to stop AudioUnit: %d\n", (int)status); - /* Even if stop fails, we mark as not running from our perspective */ - atomic_store(&mic->is_running, false); - return false; - } + + RARCH_ERR("[CoreAudio macOS Mic] Failed to stop AudioUnit: %d\n", (int)status); + /* Even if stop fails, we mark as not running from our perspective */ + atomic_store(&mic->is_running, false); + return false; } static bool coreaudio_macos_microphone_mic_use_float(const void *data, const void *mic_data) { - (void)data; coreaudio_macos_microphone_t *mic = (coreaudio_macos_microphone_t *)mic_data; - bool result = mic && mic->use_float; - return result; + return mic && mic->use_float; } diff --git a/frontend/drivers/platform_darwin.m b/frontend/drivers/platform_darwin.m index 41b2c9608c90..0cb1dead5a4a 100644 --- a/frontend/drivers/platform_darwin.m +++ b/frontend/drivers/platform_darwin.m @@ -27,6 +27,7 @@ #ifdef HAVE_GCD #include #include +#include #endif #include @@ -144,7 +145,7 @@ * any already-dispatched event handler invocation keeps * running to completion on its target queue, which is the * global concurrent queue here. The event handler - * dereferences &watch_data->has_changes, so we cannot free + * dereferences &watch_data->event_count, so we cannot free * watch_data until we're sure no in-flight handler remains. * The cancel handler fires once all handler invocations * have drained, so waiting on this semaphore before free() @@ -159,7 +160,27 @@ dispatch_queue_t queue; /* Dispatch queue for event handlers */ darwin_watch_entry_t *watches; /* Array of watch entries */ size_t watch_count; /* Number of active watches */ - volatile int32_t has_changes; /* Atomic flag indicating changes occurred */ + /* Monotonic event counter. Setter (GCD event handler thread) + * increments via retro_atomic_fetch_add_int when the + * filesystem reports an event; reader (main thread, in + * frontend_darwin_check_for_path_changes) acquire-loads it + * and compares against last_seen below. + * + * The previous design used a binary flag set/cleared via + * OSAtomicCompareAndSwap32, but Apple deprecated OSAtomic.h + * in 10.12 (2016) in favour of . Replacing the + * flag with a counter is also strictly more flexible: the + * "did anything change?" semantic is preserved exactly + * (now != last_seen), and the count is available if a future + * caller wants to batch events. retro_atomic.h provides the + * portable fetch_add / load_acquire primitives needed; no + * compare-exchange operation is necessary, because the + * monotonic counter never needs to be read-and-cleared + * atomically (last_seen is main-thread-only state). */ + retro_atomic_int_t event_count; + /* Reader-side cursor. Touched only by the main thread in + * frontend_darwin_check_for_path_changes; not atomic. */ + int last_seen; int flags; /* Event flags to monitor */ } darwin_watch_data_t; #endif @@ -929,8 +950,8 @@ static void darwin_watch_data_free(darwin_watch_data_t *watch_data) * handler to fire. dispatch_source_cancel is * asynchronous - it only marks the source as * cancelled. Any already-dispatched event handler - * invocation (the one that reads - * &watch_data->has_changes) keeps running to + * invocation (the one that increments + * &watch_data->event_count) keeps running to * completion on the global concurrent queue. The * cancel handler is guaranteed to fire AFTER all * pending event handler invocations have drained, @@ -938,7 +959,7 @@ static void darwin_watch_data_free(darwin_watch_data_t *watch_data) * is the standard 'wait until source is fully * quiesced' pattern. Without this wait a racing * event handler would NUL-deref or, worse, write - * into freed memory for has_changes. */ + * into freed memory for event_count. */ dispatch_source_cancel(watch_data->watches[i].source); if (watch_data->watches[i].cancel_sem) { @@ -989,7 +1010,12 @@ static void frontend_darwin_watch_path_for_changes( watch_data->watches = (darwin_watch_entry_t*)calloc( list->size, sizeof(darwin_watch_entry_t)); watch_data->flags = flags; - watch_data->has_changes = 0; + /* watch_data was calloc'd, so event_count and last_seen are + * already zero-bit; but plain assignment to a + * retro_atomic_int_t is illegal under the C11 stdatomic + * backend, so use the init helper for the atomic field. */ + retro_atomic_int_init(&watch_data->event_count, 0); + watch_data->last_seen = 0; if (!watch_data->watches) { @@ -1052,12 +1078,25 @@ static void frontend_darwin_watch_path_for_changes( if (source) { - /* Set up event handler - sets atomic flag when changes - * occur. This block captures watch_data by pointer and - * the cancel-handler synchronisation below is what keeps - * the capture safe against the teardown path. */ + /* Set up event handler - bump the atomic event + * counter when a filesystem event fires. This + * block captures watch_data by pointer and the + * cancel-handler synchronisation below is what + * keeps the capture safe against the teardown + * path. + * + * fetch_add (rather than the previous CAS(0, 1)) + * always stores, but the rate is bounded by GCD + * vnode events (sub-millisecond/day in normal + * use), so the extra store cost is below + * measurement noise. In return we get a + * monotonic counter that the reader can compare + * against its last_seen cursor without losing + * notifications, and we drop the dependency on + * OSAtomic.h which Apple deprecated in 10.12. */ dispatch_source_set_event_handler(source, ^{ - OSAtomicCompareAndSwap32(0, 1, &watch_data->has_changes); + retro_atomic_fetch_add_int( + &watch_data->event_count, 1); }); /* Cancel handler signals cancel_sem. darwin_watch_ @@ -1122,8 +1161,30 @@ static bool frontend_darwin_check_for_path_changes( watch_data = (darwin_watch_data_t*)(change_data->data); - /* Atomically read and clear the flag */ - return OSAtomicCompareAndSwap32(1, 0, &watch_data->has_changes); + /* Acquire-load the producer's counter and compare against + * our reader-side cursor. If they differ, at least one + * event fired since the last call -- update the cursor and + * return true. + * + * No CAS is needed because last_seen is main-thread-only + * state. The acquire-load pairs with the producer's + * fetch_add (which has acq_rel semantics in retro_atomic.h), + * so any data the producer published before its increment + * is visible to us by the time we read here. + * + * The counter is monotonic and never reset, so over a long + * enough run it would wrap around -- but on a 32-bit signed + * int at filesystem-event rates this takes hundreds of + * years. The `now != last_seen` comparison remains correct + * across wraparound under modular int arithmetic; any non- + * zero (now - last_seen) means the producer advanced. */ + { + int now = retro_atomic_load_acquire_int( + &watch_data->event_count); + bool changed = (now != watch_data->last_seen); + watch_data->last_seen = now; + return changed; + } } #endif diff --git a/gfx/gfx_thumbnail.c b/gfx/gfx_thumbnail.c index 4703b959393e..624a7238652c 100644 --- a/gfx/gfx_thumbnail.c +++ b/gfx/gfx_thumbnail.c @@ -43,41 +43,60 @@ #define DEFAULT_GFX_THUMBNAIL_STREAM_DELAY 16.66667f * 3 #define DEFAULT_GFX_THUMBNAIL_FADE_DURATION 166.66667f -/* Helpers for cross-thread thumbnail status synchronisation. +/* The thumbnail .status field is atomically-typed (see the + * gfx_thumbnail_t comment in gfx_thumbnail.h). Reads and writes + * therefore must go through the retro_atomic API rather than + * plain assignment / dereference; on the C11 stdatomic and C++11 + * std::atomic backends, plain access to such a field is illegal. * - * The main thread writes texture/width/height then publishes - * status = AVAILABLE. The video thread reads status; if it - * sees AVAILABLE, the data fields must be visible. + * The two cross-thread sites in this file are noted at the + * call sites: + * - The release-stores in gfx_thumbnail_handle_upload publish + * prior texture / width / height writes. + * - The acquire-load in gfx_thumbnail_draw pairs with those. + * All other accesses are single-threaded; they go through the + * retro_atomic API only because the field's atomic-typed + * storage requires it. The cost on weak-memory ARM/PowerPC is + * one extra ldar/stlr per cold-path access; on x86 TSO the + * barriers compile out entirely. * - * On GCC/Clang (covers ARM64, x86, PowerPC, MIPS): - * release-store / acquire-load give the required ordering. - * On MSVC (x86/x64 only in RetroArch's target matrix): - * x86 total-store-order makes plain volatile sufficient, - * but _InterlockedOr gives an explicit acquire load. - * Fallback: - * Plain volatile read/write — sufficient on single-core - * and x86. Platforms with weak ordering that lack GCC - * builtins do not run threaded video in practice. - */ -#if defined(__clang__) || (defined(__GNUC__) && \ - ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7))) -#define GFX_THUMB_STATUS_STORE(ptr, val) \ - __atomic_store_n((int*)(ptr), (int)(val), __ATOMIC_RELEASE) -#define GFX_THUMB_STATUS_LOAD(ptr) \ - ((enum gfx_thumbnail_status)__atomic_load_n((int*)(ptr), __ATOMIC_ACQUIRE)) -#elif defined(_MSC_VER) -#include -#define GFX_THUMB_STATUS_STORE(ptr, val) \ - _InterlockedExchange((volatile long*)(ptr), (long)(val)) -#define GFX_THUMB_STATUS_LOAD(ptr) \ - ((enum gfx_thumbnail_status)_InterlockedOr((volatile long*)(ptr), 0)) + * The wrappers are static-inline (rather than function-like + * macros) so callers have a clear function boundary. GCC 13+ + * at -O3 emits a spurious -Wstringop-overflow on the inlined + * __atomic_* primitive when called from gfx_thumbnail_request, + * because the optimiser cannot prove the @c thumbnail argument + * non-NULL across every `goto end:` flow-graph path; the + * suppression below is a targeted fix that does not affect + * codegen. No effect on non-GCC backends. */ + +#if defined(__GNUC__) && !defined(__clang__) && __GNUC__ >= 12 +# define GFX_THUMB_STATUS_DIAG_PUSH \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wstringop-overflow\"") +# define GFX_THUMB_STATUS_DIAG_POP \ + _Pragma("GCC diagnostic pop") #else -#define GFX_THUMB_STATUS_STORE(ptr, val) \ - do { (*(volatile int*)(ptr)) = (int)(val); } while(0) -#define GFX_THUMB_STATUS_LOAD(ptr) \ - ((enum gfx_thumbnail_status)(*(volatile int*)(ptr))) +# define GFX_THUMB_STATUS_DIAG_PUSH +# define GFX_THUMB_STATUS_DIAG_POP #endif +GFX_THUMB_STATUS_DIAG_PUSH +static INLINE void gfx_thumb_status_store( + retro_atomic_int_t *ptr, enum gfx_thumbnail_status val) +{ + retro_atomic_store_release_int(ptr, (int)val); +} + +static INLINE enum gfx_thumbnail_status gfx_thumb_status_load( + retro_atomic_int_t *ptr) +{ + return (enum gfx_thumbnail_status)retro_atomic_load_acquire_int(ptr); +} +GFX_THUMB_STATUS_DIAG_POP + +#define GFX_THUMB_STATUS_STORE(ptr, val) gfx_thumb_status_store((ptr), (val)) +#define GFX_THUMB_STATUS_LOAD(ptr) gfx_thumb_status_load((ptr)) + /* Utility structure, sent as userdata when pushing * an image load */ typedef struct @@ -152,9 +171,9 @@ static void gfx_thumbnail_init_fade( /* A 'fade in' animation is triggered if: * - Thumbnail is available * - Thumbnail is missing and 'fade_missing' is enabled */ - if ( (thumbnail->status == GFX_THUMBNAIL_STATUS_AVAILABLE) + if ( (GFX_THUMB_STATUS_LOAD(&thumbnail->status) == GFX_THUMBNAIL_STATUS_AVAILABLE) || (p_gfx_thumb->fade_missing - && (thumbnail->status == GFX_THUMBNAIL_STATUS_MISSING))) + && (GFX_THUMB_STATUS_LOAD(&thumbnail->status) == GFX_THUMBNAIL_STATUS_MISSING))) { if (p_gfx_thumb->fade_duration > 0.0f) { @@ -198,7 +217,7 @@ static void gfx_thumbnail_handle_upload( goto end; /* Only process image if we are waiting for it */ - if (thumbnail_tag->thumbnail->status != GFX_THUMBNAIL_STATUS_PENDING) + if (GFX_THUMB_STATUS_LOAD(&thumbnail_tag->thumbnail->status) != GFX_THUMBNAIL_STATUS_PENDING) goto end; /* Sanity check: if thumbnail already has a texture, @@ -346,7 +365,7 @@ void gfx_thumbnail_request( /* Reset thumbnail, then set 'missing' status by default * (saves a number of checks later) */ gfx_thumbnail_reset(thumbnail); - thumbnail->status = GFX_THUMBNAIL_STATUS_MISSING; + GFX_THUMB_STATUS_STORE(&thumbnail->status, GFX_THUMBNAIL_STATUS_MISSING); /* Update/extract thumbnail path */ if (gfx_thumbnail_is_enabled(path_data, thumbnail_id)) @@ -375,7 +394,7 @@ void gfx_thumbnail_request( thumbnail_path, (video_driver_get_disp_flags() & VIDEO_FLAG_USE_RGBA), gfx_thumbnail_upscale_threshold, gfx_thumbnail_handle_upload, thumbnail_tag)) - thumbnail->status = GFX_THUMBNAIL_STATUS_PENDING; + GFX_THUMB_STATUS_STORE(&thumbnail->status, GFX_THUMBNAIL_STATUS_PENDING); } #ifdef HAVE_NETWORKING /* Handle on demand thumbnail downloads */ @@ -433,7 +452,7 @@ void gfx_thumbnail_request( end: /* Trigger 'fade in' animation, if required */ - if (thumbnail->status != GFX_THUMBNAIL_STATUS_PENDING) + if (GFX_THUMB_STATUS_LOAD(&thumbnail->status) != GFX_THUMBNAIL_STATUS_PENDING) gfx_thumbnail_init_fade(p_gfx_thumb, thumbnail); } @@ -459,7 +478,7 @@ void gfx_thumbnail_request_file( /* Reset thumbnail, then set 'missing' status by default * (saves a number of checks later) */ gfx_thumbnail_reset(thumbnail); - thumbnail->status = GFX_THUMBNAIL_STATUS_MISSING; + GFX_THUMB_STATUS_STORE(&thumbnail->status, GFX_THUMBNAIL_STATUS_MISSING); /* Check if file path is valid */ if ( (!file_path || !*file_path) @@ -480,7 +499,7 @@ void gfx_thumbnail_request_file( file_path, (video_driver_get_disp_flags() & VIDEO_FLAG_USE_RGBA), gfx_thumbnail_upscale_threshold, gfx_thumbnail_handle_upload, thumbnail_tag)) - thumbnail->status = GFX_THUMBNAIL_STATUS_PENDING; + GFX_THUMB_STATUS_STORE(&thumbnail->status, GFX_THUMBNAIL_STATUS_PENDING); } /* Resets (and free()s the current texture of) the @@ -502,7 +521,7 @@ void gfx_thumbnail_reset(gfx_thumbnail_t *thumbnail) } /* Reset all parameters */ - thumbnail->status = GFX_THUMBNAIL_STATUS_UNKNOWN; + GFX_THUMB_STATUS_STORE(&thumbnail->status, GFX_THUMBNAIL_STATUS_UNKNOWN); thumbnail->texture = 0; thumbnail->width = 0; thumbnail->height = 0; @@ -549,7 +568,7 @@ void gfx_thumbnail_request_stream( /* Only process request if current status * is GFX_THUMBNAIL_STATUS_UNKNOWN */ if ( !thumbnail - || (thumbnail->status != GFX_THUMBNAIL_STATUS_UNKNOWN)) + || (GFX_THUMB_STATUS_LOAD(&thumbnail->status) != GFX_THUMBNAIL_STATUS_UNKNOWN)) return; /* Check if stream delay timer has elapsed */ @@ -564,7 +583,7 @@ void gfx_thumbnail_request_stream( * > Reset thumbnail and set missing status * to prevent repeated load attempts */ gfx_thumbnail_reset(thumbnail); - thumbnail->status = GFX_THUMBNAIL_STATUS_MISSING; + GFX_THUMB_STATUS_STORE(&thumbnail->status, GFX_THUMBNAIL_STATUS_MISSING); thumbnail->alpha = 1.0f; return; } @@ -615,8 +634,8 @@ void gfx_thumbnail_request_streams( /* Only process request if current status * is GFX_THUMBNAIL_STATUS_UNKNOWN */ - process_r = (right_thumbnail->status == GFX_THUMBNAIL_STATUS_UNKNOWN); - process_l = (left_thumbnail->status == GFX_THUMBNAIL_STATUS_UNKNOWN); + process_r = (GFX_THUMB_STATUS_LOAD(&right_thumbnail->status) == GFX_THUMBNAIL_STATUS_UNKNOWN); + process_l = (GFX_THUMB_STATUS_LOAD(&left_thumbnail->status) == GFX_THUMBNAIL_STATUS_UNKNOWN); if (process_r || process_l) { @@ -652,14 +671,14 @@ void gfx_thumbnail_request_streams( if (request_r) { gfx_thumbnail_reset(right_thumbnail); - right_thumbnail->status = GFX_THUMBNAIL_STATUS_MISSING; + GFX_THUMB_STATUS_STORE(&right_thumbnail->status, GFX_THUMBNAIL_STATUS_MISSING); right_thumbnail->alpha = 1.0f; } if (request_l) { gfx_thumbnail_reset(left_thumbnail); - left_thumbnail->status = GFX_THUMBNAIL_STATUS_MISSING; + GFX_THUMB_STATUS_STORE(&left_thumbnail->status, GFX_THUMBNAIL_STATUS_MISSING); left_thumbnail->alpha = 1.0f; } @@ -712,7 +731,7 @@ void gfx_thumbnail_process_stream( /* Entry is on-screen * > Only process if current status is * GFX_THUMBNAIL_STATUS_UNKNOWN */ - if (thumbnail->status == GFX_THUMBNAIL_STATUS_UNKNOWN) + if (GFX_THUMB_STATUS_LOAD(&thumbnail->status) == GFX_THUMBNAIL_STATUS_UNKNOWN) { gfx_thumbnail_state_t *p_gfx_thumb = &gfx_thumb_st; @@ -729,7 +748,7 @@ void gfx_thumbnail_process_stream( /* Content is invalid * > Reset thumbnail and set missing status */ gfx_thumbnail_reset(thumbnail); - thumbnail->status = GFX_THUMBNAIL_STATUS_MISSING; + GFX_THUMB_STATUS_STORE(&thumbnail->status, GFX_THUMBNAIL_STATUS_MISSING); thumbnail->alpha = 1.0f; return; } @@ -748,7 +767,7 @@ void gfx_thumbnail_process_stream( * > If status is GFX_THUMBNAIL_STATUS_UNKNOWN, * thumbnail is already in a blank state - but we * must ensure that delay timer is set to zero */ - if (thumbnail->status == GFX_THUMBNAIL_STATUS_UNKNOWN) + if (GFX_THUMB_STATUS_LOAD(&thumbnail->status) == GFX_THUMBNAIL_STATUS_UNKNOWN) thumbnail->delay_timer = 0.0f; /* In all other cases, reset thumbnail */ else @@ -786,8 +805,8 @@ void gfx_thumbnail_process_streams( /* Entry is on-screen * > Only process if current status is * GFX_THUMBNAIL_STATUS_UNKNOWN */ - bool process_r = (right_thumbnail->status == GFX_THUMBNAIL_STATUS_UNKNOWN); - bool process_l = (left_thumbnail->status == GFX_THUMBNAIL_STATUS_UNKNOWN); + bool process_r = (GFX_THUMB_STATUS_LOAD(&right_thumbnail->status) == GFX_THUMBNAIL_STATUS_UNKNOWN); + bool process_l = (GFX_THUMB_STATUS_LOAD(&left_thumbnail->status) == GFX_THUMBNAIL_STATUS_UNKNOWN); if (process_r || process_l) { @@ -824,14 +843,14 @@ void gfx_thumbnail_process_streams( if (request_r) { gfx_thumbnail_reset(right_thumbnail); - right_thumbnail->status = GFX_THUMBNAIL_STATUS_MISSING; + GFX_THUMB_STATUS_STORE(&right_thumbnail->status, GFX_THUMBNAIL_STATUS_MISSING); right_thumbnail->alpha = 1.0f; } if (request_l) { gfx_thumbnail_reset(left_thumbnail); - left_thumbnail->status = GFX_THUMBNAIL_STATUS_MISSING; + GFX_THUMB_STATUS_STORE(&left_thumbnail->status, GFX_THUMBNAIL_STATUS_MISSING); left_thumbnail->alpha = 1.0f; } @@ -860,12 +879,12 @@ void gfx_thumbnail_process_streams( * thumbnail is already in a blank state - but we * must ensure that delay timer is set to zero * > In all other cases, reset thumbnail */ - if (right_thumbnail->status == GFX_THUMBNAIL_STATUS_UNKNOWN) + if (GFX_THUMB_STATUS_LOAD(&right_thumbnail->status) == GFX_THUMBNAIL_STATUS_UNKNOWN) right_thumbnail->delay_timer = 0.0f; else gfx_thumbnail_reset(right_thumbnail); - if (left_thumbnail->status == GFX_THUMBNAIL_STATUS_UNKNOWN) + if (GFX_THUMB_STATUS_LOAD(&left_thumbnail->status) == GFX_THUMBNAIL_STATUS_UNKNOWN) left_thumbnail->delay_timer = 0.0f; else gfx_thumbnail_reset(left_thumbnail); @@ -1038,7 +1057,12 @@ void gfx_thumbnail_draw( thumb_snapshot.height = thumb_height; thumb_snapshot.alpha = thumb_alpha; thumb_snapshot.flags = thumb_flags; - thumb_snapshot.status = GFX_THUMBNAIL_STATUS_AVAILABLE; + /* Local snapshot: initialise the atomic-typed status + * field via _init since this is its first write -- + * direct assignment to the C11/C++11 atomic forms is + * illegal. This local is never observed by another + * thread, so no ordering is required. */ + retro_atomic_int_init(&thumb_snapshot.status, GFX_THUMBNAIL_STATUS_AVAILABLE); thumb_snapshot.delay_timer = 0.0f; gfx_thumbnail_get_draw_dimensions( &thumb_snapshot, width, height, scale_factor, diff --git a/gfx/gfx_thumbnail.h b/gfx/gfx_thumbnail.h index 686407e0f919..f38669b2df26 100644 --- a/gfx/gfx_thumbnail.h +++ b/gfx/gfx_thumbnail.h @@ -28,6 +28,7 @@ #include #include +#include #include "gfx_animation.h" @@ -185,7 +186,20 @@ enum gfx_thumbnail_flags }; /* Holds all runtime parameters associated with - * an entry thumbnail */ + * an entry thumbnail. + * + * @c status is read by the video thread (via + * gfx_thumbnail_draw) and written by both the upload-callback + * thread (release-store after publishing texture/width/height) + * and the main thread (a number of plain transitions during menu + * processing). retro_atomic_int_t backs the field with an + * atomic-typed integer that is safe to read/write through the + * retro_atomic_*_int API on every supported backend. + * + * Same size and alignment as plain int on every backend; struct + * layout is unchanged. The cost of the acquire/release barriers + * on weak-memory ARM/PowerPC is negligible at this field's + * access rates (menu and frame draw, never per-sample). */ typedef struct { uintptr_t texture; @@ -193,10 +207,37 @@ typedef struct unsigned height; float alpha; float delay_timer; - enum gfx_thumbnail_status status; + retro_atomic_int_t status; uint8_t flags; } gfx_thumbnail_t; +/* Field-by-field initializer for non-trivial gfx_thumbnail_t. + * + * Now that .status is atomically-typed, a wholesale + * memset(t, 0, sizeof(*t)) of a struct containing this type + * warns under CXX_BUILD's C++ compile (the struct is no longer + * trivially-copyable per C++11), even though the resulting + * bytes are identical. RetroArch's CXX_BUILD mode + * compiles every .c file as C++, so this helper is required + * for clean builds, not just style. + * + * gfx_thumbnail_init_blank zero-inits the small struct field + * by field, using retro_atomic_int_init() for the atomic field + * so the first write is well-defined under C11 stdatomic and + * C++11 std::atomic. Equivalent in effect to the pre-port + * memset on every real backend (status field is also zero -> + * UNKNOWN). */ +static INLINE void gfx_thumbnail_init_blank(gfx_thumbnail_t *t) +{ + t->texture = 0; + t->width = 0; + t->height = 0; + t->alpha = 0.0f; + t->delay_timer = 0.0f; + retro_atomic_int_init(&t->status, 0 /* GFX_THUMBNAIL_STATUS_UNKNOWN */); + t->flags = 0; +} + /* Holds all configuration parameters associated * with a thumbnail shadow effect */ typedef struct diff --git a/gfx/gfx_widgets.c b/gfx/gfx_widgets.c index b633fc5f007b..2b7ff979e0f0 100644 --- a/gfx/gfx_widgets.c +++ b/gfx/gfx_widgets.c @@ -189,6 +189,13 @@ static void msg_widget_msg_transition_animation_done(void *userdata) msg->msg_transition_animation = 0.0f; } +/* Forward declaration: msg_queue_free is defined further below + * (around line 550) but is needed by msg_queue_push for the + * race-loss rollback path added in the producer-locking fix. */ +static void gfx_widgets_msg_queue_free( + dispgfx_widget_t *p_dispwidget, + disp_widget_msg_t *msg); + void gfx_widgets_msg_queue_push( retro_task_t *task, const char *msg, @@ -203,7 +210,18 @@ void gfx_widgets_msg_queue_push( disp_widget_msg_t *msg_widget = NULL; dispgfx_widget_t *p_dispwidget = &dispwidget_st; - if (FIFO_WRITE_AVAIL_NONPTR(p_dispwidget->msg_queue) > 0) + /* The outer FIFO_WRITE_AVAIL fast-path check that used to wrap + * this function body has been removed: reading the FIFO cursors + * outside msg_queue_lock is a data race against the producer + * lock-protected fifo_write below (TSan-detectable; benign on + * x86 TSO but real on weak-memory hardware). The locked + * avail re-check at the fifo_write site is the correctness gate. + * + * Removing the outer gate also fixes a latent behaviour bug: + * the update-existing branch (else clause below) does not write + * to the FIFO -- it only mutates an already-tracked widget -- + * so suppressing it on FIFO-full was wrong. Existing widgets + * now get updated regardless of FIFO state. */ { /* Get current msg if it exists */ if (task && task->frontend_userdata) @@ -376,8 +394,45 @@ void gfx_widgets_msg_queue_push( msg_widget->text_height *= 2; } - fifo_write(&p_dispwidget->msg_queue, - &msg_widget, sizeof(msg_widget)); + /* Lock the FIFO across the avail re-check + fifo_write so + * concurrent producers can't both pass the check and + * overwrite each other's data. fifo_write itself does no + * bounds checking -- it silently wraps -- so the inner + * avail check is the actual gate. + * + * The outer FIFO_WRITE_AVAIL_NONPTR check at the top of + * this function is a fast-path bail and is intentionally + * left unlocked: a stale read there at worst causes us to + * skip a single message that we could have queued, which + * is recoverable on the next call. The check below is the + * one whose accuracy matters for correctness. */ + { +#ifdef HAVE_THREADS + bool fifo_full; + slock_lock(p_dispwidget->msg_queue_lock); + fifo_full = (FIFO_WRITE_AVAIL_NONPTR(p_dispwidget->msg_queue) + < sizeof(msg_widget)); + if (!fifo_full) + fifo_write(&p_dispwidget->msg_queue, + &msg_widget, sizeof(msg_widget)); + slock_unlock(p_dispwidget->msg_queue_lock); + + if (fifo_full) + { + /* Lost the race against another producer. Roll back + * the widget we just allocated -- nobody else has a + * reference to it yet (we only got here from the + * spawn-new branch, where msg_widget is freshly + * allocated above), so this is safe. */ + gfx_widgets_msg_queue_free(p_dispwidget, msg_widget); + free(msg_widget); + return; + } +#else + fifo_write(&p_dispwidget->msg_queue, + &msg_widget, sizeof(msg_widget)); +#endif + } } /* Update task info */ else @@ -962,9 +1017,16 @@ void gfx_widgets_iterate( /* Messages queue */ - /* Consume one message if available */ - if ((FIFO_READ_AVAIL_NONPTR(p_dispwidget->msg_queue) > 0) - && !(p_dispwidget->flags & DISPGFX_WIDGET_FLAG_MOVING) + /* Consume one message if available. The outer condition no + * longer reads FIFO_READ_AVAIL_NONPTR -- doing so outside + * msg_queue_lock would race with concurrent producer fifo_writes + * (TSan-detectable; benign on x86 TSO but real on weak-memory + * hardware). The locked re-check at the fifo_read site below + * is the correctness gate. current_msgs_size and the MOVING + * flag remain in the outer guard -- they are unrelated to the + * msg_queue race; their own synchronisation discipline is + * handled by current_msgs_lock and is unchanged here. */ + if ( !(p_dispwidget->flags & DISPGFX_WIDGET_FLAG_MOVING) && (p_dispwidget->current_msgs_size < ARRAY_SIZE(p_dispwidget->current_msgs))) { disp_widget_msg_t *msg_widget = NULL; @@ -975,9 +1037,21 @@ void gfx_widgets_iterate( if (p_dispwidget->current_msgs_size < ARRAY_SIZE(p_dispwidget->current_msgs)) { + /* Lock around the FIFO read to serialise against + * concurrent producer fifo_writes. Held strictly inside + * current_msgs_lock (which protects current_msgs[]) -- + * lock order is consistent with all other call sites: + * msg_queue_lock is always the inner lock when both + * are held. */ +#ifdef HAVE_THREADS + slock_lock(p_dispwidget->msg_queue_lock); +#endif if (FIFO_READ_AVAIL_NONPTR(p_dispwidget->msg_queue) > 0) fifo_read(&p_dispwidget->msg_queue, &msg_widget, sizeof(msg_widget)); +#ifdef HAVE_THREADS + slock_unlock(p_dispwidget->msg_queue_lock); +#endif if (msg_widget) { @@ -1999,6 +2073,8 @@ static void gfx_widgets_free(dispgfx_widget_t *p_dispwidget) slock_free(p_dispwidget->current_msgs_lock); p_dispwidget->current_msgs_lock = NULL; + slock_free(p_dispwidget->msg_queue_lock); + p_dispwidget->msg_queue_lock = NULL; #endif p_dispwidget->msg_queue_tasks_count = 0; @@ -2150,6 +2226,7 @@ bool gfx_widgets_init( #ifdef HAVE_THREADS p_dispwidget->current_msgs_lock = slock_new(); + p_dispwidget->msg_queue_lock = slock_new(); #endif fill_pathname_join_special( diff --git a/gfx/gfx_widgets.h b/gfx/gfx_widgets.h index d73a0a95ba39..9fd02e90d0fd 100644 --- a/gfx/gfx_widgets.h +++ b/gfx/gfx_widgets.h @@ -191,6 +191,22 @@ typedef struct dispgfx_widget #ifdef HAVE_THREADS slock_t* current_msgs_lock; + /* Serialises producer and consumer access to msg_queue. + * Producers (gfx_widgets_msg_queue_push) can be called from + * any thread -- the threaded task system at libretro-common/ + * queues/task_queue.c runs a worker thread, and several call + * paths reach the producer without holding any other lock + * (notably gfx/video_driver.c::video_driver_frame, which + * releases RUNLOOP_MSG_QUEUE_LOCK before the call). The + * consumer (gfx_widgets_iterate) runs on the main thread + * but did not previously serialise its fifo_read against + * concurrent producer fifo_writes. This separates concerns: + * current_msgs_lock continues to guard the displayed-message + * deque (current_msgs[]); msg_queue_lock guards the FIFO + * staging buffer. Acquired by every fifo_* call site and + * by FIFO_*_AVAIL checks where the result is used to gate a + * subsequent FIFO operation. */ + slock_t* msg_queue_lock; #endif fifo_buffer_t msg_queue; disp_widget_msg_t* current_msgs[MSG_QUEUE_ONSCREEN_MAX]; diff --git a/griffin/griffin.c b/griffin/griffin.c index 0da5730a18d8..18d8dccc3d7e 100644 --- a/griffin/griffin.c +++ b/griffin/griffin.c @@ -815,6 +815,7 @@ INPUT (HID) FIFO BUFFER ============================================================ */ #include "../libretro-common/queues/fifo_queue.c" +#include "../libretro-common/queues/retro_spsc.c" /*============================================================ AUDIO RESAMPLER diff --git a/libretro-common/features/features_cpu.c b/libretro-common/features/features_cpu.c index d4b1ca13fcdc..236b18561afd 100644 --- a/libretro-common/features/features_cpu.c +++ b/libretro-common/features/features_cpu.c @@ -614,56 +614,78 @@ uint64_t cpu_features_get(void) const int avx_flags = (1 << 27) | (1 << 28); #endif #if defined(__MACH__) - size_t _len = sizeof(size_t); - if (sysctlbyname("hw.optional.floatingpoint", NULL, &_len, NULL, 0) == 0) + /* sysctlbyname() returns 0 (success) whenever the key exists, regardless + * of its value. On Intel Macs the hw.optional.* keys are always present + * but report 0 when the feature is unsupported (e.g. avx512f on pre-Skylake + * Xeon CPUs). We therefore have to read the value, not just the success + * code. */ + int _val = 0; + size_t _len = sizeof(_val); + if ( sysctlbyname("hw.optional.floatingpoint", &_val, &_len, NULL, 0) == 0 + && _val) cpu |= RETRO_SIMD_CMOV; #if defined(CPU_X86) - _len = sizeof(size_t); - if (sysctlbyname("hw.optional.mmx", NULL, &_len, NULL, 0) == 0) + _val = 0; + _len = sizeof(_val); + if (sysctlbyname("hw.optional.mmx", &_val, &_len, NULL, 0) == 0 && _val) cpu |= RETRO_SIMD_MMX | RETRO_SIMD_MMXEXT; - _len = sizeof(size_t); - if (sysctlbyname("hw.optional.sse", NULL, &_len, NULL, 0) == 0) + _val = 0; + _len = sizeof(_val); + if (sysctlbyname("hw.optional.sse", &_val, &_len, NULL, 0) == 0 && _val) cpu |= RETRO_SIMD_SSE; - _len = sizeof(size_t); - if (sysctlbyname("hw.optional.sse2", NULL, &_len, NULL, 0) == 0) + _val = 0; + _len = sizeof(_val); + if (sysctlbyname("hw.optional.sse2", &_val, &_len, NULL, 0) == 0 && _val) cpu |= RETRO_SIMD_SSE2; - _len = sizeof(size_t); - if (sysctlbyname("hw.optional.sse3", NULL, &_len, NULL, 0) == 0) + _val = 0; + _len = sizeof(_val); + if (sysctlbyname("hw.optional.sse3", &_val, &_len, NULL, 0) == 0 && _val) cpu |= RETRO_SIMD_SSE3; - _len = sizeof(size_t); - if (sysctlbyname("hw.optional.supplementalsse3", NULL, &_len, NULL, 0) == 0) + _val = 0; + _len = sizeof(_val); + if (sysctlbyname("hw.optional.supplementalsse3", &_val, &_len, NULL, 0) == 0 && _val) cpu |= RETRO_SIMD_SSSE3; - _len = sizeof(size_t); - if (sysctlbyname("hw.optional.sse4_1", NULL, &_len, NULL, 0) == 0) + _val = 0; + _len = sizeof(_val); + if (sysctlbyname("hw.optional.sse4_1", &_val, &_len, NULL, 0) == 0 && _val) cpu |= RETRO_SIMD_SSE4; - _len = sizeof(size_t); - if (sysctlbyname("hw.optional.sse4_2", NULL, &_len, NULL, 0) == 0) + _val = 0; + _len = sizeof(_val); + if (sysctlbyname("hw.optional.sse4_2", &_val, &_len, NULL, 0) == 0 && _val) cpu |= RETRO_SIMD_SSE42; - _len = sizeof(size_t); - if (sysctlbyname("hw.optional.aes", NULL, &_len, NULL, 0) == 0) + _val = 0; + _len = sizeof(_val); + if (sysctlbyname("hw.optional.aes", &_val, &_len, NULL, 0) == 0 && _val) cpu |= RETRO_SIMD_AES; - _len = sizeof(size_t); - if (sysctlbyname("hw.optional.avx1_0", NULL, &_len, NULL, 0) == 0) + _val = 0; + _len = sizeof(_val); + if (sysctlbyname("hw.optional.avx1_0", &_val, &_len, NULL, 0) == 0 && _val) cpu |= RETRO_SIMD_AVX; - _len = sizeof(size_t); - if (sysctlbyname("hw.optional.avx2_0", NULL, &_len, NULL, 0) == 0) + _val = 0; + _len = sizeof(_val); + if (sysctlbyname("hw.optional.avx2_0", &_val, &_len, NULL, 0) == 0 && _val) cpu |= RETRO_SIMD_AVX2; - _len = sizeof(size_t); - if (sysctlbyname("hw.optional.avx512f", NULL, &_len, NULL, 0) == 0) + _val = 0; + _len = sizeof(_val); + if (sysctlbyname("hw.optional.avx512f", &_val, &_len, NULL, 0) == 0 && _val) cpu |= RETRO_SIMD_AVX512; - _len = sizeof(size_t); - if (sysctlbyname("hw.optional.altivec", NULL, &_len, NULL, 0) == 0) + _val = 0; + _len = sizeof(_val); + if (sysctlbyname("hw.optional.altivec", &_val, &_len, NULL, 0) == 0 && _val) cpu |= RETRO_SIMD_VMX; #else - _len = sizeof(size_t); - if (sysctlbyname("hw.optional.neon", NULL, &_len, NULL, 0) == 0) + _val = 0; + _len = sizeof(_val); + if (sysctlbyname("hw.optional.neon", &_val, &_len, NULL, 0) == 0 && _val) cpu |= RETRO_SIMD_NEON; - _len = sizeof(size_t); - if (sysctlbyname("hw.optional.neon_fp16", NULL, &_len, NULL, 0) == 0) + _val = 0; + _len = sizeof(_val); + if (sysctlbyname("hw.optional.neon_fp16", &_val, &_len, NULL, 0) == 0 && _val) cpu |= RETRO_SIMD_VFPV3; - _len = sizeof(size_t); - if (sysctlbyname("hw.optional.neon_hpfp", NULL, &_len, NULL, 0) == 0) + _val = 0; + _len = sizeof(_val); + if (sysctlbyname("hw.optional.neon_hpfp", &_val, &_len, NULL, 0) == 0 && _val) cpu |= RETRO_SIMD_VFPV4; #endif #elif defined(_XBOX1) diff --git a/libretro-common/include/retro_spsc.h b/libretro-common/include/retro_spsc.h new file mode 100644 index 000000000000..9191d1f0e656 --- /dev/null +++ b/libretro-common/include/retro_spsc.h @@ -0,0 +1,270 @@ +/* Copyright (C) 2010-2026 The RetroArch team + * + * --------------------------------------------------------------------------------------- + * The following license statement only applies to this file (retro_spsc.h). + * --------------------------------------------------------------------------------------- + * + * Permission is hereby granted, free of charge, + * to any person obtaining a copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef __LIBRETRO_SDK_SPSC_H +#define __LIBRETRO_SDK_SPSC_H + +/* + * retro_spsc.h - portable single-producer / single-consumer byte queue + * + * Lock-free byte-stream queue with one writer thread and one reader + * thread. Uses release-store / acquire-load on two cursors (the + * Lamport / Disruptor design) rather than a single shared count, so + * neither thread issues atomic RMW operations on the hot path. + * + * Constraints: + * - Exactly ONE producer and ONE consumer. No locking is performed; + * two producers or two consumers will corrupt the queue. + * - Capacity must be power-of-2. Init rounds up. + * - Maximum capacity is SIZE_MAX/2 (so head - tail never overflows + * the well-defined unsigned modular range). + * - The buffer is byte-addressed. Callers pushing structured records + * are responsible for framing. + * + * Memory model: + * - Producer writes data into the buffer (non-atomic stores), then + * publishes the new head via release-store. + * - Consumer acquire-loads head (sees data writes that preceded the + * release), reads data (non-atomic loads), then publishes the new + * tail via release-store. + * - Producer acquire-loads tail to know how much space is free. + * - This pairing gives the SPSC invariant on every backend + * retro_atomic.h supports lock-freely. On the volatile fallback + * (no real backend), correctness reduces to single-core or x86 TSO, + * same as every other retro_atomic.h caller. + * + * Cache behaviour: + * - head and tail are placed on separate cache lines via explicit + * padding to RETRO_SPSC_CACHE_LINE. Without this, the producer's + * publish would invalidate the consumer's tail line and vice versa, + * halving throughput on contended SMP. The padding is a + * performance hint; correctness does not depend on it. + * + * Lifetime: + * - retro_spsc_init allocates an internal buffer; retro_spsc_free + * releases it. The caller owns the retro_spsc_t struct itself. + * - The struct is NOT itself thread-safe to construct or destroy + * while in use. Build it on one thread, hand the producer pointer + * to one thread and the consumer pointer to another, then tear + * down on one thread after both producer and consumer have stopped. + * + * Example: + * + * retro_spsc_t q; + * retro_spsc_init(&q, 4096); + * + * // Producer thread: + * uint8_t msg[64] = ...; + * if (retro_spsc_write_avail(&q) >= sizeof(msg)) + * retro_spsc_write(&q, msg, sizeof(msg)); + * + * // Consumer thread: + * uint8_t msg[64]; + * if (retro_spsc_read_avail(&q) >= sizeof(msg)) + * retro_spsc_read(&q, msg, sizeof(msg)); + * + * retro_spsc_free(&q); + * + * Comparison with libretro-common's existing fifo_queue_t: + * - fifo_queue uses a slock_t internally; safe with multiple producers + * and multiple consumers, but every push/pop takes a mutex. + * - retro_spsc is lock-free but limited to one producer and one + * consumer. Use this when (a) the producer/consumer count is fixed + * at one each (audio driver -> backend, video -> task system, etc.) + * and (b) the lock contention in fifo_queue measurably matters. + * - For most code paths, fifo_queue is the better default. This + * primitive is for hot paths where lock-free is a measured win. + */ + +#include +#include +#include + +#include +#include + +RETRO_BEGIN_DECLS + +/* Cache-line padding size. 64 covers x86-64, AArch64, most modern + * ARMs, and modern PowerPC. Apple Silicon's effective coherency line + * is 128 due to its M-series cluster topology; over-padding to 64 + * still avoids false sharing, just slightly less efficiently. Older + * 32-bit ARMs (ARMv6, Cortex-A8) used 32-byte lines but tolerate the + * larger pad without correctness impact. */ +#ifndef RETRO_SPSC_CACHE_LINE +#define RETRO_SPSC_CACHE_LINE 64 +#endif + +/* Padding helper. C89 has no _Static_assert; the array-size trick is + * portable. We pad to the next multiple of cache-line size after the + * pointer-and-size_t prefix and after each cursor. */ + +/* Effective cache line for padding. We pad each cursor to a full + * cache line. If the pre-cursor fields exceed RETRO_SPSC_CACHE_LINE + * on some hypothetical small-cache target, the underflow on the + * subtraction would produce a giant array; guard with a max() so + * the pad is always at least 1 byte and never underflows. */ +#define RETRO_SPSC_PAD0_BYTES \ + ((RETRO_SPSC_CACHE_LINE > (sizeof(uint8_t*) + sizeof(size_t))) \ + ? (RETRO_SPSC_CACHE_LINE - (sizeof(uint8_t*) + sizeof(size_t))) \ + : 1) +#define RETRO_SPSC_PAD1_BYTES \ + ((RETRO_SPSC_CACHE_LINE > sizeof(retro_atomic_size_t)) \ + ? (RETRO_SPSC_CACHE_LINE - sizeof(retro_atomic_size_t)) \ + : 1) + +typedef struct retro_spsc +{ + uint8_t *buffer; + size_t capacity; /* power of 2; mask = capacity - 1 */ + /* Pad so head sits on a fresh cache line, isolating it from + * the buffer/capacity fields that init may touch. */ + uint8_t _pad0[RETRO_SPSC_PAD0_BYTES]; + retro_atomic_size_t head; /* producer publishes; consumer reads */ + /* Pad so tail sits on its own cache line, isolating it from head. */ + uint8_t _pad1[RETRO_SPSC_PAD1_BYTES]; + retro_atomic_size_t tail; /* consumer publishes; producer reads */ +} retro_spsc_t; + +/** + * retro_spsc_init: + * @q : The queue. + * @min_capacity : Requested capacity in bytes. Rounded up to the + * next power of 2. Must be > 0 and <= SIZE_MAX/2. + * + * Allocates the internal buffer and zero-initialises both cursors. + * Both cursors begin at 0; the queue is empty. + * + * Returns: true on success, false on allocation failure or invalid + * @min_capacity. + * + * After this returns true, the producer thread can call + * retro_spsc_write / retro_spsc_write_avail and the consumer thread can + * call retro_spsc_read / retro_spsc_read_avail. + */ +bool retro_spsc_init(retro_spsc_t *q, size_t min_capacity); + +/** + * retro_spsc_free: + * @q : The queue. + * + * Releases the internal buffer. Caller must ensure both producer + * and consumer have stopped using @q before calling. Safe to call + * on a queue that retro_spsc_init failed on (no-op). + */ +void retro_spsc_free(retro_spsc_t *q); + +/** + * retro_spsc_clear: + * @q : The queue. + * + * Resets head and tail to 0, discarding any unread data. The + * underlying buffer is preserved (no reallocation). + * + * SAFETY: callable only when both the producer and consumer are + * quiesced -- e.g. before either has started, or after both have + * stopped. Concurrent calls with a live producer or consumer are + * a data race. This is the same lifetime constraint as + * retro_spsc_init / retro_spsc_free. + * + * Typical use is when a stream is stopped and restarted (e.g. an + * audio driver pausing and resuming, or switching device formats), + * and stale buffered data should be discarded before the new + * stream begins. + */ +void retro_spsc_clear(retro_spsc_t *q); + +/** + * retro_spsc_write_avail: + * @q : The queue. + * + * Producer-side query: how many bytes can be written before the + * queue is full. + * + * Returns: byte count, in [0, capacity]. + * + * SAFETY: callable only from the producer thread. + */ +size_t retro_spsc_write_avail(const retro_spsc_t *q); + +/** + * retro_spsc_read_avail: + * @q : The queue. + * + * Consumer-side query: how many bytes are available to read. + * + * Returns: byte count, in [0, capacity]. + * + * SAFETY: callable only from the consumer thread. + */ +size_t retro_spsc_read_avail(const retro_spsc_t *q); + +/** + * retro_spsc_write: + * @q : The queue. + * @data : Source bytes. + * @bytes : Number of bytes to attempt to write. + * + * Writes up to @bytes from @data into the queue. If the queue has + * less than @bytes free, writes only what fits. + * + * Returns: number of bytes actually written. + * + * SAFETY: callable only from the producer thread. Concurrent calls + * with another producer corrupt the queue. + */ +size_t retro_spsc_write(retro_spsc_t *q, const void *data, size_t bytes); + +/** + * retro_spsc_read: + * @q : The queue. + * @data : Destination bytes. + * @bytes : Number of bytes to attempt to read. + * + * Reads up to @bytes from the queue into @data. If the queue has + * less than @bytes available, reads only what is present. + * + * Returns: number of bytes actually read. + * + * SAFETY: callable only from the consumer thread. Concurrent calls + * with another consumer corrupt the queue. + */ +size_t retro_spsc_read(retro_spsc_t *q, void *data, size_t bytes); + +/** + * retro_spsc_peek: + * @q : The queue. + * @data : Destination bytes. + * @bytes : Number of bytes to peek. + * + * Like retro_spsc_read but does not advance the read cursor. The + * peeked bytes remain available for the next read. + * + * Returns: number of bytes peeked. + * + * SAFETY: callable only from the consumer thread. + */ +size_t retro_spsc_peek(const retro_spsc_t *q, void *data, size_t bytes); + +RETRO_END_DECLS + +#endif /* __LIBRETRO_SDK_SPSC_H */ diff --git a/libretro-common/queues/retro_spsc.c b/libretro-common/queues/retro_spsc.c new file mode 100644 index 000000000000..c2fd7e0831c9 --- /dev/null +++ b/libretro-common/queues/retro_spsc.c @@ -0,0 +1,208 @@ +/* Copyright (C) 2010-2026 The RetroArch team + * + * --------------------------------------------------------------------------------------- + * The following license statement only applies to this file (retro_spsc.c). + * --------------------------------------------------------------------------------------- + * + * Permission is hereby granted, free of charge, + * to any person obtaining a copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include + +/* Round @v up to the next power of 2. Returns 0 if @v == 0 or if + * the next power of 2 would overflow (caller checks @v <= SIZE_MAX/2). */ +static size_t spsc_round_up_pow2(size_t v) +{ + size_t r; + if (v == 0) + return 0; + /* If already a power of 2, return as-is. */ + if ((v & (v - 1)) == 0) + return v; + /* Round up. Loop terminates because v <= SIZE_MAX/2 means r + * cannot overflow. */ + r = 1; + while (r < v) + r <<= 1; + return r; +} + +bool retro_spsc_init(retro_spsc_t *q, size_t min_capacity) +{ + size_t cap; + + if (!q || min_capacity == 0 || min_capacity > (SIZE_MAX / 2)) + return false; + + cap = spsc_round_up_pow2(min_capacity); + if (cap == 0 || cap > (SIZE_MAX / 2)) + return false; + + q->buffer = (uint8_t*)malloc(cap); + if (!q->buffer) + return false; + + q->capacity = cap; + retro_atomic_size_init(&q->head, 0); + retro_atomic_size_init(&q->tail, 0); + return true; +} + +void retro_spsc_free(retro_spsc_t *q) +{ + if (!q) + return; + if (q->buffer) + { + free(q->buffer); + q->buffer = NULL; + } + q->capacity = 0; +} + +void retro_spsc_clear(retro_spsc_t *q) +{ + if (!q) + return; + /* Quiescence is the caller's responsibility (documented). + * Under that assumption, no other thread is touching head or + * tail, so plain init is correct here -- and necessary, because + * plain assignment to a retro_atomic_size_t is illegal under + * the C11 stdatomic backend. */ + retro_atomic_size_init(&q->head, 0); + retro_atomic_size_init(&q->tail, 0); +} + +size_t retro_spsc_write_avail(const retro_spsc_t *q) +{ + /* Producer query. We read our own head (which we wrote) and the + * consumer's tail (which they wrote with a release-store). + * acquire-load on tail so any reads we do based on the freed + * space are ordered after the consumer's tail publication. + * + * We can read our own head without an acquire because we are the + * only writer to it; but retro_atomic.h does not expose relaxed + * loads, so use acquire. Cost is one extra fence on weak-memory + * targets, which is negligible compared to the actual work + * surrounding this query. */ + size_t head = retro_atomic_load_acquire_size( + (retro_atomic_size_t*)&q->head); + size_t tail = retro_atomic_load_acquire_size( + (retro_atomic_size_t*)&q->tail); + /* head - tail is well-defined modular arithmetic on size_t. + * Invariant: 0 <= head - tail <= capacity. */ + return q->capacity - (head - tail); +} + +size_t retro_spsc_read_avail(const retro_spsc_t *q) +{ + /* Consumer query. acquire-load on head pairs with the producer's + * release-store on head, so the data writes that preceded the + * release are visible by the time we read them. */ + size_t head = retro_atomic_load_acquire_size( + (retro_atomic_size_t*)&q->head); + size_t tail = retro_atomic_load_acquire_size( + (retro_atomic_size_t*)&q->tail); + return head - tail; +} + +size_t retro_spsc_write(retro_spsc_t *q, const void *data, size_t bytes) +{ + size_t mask, head_idx, first; + const uint8_t *src = (const uint8_t*)data; + /* read tail first to know how much room there is */ + size_t head = retro_atomic_load_acquire_size(&q->head); + size_t tail = retro_atomic_load_acquire_size(&q->tail); + size_t avail = q->capacity - (head - tail); + if (bytes > avail) + bytes = avail; + if (bytes == 0) + return 0; + + mask = q->capacity - 1; + head_idx = head & mask; + + /* first = bytes from head_idx to end-of-buffer */ + first = q->capacity - head_idx; + if (first > bytes) + first = bytes; + + memcpy(q->buffer + head_idx, src, first); + memcpy(q->buffer, src + first, bytes - first); + + /* Publish: release-store ensures the memcpys above are globally + * visible before the consumer observes the new head. */ + retro_atomic_store_release_size(&q->head, head + bytes); + return bytes; +} + +size_t retro_spsc_read(retro_spsc_t *q, void *data, size_t bytes) +{ + size_t mask, tail_idx, first; + uint8_t *dst = (uint8_t*)data; + /* acquire on head pairs with producer's release-store; this is + * what makes the subsequent memcpys safe to read. */ + size_t head = retro_atomic_load_acquire_size(&q->head); + size_t tail = retro_atomic_load_acquire_size(&q->tail); + size_t avail = head - tail; + if (bytes > avail) + bytes = avail; + if (bytes == 0) + return 0; + + mask = q->capacity - 1; + tail_idx = tail & mask; + + first = q->capacity - tail_idx; + if (first > bytes) + first = bytes; + + memcpy(dst, q->buffer + tail_idx, first); + memcpy(dst + first, q->buffer, bytes - first); + + /* Publish: release-store so the producer can re-use this space. */ + retro_atomic_store_release_size(&q->tail, tail + bytes); + return bytes; +} + +size_t retro_spsc_peek(const retro_spsc_t *q, void *data, size_t bytes) +{ + size_t mask, tail_idx, first; + uint8_t *dst = (uint8_t*)data; + size_t head = retro_atomic_load_acquire_size( + (retro_atomic_size_t*)&q->head); + size_t tail = retro_atomic_load_acquire_size( + (retro_atomic_size_t*)&q->tail); + size_t avail = head - tail; + if (bytes > avail) + bytes = avail; + if (bytes == 0) + return 0; + + mask = q->capacity - 1; + tail_idx = tail & mask; + + first = q->capacity - tail_idx; + if (first > bytes) + first = bytes; + + memcpy(dst, q->buffer + tail_idx, first); + memcpy(dst + first, q->buffer, bytes - first); + /* No tail update: peek does not consume. */ + return bytes; +} diff --git a/libretro-common/samples/queues/retro_spsc_test/Makefile b/libretro-common/samples/queues/retro_spsc_test/Makefile new file mode 100644 index 000000000000..fffcba4c9a12 --- /dev/null +++ b/libretro-common/samples/queues/retro_spsc_test/Makefile @@ -0,0 +1,44 @@ +TARGET := retro_spsc_test + +LIBRETRO_COMM_DIR := ../../.. + +# retro_spsc.c is the lock-free SPSC byte queue under test; the test +# harness drives a producer and consumer thread through rthreads.c +# (HAVE_THREADS). retro_atomic.h is header-only and pulled in via +# the queue's transitive includes. +# +# Build under SANITIZER=thread (TSan) for the regression-catching +# CI run; SANITIZER=address,undefined as the default for the +# auto-discovery workflow gives a coarser smoke test. Both pass on +# correct code. +SOURCES := \ + retro_spsc_test.c \ + $(LIBRETRO_COMM_DIR)/queues/retro_spsc.c \ + $(LIBRETRO_COMM_DIR)/rthreads/rthreads.c + +OBJS := $(SOURCES:.c=.o) + +CFLAGS += -Wall -pedantic -std=gnu99 -g -O0 \ + -DHAVE_THREADS -I$(LIBRETRO_COMM_DIR)/include +LDFLAGS += -lpthread +# rthreads.c uses clock_gettime + CLOCK_REALTIME on Linux glibc; on +# older glibc those live in -lrt. Harmless on newer glibc. +LDFLAGS += -lrt + +ifneq ($(SANITIZER),) + CFLAGS := -fsanitize=$(SANITIZER) -fno-omit-frame-pointer $(CFLAGS) + LDFLAGS := -fsanitize=$(SANITIZER) $(LDFLAGS) +endif + +all: $(TARGET) + +%.o: %.c + $(CC) -c -o $@ $< $(CFLAGS) + +$(TARGET): $(OBJS) + $(CC) -o $@ $^ $(LDFLAGS) + +clean: + rm -f $(TARGET) $(OBJS) + +.PHONY: clean diff --git a/libretro-common/samples/queues/retro_spsc_test/retro_spsc_test.c b/libretro-common/samples/queues/retro_spsc_test/retro_spsc_test.c new file mode 100644 index 000000000000..408858fa2064 --- /dev/null +++ b/libretro-common/samples/queues/retro_spsc_test/retro_spsc_test.c @@ -0,0 +1,265 @@ +/* SPSC stress test: producer pushes N bytes total; consumer reads N + * bytes; verify the byte stream is exactly an expected sequence. + * Validates ordering, no torn reads, no duplicates, no drops. + * + * Design notes: + * - Producer writes incrementing 32-bit "tokens" (i = 1, 2, 3, ...). + * - Consumer reads the byte stream and reassembles tokens. + * - Each token is checked against expected sequence; mismatch => + * reordering or torn write. + * - Producer/consumer race in tight loops with no per-iteration + * handshake; a real lock-free SPSC must handle this concurrency + * without external synchronisation. + * - Run under TSan for race detection. Run without sanitizer for + * throughput sanity. */ + +#include +#include +#include +#include + +#include +#include + +/* Total tokens to push. ~10M gives a reproducible test of <2s on + * x86_64; under qemu-aarch64 and TSan it's ~30s. */ +#define TOTAL_TOKENS 10000000 + +/* Buffer capacity in bytes. Smaller -> more producer/consumer + * interleaving (better race coverage); larger -> better throughput + * but less torture. */ +#define BUF_BYTES 4096 + +typedef struct +{ + retro_spsc_t q; + /* Captured by the producer's loop sentinel and read by main after + * join, so it doesn't need to be atomic. */ + unsigned long mismatches; + unsigned long produced_tokens; + unsigned long consumed_tokens; +} test_state_t; + +static void producer_thread(void *arg) +{ + test_state_t *s = (test_state_t*)arg; + uint32_t token; + + for (token = 1; token <= TOTAL_TOKENS; token++) + { + /* Spin until there is room for a full token. */ + while (retro_spsc_write_avail(&s->q) < sizeof(token)) + ; /* spin */ + retro_spsc_write(&s->q, &token, sizeof(token)); + s->produced_tokens++; + } +} + +static void consumer_thread(void *arg) +{ + test_state_t *s = (test_state_t*)arg; + uint32_t expected_token = 1; + + while (expected_token <= TOTAL_TOKENS) + { + uint32_t got; + while (retro_spsc_read_avail(&s->q) < sizeof(got)) + ; /* spin */ + retro_spsc_read(&s->q, &got, sizeof(got)); + if (got != expected_token) + s->mismatches++; + expected_token++; + s->consumed_tokens++; + } +} + +static int run_stress(void) +{ + test_state_t s; + sthread_t *prod; + sthread_t *cons; + + memset(&s, 0, sizeof(s)); + if (!retro_spsc_init(&s.q, BUF_BYTES)) + { + fprintf(stderr, "FAIL: retro_spsc_init\n"); + return 1; + } + + prod = sthread_create(producer_thread, &s); + if (!prod) + { + fprintf(stderr, "FAIL: sthread_create(producer)\n"); + retro_spsc_free(&s.q); + return 1; + } + cons = sthread_create(consumer_thread, &s); + if (!cons) + { + fprintf(stderr, "FAIL: sthread_create(consumer)\n"); + sthread_join(prod); + retro_spsc_free(&s.q); + return 1; + } + + sthread_join(prod); + sthread_join(cons); + retro_spsc_free(&s.q); + + if (s.mismatches != 0) + { + fprintf(stderr, + "FAIL: %lu mismatched tokens out of %lu\n", + s.mismatches, s.consumed_tokens); + return 1; + } + if (s.produced_tokens != TOTAL_TOKENS + || s.consumed_tokens != TOTAL_TOKENS) + { + fprintf(stderr, + "FAIL: produced=%lu consumed=%lu (expected %d each)\n", + s.produced_tokens, s.consumed_tokens, TOTAL_TOKENS); + return 1; + } + + printf("[pass] stress: %d tokens through %d-byte buffer, " + "0 mismatches\n", + TOTAL_TOKENS, BUF_BYTES); + return 0; +} + +/* Single-threaded property checks: verify the API contracts that + * don't require concurrency. */ +static int run_property_checks(void) +{ + retro_spsc_t q; + uint8_t buf[64]; + uint8_t readback[64]; + size_t i, n; + + /* init with non-power-of-2 should round up */ + if (!retro_spsc_init(&q, 100)) + { + fprintf(stderr, "FAIL: init(100)\n"); + return 1; + } + if (q.capacity != 128) + { + fprintf(stderr, "FAIL: capacity %zu != 128 (round up)\n", + q.capacity); + retro_spsc_free(&q); + return 1; + } + + /* fresh queue: read_avail = 0, write_avail = capacity */ + if (retro_spsc_read_avail(&q) != 0 + || retro_spsc_write_avail(&q) != 128) + { + fprintf(stderr, "FAIL: fresh-queue avails\n"); + retro_spsc_free(&q); + return 1; + } + + /* write 64, read 64, verify */ + for (i = 0; i < sizeof(buf); i++) + buf[i] = (uint8_t)(i + 1); + n = retro_spsc_write(&q, buf, sizeof(buf)); + if (n != sizeof(buf)) + { + fprintf(stderr, "FAIL: write returned %zu\n", n); + retro_spsc_free(&q); + return 1; + } + if (retro_spsc_read_avail(&q) != sizeof(buf)) + { + fprintf(stderr, "FAIL: read_avail after write\n"); + retro_spsc_free(&q); + return 1; + } + memset(readback, 0, sizeof(readback)); + n = retro_spsc_read(&q, readback, sizeof(readback)); + if (n != sizeof(buf) || memcmp(buf, readback, sizeof(buf)) != 0) + { + fprintf(stderr, "FAIL: read content mismatch\n"); + retro_spsc_free(&q); + return 1; + } + + /* peek does not advance */ + retro_spsc_write(&q, buf, 32); + if (retro_spsc_peek(&q, readback, 32) != 32) + { + fprintf(stderr, "FAIL: peek returned wrong size\n"); + retro_spsc_free(&q); + return 1; + } + if (retro_spsc_read_avail(&q) != 32) + { + fprintf(stderr, "FAIL: peek advanced read cursor\n"); + retro_spsc_free(&q); + return 1; + } + /* drain so wraparound test starts from a known state */ + retro_spsc_read(&q, readback, 32); + + /* wraparound: write 100 (forces wrap given capacity 128 + 32-byte + * peek state above) */ + for (i = 0; i < sizeof(buf); i++) + buf[i] = (uint8_t)(0x80 + i); + n = retro_spsc_write(&q, buf, sizeof(buf)); + if (n != sizeof(buf)) + { + fprintf(stderr, "FAIL: wraparound write\n"); + retro_spsc_free(&q); + return 1; + } + memset(readback, 0, sizeof(readback)); + n = retro_spsc_read(&q, readback, sizeof(buf)); + if (n != sizeof(buf) || memcmp(buf, readback, sizeof(buf)) != 0) + { + fprintf(stderr, "FAIL: wraparound read content mismatch\n"); + retro_spsc_free(&q); + return 1; + } + + /* clear discards unread data without reallocating buffer */ + retro_spsc_write(&q, buf, 50); + if (retro_spsc_read_avail(&q) != 50) + { + fprintf(stderr, "FAIL: pre-clear read_avail\n"); + retro_spsc_free(&q); + return 1; + } + retro_spsc_clear(&q); + if (retro_spsc_read_avail(&q) != 0 + || retro_spsc_write_avail(&q) != 128) + { + fprintf(stderr, "FAIL: clear did not reset cursors\n"); + retro_spsc_free(&q); + return 1; + } + /* queue is reusable after clear */ + retro_spsc_write(&q, buf, 16); + memset(readback, 0, sizeof(readback)); + n = retro_spsc_read(&q, readback, 16); + if (n != 16 || memcmp(buf, readback, 16) != 0) + { + fprintf(stderr, "FAIL: post-clear read mismatch\n"); + retro_spsc_free(&q); + return 1; + } + + retro_spsc_free(&q); + printf("[pass] property checks\n"); + return 0; +} + +int main(void) +{ + if (run_property_checks() != 0) + return 1; + if (run_stress() != 0) + return 1; + puts("ALL OK"); + return 0; +} diff --git a/menu/drivers/xmb.c b/menu/drivers/xmb.c index db650c2a6ef3..ff34f03b4873 100644 --- a/menu/drivers/xmb.c +++ b/menu/drivers/xmb.c @@ -653,7 +653,12 @@ const char* xmb_theme_ident(void) * uninit `flags & FADE_ACTIVE` would call * gfx_animation_kill_by_tag on stale state. * - The lazy thumbnail path resolution in xmb_render reads - * `icon_path[0]` to decide whether resolution is needed. */ + * `icon_path[0]` to decide whether resolution is needed. + * + * gfx_thumbnail_init_blank() (rather than memset) is needed + * because gfx_thumbnail_t.status is now atomically-typed; a + * memset of a struct containing std::atomic warns under + * CXX_BUILD's C++ compile of this file. */ static xmb_node_t *xmb_alloc_node(void) { xmb_node_t *node = (xmb_node_t*)malloc(sizeof(*node)); @@ -664,8 +669,7 @@ static xmb_node_t *xmb_alloc_node(void) node->alpha = node->label_alpha = 0; node->zoom = node->x = node->y = 0; node->icon = node->content_icon = 0; - memset(&node->thumbnail_icon.icon, 0, - sizeof(node->thumbnail_icon.icon)); + gfx_thumbnail_init_blank(&node->thumbnail_icon.icon); node->thumbnail_icon.thumbnail_path_data.icon_path[0] = '\0'; node->fullpath = NULL; node->console_name = NULL; diff --git a/samples/gfx/gfx_thumbnail_status_atomic/Makefile b/samples/gfx/gfx_thumbnail_status_atomic/Makefile new file mode 100644 index 000000000000..f4e5edba934e --- /dev/null +++ b/samples/gfx/gfx_thumbnail_status_atomic/Makefile @@ -0,0 +1,50 @@ +TARGET := gfx_thumbnail_status_atomic_test + +# Path back to the repo root from this sample dir. The test references +# headers from libretro-common/include and (when HAVE_THREADS=1) compiles +# rthreads.c from libretro-common/rthreads. +REPO_ROOT := ../../.. +LIBRETRO_COMM_DIR := $(REPO_ROOT)/libretro-common + +# This sample mirrors gfx_thumbnail.c's atomic-status synchronisation +# pattern in isolation; the real gfx_thumbnail.c is not linked. The +# stress test exercises the producer/consumer shape through rthreads +# when HAVE_THREADS is enabled. Without HAVE_THREADS the test still +# runs the static-assertion and round-trip property checks, which +# validate the type/layout invariants that the C11 stdatomic and C++11 +# std::atomic backends rely on. +HAVE_THREADS ?= 1 + +SOURCES := gfx_thumbnail_status_atomic_test.c + +CFLAGS += -Wall -pedantic -std=gnu99 -g -O0 \ + -I$(LIBRETRO_COMM_DIR)/include + +ifeq ($(HAVE_THREADS),1) + CFLAGS += -DHAVE_THREADS + SOURCES += $(LIBRETRO_COMM_DIR)/rthreads/rthreads.c + LDFLAGS += -lpthread + # rthreads.c uses clock_gettime + CLOCK_REALTIME on Linux glibc; on + # older glibc those live in -lrt. Harmless on newer glibc. + LDFLAGS += -lrt +endif + +OBJS := $(SOURCES:.c=.o) + +ifneq ($(SANITIZER),) + CFLAGS := -fsanitize=$(SANITIZER) -fno-omit-frame-pointer $(CFLAGS) + LDFLAGS := -fsanitize=$(SANITIZER) $(LDFLAGS) +endif + +all: $(TARGET) + +%.o: %.c + $(CC) -c -o $@ $< $(CFLAGS) + +$(TARGET): $(OBJS) + $(CC) -o $@ $^ $(LDFLAGS) + +clean: + rm -f $(TARGET) $(OBJS) + +.PHONY: clean diff --git a/samples/gfx/gfx_thumbnail_status_atomic/gfx_thumbnail_status_atomic_test.c b/samples/gfx/gfx_thumbnail_status_atomic/gfx_thumbnail_status_atomic_test.c new file mode 100644 index 000000000000..6d48d922115a --- /dev/null +++ b/samples/gfx/gfx_thumbnail_status_atomic/gfx_thumbnail_status_atomic_test.c @@ -0,0 +1,453 @@ +/* Copyright (C) 2010-2026 The RetroArch team + * + * --------------------------------------------------------------------------------------- + * The following license statement only applies to this file (gfx_thumbnail_status_atomic_test.c). + * --------------------------------------------------------------------------------------- + * + * Permission is hereby granted, free of charge, + * to any person obtaining a copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/* Regression test for the cross-thread thumbnail status + * synchronisation in gfx/gfx_thumbnail.c. + * + * The .status field of gfx_thumbnail_t is read by the video + * thread (in gfx_thumbnail_draw) and written by the upload- + * callback thread (in gfx_thumbnail_handle_upload): + * + * --- upload-callback thread --- + * thumbnail->texture = ...; + * thumbnail->width = ...; + * thumbnail->height = ...; + * GFX_THUMB_STATUS_STORE(&thumbnail->status, + * GFX_THUMBNAIL_STATUS_AVAILABLE); + * + * --- video thread --- + * if (GFX_THUMB_STATUS_LOAD(&thumbnail->status) + * == GFX_THUMBNAIL_STATUS_AVAILABLE) { + * uintptr_t tex = thumbnail->texture; + * unsigned w = thumbnail->width; + * unsigned h = thumbnail->height; + * ... + * } + * + * On weakly-ordered SMP hardware (ARM, AArch64, PowerPC), this + * pattern requires release-store / acquire-load barriers: + * without them the video thread can observe the AVAILABLE + * status before the texture/width/height writes are visible, + * leading to garbage thumbnail rendering or div-by-zero in the + * aspect-ratio code. Pre-fix gfx_thumbnail.c had its own + * hand-rolled atomic shim covering GCC __atomic_*, MSVC + * Interlocked, and a volatile fallback; the port to + * retro_atomic.h replaces the shim with the portable API and + * stores the field as retro_atomic_int_t so the type system + * enforces the discipline. + * + * This test exercises two failure modes: + * + * 1. Compile-time: the .status field MUST be retro_atomic_int_t + * (not a plain enum), so that on the C11 stdatomic and + * C++11 std::atomic backends a future contributor cannot + * accidentally bypass the API with a plain `t->status = X` + * assignment -- those backends would refuse to compile + * such an assignment. We assert this with a static + * check that verifies sizeof(.status) == sizeof(int) + * (preserving struct layout) and that the macros + * GFX_THUMB_STATUS_STORE / GFX_THUMB_STATUS_LOAD route + * through retro_atomic_*_int (verified at preprocessing + * by token-pasting their expansion). + * + * 2. Runtime: a 1M-iteration producer/consumer stress test + * that mirrors the upload/draw shape: producer writes a + * monotonic (texture, width, height) triple before + * publishing AVAILABLE; consumer waits for AVAILABLE and + * verifies the triple is internally consistent (all three + * fields belong to the same generation). Intended to be + * run under ThreadSanitizer, which instruments every + * atomic load/store and would flag the missing-barrier + * case as a data race even on x86 TSO (where the + * hardware otherwise hides the bug). + * + * If gfx_thumbnail.c amends GFX_THUMB_STATUS_STORE / + * GFX_THUMB_STATUS_LOAD or the .status field type, the + * verbatim copies in this test must follow. + */ + +#include +#include +#include + +#include + +/* Needs HAVE_THREADS from the build for the SPSC stress; fall + * through to the static-assert checks otherwise. */ +#ifdef HAVE_THREADS +#include +#endif + +/* === Verbatim mirror of gfx_thumbnail.h's relevant pieces === */ + +enum gfx_thumbnail_status +{ + GFX_THUMBNAIL_STATUS_UNKNOWN = 0, + GFX_THUMBNAIL_STATUS_PENDING, + GFX_THUMBNAIL_STATUS_AVAILABLE, + GFX_THUMBNAIL_STATUS_MISSING +}; + +/* Mirror of gfx_thumbnail_t with the same field order/types. + * Layout must match gfx_thumbnail.h's definition; size/offset + * checks below validate this. */ +typedef struct +{ + uintptr_t texture; + unsigned width; + unsigned height; + float alpha; + float delay_timer; + retro_atomic_int_t status; + uint8_t flags; +} gfx_thumbnail_t; + +/* Verbatim mirror of the macros from gfx/gfx_thumbnail.c. + * If the originals change, this block must follow. */ +#define GFX_THUMB_STATUS_STORE(ptr, val) \ + retro_atomic_store_release_int((retro_atomic_int_t*)(void*)(ptr), (int)(val)) +#define GFX_THUMB_STATUS_LOAD(ptr) \ + ((enum gfx_thumbnail_status)retro_atomic_load_acquire_int((retro_atomic_int_t*)(void*)(ptr))) + +/* Local mirror of gfx_thumbnail_init_blank. The test cannot + * include gfx_thumbnail.h directly because it deliberately + * shadows that header's type definitions to verify layout + * invariants -- so we mirror the helper as well. Field-by- + * field zero-init is required (rather than memset) when the + * test is compiled as C++ via CXX_BUILD: the .status field is + * std::atomic, making gfx_thumbnail_t non-trivially- + * copyable, and memset of such a struct warns + * -Wclass-memaccess. */ +static void gfx_thumbnail_test_init_blank(gfx_thumbnail_t *t) +{ + t->texture = 0; + t->width = 0; + t->height = 0; + t->alpha = 0.0f; + t->delay_timer = 0.0f; + retro_atomic_int_init(&t->status, GFX_THUMBNAIL_STATUS_UNKNOWN); + t->flags = 0; +} + +/* === Compile-time invariants === */ + +/* Size: the atomic-typed status field must be the same size as + * a plain int on every supported backend, so swapping it in for + * `enum gfx_thumbnail_status` doesn't change the gfx_thumbnail_t + * layout. */ +#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L +_Static_assert(sizeof(((gfx_thumbnail_t*)0)->status) == sizeof(int), + "gfx_thumbnail_t.status must be int-sized for ABI compatibility"); +#elif defined(__cplusplus) && __cplusplus >= 201103L +static_assert(sizeof(((gfx_thumbnail_t*)0)->status) == sizeof(int), + "gfx_thumbnail_t.status must be int-sized for ABI compatibility"); +#else +/* Pre-C11 fallback: array-size negative-on-failure trick. */ +typedef char _gfx_thumb_status_size_check[ + (sizeof(((gfx_thumbnail_t*)0)->status) == sizeof(int)) ? 1 : -1]; +#endif + +/* Backend sanity: retro_atomic.h must be selected and lock-free. + * The volatile fallback is correct only on x86 TSO / single-core + * targets and would not provide barriers on weak-memory hardware, + * which is exactly what the upload/draw pairing relies on. */ +#if !defined(HAVE_RETRO_ATOMIC) +# error "retro_atomic.h: HAVE_RETRO_ATOMIC not defined" +#endif + +/* === Runtime SPSC stress (HAVE_THREADS only) === */ + +#ifdef HAVE_THREADS + +/* Number of producer/consumer iterations. 1M is enough to flush + * out a missing-barrier bug under TSan within a few seconds; on + * real weak-memory hardware (qemu-aarch64) it surfaces faster. + * Adjust upward if the bug becomes harder to reproduce. */ +#define STRESS_ITERS 1000000 + +/* SPSC handshake state. + * + * The real gfx_thumbnail upload/draw pairing is single-publication: + * the upload-callback thread writes the texture once and publishes + * AVAILABLE; the video thread observes AVAILABLE and reads the + * triple every frame thereafter. There is no high-frequency + * back-and-forth. + * + * To mirror that shape under stress we need a per-generation + * handshake so the texture / width / height fields are NOT being + * rewritten while the consumer is reading them. Without that, + * a tight loop where the producer rewrites the data while the + * consumer reads it would generate torn reads even with correct + * barriers, because the data is mutating concurrently -- a + * property of the test design, not of the synchronisation + * primitives. + * + * We pair two atomic counters for the handshake: + * - `produced`: producer increments it after staging a generation, + * via release-store so the staged data is visible to any thread + * that observes the new value. Consumer waits for it to exceed + * the last-seen value via acquire-load. This is the actual + * publish edge the consumer keys on (rather than a status + * transition, which would require the consumer to observe a + * transient UNKNOWN that the producer might overwrite faster + * than the consumer can poll). + * - `consumed`: consumer increments it after reading; producer + * waits for `consumed == produced` before staging the next + * generation. + * + * The .status field (the actual subject of the test) is updated + * on the same pattern as in gfx_thumbnail.c -- each generation + * stores AVAILABLE through GFX_THUMB_STATUS_STORE before the + * counter bump, and the consumer reads it through + * GFX_THUMB_STATUS_LOAD as part of the per-generation read. This + * exercises the macros under test even though the counter is what + * gates the handshake. */ +typedef struct +{ + gfx_thumbnail_t thumb; + /* Producer's generation counter. Bumped via release-store + * after staging the data; consumer reads via acquire-load. */ + retro_atomic_int_t produced; + /* Consumer's acknowledgement counter. Bumped via release- + * store after reading; producer reads via acquire-load. */ + retro_atomic_int_t consumed; + /* Counter of mismatches the consumer observed. Read by main + * thread after the join, so it doesn't need to be atomic. */ + unsigned long mismatches; +} stress_state_t; + +static void producer_thread(void *arg) +{ + stress_state_t *s = (stress_state_t *)arg; + int i; + + for (i = 1; i <= STRESS_ITERS; i++) + { + /* Wait for the consumer to acknowledge the previous + * generation. On the first iteration this is true + * immediately (consumed starts at 0). */ + while (retro_atomic_load_acquire_int(&s->consumed) != (i - 1)) + ; /* spin */ + + /* Stage the generation's data (no ordering required + * between these). */ + s->thumb.texture = (uintptr_t)i; + s->thumb.width = (unsigned)i; + s->thumb.height = (unsigned)i; + + /* Update status via the macros under test. Same shape as + * the production gfx_thumbnail.c upload path: + * STATUS_STORE(AVAILABLE) after staging the texture, with + * release-store semantics that fence the earlier writes. */ + GFX_THUMB_STATUS_STORE(&s->thumb.status, + GFX_THUMBNAIL_STATUS_AVAILABLE); + + /* Publish the new generation. Release-store so the staged + * data and the AVAILABLE status are visible before the + * consumer observes the new generation. */ + retro_atomic_store_release_int(&s->produced, i); + } + + /* Wait for the final ack, then signal shutdown via a + * sentinel value `produced = STRESS_ITERS + 1`, paired with + * a final STATUS_STORE so the consumer exercises the macro + * one last time. */ + while (retro_atomic_load_acquire_int(&s->consumed) != STRESS_ITERS) + ; /* drain */ + GFX_THUMB_STATUS_STORE(&s->thumb.status, + GFX_THUMBNAIL_STATUS_MISSING); + retro_atomic_store_release_int(&s->produced, STRESS_ITERS + 1); +} + +static void consumer_thread(void *arg) +{ + stress_state_t *s = (stress_state_t *)arg; + unsigned long mismatches = 0; + int expected = 1; + + /* Per-generation loop. Wait for produced >= expected, then + * read the staged data and the status. The status read + * goes through GFX_THUMB_STATUS_LOAD (the macro under test); + * it should always observe AVAILABLE for the current + * generation. Both reads are acquire-loads; the producer's + * release-stores ensure ordering between staging and + * publication. */ + for (;;) + { + int prod; + uintptr_t tex; + unsigned w, h; + enum gfx_thumbnail_status st; + + /* Wait for the producer to publish the next generation + * (or the shutdown sentinel STRESS_ITERS + 1). */ + do { + prod = retro_atomic_load_acquire_int(&s->produced); + } while (prod < expected); + + if (prod == STRESS_ITERS + 1) + break; + + /* Read the staged data and status. The acquire-load on + * `produced` fences these reads; if any field disagrees + * with the expected generation, the fence failed. We + * additionally exercise GFX_THUMB_STATUS_LOAD here -- it + * should always observe AVAILABLE for the current + * generation. */ + tex = s->thumb.texture; + w = s->thumb.width; + h = s->thumb.height; + st = GFX_THUMB_STATUS_LOAD(&s->thumb.status); + + if (tex != (uintptr_t)expected + || w != (unsigned)expected + || h != (unsigned)expected + || st != GFX_THUMBNAIL_STATUS_AVAILABLE) + mismatches++; + + /* Ack: producer waits on this before staging the next + * generation. */ + retro_atomic_store_release_int(&s->consumed, expected); + expected++; + } + + s->mismatches = mismatches; +} + +static int run_stress(void) +{ + stress_state_t s; + sthread_t *prod; + sthread_t *cons; + + /* Field-by-field init, not memset, to silence + * -Wclass-memaccess under CXX_BUILD where .status is + * std::atomic and the struct is non-trivially-copyable. */ + gfx_thumbnail_test_init_blank(&s.thumb); + retro_atomic_int_init(&s.produced, 0); + retro_atomic_int_init(&s.consumed, 0); + s.mismatches = 0; + + prod = sthread_create(producer_thread, &s); + if (!prod) + { + fprintf(stderr, "FAIL: sthread_create(producer)\n"); + return 1; + } + cons = sthread_create(consumer_thread, &s); + if (!cons) + { + fprintf(stderr, "FAIL: sthread_create(consumer)\n"); + sthread_join(prod); + return 1; + } + + sthread_join(prod); + sthread_join(cons); + + if (s.mismatches != 0) + { + fprintf(stderr, + "FAIL: SPSC stress observed %lu mismatched reads " + "out of %d iterations\n", s.mismatches, STRESS_ITERS); + return 1; + } + + printf("[pass] SPSC stress: %d iterations, 0 mismatches\n", + STRESS_ITERS); + return 0; +} + +#else /* HAVE_THREADS */ + +static int run_stress(void) +{ + /* Compile-time-only mode: skip stress, but the static + * assertions above still validated the type/layout invariants. */ + printf("[skip] HAVE_THREADS not defined; static checks only\n"); + return 0; +} + +#endif /* HAVE_THREADS */ + +/* === Single-threaded property checks === */ + +static int run_property_checks(void) +{ + gfx_thumbnail_t t; + /* Field-by-field init, not memset, to silence + * -Wclass-memaccess under CXX_BUILD. */ + gfx_thumbnail_test_init_blank(&t); + + /* Round-trip every enum value through STORE/LOAD. */ + { + const enum gfx_thumbnail_status vals[] = { + GFX_THUMBNAIL_STATUS_UNKNOWN, + GFX_THUMBNAIL_STATUS_PENDING, + GFX_THUMBNAIL_STATUS_AVAILABLE, + GFX_THUMBNAIL_STATUS_MISSING, + }; + size_t i; + for (i = 0; i < sizeof(vals) / sizeof(vals[0]); i++) + { + GFX_THUMB_STATUS_STORE(&t.status, vals[i]); + if (GFX_THUMB_STATUS_LOAD(&t.status) != vals[i]) + { + fprintf(stderr, + "FAIL: STORE/LOAD round-trip on value %d\n", + (int)vals[i]); + return 1; + } + } + } + + /* The acquire-load must read what the release-store wrote + * (single-threaded, so no ordering question, just contract + * verification). */ + GFX_THUMB_STATUS_STORE(&t.status, GFX_THUMBNAIL_STATUS_AVAILABLE); + if (GFX_THUMB_STATUS_LOAD(&t.status) != GFX_THUMBNAIL_STATUS_AVAILABLE) + { + fprintf(stderr, "FAIL: AVAILABLE not observable after store\n"); + return 1; + } + + printf("[pass] STORE/LOAD round-trip on all four status values\n"); + return 0; +} + +int main(void) +{ + printf("retro_atomic backend: %s\n", RETRO_ATOMIC_BACKEND_NAME); +#ifdef RETRO_ATOMIC_LOCK_FREE + printf("retro_atomic lock-free: yes\n"); +#else + printf("retro_atomic lock-free: NO (volatile fallback)\n"); +#endif + + if (run_property_checks() != 0) + return 1; + if (run_stress() != 0) + return 1; + + puts("ALL OK"); + return 0; +} diff --git a/samples/gfx/gfx_widgets_msg_queue_race/Makefile b/samples/gfx/gfx_widgets_msg_queue_race/Makefile new file mode 100644 index 000000000000..b8f62bad58b3 --- /dev/null +++ b/samples/gfx/gfx_widgets_msg_queue_race/Makefile @@ -0,0 +1,52 @@ +TARGET := gfx_widgets_msg_queue_race_test + +# Path back to the repo root from this sample dir. The test references +# headers from libretro-common/include and links against +# libretro-common/queues/fifo_queue.c (the FIFO this regression test +# exercises) and libretro-common/rthreads/rthreads.c (when HAVE_THREADS +# is enabled). +REPO_ROOT := ../../.. +LIBRETRO_COMM_DIR := $(REPO_ROOT)/libretro-common + +# This sample mirrors the post-fix locking discipline around +# dispgfx_widget_t::msg_queue in gfx/gfx_widgets.c. The test uses +# fifo_buffer_t directly (compiled from libretro-common) but does NOT +# pull in gfx_widgets.c itself -- the fix is a locking protocol, +# not a code change to the FIFO primitive. +HAVE_THREADS ?= 1 + +SOURCES := \ + gfx_widgets_msg_queue_race_test.c \ + $(LIBRETRO_COMM_DIR)/queues/fifo_queue.c + +CFLAGS += -Wall -pedantic -std=gnu99 -g -O0 \ + -I$(LIBRETRO_COMM_DIR)/include + +ifeq ($(HAVE_THREADS),1) + CFLAGS += -DHAVE_THREADS + SOURCES += $(LIBRETRO_COMM_DIR)/rthreads/rthreads.c + LDFLAGS += -lpthread + # rthreads.c uses clock_gettime + CLOCK_REALTIME on Linux glibc; on + # older glibc those live in -lrt. Harmless on newer glibc. + LDFLAGS += -lrt +endif + +OBJS := $(SOURCES:.c=.o) + +ifneq ($(SANITIZER),) + CFLAGS := -fsanitize=$(SANITIZER) -fno-omit-frame-pointer $(CFLAGS) + LDFLAGS := -fsanitize=$(SANITIZER) $(LDFLAGS) +endif + +all: $(TARGET) + +%.o: %.c + $(CC) -c -o $@ $< $(CFLAGS) + +$(TARGET): $(OBJS) + $(CC) -o $@ $^ $(LDFLAGS) + +clean: + rm -f $(TARGET) $(OBJS) + +.PHONY: clean diff --git a/samples/gfx/gfx_widgets_msg_queue_race/gfx_widgets_msg_queue_race_test.c b/samples/gfx/gfx_widgets_msg_queue_race/gfx_widgets_msg_queue_race_test.c new file mode 100644 index 000000000000..30a307d15be1 --- /dev/null +++ b/samples/gfx/gfx_widgets_msg_queue_race/gfx_widgets_msg_queue_race_test.c @@ -0,0 +1,497 @@ +/* Copyright (C) 2010-2026 The RetroArch team + * + * --------------------------------------------------------------------------------------- + * The following license statement only applies to this file (gfx_widgets_msg_queue_race_test.c). + * --------------------------------------------------------------------------------------- + * + * Permission is hereby granted, free of charge, + * to any person obtaining a copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/* Regression test for the producer-producer race fix on + * dispgfx_widget_t::msg_queue in gfx/gfx_widgets.c. + * + * Pre-fix: gfx_widgets_msg_queue_push called fifo_write on + * p_dispwidget->msg_queue without holding any lock. The + * function is reachable from three callers: + * + * 1. runloop.c:runloop_msg_queue_push (40+ files transitively + * reach this; many run on threaded task workers via + * libretro-common/queues/task_queue.c). Holds + * RUNLOOP_MSG_QUEUE_LOCK across the call. + * 2. runloop.c:runloop_task_msg_queue_push. Same lock. + * 3. gfx/video_driver.c:video_driver_frame. Acquires + * RUNLOOP_MSG_QUEUE_LOCK to drain runloop_st->msg_queue, + * RELEASES it, then calls gfx_widgets_msg_queue_push + * WITHOUT the lock. + * + * Path 3 runs on the main thread; path 1 can run on any thread. + * RUNLOOP_MSG_QUEUE_LOCK protects runloop_st->msg_queue, NOT + * p_dispwidget->msg_queue -- the lock name reflects the runloop + * struct it was named after. So path 3's fifo_write (no lock) + * can race with path 1's fifo_write (lock held but irrelevant + * to path 3). + * + * The consumer (gfx_widgets_iterate -> fifo_read) ran inside + * RUNLOOP_MSG_QUEUE_LOCK at runloop.c:6048 and inside + * current_msgs_lock at gfx_widgets.c:973 -- but neither lock + * is taken on path 3, so the consumer's fifo_read could observe + * a half-updated `end` cursor from path 3's fifo_write. + * + * Outcome of a race: torn `end` cursor publishes a bad pointer + * in the disp_widget_msg_t* slots. fifo_read returns garbage, + * the consumer dereferences it as a disp_widget_msg_t* in + * gfx_widgets_iterate, and we get a use-after-free / segfault. + * On x86 TSO the bug is masked most of the time by hardware + * ordering; on Win-on-ARM / Apple Silicon (AArch64) it surfaces + * more often. + * + * Fix: a dedicated msg_queue_lock on dispgfx_widget_t, acquired + * inside gfx_widgets_msg_queue_push around the fifo_write + * (with an avail re-check, since fifo_write does no bounds + * checking) and inside gfx_widgets_iterate around the fifo_read. + * msg_queue_lock is the inner lock; current_msgs_lock is the + * outer lock when both are held. + * + * This test exercises the post-fix shape: + * + * - Two producer threads each call a stripped-down version of + * gfx_widgets_msg_queue_push that mirrors the post-fix + * locking discipline (msg_queue_lock around fifo_write + + * avail re-check + rollback on race-loss). + * + * - One consumer thread calls a stripped-down version of the + * gfx_widgets_iterate FIFO-handling block: take + * msg_queue_lock, fifo_read, release. Then validate the + * pointer it got is one we actually pushed. + * + * - Run for STRESS_ITERS iterations under ThreadSanitizer. + * halt_on_error=1 means the first race aborts the run + * non-zero, which the workflow gates on. If the locking + * in the production code is removed or weakened (e.g. + * someone drops the lock around fifo_write thinking the + * outer FIFO_WRITE_AVAIL check is enough), TSan will see + * concurrent writes to fifo_buffer_t::end and flag the race. + * + * The test does NOT mirror the full gfx_widgets_msg_queue_push + * function -- the message-widget allocation, font measurement, + * and animation push are irrelevant to the race. What matters + * is the lock protocol around the fifo_buffer_t access, which + * is what we exercise. + * + * If gfx_widgets.c amends the locking around msg_queue, the + * verbatim copies in this test must follow. Convention used by + * gfx_thumbnail_status_atomic_test, vulkan_extension_count_test, + * and others under samples/. + */ + +#include +#include +#include +#include + +#include +#include + +#ifdef HAVE_THREADS +#include +#endif + +/* MSG_QUEUE_PENDING_MAX in gfx_widgets.c. Mirrored here for + * realism, but the test works at any reasonable size. */ +#define MSG_QUEUE_PENDING_MAX 32 + +/* Stand-in for disp_widget_msg_t. The real struct has 30+ + * fields; the FIFO only stores the POINTER, not the contents. + * For race-checking we just need allocations whose addresses + * we can validate. */ +typedef struct test_msg +{ + uint32_t generation; + uint32_t producer_id; +} test_msg_t; + +/* Mirror of dispgfx_widget_t's msg_queue + lock pieces. The + * real struct has many more fields; the test only models the + * synchronisation invariant. */ +typedef struct +{ + fifo_buffer_t msg_queue; +#ifdef HAVE_THREADS + slock_t *msg_queue_lock; +#endif +} dispgfx_widget_test_t; + +/* === Verbatim mirror of gfx_widgets_msg_queue_push's FIFO + * interaction (post-fix). Strips out the widget-allocation, + * font-measurement, and animation-push work that is not + * part of the race; what remains is the lock protocol. + * If gfx_widgets.c amends the locking, this block must + * follow. === */ +static int test_msg_queue_push( + dispgfx_widget_test_t *p_dispwidget, + test_msg_t *msg_widget) +{ + /* No outer FIFO_WRITE_AVAIL fast-path: reading the FIFO + * cursors outside msg_queue_lock would race with concurrent + * producer fifo_writes (TSan-detectable). The locked + * avail re-check below is the correctness gate. */ +#ifdef HAVE_THREADS + bool fifo_full; + slock_lock(p_dispwidget->msg_queue_lock); + fifo_full = (FIFO_WRITE_AVAIL_NONPTR(p_dispwidget->msg_queue) + < sizeof(msg_widget)); + if (!fifo_full) + fifo_write(&p_dispwidget->msg_queue, + &msg_widget, sizeof(msg_widget)); + slock_unlock(p_dispwidget->msg_queue_lock); + + if (fifo_full) + return 0; /* race-loss or full: caller's responsibility to free */ + return 1; +#else + if (FIFO_WRITE_AVAIL_NONPTR(p_dispwidget->msg_queue) < sizeof(msg_widget)) + return 0; + fifo_write(&p_dispwidget->msg_queue, + &msg_widget, sizeof(msg_widget)); + return 1; +#endif +} + +/* === Verbatim mirror of gfx_widgets_iterate's FIFO read + * block (post-fix). Strips out the current_msgs[] insertion + * logic; what remains is the lock protocol. If gfx_widgets.c + * amends the locking, this block must follow. === */ +static test_msg_t *test_msg_queue_consume( + dispgfx_widget_test_t *p_dispwidget) +{ + test_msg_t *msg_widget = NULL; + + /* No outer FIFO_READ_AVAIL fast-path: reading the FIFO cursors + * outside msg_queue_lock would race with concurrent producer + * fifo_writes. The locked re-check below is the gate. */ +#ifdef HAVE_THREADS + slock_lock(p_dispwidget->msg_queue_lock); +#endif + if (FIFO_READ_AVAIL_NONPTR(p_dispwidget->msg_queue) >= sizeof(msg_widget)) + fifo_read(&p_dispwidget->msg_queue, + &msg_widget, sizeof(msg_widget)); +#ifdef HAVE_THREADS + slock_unlock(p_dispwidget->msg_queue_lock); +#endif + return msg_widget; +} +/* === end verbatim copies === */ + +#ifdef HAVE_THREADS + +/* Number of pushes per producer thread. 500K * 2 producers gives + * 1M total push attempts. Enough to flush out a missing-lock + * regression under TSan within a few seconds. TSan halt_on_error=1 + * means the first observed race aborts the run, so the iteration + * count primarily affects how reliably we trigger the bug if it + * regresses, not the cost of the success case. */ +#define STRESS_ITERS 500000 + +typedef struct +{ + dispgfx_widget_test_t *widget; + uint32_t producer_id; + uint32_t pushed_ok; + uint32_t pushed_full; +} producer_ctx_t; + +typedef struct +{ + dispgfx_widget_test_t *widget; + uint32_t consumed; + int failed; /* set non-zero on a bogus pointer */ + retro_atomic_int_t *stop; +} consumer_ctx_t; + +static void producer_thread(void *arg) +{ + producer_ctx_t *ctx = (producer_ctx_t*)arg; + uint32_t i; + for (i = 0; i < STRESS_ITERS; i++) + { + test_msg_t *m = (test_msg_t*)malloc(sizeof(*m)); + if (!m) + continue; /* OOM: skip; cosmetic-error path */ + m->generation = i; + m->producer_id = ctx->producer_id; + + if (test_msg_queue_push(ctx->widget, m)) + ctx->pushed_ok++; + else + { + /* Lost the race or FIFO full. Free our allocation. */ + free(m); + ctx->pushed_full++; + } + } +} + +static void consumer_thread(void *arg) +{ + consumer_ctx_t *ctx = (consumer_ctx_t*)arg; + while (!retro_atomic_load_acquire_int(ctx->stop)) + { + test_msg_t *m = test_msg_queue_consume(ctx->widget); + if (m) + { + /* Validate that what we got back is plausibly one of + * the things a producer pushed. generation < STRESS_ITERS, + * producer_id is one of the small set we spawned. A + * torn pointer from a missing-lock race would typically + * fail this check (garbage memory), but the primary + * regression signal here is TSan flagging the + * concurrent unsynchronised access on the FIFO cursors, + * not this content check. This is belt-and-suspenders. */ + if (m->generation >= STRESS_ITERS || m->producer_id >= 16) + { + ctx->failed = 1; + free(m); + return; + } + ctx->consumed++; + free(m); + } + } + /* Drain anything left behind after producers stopped. */ + for (;;) + { + test_msg_t *m = test_msg_queue_consume(ctx->widget); + if (!m) + break; + ctx->consumed++; + free(m); + } +} + +static int run_stress(void) +{ + dispgfx_widget_test_t widget; + producer_ctx_t prod_ctx[2]; + consumer_ctx_t cons_ctx; + sthread_t *prod_threads[2]; + sthread_t *cons_thread; + retro_atomic_int_t stop; + int i; + uint32_t total_pushed = 0; + + retro_atomic_int_init(&stop, 0); + + if (!fifo_initialize(&widget.msg_queue, + MSG_QUEUE_PENDING_MAX * sizeof(test_msg_t*))) + { + fprintf(stderr, "fifo_initialize failed\n"); + return 1; + } + widget.msg_queue_lock = slock_new(); + if (!widget.msg_queue_lock) + { + fprintf(stderr, "slock_new failed\n"); + fifo_deinitialize(&widget.msg_queue); + return 1; + } + + for (i = 0; i < 2; i++) + { + prod_ctx[i].widget = &widget; + prod_ctx[i].producer_id = (uint32_t)i; + prod_ctx[i].pushed_ok = 0; + prod_ctx[i].pushed_full = 0; + } + cons_ctx.widget = &widget; + cons_ctx.consumed = 0; + cons_ctx.failed = 0; + cons_ctx.stop = &stop; + + cons_thread = sthread_create(consumer_thread, &cons_ctx); + prod_threads[0] = sthread_create(producer_thread, &prod_ctx[0]); + prod_threads[1] = sthread_create(producer_thread, &prod_ctx[1]); + + if (!cons_thread || !prod_threads[0] || !prod_threads[1]) + { + fprintf(stderr, "sthread_create failed\n"); + return 1; + } + + sthread_join(prod_threads[0]); + sthread_join(prod_threads[1]); + retro_atomic_store_release_int(&stop, 1); + sthread_join(cons_thread); + + /* Drain anything still in the FIFO (consumer-side, single- + * threaded by this point so no lock needed -- but using the + * same helper for consistency). */ + for (;;) + { + test_msg_t *m = test_msg_queue_consume(&widget); + if (!m) + break; + cons_ctx.consumed++; + free(m); + } + + total_pushed = prod_ctx[0].pushed_ok + prod_ctx[1].pushed_ok; + + if (cons_ctx.failed) + { + fprintf(stderr, "FAIL: consumer saw torn / bogus pointer\n"); + slock_free(widget.msg_queue_lock); + fifo_deinitialize(&widget.msg_queue); + return 1; + } + + if (cons_ctx.consumed != total_pushed) + { + fprintf(stderr, + "FAIL: consumed %u != pushed %u (P0=%u/%u, P1=%u/%u)\n", + cons_ctx.consumed, total_pushed, + prod_ctx[0].pushed_ok, prod_ctx[0].pushed_full, + prod_ctx[1].pushed_ok, prod_ctx[1].pushed_full); + slock_free(widget.msg_queue_lock); + fifo_deinitialize(&widget.msg_queue); + return 1; + } + + printf("[pass] stress: %u pushed (%u+%u), %u rejected on full, " + "%u consumed, no torn pointers\n", + total_pushed, prod_ctx[0].pushed_ok, prod_ctx[1].pushed_ok, + prod_ctx[0].pushed_full + prod_ctx[1].pushed_full, + cons_ctx.consumed); + + slock_free(widget.msg_queue_lock); + fifo_deinitialize(&widget.msg_queue); + return 0; +} + +#endif /* HAVE_THREADS */ + +/* Property-style smoke test for the single-threaded path: + * push fills the FIFO, push past full returns 0, consume drains + * everything in order. */ +static int run_smoke(void) +{ + dispgfx_widget_test_t widget; + int i; + int pushed = 0; + int consumed = 0; + test_msg_t *messages[MSG_QUEUE_PENDING_MAX + 4]; + + if (!fifo_initialize(&widget.msg_queue, + MSG_QUEUE_PENDING_MAX * sizeof(test_msg_t*))) + { + fprintf(stderr, "fifo_initialize failed\n"); + return 1; + } +#ifdef HAVE_THREADS + widget.msg_queue_lock = slock_new(); + if (!widget.msg_queue_lock) + { + fprintf(stderr, "slock_new failed\n"); + fifo_deinitialize(&widget.msg_queue); + return 1; + } +#endif + + /* Push more than the FIFO can hold; later pushes should + * return 0 (rejected on full). */ + for (i = 0; i < MSG_QUEUE_PENDING_MAX + 4; i++) + { + messages[i] = (test_msg_t*)malloc(sizeof(test_msg_t)); + if (!messages[i]) + { + fprintf(stderr, "malloc failed\n"); + return 1; + } + messages[i]->generation = i; + messages[i]->producer_id = 0; + if (test_msg_queue_push(&widget, messages[i])) + pushed++; + else + { + /* push refused: free the allocation we tried to enqueue */ + free(messages[i]); + messages[i] = NULL; + } + } + + if (pushed == 0 || pushed > MSG_QUEUE_PENDING_MAX) + { + fprintf(stderr, "FAIL: smoke pushed=%d; expected 1..%d\n", + pushed, MSG_QUEUE_PENDING_MAX); + return 1; + } + + /* Drain. Order should be FIFO. */ + for (i = 0; i < pushed; i++) + { + test_msg_t *m = test_msg_queue_consume(&widget); + if (!m) + { + fprintf(stderr, "FAIL: smoke consume returned NULL at i=%d\n", i); + return 1; + } + if (m->generation != (uint32_t)i) + { + fprintf(stderr, + "FAIL: smoke order: expected generation %d, got %u\n", + i, m->generation); + free(m); + return 1; + } + free(m); + consumed++; + } + + if (test_msg_queue_consume(&widget)) + { + fprintf(stderr, "FAIL: smoke FIFO non-empty after drain\n"); + return 1; + } + + /* Free any allocations whose push was refused. */ + for (i = pushed; i < MSG_QUEUE_PENDING_MAX + 4; i++) + { + if (messages[i]) + free(messages[i]); + } + + printf("[pass] smoke: pushed=%d consumed=%d (FIFO order verified)\n", + pushed, consumed); + +#ifdef HAVE_THREADS + slock_free(widget.msg_queue_lock); +#endif + fifo_deinitialize(&widget.msg_queue); + return 0; +} + +int main(void) +{ + if (run_smoke()) + return 1; +#ifdef HAVE_THREADS + if (run_stress()) + return 1; +#else + printf("[skip] stress: HAVE_THREADS not enabled\n"); +#endif + printf("ALL OK\n"); + return 0; +}