From 1369a6c25b03f505d0e3b64d77622e98a1982eaf Mon Sep 17 00:00:00 2001 From: Stamate Viorel Date: Wed, 10 Jun 2026 14:33:33 +0200 Subject: [PATCH 1/2] Add exponential backoff to process monitor restarts process_monitor restarted a crashing child in a tight zero-delay loop: a process that dies instantly (e.g. its ALSA output device cannot be opened) gets respawned as fast as the loop runs, flooding logs and wearing the SD card. Track consecutive fast failures and back off exponentially (2s..30s), resetting once a run survives 10 seconds. Signed-off-by: Stamate Viorel Co-Authored-By: Claude Fable 5 --- CHANGELOG.md | 1 + streams/process_monitor.py | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index fd1894792..b9bfc47b2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ * Upgraded volume calculations to preserve relative positions when hitting the min or max setting via source volume bar * Update our spotify provider `go-librespot` to `0.7.3` * Upgrade from Logitech Media Server 8.5.2 to Lyrion Music Server 9.0.3 + * Hardened stream process restarts: exponential backoff and an ALSA device readiness gate (#957) # 0.4.11 * System diff --git a/streams/process_monitor.py b/streams/process_monitor.py index 34dd879da..8830cd396 100755 --- a/streams/process_monitor.py +++ b/streams/process_monitor.py @@ -3,10 +3,13 @@ import sys import subprocess import signal +import time args = sys.argv[1:] print(f'Starting process monitor for process: {args}') +_START_TIME_THRESHOLD = 10 # seconds — consider a run "successful" if it lasted this long + def signal_handler(sig, frame): """Signal handler function that sends the received signal to the @@ -29,7 +32,10 @@ def signal_handler(sig, frame): except: pass +consecutive_failures = 0 + while True: + start_time = time.monotonic() proc = subprocess.Popen(args) try: @@ -52,3 +58,15 @@ def signal_handler(sig, frame): # If the subprocess exits with a non-zero code, restart it print( f"Subprocess exited with non-zero code: {proc.returncode}. Restarting...", flush=True) + + # Track consecutive failures for exponential backoff. + # Reset if the process ran long enough to be considered a successful start. + elapsed = time.monotonic() - start_time + if elapsed >= _START_TIME_THRESHOLD: + consecutive_failures = 0 + else: + consecutive_failures += 1 + + backoff = min(2 ** consecutive_failures, 30) + print(f"Waiting {backoff}s before restart (consecutive_failures={consecutive_failures})...", flush=True) + time.sleep(backoff) From 8b0cada83ab6c283e05c38d82e479f426dcfb2be Mon Sep 17 00:00:00 2001 From: Stamate Viorel Date: Wed, 10 Jun 2026 14:33:33 +0200 Subject: [PATCH 2/2] Gate monitored-process restarts on ALSA output device readiness When the monitored player exits because its loopback is still held (a not-yet-released previous instance, or the dmix state from #957), every respawn dies instantly with EINVAL and the stream stays silent - we saw a 5.5h outage from one stream re-assign. If the monitored command plays to an ALSA loopback (-o lb*), probe the device with a 1s silent aplay (the same open path the player uses) before each spawn and wait quietly until it opens. On the first failed probe, log the current /dev/snd holders via fuser so the journal explains the wedge; after four failed probes, kill a stale leftover player process that targets the same device (strict match on the binary name and exact -o argument, never the monitor's own child). Processes without an -o lb* argument are completely unaffected. Signed-off-by: Stamate Viorel Co-Authored-By: Claude Fable 5 --- streams/process_monitor.py | 111 ++++++++++++++++++++++++++++++++++++- 1 file changed, 108 insertions(+), 3 deletions(-) diff --git a/streams/process_monitor.py b/streams/process_monitor.py index 8830cd396..c80f6a61b 100755 --- a/streams/process_monitor.py +++ b/streams/process_monitor.py @@ -1,21 +1,120 @@ #!/usr/bin/env python3 -import sys -import subprocess +import os import signal +import subprocess +import sys import time args = sys.argv[1:] print(f'Starting process monitor for process: {args}') _START_TIME_THRESHOLD = 10 # seconds — consider a run "successful" if it lasted this long +_DEVICE_PROBE_TIMEOUT = 5 # seconds — hard cap on a single aplay readiness probe +_ORPHAN_KILL_AFTER = 4 # failed probes in a row before killing a stale holder + + +def _alsa_out_device(argv): + """Return the ALSA loopback output device (-o lbXX) if the monitored + process plays to one, else None. All of the device gating below is + skipped for processes without such an argument.""" + for i, a in enumerate(argv[:-1]): + if a == '-o' and argv[i + 1].startswith('lb'): + return argv[i + 1] + return None + + +OUT_DEV = _alsa_out_device(args) + + +def device_ready(dev): + """True if the ALSA device can be opened for playback right now. + Mirrors squeezelite's open path (loopback dmix): fails with EINVAL + while a stale holder keeps the device busy, succeeds once it is free. + Writes 1s of silence to a loopback nobody is reading — harmless.""" + try: + res = subprocess.run( + ['aplay', '-D', dev, '-r', '48000', '-f', 'S16_LE', '-c', '2', + '-d', '1', '-q', '/dev/zero'], + stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, + timeout=_DEVICE_PROBE_TIMEOUT) + return res.returncode == 0 + except Exception: + return False + + +def print_device_diagnostics(dev): + """Dump who currently holds the sound devices so a wedge is explained + in the journal rather than inferred afterwards.""" + print(f'Device {dev} not ready — current /dev/snd holders:', flush=True) + try: + out = subprocess.run('fuser -v /dev/snd/pcm* 2>&1', shell=True, + capture_output=True, text=True, timeout=10) + print((out.stdout + out.stderr).strip(), flush=True) + except Exception as e: + print(f'(fuser failed: {e})', flush=True) + + +def find_stale_holders(dev, own_child_pid): + """PIDs of squeezelite processes (argv0 ends in 'squeezelite') playing + to this exact device via -o , excluding ourselves and our own + child. Strict match — cannot hit the monitor or unrelated processes.""" + pids = [] + for entry in os.listdir('/proc'): + if not entry.isdigit(): + continue + pid = int(entry) + if pid in (own_child_pid, os.getpid()): + continue + try: + with open(f'/proc/{pid}/cmdline', 'rb') as f: + argv = f.read().decode(errors='replace').split('\0') + except OSError: + continue + if not argv or not argv[0].endswith('squeezelite'): + continue + for i, a in enumerate(argv[:-1]): + if a == '-o' and argv[i + 1] == dev: + pids.append(pid) + break + return pids + + +def wait_for_device(dev, own_child_pid): + """Block until the output device is openable. Converts the old + crash-loop (spawn → EINVAL → die → respawn, silent for hours) into a + quiet wait, and clears a stale squeezelite holder if one is wedging + the loopback.""" + failures = 0 + while not device_ready(dev): + failures += 1 + if failures == 1: + print_device_diagnostics(dev) + if failures == _ORPHAN_KILL_AFTER: + for pid in find_stale_holders(dev, own_child_pid): + print(f'Killing stale squeezelite holder pid={pid} on {dev}', flush=True) + try: + os.kill(pid, signal.SIGKILL) + except OSError as e: + print(f'(kill {pid} failed: {e})', flush=True) + delay = min(2 ** failures, 30) + print(f'Device {dev} busy (probe {failures} failed) — retrying in {delay}s...', + flush=True) + time.sleep(delay) + if failures: + print(f'Device {dev} ready again after {failures} failed probe(s).', flush=True) + + +proc = None +last_child_pid = -1 def signal_handler(sig, frame): """Signal handler function that sends the received signal to the subprocess and exits the script for certain signals.""" print(f'Forwarding signal {sig} to subprocess.', flush=True) - proc.send_signal(sig) + if proc is not None and proc.poll() is None: + proc.send_signal(sig) if sig in [signal.SIGINT, signal.SIGTERM, signal.SIGQUIT]: sys.exit(0) else: @@ -35,8 +134,14 @@ def signal_handler(sig, frame): consecutive_failures = 0 while True: + # Don't spawn into a busy loopback — the child would just exit with + # EINVAL and we'd crash-loop while staying silent for hours. + if OUT_DEV: + wait_for_device(OUT_DEV, last_child_pid) + start_time = time.monotonic() proc = subprocess.Popen(args) + last_child_pid = proc.pid try: # Wait for the subprocess to complete