Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
* Upgraded volume calculations to preserve relative positions when hitting the min or max setting via source volume bar
* Update our spotify provider `go-librespot` to `0.7.3`
* Upgrade from Logitech Media Server 8.5.2 to Lyrion Music Server 9.0.3
* Hardened stream process restarts: exponential backoff and an ALSA device readiness gate (#957)

# 0.4.11
* System
Expand Down
129 changes: 126 additions & 3 deletions streams/process_monitor.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,120 @@
#!/usr/bin/env python3

import sys
import subprocess
import os
import signal
import subprocess
import sys
import time

args = sys.argv[1:]
print(f'Starting process monitor for process: {args}')

_START_TIME_THRESHOLD = 10 # seconds — consider a run "successful" if it lasted this long
_DEVICE_PROBE_TIMEOUT = 5 # seconds — hard cap on a single aplay readiness probe
_ORPHAN_KILL_AFTER = 4 # failed probes in a row before killing a stale holder


def _alsa_out_device(argv):
"""Return the ALSA loopback output device (-o lbXX) if the monitored
process plays to one, else None. All of the device gating below is
skipped for processes without such an argument."""
for i, a in enumerate(argv[:-1]):
if a == '-o' and argv[i + 1].startswith('lb'):
return argv[i + 1]
return None


OUT_DEV = _alsa_out_device(args)


def device_ready(dev):
"""True if the ALSA device can be opened for playback right now.
Mirrors squeezelite's open path (loopback dmix): fails with EINVAL
while a stale holder keeps the device busy, succeeds once it is free.
Writes 1s of silence to a loopback nobody is reading — harmless."""
try:
res = subprocess.run(
['aplay', '-D', dev, '-r', '48000', '-f', 'S16_LE', '-c', '2',
'-d', '1', '-q', '/dev/zero'],
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
timeout=_DEVICE_PROBE_TIMEOUT)
return res.returncode == 0
except Exception:
return False


def print_device_diagnostics(dev):
"""Dump who currently holds the sound devices so a wedge is explained
in the journal rather than inferred afterwards."""
print(f'Device {dev} not ready — current /dev/snd holders:', flush=True)
try:
out = subprocess.run('fuser -v /dev/snd/pcm* 2>&1', shell=True,
capture_output=True, text=True, timeout=10)
print((out.stdout + out.stderr).strip(), flush=True)
except Exception as e:
print(f'(fuser failed: {e})', flush=True)


def find_stale_holders(dev, own_child_pid):
"""PIDs of squeezelite processes (argv0 ends in 'squeezelite') playing
to this exact device via -o <dev>, excluding ourselves and our own
child. Strict match — cannot hit the monitor or unrelated processes."""
pids = []
for entry in os.listdir('/proc'):
if not entry.isdigit():
continue
pid = int(entry)
if pid in (own_child_pid, os.getpid()):
continue
try:
with open(f'/proc/{pid}/cmdline', 'rb') as f:
argv = f.read().decode(errors='replace').split('\0')
except OSError:
continue
if not argv or not argv[0].endswith('squeezelite'):
continue
for i, a in enumerate(argv[:-1]):
if a == '-o' and argv[i + 1] == dev:
pids.append(pid)
break
return pids


def wait_for_device(dev, own_child_pid):
"""Block until the output device is openable. Converts the old
crash-loop (spawn → EINVAL → die → respawn, silent for hours) into a
quiet wait, and clears a stale squeezelite holder if one is wedging
the loopback."""
failures = 0
while not device_ready(dev):
failures += 1
if failures == 1:
print_device_diagnostics(dev)
if failures == _ORPHAN_KILL_AFTER:
for pid in find_stale_holders(dev, own_child_pid):
print(f'Killing stale squeezelite holder pid={pid} on {dev}', flush=True)
try:
os.kill(pid, signal.SIGKILL)
except OSError as e:
print(f'(kill {pid} failed: {e})', flush=True)
delay = min(2 ** failures, 30)
print(f'Device {dev} busy (probe {failures} failed) — retrying in {delay}s...',
flush=True)
time.sleep(delay)
if failures:
print(f'Device {dev} ready again after {failures} failed probe(s).', flush=True)


proc = None
last_child_pid = -1


def signal_handler(sig, frame):
"""Signal handler function that sends the received signal to the
subprocess and exits the script for certain signals."""
print(f'Forwarding signal {sig} to subprocess.', flush=True)
proc.send_signal(sig)
if proc is not None and proc.poll() is None:
proc.send_signal(sig)
if sig in [signal.SIGINT, signal.SIGTERM, signal.SIGQUIT]:
sys.exit(0)
else:
Expand All @@ -29,8 +131,17 @@ def signal_handler(sig, frame):
except:
pass

consecutive_failures = 0

while True:
# Don't spawn into a busy loopback — the child would just exit with
# EINVAL and we'd crash-loop while staying silent for hours.
if OUT_DEV:
wait_for_device(OUT_DEV, last_child_pid)

start_time = time.monotonic()
proc = subprocess.Popen(args)
last_child_pid = proc.pid

try:
# Wait for the subprocess to complete
Expand All @@ -52,3 +163,15 @@ def signal_handler(sig, frame):
# If the subprocess exits with a non-zero code, restart it
print(
f"Subprocess exited with non-zero code: {proc.returncode}. Restarting...", flush=True)

# Track consecutive failures for exponential backoff.
# Reset if the process ran long enough to be considered a successful start.
elapsed = time.monotonic() - start_time
if elapsed >= _START_TIME_THRESHOLD:
consecutive_failures = 0
else:
consecutive_failures += 1

backoff = min(2 ** consecutive_failures, 30)
print(f"Waiting {backoff}s before restart (consecutive_failures={consecutive_failures})...", flush=True)
time.sleep(backoff)