From 1623869f4bf5450414601065fbee62fbad702180 Mon Sep 17 00:00:00 2001 From: Stamate Viorel Date: Wed, 10 Jun 2026 14:33:34 +0200 Subject: [PATCH] Auto-recover a hung preamp on persistent I2C write failures When the preamp microcontroller hangs it stops ACKing and every I2C write fails with OSError 121 (EREMOTEIO). The existing fallback only reopens the SMBus handle, which recovers a transient bus glitch but not a hung preamp - zone control stays dead until someone power-cycles the unit. Escalate: when the reopened-bus retry also fails, reset the preamps in place, re-assign I2C addresses, reopen the bus and re-flush all cached register values so zone state (mute/source/volume) survives, then retry the write. Rate-limited to once per 20s so a benign one-off glitch never resets audio. Observed live on our unit 2026-06-04 (zone control dead until manual reboot); with this patch the same wedge self-heals in under a second. Signed-off-by: Stamate Viorel Co-Authored-By: Claude Fable 5 --- CHANGELOG.md | 1 + amplipi/rt.py | 56 ++++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 54 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fd1894792..a2c9e43da 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ * Upgraded volume calculations to preserve relative positions when hitting the min or max setting via source volume bar * Update our spotify provider `go-librespot` to `0.7.3` * Upgrade from Logitech Media Server 8.5.2 to Lyrion Music Server 9.0.3 + * Added in-place preamp recovery when I2C writes fail persistently (EREMOTEIO), avoiding a full reboot # 0.4.11 * System diff --git a/amplipi/rt.py b/amplipi/rt.py index 29571097a..5585b375c 100644 --- a/amplipi/rt.py +++ b/amplipi/rt.py @@ -145,6 +145,11 @@ class _Preamps: preamps: Dict[int, List[int]] # Key: i2c address, Val: register values + # In-place preamp recovery — rate-limited so a benign I2C glitch + # never resets audio. See _recover_preamps() / write_byte_data(). + _RECOVERY_COOLDOWN_S = 20.0 + _last_recovery = 0.0 + def __init__(self, reset: bool = True, set_addr: bool = True, bootloader: bool = False, debug=True): self.preamps = dict() if not is_amplipi(): @@ -242,6 +247,41 @@ def new_preamp(self, addr: int): 0x4F, ] + def _recover_preamps(self) -> bool: + """ Recover a wedged/hung preamp IN-PLACE. + + The bare bus.write_byte_data retry in write_byte_data only reopens the + Linux SMBus handle — that recovers a transient bus glitch but NOT a hung + preamp microcontroller (which stops ACKing -> OSError 121 / EREMOTEIO). + The only thing that revives a hung preamp is pulsing its reset line, + which is exactly what a full reboot does. This does the same WITHOUT + rebooting: reset the preamp(s), re-assign I2C addresses, reopen the bus, + and re-flush every cached register so zone state (mute/source/vol) + survives the reset (self.preamps is the code's source of truth, updated + on every write). + + Rate-limited so a benign one-off glitch never resets audio. Returns True + if a recovery was performed (caller may retry the write). + """ + now = time.time() + if now - self._last_recovery < self._RECOVERY_COOLDOWN_S: + return False + self._last_recovery = now + logger.warning('Preamp I2C wedged (EREMOTEIO) - attempting in-place recovery (reset + re-flush)') + try: + self.reset_preamps() + self.set_i2c_addr() + self.bus = SMBus(1) + for addr, regs in list(self.preamps.items()): + for reg, val in enumerate(regs): + time.sleep(0.001) + self.bus.write_byte_data(addr, reg, val) + logger.info('Preamp in-place recovery complete') + return True + except Exception as exc: + logger.error(f'Preamp in-place recovery failed: {exc}') + return False + def write_byte_data(self, preamp_addr, reg, data): assert preamp_addr in _DEV_ADDRS assert type(preamp_addr) == int @@ -263,9 +303,19 @@ def write_byte_data(self, preamp_addr, reg, data): time.sleep(0.001) # space out sequential calls to avoid bus errors self.bus.write_byte_data(preamp_addr, reg, data) except Exception: - time.sleep(0.001) - self.bus = SMBus(1) - self.bus.write_byte_data(preamp_addr, reg, data) + # Fallback 1: reopen the bus handle and retry (transient bus glitch). + try: + time.sleep(0.001) + self.bus = SMBus(1) + self.bus.write_byte_data(preamp_addr, reg, data) + except Exception: + # Fallback 2: a reopened fd can't revive a hung preamp MCU. + # Escalate to an in-place preamp reset + re-flush, then retry once more. + if self._recover_preamps(): + time.sleep(0.001) + self.bus.write_byte_data(preamp_addr, reg, data) + else: + raise def probe_preamp(self, addr: int): # Scan for preamps, and set source registers to be completely digital