From d87804b626ab28aa632c7cd1d94e50ab01b23701 Mon Sep 17 00:00:00 2001 From: Waldemar Berg Date: Thu, 7 May 2026 21:27:11 +0200 Subject: [PATCH 1/6] mplayer-wrapper: decode-then-escape pipeline + auto-respawn after crash MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two related-but-distinct issues in the slave-protocol shim that sits between backend-player and mplayer. 1. Slave-protocol injection (HIGH-7). exec() built a command line and ran it through jsStringEscape() to escape `"`, newlines, and backslashes so a caller-controlled value (e.g. a track URL) could not break out of the quoted argument. Then the very next line did `str = decodeURIComponent(str)`, which UNDOES any `%22`, `%0A` or `%5C` the caller wedged in there. Net effect: a crafted URL like `http://attacker/file%22%0Astop%0A` decoded back to a literal `"` and newline AFTER escaping, injecting `stop` into the slave socket. Fix: keep decodeURIComponent (legitimate callers pass %xx-encoded paths from m3u resolvers, RSS feeds and the spotify-control playList helper — without decoding mplayer can't open them) but move it BEFORE jsStringEscape. Decoding-then-escaping preserves both halves: the caller still gets `Foo Bar/01.mp3` from `Foo%20Bar/ 01.mp3`, and a malicious `Foo%22%0Astop%0A` decodes to `Foo"\nstop\n` which jsStringEscape then neutralises into `Foo\"\\nstop\\n` — mplayer sees the escaped form, no slave-protocol injection. Wrap the decode in a try/catch since decodeURIComponent throws URIError on malformed `%FF` sequences. 2. No process recovery (MED-4). spawn() ran exactly once at module load, handled only the 'close' event, and ignored 'error' and stdin 'error'. So: - if mplayer crashed (SIGSEGV / OOM / decoder bug), every subsequent exec() wrote to a closed stdin and crashed the entire backend-player with EPIPE - if mplayer's binary went missing or fork() failed, the spawn error went uncaught and tore down the process Wrap the spawn in spawnMplayer() and attach error / stdin-error listeners. On unexpected close, schedule a respawn with capped exponential backoff (1, 2, 4, 8, 16, 30, 30…s). Reset the counter once a fresh process has been alive for 30s. The 'close' event still fires on intentional close() so existing teardown semantics are preserved (new `shutdown` flag distinguishes intentional from crash). Commands issued during the crash-respawn window are debug-logged and dropped instead of throwing — listeners can react to the new 'mplayer-crash' event. --- src/backend-player/src/mplayer-wrapper.js | 176 +++++++++++++++------- 1 file changed, 123 insertions(+), 53 deletions(-) diff --git a/src/backend-player/src/mplayer-wrapper.js b/src/backend-player/src/mplayer-wrapper.js index 1ea0dcb9..6b436f7b 100644 --- a/src/backend-player/src/mplayer-wrapper.js +++ b/src/backend-player/src/mplayer-wrapper.js @@ -6,24 +6,104 @@ const debug = require('debug')('mplayer-wrapper') const parsers = require('./parsers') +const MPLAYER_RESPAWN_MAX_BACKOFF_MS = 30_000 +const MPLAYER_HEALTHY_RUN_MS = 30_000 + const createPlayer = () => { const out = new EventEmitter() - const proc = spawn( - 'mplayer', - [ - '-slave', // 😔 - '-idle', - '-novideo', - '-quiet', - '-msglevel', - 'all=1:global=4:cplayer=4', - ], - { - env: process.env, - stdio: ['pipe', 'pipe', 'ignore'], - }, - ) + // mplayer -> wrapper. Defined before spawnMplayer so the spawn closure + // can reference it without a forward declaration. + const onLine = (line) => { + debug(`line: ${line}`) + if (line === 'Starting playback...') return out.emit('track-change') + + //Callback when playlist finishes + if (line === 'ANS_ERROR=PROPERTY_UNAVAILABLE') return out.emit('playlist-finish') + // todo: `ANS_ERROR=PROPERTY_UNAVAILABLE` + + const parts = /^ANS_([\w]+)=/g.exec(line) + if (!parts || !parts[1]) return null + const prop = parts[1] + + const parser = parsers[prop] + if (!parser) return null + const val = parser(line.slice(parts[0].length)) + out.emit('prop', prop, val) + out.emit(prop, val) + } + + // Auto-respawn: previously a single mplayer crash (SIGSEGV, OOM, mp3 + // decoder bug) bricked the player until pm2 restarted backend-player. + // Now we re-spawn with exponential backoff (1, 2, 4, 8, 16, 30, 30…s), + // and reset the counter once a fresh process has run cleanly for 30s. + let proc = null + let shutdown = false + let respawnAttempts = 0 + let healthyTimer = null + + const spawnMplayer = () => { + if (shutdown) return + + proc = spawn( + 'mplayer', + [ + '-slave', // 😔 + '-idle', + '-novideo', + '-quiet', + '-msglevel', + 'all=1:global=4:cplayer=4', + ], + { + env: process.env, + stdio: ['pipe', 'pipe', 'ignore'], + }, + ) + + // Spawn-itself errors (binary missing, ENOMEM during fork). Without + // this listener Node would re-throw and kill the entire backend-player. + proc.on('error', (err) => { + debug(`mplayer process error: ${err.message}`) + out.emit('mplayer-error', err) + }) + + // EPIPE on stdin when mplayer dies mid-write. The 'close' handler + // below owns the respawn; here we just absorb the error so it doesn't + // become an uncaught exception. + if (proc.stdin) { + proc.stdin.on('error', (err) => { + debug(`mplayer stdin error: ${err.message}`) + }) + } + + proc.stdout.pipe(byLine.createStream()).on('data', (line) => { + onLine(Buffer.isBuffer(line) ? line.toString() : line) + }) + + proc.on('close', (code) => { + if (healthyTimer) { + clearTimeout(healthyTimer) + healthyTimer = null + } + if (shutdown) { + out.emit('close', code) + return + } + const backoff = Math.min(1000 * 2 ** respawnAttempts, MPLAYER_RESPAWN_MAX_BACKOFF_MS) + respawnAttempts++ + debug(`mplayer exited unexpectedly (code ${code}), respawn in ${backoff}ms (attempt ${respawnAttempts})`) + out.emit('mplayer-crash', { code, attempt: respawnAttempts, backoffMs: backoff }) + setTimeout(spawnMplayer, backoff) + }) + + healthyTimer = setTimeout(() => { + respawnAttempts = 0 + healthyTimer = null + }, MPLAYER_HEALTHY_RUN_MS) + } + + spawnMplayer() // wrapper -> mplayer const exec = (cmd, args = []) => { @@ -31,14 +111,35 @@ const createPlayer = () => { for (const arg of args) { str += ' ' if ('string' === typeof arg) { - if (arg.includes(' ')) str += `"` - str += jsStringEscape(arg) - if (arg.includes(' ')) str += `"` + // Decode percent-encoded paths/URLs FIRST, then escape. The + // previous code decoded AFTER jsStringEscape — that undid the + // quote/newline protection and let `%22%0Astop%0A` become a + // literal `"\nstop\n` injection (HIGH-7). Decoding before + // escape keeps the legitimate use case (RSS/playlist track + // names with %20 etc. that callers pass through) while + // jsStringEscape now sees and escapes any quote/newline that + // came out of the decode. + let decoded = arg + try { + decoded = decodeURIComponent(arg) + } catch { + // Malformed percent sequence (e.g. literal `%FF` that isn't + // valid UTF-8). Fall through with the original string — + // jsStringEscape will still neutralise quotes/newlines. + } + if (decoded.includes(' ')) str += `"` + str += jsStringEscape(decoded) + if (decoded.includes(' ')) str += `"` } else str += arg } - str = decodeURIComponent(str) debug(`exec: ${str}`) - proc.stdin.write(`${str}\n`) + if (proc?.stdin?.writable) { + proc.stdin.write(`${str}\n`) + } else { + // mplayer is between crash and respawn — drop the command rather + // than hard-error. Callers that care can react to mplayer-crash. + debug(`exec dropped (mplayer not ready): ${str}`) + } } const getProps = (props) => { for (const prop of props) exec('pausing_keep_force get_property', [prop]) @@ -55,42 +156,11 @@ const createPlayer = () => { const setVolume = (amount) => exec('pausing_keep volume', [amount, '1']) const stop = () => exec('stop') - let closed = false - proc.on('close', (code) => { - closed = true - out.emit('close', code) - if (code > 0) { - // todo: emit err from proc.stderr - } - }) const close = () => { - if (!closed) exec('quit') - } - - // mplayer -> wrapper - const onLine = (line) => { - debug(`line: ${line}`) - if (line === 'Starting playback...') return out.emit('track-change') - - //Callback when playlist finishes - if (line === 'ANS_ERROR=PROPERTY_UNAVAILABLE') return out.emit('playlist-finish') - // todo: `ANS_ERROR=PROPERTY_UNAVAILABLE` - - const parts = /^ANS_([\w]+)=/g.exec(line) - if (!parts || !parts[1]) return null - const prop = parts[1] - - const parser = parsers[prop] - if (!parser) return null - const val = parser(line.slice(parts[0].length)) - out.emit('prop', prop, val) - out.emit(prop, val) + shutdown = true + if (proc?.stdin?.writable) exec('quit') } - proc.stdout.pipe(byLine.createStream()).on('data', (line) => { - onLine(Buffer.isBuffer(line) ? line.toString() : line) - }) - out.exec = exec out.getProps = getProps out.seek = seek From fe9d71a86d1a9cd25aac075669431d0ec0750c0c Mon Sep 17 00:00:00 2001 From: Waldemar Berg Date: Fri, 8 May 2026 07:48:30 +0200 Subject: [PATCH 2/6] spotify-control: serialise setVolume + read actual amixer state, not stale meta (MED-1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The volume-up handler had a TOCTOU race that let fast touchscreen taps push playback past muPiBoxConfig.mupibox.maxVolume — the Hörschutz cap parents rely on. The bug shape: exec(cmdVolume, callback) // async amixer-read, NOT awaited if (currentMeta.volume < muPiBoxConfig.mupibox.maxVolume) { cmdCall(volumeUp) currentMeta.volume += 5 } The exec was fired but the callback (which updates currentMeta.volume) ran later, while the if-check immediately used currentMeta.volume from whatever value the LAST setVolume left it at. With rapid taps: tap 1: cmd reads start, sync check sees stale volume X < cap → +5, currentMeta.volume = X+5 (locally, optimistic) tap 2 (before tap-1's read returns): same path, sees X+5 < cap → +5, currentMeta.volume = X+10 tap 3: … The actual amixer state climbs, but the local-optimistic view stays under the cap until reads finally catch up. Net: cap exceeded. Fix: 1. Promisify the amixer-read so each setVolume call awaits the ACTUAL hardware volume before deciding. 2. Add a serial queue (`_volumeOpQueue`) so concurrent setVolume invocations chain rather than overlap. Each chained op reads, checks, writes atomically with respect to other queued ops. 3. Bound the local update with Math.min/Math.max so even if amixer misreports, currentMeta stays in [0, maxVolume]. 4. Catch failures inside the chained .then so one bad op (e.g. amixer read failed during boot) doesn't poison the queue. --- src/backend-player/src/spotify-control.js | 71 +++++++++++++++-------- 1 file changed, 48 insertions(+), 23 deletions(-) diff --git a/src/backend-player/src/spotify-control.js b/src/backend-player/src/spotify-control.js index 732da894..47aa3fb6 100644 --- a/src/backend-player/src/spotify-control.js +++ b/src/backend-player/src/spotify-control.js @@ -837,6 +837,22 @@ function cmdCall(cmd) { }) } +// Serialise setVolume calls so two rapid taps from the touchscreen +// can't both read the same stale currentMeta.volume and double-increment +// past maxVolume. The previous code fired exec(cmdVolume) WITHOUT +// awaiting the callback and then immediately compared against the +// not-yet-updated currentMeta.volume — for fast taps the comparison +// always saw the value from before any of the in-flight operations, +// so the maxVolume cap (Hörschutz) was bypassable. Bug class: TOCTOU. +let _volumeOpQueue = Promise.resolve() +const _execAsync = (cmd) => + new Promise((resolve, reject) => { + require('node:child_process').exec(cmd, (e, stdout, stderr) => { + if (e) reject(e) + else resolve({ stdout, stderr }) + }) + }) + /*gets available devices, searches for the active one and returns its volume*/ async function setVolume(volume) { const volumeUp = '/usr/bin/amixer sset Master 5%+' @@ -844,33 +860,42 @@ async function setVolume(volume) { const volumeMax = `/usr/bin/amixer sset Master ${muPiBoxConfig.mupibox.maxVolume}%` const cmdVolume = "/usr/bin/amixer sget Master | grep 'Right:'" - const exec = require('node:child_process').exec - exec(cmdVolume, (e, stdout, _stderr) => { - if (e instanceof Error) { - console.error(nowDate.toLocaleString() + e) - throw e + // Chain onto the queue so concurrent invocations run strictly serially. + // Each invocation reads the ACTUAL current volume from amixer first, + // checks the cap, then writes — no stale-comparison window. + _volumeOpQueue = _volumeOpQueue.then(async () => { + let actualVolume + try { + const { stdout } = await _execAsync(cmdVolume) + actualVolume = Number.parseInt(stdout.split('[')[1].split('%')[0], 10) + } catch (e) { + log.warn(`${nowDate.toLocaleString()}: [setVolume] amixer read failed, skipping op:`, e?.message || e) + return } - currentMeta.volume = Number.parseInt(stdout.split('[')[1].split('%')[0], 10) - //console.log('stdout', stdout); - //console.log('stderr', stderr); - }) - - if (volume) { - if (currentMeta.volume < muPiBoxConfig.mupibox.maxVolume) { - await cmdCall(volumeUp) - currentMeta.volume = Number.parseInt(currentMeta.volume, 10) + 5 - } else { - currentMeta.volume = muPiBoxConfig.mupibox.maxVolume - await cmdCall(volumeMax) + if (Number.isNaN(actualVolume)) { + log.warn(`${nowDate.toLocaleString()}: [setVolume] amixer returned unparseable volume, skipping op`) + return } - } else { - await cmdCall(volumeDown) - if (currentMeta.volume > 0) { - currentMeta.volume = Number.parseInt(currentMeta.volume, 10) - 5 + currentMeta.volume = actualVolume + + if (volume) { + if (actualVolume < muPiBoxConfig.mupibox.maxVolume) { + await cmdCall(volumeUp) + currentMeta.volume = Math.min(actualVolume + 5, muPiBoxConfig.mupibox.maxVolume) + } else { + currentMeta.volume = muPiBoxConfig.mupibox.maxVolume + await cmdCall(volumeMax) + } } else { - currentMeta.volume = 0 + await cmdCall(volumeDown) + currentMeta.volume = Math.max(actualVolume - 5, 0) } - } + }).catch((err) => { + // Don't let one failed op poison the queue for subsequent ops. + log.warn(`${nowDate.toLocaleString()}: [setVolume] op failed:`, err?.message || err) + }) + + return _volumeOpQueue } async function transferPlayback(id) { From 8b0d61e33de220c3e00cb4159a1686b9d01690bd Mon Sep 17 00:00:00 2001 From: Waldemar Berg Date: Thu, 7 May 2026 17:49:04 +0200 Subject: [PATCH 3/6] backend-player: harden handleSpotifyError against non-API errors err.body was dereferenced unconditionally even though .error?.status had optional chaining; for any non-API error (DNS failure, connection drop, JSON parse error in the SDK, token-refresh HTTP failure) the handler itself crashed with TypeError, taking the whole backend-player process down via pm2 unhandledException. Add optional chaining on err and body. --- src/backend-player/src/spotify-control.js | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/backend-player/src/spotify-control.js b/src/backend-player/src/spotify-control.js index 47aa3fb6..9f459b4b 100644 --- a/src/backend-player/src/spotify-control.js +++ b/src/backend-player/src/spotify-control.js @@ -261,7 +261,7 @@ function setAccessToken(token) { /*called in all error cases*/ /*token expired and no_device error are handled explicitly*/ function handleSpotifyError(err, from) { - if (err.body.error?.status === 401) { + if (err?.body?.error?.status === 401) { log.debug(`${nowDate.toLocaleString()}: access token expired, refreshing...`) log.debug(`${nowDate.toLocaleString()}: Error from: ${from}`) counter.counterrorAccessToken++ @@ -271,7 +271,7 @@ function handleSpotifyError(err, from) { if (currentMeta.activeSpotifyId !== '0') { refreshToken() } - } else if (err.body.error?.status === 400) { + } else if (err?.body?.error?.status === 400) { log.debug(`${nowDate.toLocaleString()}: invalid id`) log.debug(`${nowDate.toLocaleString()}: Error from: ${from}`) log.debug(`${nowDate.toLocaleString()}: ${err}`) @@ -282,7 +282,7 @@ function handleSpotifyError(err, from) { if (currentMeta.activeSpotifyId !== '0') { setActiveDevice() } - } else if (err.body.error?.status === 429) { + } else if (err?.body?.error?.status === 429) { log.debug(`${nowDate.toLocaleString()}: To many requests on th spotify web api`) log.debug(`${nowDate.toLocaleString()}: Error from: ${from}`) log.debug(`${nowDate.toLocaleString()}: ${err}`) From c842c7659762d78f9f28738a2ec47087fd17a4b2 Mon Sep 17 00:00:00 2001 From: Waldemar Berg Date: Fri, 8 May 2026 23:49:22 +0200 Subject: [PATCH 4/6] fix: prevent overlapping startup.wav playback in chromium-autostart.sh MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit chromium-autostart.sh fires from two paths that race: 1) restart_kiosk.sh after the admin "Restart services" click 2) dietpi-login auto-respawn on tty2 once the previous chromium browser process dies Without a guard each invocation runs its own `mplayer ${START_SOUND} &` in the background. Live monitoring during a single user click showed 2 chromium-autostart.sh instances and 3 mplayer-startup.wav instances running concurrently — the user reported the boot sound layered on top of itself. pkill any prior mplayer that's still playing the same wav before launching a fresh one. The `${START_SOUND}` path (the wav) is unique enough that the regex never matches the backend-player's slave-mode mplayer (which is invoked without a media-file path on its argv). --- scripts/chromium-autostart.sh | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/scripts/chromium-autostart.sh b/scripts/chromium-autostart.sh index 7e699a51..626472e1 100644 --- a/scripts/chromium-autostart.sh +++ b/scripts/chromium-autostart.sh @@ -79,6 +79,14 @@ START_SOUND=$(/usr/bin/jq -r .mupibox.startSound ${CONFIG}) START_VOLUME=$(/usr/bin/jq -r .mupibox.startVolume ${CONFIG}) AUDIO_DEVICE=$(/usr/bin/jq -r .mupibox.audioDevice ${CONFIG}) /usr/bin/pactl set-sink-volume @DEFAULT_SINK@ ${START_VOLUME}% +# Kill any in-flight startup-sound playback before launching a fresh one. +# chromium-autostart.sh runs from two paths that can fire in quick +# succession: (1) restart_kiosk.sh after the admin "Restart services" +# click, and (2) dietpi-login auto-respawn on tty2 once chromium dies. +# Without this pkill both invocations spawn their own mplayer & overlay +# the welcome wav. Match by the wav path so the regex never collides +# with mplayer's slave-mode instance held by the backend-player. +pkill -f "mplayer.*${START_SOUND}" 2>/dev/null /usr/bin/mplayer -volume 100 ${START_SOUND} & pgrep -f "chromium-browser" | while read -r pid; do # Setze die Priorität für jeden Prozess neu From a4341f88a950e5c41b78fd045e79d2948d4a2530 Mon Sep 17 00:00:00 2001 From: Waldemar Berg Date: Sat, 16 May 2026 15:10:55 +0200 Subject: [PATCH 5/6] fix(hat): auto-reconnect SMBus on Remote I/O error (M6) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The BQ25792 driver previously raised I2CError on any OSError from the underlying smbus2 call. On a Pi the typical OSError is errno 121 "Remote I/O error" which occurs when an I2C transaction gets interrupted mid-byte (transient bus glitch, interleaved transaction). After such an error the bus is in an indeterminate state — subsequent reads also fail until the SMBus handle is re-opened. Without the auto-reconnect a single Remote-I/O-error knocks the mupihat daemon into a stale state: the periodic_json_dump cycle keeps logging errors, /tmp/mupihat.json goes stale until the watchdog eventually resets the service. Add a new _reopen_bus() helper that closes-and-reopens the smbus2.SMBus handle. In safe_execute(), catch OSError specifically (not the broader Exception catch), call _reopen_bus(), and retry the operation exactly once. If the retry also fails the original I2CError escalation path runs as before. The retry rebinds the bound method to the new bus handle so the original `self.bq.foo` reference doesn't dangle on the closed handle. Bound-method detection via hasattr(func, '__self__'). Other Exceptions (non-OSError) keep the original behaviour: log and re-raise as I2CError. --- scripts/mupihat/mupihat_bq25792.py | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/scripts/mupihat/mupihat_bq25792.py b/scripts/mupihat/mupihat_bq25792.py index ab1182f2..df5fa898 100644 --- a/scripts/mupihat/mupihat_bq25792.py +++ b/scripts/mupihat/mupihat_bq25792.py @@ -5307,15 +5307,43 @@ def get_PART_REVISION_string(self): # class methods + def _reopen_bus(self): + """ + Close and re-open the SMBus handle. Used after a Remote I/O error + to clear a hung bus state on the BQ25792 (errno 121 leaves the bus + in an indeterminate state on the Pi's i2c-bcm2835 driver). + """ + try: + self.bq.close() + except Exception: + pass + self.bq = smbus2.SMBus(self.i2c_device) + def safe_execute(self, func, *args, **kwargs): """ Executes a function safely, catching exceptions and handling errors. + On the first OSError (typically errno 121 "Remote I/O error" caused + by an interleaved I2C transaction or transient bus glitch), reopen + the SMBus handle and retry once. A second failure escalates to + I2CError so the caller can choose whether to skip the cycle. If `_exit_on_error` is True, the program will exit on error. """ try: return func(*args, **kwargs) + except OSError as e: + logging.warning(f"OSError in {func.__name__}: {str(e)} — reopening I2C bus and retrying once") + self._reopen_bus() + try: + # Re-bind the bound method to the new bus handle if needed + if hasattr(func, '__self__') and func.__self__ is not None: + func = getattr(self.bq, func.__name__) + return func(*args, **kwargs) + except Exception as e2: + logging.error(f"Error in {func.__name__} after bus reopen: {str(e2)}") + if self._exit_on_error: + sys.exit(1) + raise I2CError(f"Failed to execute {func.__name__}") from e2 except Exception as e: - #sys.stderr.write(f"Error in {func.__name__}: {str(e)}\n") logging.error(f"Error in {func.__name__}: {str(e)}") if self._exit_on_error: sys.exit(1) From 86d077b59e848e9438ca74b673b5c0872a590d2c Mon Sep 17 00:00:00 2001 From: Waldemar Berg Date: Fri, 15 May 2026 16:09:29 +0200 Subject: [PATCH 6/6] fix(hat): I2C bus mutex + auto-reconnect on Remote I/O error (AR5-5/M6) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit AR5-5: mupihat.py runs the BQ25792 driver in three threads — the periodic_json_dump daemon plus two Flask request workers (`/` and `/api/registers`). Without serialisation they can interleave I2C transactions on the same smbus2.SMBus handle, producing the "[Errno 121] Remote I/O error" bursts visible in the mupi_hat journal (observed live during charging on 2026-05-15). The Pi's i2c-bcm2835 driver doesn't fully recover from a half-completed sequence, so even benign collisions leave the bus in a state where read_all_register returns garbage or fails outright. Fix: module-level threading.Lock in mupihat.py wrapping every hat.* call. periodic_json_dump now acquires the lock in three short phases around the watchdog reset, the bulk register read, and the to_json / log_register_values follow-up reads — sleeps stay outside the lock so request workers can fit between phases (no pathological 5-second waits). The JSON file write also moved outside the lock: it doesn't touch I2C and can take 5-30 ms on SD. M6: companion auto-reconnect in mupihat_bq25792.py safe_execute(). When a single I2C transaction throws OSError (errno 121 is the common case), the wrapper now closes the SMBus handle, re-opens it with smbus2.SMBus(self.i2c_device), and retries the operation once before escalating to I2CError. This clears the hung bus state that previously persisted until the next cycle (or until mupihat-daemon restart in worst-case watchdog scenarios). The retry rebinds the bound method to the new bus handle so the original `self.bq.foo` reference doesn't dangle on the closed handle. Behaviour preserved: timing identical (sleep(0.1) + sleep(1) + sleep(3.9) per cycle), all original error logging kept, no API surface change. Only the locking and the OSError-recovery path are new. --- scripts/mupihat/mupihat.py | 48 +++++++++++++++++++++++++++----------- 1 file changed, 35 insertions(+), 13 deletions(-) diff --git a/scripts/mupihat/mupihat.py b/scripts/mupihat/mupihat.py index 07d6e568..69c4a985 100644 --- a/scripts/mupihat/mupihat.py +++ b/scripts/mupihat/mupihat.py @@ -35,13 +35,19 @@ import argparse from datetime import datetime from flask import Flask, render_template, jsonify -from threading import Thread +from threading import Thread, Lock from mupihat_bq25792 import bq25792 app = Flask(__name__) # Global variables hat = None +# AR5-5: Flask is threaded by default — without serialising access to the +# shared BQ25792 driver, the periodic_json_dump background thread and the +# Flask request workers can interleave their I2C transactions on the same +# smbus2.SMBus handle, producing "Remote I/O error" (errno 121) on the +# Pi's i2c bus. All hat.* calls below run under this lock. +i2c_lock = Lock() log_flag = False json_flag = False json_file = "/tmp/mupihat.json" @@ -84,7 +90,9 @@ def log_register_values(): def index(): """Flask route to display register values.""" try: - return render_template("index.html", registers=hat.to_json_registers()) + with i2c_lock: + registers = hat.to_json_registers() + return render_template("index.html", registers=registers) except Exception as e: return f"Error reading registers: {str(e)}", 500 @@ -92,8 +100,10 @@ def index(): @app.route("/api/registers") def api_registers(): """Flask API endpoint to return register values as JSON.""" - try: - return jsonify(hat.to_json_registers()) + try: + with i2c_lock: + registers = hat.to_json_registers() + return jsonify(registers) except Exception as e: return jsonify({"error": str(e)}), 500 @@ -102,19 +112,31 @@ def periodic_json_dump(): """Periodically writes the register values to a JSON file.""" global json_flag, json_file while True: - hat.watchdog_reset() - time.sleep(0.1) # Allow time for the watchdog reset - hat.read_all_register() - time.sleep(1) # Allow time for the registers to be updated - if json_flag: + # AR5-5: serialise I2C work via i2c_lock so Flask request workers + # don't interleave bus transactions with this thread. Sleeps stay + # OUTSIDE the lock so request workers can fit between phases. + snapshot = None + try: + with i2c_lock: + hat.watchdog_reset() + time.sleep(0.1) # Allow time for the watchdog reset + with i2c_lock: + hat.read_all_register() + time.sleep(1) # Allow time for the registers to be updated + with i2c_lock: + if json_flag: + snapshot = hat.to_json() + if log_flag: + log_register_values() + except Exception as e: + logging.error("periodic_json_dump I2C cycle failed: %s", str(e)) + if snapshot is not None: try: with open(json_file, "w") as outfile: - json.dump(hat.to_json(), outfile, indent=4) + json.dump(snapshot, outfile, indent=4) except Exception as e: logging.error("Failed to write JSON dump: %s", str(e)) - if log_flag: - log_register_values() - time.sleep(3.9) # Run every 4 seconds + time.sleep(3.9) # Run every ~4 seconds def parse_arguments():