From 82a13f1e39c41932c0b6da367fd81846b2c69f52 Mon Sep 17 00:00:00 2001 From: amariichi <68761912+amariichi@users.noreply.github.com> Date: Sat, 16 May 2026 23:13:18 +0900 Subject: [PATCH 1/9] AtomS3R: enable Echo Base audio, tune face renderer, add bridge/sidecar MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Hardware-validated AtomS3R frontend work: - firmware: AtomS3R has no internal speaker; use Atomic Echo Base (cfg.internal_spk=false, external_speaker.atomic_echo=true), set speaker volume 130 - face_renderer: success face uses normal eyes + brown raised "\/" brows + closed-mouth smile arc (口パク preserved while speaking); blink is a dark downward-convex eyelid arc with randomized 3/4/5 s scheduling (1-in-10 quick 0.3 s re-blink); Thinking sweeps the pupils left/right - headroom_transport: Failed background persists until the next state event (like Permission) instead of auto-reverting after 8 s - scripts: default restart-operator-stack-in-place.sh to FACE_AUDIO_TARGET=both so the PC->Atom bridge receives tts_audio; add atoms3r-http-bridge.mjs and the stackchan-minimal sidecar Local commit only; not pushed. ExecPlan PLANS_48 updated on disk (.agent/ is gitignored by repo design, so it is not in this commit). Co-Authored-By: Claude Opus 4.7 --- firmware/atoms3r-headroom/.gitignore | 3 + firmware/atoms3r-headroom/README.md | 72 +++ .../include/headroom_config.example.h | 19 + .../include/headroom_config.h | 7 + firmware/atoms3r-headroom/partitions.csv | 6 + firmware/atoms3r-headroom/platformio.ini | 21 + .../atoms3r-headroom/src/face_renderer.cpp | 302 +++++++++++++ firmware/atoms3r-headroom/src/face_renderer.h | 50 +++ .../atoms3r-headroom/src/headroom_audio.cpp | 282 ++++++++++++ .../atoms3r-headroom/src/headroom_audio.h | 40 ++ .../src/headroom_ingress_server.cpp | 299 +++++++++++++ .../src/headroom_ingress_server.h | 44 ++ .../src/headroom_settings.cpp | 170 +++++++ .../atoms3r-headroom/src/headroom_settings.h | 46 ++ .../src/headroom_setup_portal.cpp | 209 +++++++++ .../src/headroom_setup_portal.h | 31 ++ .../src/headroom_transport.cpp | 280 ++++++++++++ .../atoms3r-headroom/src/headroom_transport.h | 35 ++ firmware/atoms3r-headroom/src/main.cpp | 206 +++++++++ integrations/stackchan-minimal/README.md | 98 +++++ .../stackchan-minimal/stackchan.env.example | 43 ++ .../stackchan_asr_adapter.py | 259 +++++++++++ .../stackchan_tts_adapter.py | 243 ++++++++++ scripts/atoms3r-http-bridge.mjs | 415 ++++++++++++++++++ scripts/restart-operator-stack-in-place.sh | 14 +- scripts/run-stackchan-sidecar.sh | 208 +++++++++ 26 files changed, 3399 insertions(+), 3 deletions(-) create mode 100644 firmware/atoms3r-headroom/.gitignore create mode 100644 firmware/atoms3r-headroom/README.md create mode 100644 firmware/atoms3r-headroom/include/headroom_config.example.h create mode 100644 firmware/atoms3r-headroom/include/headroom_config.h create mode 100644 firmware/atoms3r-headroom/partitions.csv create mode 100644 firmware/atoms3r-headroom/platformio.ini create mode 100644 firmware/atoms3r-headroom/src/face_renderer.cpp create mode 100644 firmware/atoms3r-headroom/src/face_renderer.h create mode 100644 firmware/atoms3r-headroom/src/headroom_audio.cpp create mode 100644 firmware/atoms3r-headroom/src/headroom_audio.h create mode 100644 firmware/atoms3r-headroom/src/headroom_ingress_server.cpp create mode 100644 firmware/atoms3r-headroom/src/headroom_ingress_server.h create mode 100644 firmware/atoms3r-headroom/src/headroom_settings.cpp create mode 100644 firmware/atoms3r-headroom/src/headroom_settings.h create mode 100644 firmware/atoms3r-headroom/src/headroom_setup_portal.cpp create mode 100644 firmware/atoms3r-headroom/src/headroom_setup_portal.h create mode 100644 firmware/atoms3r-headroom/src/headroom_transport.cpp create mode 100644 firmware/atoms3r-headroom/src/headroom_transport.h create mode 100644 firmware/atoms3r-headroom/src/main.cpp create mode 100644 integrations/stackchan-minimal/README.md create mode 100644 integrations/stackchan-minimal/stackchan.env.example create mode 100755 integrations/stackchan-minimal/stackchan_asr_adapter.py create mode 100755 integrations/stackchan-minimal/stackchan_tts_adapter.py create mode 100755 scripts/atoms3r-http-bridge.mjs create mode 100755 scripts/run-stackchan-sidecar.sh diff --git a/firmware/atoms3r-headroom/.gitignore b/firmware/atoms3r-headroom/.gitignore new file mode 100644 index 0000000..96a195e --- /dev/null +++ b/firmware/atoms3r-headroom/.gitignore @@ -0,0 +1,3 @@ +.pio/ +.vscode/ +include/headroom_config.local.h diff --git a/firmware/atoms3r-headroom/README.md b/firmware/atoms3r-headroom/README.md new file mode 100644 index 0000000..562a80a --- /dev/null +++ b/firmware/atoms3r-headroom/README.md @@ -0,0 +1,72 @@ +# Real Minimum Headroom AtomS3R Firmware + +This PlatformIO project is the AtomS3R hardware frontend for minimum-headroom. + +Milestone 1 initializes the M5Stack AtomS3R display, draws a 128x128 parametric +face, and cycles expressions with the Atom button. Milestone 2 adds saved +settings and a setup access point. WebSocket, TTS, microphone, and operator +bridge connection are still later milestones. + +## Build + +```bash +cd firmware/atoms3r-headroom +pio run +``` + +## Flash + +Put the AtomS3R in download mode if needed, then run: + +```bash +pio run -t upload +pio device monitor +``` + +Expected serial output: + +```text +Real Minimum Headroom AtomS3R starting +display ready +demo face mode +``` + +Press the Atom button to cycle through neutral, thinking, speaking, listening, +permission, success, and failed expressions. + +If Wi-Fi is not configured or cannot connect, the Atom starts a setup access +point such as `RMH-SETUP-1A2B` and shows the SSID plus `192.168.4.1` on the +display. Connect to that AP and open: + +```text +http://192.168.4.1/ +``` + +The setup page saves Wi-Fi, face app URLs, auth token, device id, display +priority agent id, input target agent id, face rotation, placement pose, and +upper-side orientation to ESP32 NVS/Preferences. + +When Wi-Fi connects successfully, the firmware opens the configured WebSocket +URL and mirrors these minimum-headroom payloads: + +- `event`: changes expression for command start, success, failure, permission, + retry, and idle states. +- `tts_state`: shows queued/speaking/error/idle state. +- `tts_mouth`: drives mouth openness from the payload's `open` value. + +If `MH_FACE_AUTH_TOKEN` is enabled on the PC, set the same token in the setup +page. The firmware appends it as `auth_token` on the WebSocket URL for the +same-LAN first implementation. + +The default display priority agent is `__operator__`. TTS/status payloads from +other agents are still accepted, but recent operator payloads get a short +priority window so helper speech does not immediately overwrite the physical +operator face. Future Atom input payloads should target `__operator__` by +default and must not directly target helper panes unless explicitly configured. + +## Local Settings + +The checked-in `include/headroom_config.example.h` contains safe placeholders. +For development-only defaults, create `include/headroom_config.local.h`; it is +ignored by git. Later milestones will load saved settings from NVS and expose an +Atom-hosted setup portal for Wi-Fi, server URL, auth token, and orientation. diff --git a/firmware/atoms3r-headroom/include/headroom_config.example.h b/firmware/atoms3r-headroom/include/headroom_config.example.h new file mode 100644 index 0000000..af583ed --- /dev/null +++ b/firmware/atoms3r-headroom/include/headroom_config.example.h @@ -0,0 +1,19 @@ +#pragma once + +#define HEADROOM_WIFI_SSID "your-wifi" +#define HEADROOM_WIFI_PASSWORD "your-password" +#define HEADROOM_FACE_HTTP_BASE "http://192.168.1.10:8765" +#define HEADROOM_FACE_WS_URL "ws://192.168.1.10:8765/ws" +#define HEADROOM_FACE_AUTH_TOKEN "" +#define HEADROOM_DEVICE_ID "atom-headroom-1" +#define HEADROOM_DISPLAY_AGENT_ID "__operator__" +#define HEADROOM_INPUT_TARGET_AGENT_ID "__operator__" +#define HEADROOM_MAX_BASE64_TTS_SECONDS 10 +#define HEADROOM_MAX_HTTP_TTS_BYTES 1200000 + +// Valid face rotations are 0, 90, 180, and 270 degrees. +#define HEADROOM_FACE_ROTATION_DEGREES 0 + +// Initial supported placement poses are "screen_up" and "side_up". +#define HEADROOM_PLACEMENT_POSE "screen_up" +#define HEADROOM_UP_SIDE_DEGREES 0 diff --git a/firmware/atoms3r-headroom/include/headroom_config.h b/firmware/atoms3r-headroom/include/headroom_config.h new file mode 100644 index 0000000..095177d --- /dev/null +++ b/firmware/atoms3r-headroom/include/headroom_config.h @@ -0,0 +1,7 @@ +#pragma once + +#if __has_include("headroom_config.local.h") +#include "headroom_config.local.h" +#else +#include "headroom_config.example.h" +#endif diff --git a/firmware/atoms3r-headroom/partitions.csv b/firmware/atoms3r-headroom/partitions.csv new file mode 100644 index 0000000..31455d2 --- /dev/null +++ b/firmware/atoms3r-headroom/partitions.csv @@ -0,0 +1,6 @@ +# Name, Type, SubType, Offset, Size, Flags +nvs, data, nvs, 0x9000, 0x5000, +otadata, data, ota, 0xe000, 0x2000, +app0, app, ota_0, 0x10000, 0x330000, +app1, app, ota_1, 0x340000, 0x330000, +spiffs, data, spiffs, 0x670000, 0x190000, diff --git a/firmware/atoms3r-headroom/platformio.ini b/firmware/atoms3r-headroom/platformio.ini new file mode 100644 index 0000000..5765760 --- /dev/null +++ b/firmware/atoms3r-headroom/platformio.ini @@ -0,0 +1,21 @@ +[env:m5stack-atoms3r] +platform = espressif32@6.7.0 +board = esp32-s3-devkitc-1 +framework = arduino +board_build.arduino.memory_type = qio_opi +board_build.partitions = partitions.csv +monitor_speed = 115200 +upload_speed = 115200 + +build_flags = + -DESP32S3 + -DBOARD_HAS_PSRAM + -mfix-esp32-psram-cache-issue + -DCORE_DEBUG_LEVEL=3 + -DARDUINO_USB_CDC_ON_BOOT=1 + -DARDUINO_USB_MODE=1 + +lib_deps = + M5Unified=https://github.com/m5stack/M5Unified + bblanchon/ArduinoJson@^7.4.2 + links2004/WebSockets@^2.6.1 diff --git a/firmware/atoms3r-headroom/src/face_renderer.cpp b/firmware/atoms3r-headroom/src/face_renderer.cpp new file mode 100644 index 0000000..d1d0ae3 --- /dev/null +++ b/firmware/atoms3r-headroom/src/face_renderer.cpp @@ -0,0 +1,302 @@ +#include "face_renderer.h" + +#include +#include +#include + +namespace { + +float clampFloat(float value, float minValue, float maxValue) { + return std::max(minValue, std::min(value, maxValue)); +} + +int displayRotationForDegrees(int degrees) { + int normalized = ((degrees % 360) + 360) % 360; + switch (normalized) { + case 90: + return 1; + case 180: + return 2; + case 270: + return 3; + default: + return 0; + } +} + +void drawThickLine(M5Canvas& canvas, int x0, int y0, int x1, int y1, uint16_t color) { + for (int offset = -2; offset <= 2; ++offset) { + canvas.drawLine(x0, y0 + offset, x1, y1 + offset, color); + } +} + +} // namespace + +void HeadroomFaceRenderer::scheduleNextBlink(uint32_t nowMs) { + // Random gap of 3 / 4 / 5 s, but 1 in 10 times a quick 0.3 s re-blink. + uint32_t gap; + if ((esp_random() % 10u) == 0u) { + gap = 300; + } else { + static const uint32_t gaps[3] = {3000, 4000, 5000}; + gap = gaps[esp_random() % 3u]; + } + nextBlinkAtMs_ = nowMs + gap; +} + +float HeadroomFaceRenderer::blinkOpenAmount(uint32_t nowMs) { + if (!blinkSeeded_) { + blinkSeeded_ = true; + scheduleNextBlink(nowMs); + } + if (nowMs < nextBlinkAtMs_) { + return 1.0f; + } + uint32_t phase = nowMs - nextBlinkAtMs_; + if (phase >= 160) { + scheduleNextBlink(nowMs); + return 1.0f; + } + if (phase < 60) { + return 1.0f - static_cast(phase) / 60.0f; + } + if (phase < 100) { + return 0.0f; + } + return static_cast(phase - 100) / 60.0f; +} + +void HeadroomFaceRenderer::begin(uint16_t width, uint16_t height, int rotationDegrees) { + width_ = width; + height_ = height; + canvas_.setColorDepth(16); + canvas_.createSprite(width_, height_); + setRotationDegrees(rotationDegrees); + canvas_.setTextDatum(middle_center); + canvas_.setTextSize(1); +} + +void HeadroomFaceRenderer::setRotationDegrees(int rotationDegrees) { + rotationDegrees_ = rotationDegrees; + M5.Display.setRotation(displayRotationForDegrees(rotationDegrees_)); +} + +void HeadroomFaceRenderer::draw(const HeadroomFaceState& state) { + canvas_.startWrite(); + canvas_.fillScreen(backgroundFor(state)); + drawHeadBase(state); + drawBrows(state); + drawEyes(state); + drawMouth(state); + + if (!state.connected) { + canvas_.drawCircle(width_ - 10, 10, 3, canvas_.color565(105, 112, 120)); + } else { + canvas_.fillCircle(width_ - 10, 10, 3, TFT_GREEN); + } + canvas_.endWrite(); + canvas_.pushSprite(0, 0); +} + +void HeadroomFaceRenderer::drawHeadBase(const HeadroomFaceState& state) { + uint16_t skin = canvas_.color565(255, 201, 150); + uint16_t cheek = canvas_.color565(255, 184, 132); + uint16_t hair = canvas_.color565(178, 112, 56); + uint16_t hairShadow = canvas_.color565(132, 78, 40); + uint16_t outline = canvas_.color565(168, 105, 63); + + canvas_.fillRoundRect(13, 14, 102, 100, 31, outline); + canvas_.fillRoundRect(16, 16, 96, 96, 29, skin); + canvas_.drawRoundRect(16, 16, 96, 96, 29, outline); + canvas_.fillRoundRect(22, 16, 84, 22, 13, hairShadow); + canvas_.fillRoundRect(20, 15, 88, 20, 12, hair); + canvas_.fillCircle(29, 32, 11, hair); + canvas_.fillCircle(45, 24, 14, hair); + canvas_.fillCircle(64, 21, 16, hair); + canvas_.fillCircle(84, 24, 14, hair); + canvas_.fillCircle(100, 32, 11, hair); + canvas_.fillTriangle(29, 34, 42, 35, 34, 43, hair); + canvas_.fillTriangle(56, 31, 69, 31, 62, 42, hair); + canvas_.fillTriangle(85, 34, 99, 34, 91, 43, hair); + canvas_.fillEllipse(37, 82, 10, 6, cheek); + canvas_.fillEllipse(91, 82, 10, 6, cheek); +} + +void HeadroomFaceRenderer::drawBrows(const HeadroomFaceState& state) { + uint16_t color = canvas_.color565(93, 55, 31); + int leftY = 45; + int rightY = 45; + int slant = 0; + + switch (state.expression) { + case HeadroomExpression::Permission: + color = canvas_.color565(112, 72, 28); + leftY = 42; + rightY = 42; + slant = 5; + break; + case HeadroomExpression::Failed: + color = canvas_.color565(110, 42, 31); + slant = -6; + break; + case HeadroomExpression::Success: + // Raised happy "\/" brows (inverse of への字), lifted slightly higher. + // Same brown as the normal brows for consistency. + leftY = 40; + rightY = 40; + slant = -5; + break; + case HeadroomExpression::Thinking: + color = canvas_.color565(71, 63, 41); + leftY = 45; + rightY = 45; + slant = 5; + break; + case HeadroomExpression::Listening: + color = canvas_.color565(102, 61, 30); + leftY = 45; + rightY = 45; + break; + default: + break; + } + + drawThickLine(canvas_, 32, leftY + slant, 53, leftY - slant, color); + drawThickLine(canvas_, 75, rightY - slant, 96, rightY + slant, color); +} + +void HeadroomFaceRenderer::drawClosedEyeArc(int centerX, int eyeCenterY, uint16_t color) { + // Downward-convex "∪" eyelid arc (a dark lash line), not a white sliver. + const int radius = 15; + const int arcCenterY = eyeCenterY - 13; // bottom of the arc sits at the eye center + canvas_.fillArc(centerX, arcCenterY, radius, radius - 4, 25.0f, 155.0f, color); +} + +void HeadroomFaceRenderer::drawEyes(const HeadroomFaceState& state) { + int pupilOffsetX = static_cast(roundf(clampFloat(state.gazeX, -1.0f, 1.0f) * 3.0f)); + int pupilOffsetY = static_cast(roundf(clampFloat(state.gazeY, -1.0f, 1.0f) * 2.0f)); + + // Thinking: sweep the eyes slowly left/right like the PC/mobile face. + if (state.expression == HeadroomExpression::Thinking) { + float t = static_cast(millis()) / 1000.0f; + pupilOffsetX = static_cast(roundf(sinf(t * 1.6f) * 4.0f)); + pupilOffsetY = 0; + } + + int eyeY = 54; + int pupilY = 64; + const int leftCenterX = 42; + const int rightCenterX = 85; + + int eyeHeight = 20; + if (state.expression == HeadroomExpression::Thinking) { + eyeHeight = 17; + } + + float blink = blinkOpenAmount(millis()); + uint16_t lidColor = canvas_.color565(70, 42, 26); + + // Closed (blink): draw the dark ∪ eyelid arc instead of a white bar. + if (blink <= 0.30f) { + drawClosedEyeArc(leftCenterX, pupilY, lidColor); + drawClosedEyeArc(rightCenterX, pupilY, lidColor); + return; + } + + eyeHeight = std::max(6, static_cast(roundf(static_cast(eyeHeight) * blink))); + canvas_.fillRoundRect(25, eyeY, 35, eyeHeight, 8, TFT_WHITE); + canvas_.fillRoundRect(68, eyeY, 35, eyeHeight, 8, TFT_WHITE); + + // Mid-blink: whites only, no pupils until the eye is open enough. + if (eyeHeight < 12) { + return; + } + + uint16_t pupilColor = TFT_BLACK; + if (state.expression == HeadroomExpression::Permission) { + pupilColor = TFT_NAVY; + } + canvas_.fillCircle(leftCenterX + pupilOffsetX, pupilY + pupilOffsetY, 6, pupilColor); + canvas_.fillCircle(rightCenterX + pupilOffsetX, pupilY + pupilOffsetY, 6, pupilColor); +} + +void HeadroomFaceRenderer::drawMouth(const HeadroomFaceState& state) { + float open = clampFloat(state.mouthOpen, 0.0f, 1.0f); + uint16_t mouthColor = canvas_.color565(116, 32, 28); + uint16_t mouthInnerColor = canvas_.color565(72, 20, 24); + + switch (state.expression) { + case HeadroomExpression::Failed: + drawThickLine(canvas_, 50, 94, 64, 86, mouthColor); + drawThickLine(canvas_, 64, 86, 78, 94, mouthColor); + return; + case HeadroomExpression::Permission: + canvas_.drawRoundRect(51, 84, 27, 18, 7, mouthColor); + canvas_.drawRoundRect(52, 85, 25, 16, 6, mouthColor); + return; + case HeadroomExpression::Listening: + canvas_.drawCircle(64, 92, 10, mouthColor); + canvas_.drawCircle(64, 92, 11, mouthColor); + return; + case HeadroomExpression::Success: + // While speaking, keep the talking mouth animation (口パク). + // When the mouth is closed and status is success, show the smile arc. + if (open <= 0.12f) { + canvas_.fillArc(64, 82, 18, 13, 20.0f, 160.0f, mouthColor); + return; + } + break; + default: + break; + } + + int mouthHeight = 3 + static_cast(roundf(open * 24.0f)); + int mouthWidth = 34 + static_cast(roundf(open * 12.0f)); + int x = (width_ - mouthWidth) / 2; + + if (mouthHeight <= 5) { + canvas_.drawRoundRect(x, 88, mouthWidth, 5, 3, mouthColor); + canvas_.drawRoundRect(x, 89, mouthWidth, 5, 3, mouthColor); + } else { + canvas_.fillEllipse(width_ / 2, 91, mouthWidth / 2, mouthHeight / 2, mouthColor); + canvas_.fillEllipse(width_ / 2, 91, std::max(4, mouthWidth / 3), std::max(2, mouthHeight / 3), mouthInnerColor); + } +} + +uint16_t HeadroomFaceRenderer::backgroundFor(const HeadroomFaceState& state) const { + switch (state.expression) { + case HeadroomExpression::Listening: + return canvas_.color565(72, 44, 24); + case HeadroomExpression::Speaking: + return canvas_.color565(20, 62, 78); + case HeadroomExpression::Permission: + return canvas_.color565(82, 48, 24); + case HeadroomExpression::Success: + return canvas_.color565(22, 76, 58); + case HeadroomExpression::Failed: + return canvas_.color565(82, 34, 34); + case HeadroomExpression::Thinking: + return canvas_.color565(19, 48, 67); + default: + return canvas_.color565(13, 26, 38); + } +} + +uint16_t HeadroomFaceRenderer::accentFor(const HeadroomFaceState& state) const { + switch (state.expression) { + case HeadroomExpression::Listening: + return TFT_ORANGE; + case HeadroomExpression::Speaking: + return TFT_SKYBLUE; + case HeadroomExpression::Permission: + return TFT_YELLOW; + case HeadroomExpression::Success: + return TFT_GREEN; + case HeadroomExpression::Failed: + return TFT_RED; + case HeadroomExpression::Thinking: + return TFT_CYAN; + default: + return state.accentColor; + } +} diff --git a/firmware/atoms3r-headroom/src/face_renderer.h b/firmware/atoms3r-headroom/src/face_renderer.h new file mode 100644 index 0000000..ffccb89 --- /dev/null +++ b/firmware/atoms3r-headroom/src/face_renderer.h @@ -0,0 +1,50 @@ +#pragma once + +#include +#include + +enum class HeadroomExpression { + Neutral, + Listening, + Speaking, + Permission, + Success, + Failed, + Thinking, +}; + +struct HeadroomFaceState { + HeadroomExpression expression = HeadroomExpression::Neutral; + float mouthOpen = 0.0f; + float gazeX = 0.0f; + float gazeY = 0.0f; + uint16_t accentColor = TFT_SKYBLUE; + bool connected = false; +}; + +class HeadroomFaceRenderer { +public: + void begin(uint16_t width = 128, uint16_t height = 128, int rotationDegrees = 0); + void setRotationDegrees(int rotationDegrees); + void draw(const HeadroomFaceState& state); + +private: + M5Canvas canvas_{&M5.Display}; + uint16_t width_ = 128; + uint16_t height_ = 128; + int rotationDegrees_ = 0; + + // Randomized blink scheduling (lifelike, not a fixed loop). + bool blinkSeeded_ = false; + uint32_t nextBlinkAtMs_ = 0; + void scheduleNextBlink(uint32_t nowMs); + float blinkOpenAmount(uint32_t nowMs); + + void drawHeadBase(const HeadroomFaceState& state); + void drawBrows(const HeadroomFaceState& state); + void drawEyes(const HeadroomFaceState& state); + void drawClosedEyeArc(int centerX, int eyeCenterY, uint16_t color); + void drawMouth(const HeadroomFaceState& state); + uint16_t backgroundFor(const HeadroomFaceState& state) const; + uint16_t accentFor(const HeadroomFaceState& state) const; +}; diff --git a/firmware/atoms3r-headroom/src/headroom_audio.cpp b/firmware/atoms3r-headroom/src/headroom_audio.cpp new file mode 100644 index 0000000..5d4d2a4 --- /dev/null +++ b/firmware/atoms3r-headroom/src/headroom_audio.cpp @@ -0,0 +1,282 @@ +#include "headroom_audio.h" + +#include +#include +#include +#include +#include + +namespace { + +constexpr size_t kWavHeaderProbeBytes = 96; + +uint32_t readLe32(const uint8_t* data) { + return static_cast(data[0]) | (static_cast(data[1]) << 8) | (static_cast(data[2]) << 16) | + (static_cast(data[3]) << 24); +} + +uint16_t readLe16(const uint8_t* data) { + return static_cast(data[0]) | (static_cast(data[1]) << 8); +} + +bool startsWithHttp(const String& url) { + return url.startsWith("http://") || url.startsWith("https://"); +} + +} // namespace + +void HeadroomAudio::begin(const HeadroomSettingsData& settings) { + httpBase_ = settings.faceHttpBase; + authToken_ = settings.authToken; + maxBase64Seconds_ = max(1, min(15, settings.maxBase64TtsSeconds)); + maxHttpBytes_ = max(100000, settings.maxHttpTtsBytes); + M5.Speaker.setVolume(130); + M5.Speaker.begin(); +} + +void HeadroomAudio::loop() { + releaseActive(); +} + +void HeadroomAudio::stop() { + M5.Speaker.stop(); + releaseActive(); +} + +bool HeadroomAudio::busy() const { + return M5.Speaker.isPlaying(); +} + +HeadroomAudioResult HeadroomAudio::playBase64Wav(const char* audioBase64, size_t base64Length, int sampleRateHint) { + if (!audioBase64 || base64Length == 0) { + return HeadroomAudioResult::Ignored; + } + + size_t decodedCapacity = ((base64Length * 3) / 4) + 8; + size_t roughLimit = static_cast(maxBase64Seconds_) * static_cast(sampleRateHint > 0 ? sampleRateHint : 24000) * 2 + 128; + if (decodedCapacity > roughLimit) { + Serial.printf("tts_audio too large before decode base64=%u decoded~=%u limit=%u\n", static_cast(base64Length), + static_cast(decodedCapacity), static_cast(roughLimit)); + return HeadroomAudioResult::TooLarge; + } + + uint8_t* wav = static_cast(ps_malloc(decodedCapacity)); + if (!wav) { + wav = static_cast(malloc(decodedCapacity)); + } + if (!wav) { + return HeadroomAudioResult::DecodeFailed; + } + + size_t decodedLength = 0; + int rc = mbedtls_base64_decode(wav, decodedCapacity, &decodedLength, reinterpret_cast(audioBase64), base64Length); + if (rc != 0 || decodedLength == 0) { + free(wav); + Serial.printf("base64 decode failed rc=%d\n", rc); + return HeadroomAudioResult::DecodeFailed; + } + + int sampleRate = 0; + size_t dataBytes = 0; + uint16_t bits = 0; + uint16_t channels = 0; + if (!inspectWav(wav, decodedLength, &sampleRate, &dataBytes, &bits, &channels)) { + free(wav); + return HeadroomAudioResult::Unsupported; + } + if (bits != 16 || channels != 1) { + Serial.printf("unsupported wav format bits=%u channels=%u\n", bits, channels); + free(wav); + return HeadroomAudioResult::Unsupported; + } + size_t maxDataBytes = static_cast(maxBase64Seconds_) * static_cast(sampleRate) * 2; + if (dataBytes > maxDataBytes) { + free(wav); + return HeadroomAudioResult::TooLarge; + } + + return playOwnedWav(wav, decodedLength, true); +} + +HeadroomAudioResult HeadroomAudio::playHttpWavRef(const String& url) { + String fullUrl = absoluteUrl(url); + if (!startsWithHttp(fullUrl)) { + return HeadroomAudioResult::Unsupported; + } + + WiFiClient client; + HTTPClient http; + if (!http.begin(client, fullUrl)) { + return HeadroomAudioResult::HttpFailed; + } + if (authToken_.length() > 0) { + http.addHeader("Authorization", String("Bearer ") + authToken_); + } + + int status = http.GET(); + if (status != HTTP_CODE_OK) { + Serial.printf("tts_audio_ref http status=%d\n", status); + http.end(); + return HeadroomAudioResult::HttpFailed; + } + + int contentLength = http.getSize(); + if (contentLength <= 0 || contentLength > maxHttpBytes_) { + Serial.printf("tts_audio_ref size rejected length=%d limit=%d\n", contentLength, maxHttpBytes_); + http.end(); + return HeadroomAudioResult::TooLarge; + } + + uint8_t* wav = static_cast(ps_malloc(contentLength)); + if (!wav) { + wav = static_cast(malloc(contentLength)); + } + if (!wav) { + http.end(); + return HeadroomAudioResult::DecodeFailed; + } + + WiFiClient* stream = http.getStreamPtr(); + size_t offset = 0; + while (http.connected() && offset < static_cast(contentLength)) { + int available = stream->available(); + if (available <= 0) { + delay(1); + continue; + } + int remaining = contentLength - static_cast(offset); + int readLen = stream->readBytes(wav + offset, min(available, remaining)); + if (readLen <= 0) { + break; + } + offset += static_cast(readLen); + } + http.end(); + + if (offset != static_cast(contentLength)) { + free(wav); + Serial.printf("tts_audio_ref incomplete read got=%u expected=%u\n", static_cast(offset), static_cast(contentLength)); + return HeadroomAudioResult::HttpFailed; + } + + int sampleRate = 0; + size_t dataBytes = 0; + uint16_t bits = 0; + uint16_t channels = 0; + if (!inspectWav(wav, offset, &sampleRate, &dataBytes, &bits, &channels) || bits != 16 || channels != 1) { + free(wav); + return HeadroomAudioResult::Unsupported; + } + + return playOwnedWav(wav, offset, true); +} + +HeadroomAudioResult HeadroomAudio::playWavBytes(const uint8_t* wav, size_t length) { + if (!wav || length == 0) { + return HeadroomAudioResult::Ignored; + } + if (length > static_cast(maxHttpBytes_)) { + return HeadroomAudioResult::TooLarge; + } + + int sampleRate = 0; + size_t dataBytes = 0; + uint16_t bits = 0; + uint16_t channels = 0; + if (!inspectWav(wav, length, &sampleRate, &dataBytes, &bits, &channels) || bits != 16 || channels != 1) { + return HeadroomAudioResult::Unsupported; + } + + uint8_t* owned = static_cast(ps_malloc(length)); + if (!owned) { + owned = static_cast(malloc(length)); + } + if (!owned) { + return HeadroomAudioResult::DecodeFailed; + } + memcpy(owned, wav, length); + return playOwnedWav(owned, length, true); +} + +void HeadroomAudio::releaseActive() { + if (!activeWav_) { + return; + } + if (!M5.Speaker.isPlaying()) { + free(activeWav_); + activeWav_ = nullptr; + activeWavLength_ = 0; + } +} + +HeadroomAudioResult HeadroomAudio::playOwnedWav(uint8_t* wav, size_t length, bool takeOwnership) { + releaseActive(); + M5.Speaker.stop(); + if (activeWav_) { + free(activeWav_); + activeWav_ = nullptr; + activeWavLength_ = 0; + } + + bool ok = M5.Speaker.playWav(wav, length, 1, -1, true); + if (!ok) { + if (takeOwnership) { + free(wav); + } + return HeadroomAudioResult::PlaybackFailed; + } + + if (takeOwnership) { + activeWav_ = wav; + activeWavLength_ = length; + } + return HeadroomAudioResult::Ok; +} + +bool HeadroomAudio::inspectWav(const uint8_t* wav, size_t length, int* sampleRate, size_t* dataBytes, uint16_t* bitsPerSample, uint16_t* channels) { + if (!wav || length < 44 || memcmp(wav, "RIFF", 4) != 0 || memcmp(wav + 8, "WAVE", 4) != 0) { + return false; + } + + bool sawFmt = false; + bool sawData = false; + size_t offset = 12; + while (offset + 8 <= min(length, kWavHeaderProbeBytes)) { + const uint8_t* chunk = wav + offset; + uint32_t chunkSize = readLe32(chunk + 4); + size_t chunkStart = offset + 8; + if (chunkStart + chunkSize > length) { + return false; + } + + if (memcmp(chunk, "fmt ", 4) == 0 && chunkSize >= 16) { + uint16_t audioFormat = readLe16(wav + chunkStart); + *channels = readLe16(wav + chunkStart + 2); + *sampleRate = static_cast(readLe32(wav + chunkStart + 4)); + *bitsPerSample = readLe16(wav + chunkStart + 14); + sawFmt = audioFormat == 1; + } else if (memcmp(chunk, "data", 4) == 0) { + *dataBytes = chunkSize; + sawData = true; + break; + } + + offset = chunkStart + chunkSize + (chunkSize % 2); + } + + return sawFmt && sawData && *sampleRate > 0 && *dataBytes > 0; +} + +String HeadroomAudio::absoluteUrl(const String& url) const { + if (startsWithHttp(url)) { + return url; + } + if (!url.startsWith("/")) { + return url; + } + String base = httpBase_; + if (base.endsWith("/")) { + base.remove(base.length() - 1); + } + return base + url; +} diff --git a/firmware/atoms3r-headroom/src/headroom_audio.h b/firmware/atoms3r-headroom/src/headroom_audio.h new file mode 100644 index 0000000..ffb983e --- /dev/null +++ b/firmware/atoms3r-headroom/src/headroom_audio.h @@ -0,0 +1,40 @@ +#pragma once + +#include + +#include "headroom_settings.h" + +enum class HeadroomAudioResult { + Ok, + Ignored, + TooLarge, + DecodeFailed, + HttpFailed, + Unsupported, + PlaybackFailed, +}; + +class HeadroomAudio { +public: + void begin(const HeadroomSettingsData& settings); + void loop(); + void stop(); + bool busy() const; + + HeadroomAudioResult playBase64Wav(const char* audioBase64, size_t base64Length, int sampleRateHint); + HeadroomAudioResult playHttpWavRef(const String& url); + HeadroomAudioResult playWavBytes(const uint8_t* wav, size_t length); + +private: + String httpBase_; + String authToken_; + int maxBase64Seconds_ = 10; + int maxHttpBytes_ = 1200000; + uint8_t* activeWav_ = nullptr; + size_t activeWavLength_ = 0; + + void releaseActive(); + HeadroomAudioResult playOwnedWav(uint8_t* wav, size_t length, bool takeOwnership); + bool inspectWav(const uint8_t* wav, size_t length, int* sampleRate, size_t* dataBytes, uint16_t* bitsPerSample, uint16_t* channels); + String absoluteUrl(const String& url) const; +}; diff --git a/firmware/atoms3r-headroom/src/headroom_ingress_server.cpp b/firmware/atoms3r-headroom/src/headroom_ingress_server.cpp new file mode 100644 index 0000000..427fc72 --- /dev/null +++ b/firmware/atoms3r-headroom/src/headroom_ingress_server.cpp @@ -0,0 +1,299 @@ +#include "headroom_ingress_server.h" + +#include + +namespace { + +constexpr size_t kMinimumPayloadBytes = 32768; +constexpr size_t kMaximumPayloadBytes = 1100 * 1024; + +bool timingSafeStringEquals(const String& left, const String& right) { + if (left.length() != right.length()) { + return false; + } + uint8_t diff = 0; + for (size_t i = 0; i < left.length(); ++i) { + diff |= static_cast(left.charAt(i)) ^ static_cast(right.charAt(i)); + } + return diff == 0; +} + +String bearerToken(const String& authorization) { + if (!authorization.startsWith("Bearer ")) { + return String(); + } + String token = authorization.substring(7); + token.trim(); + return token; +} + +size_t estimatePayloadLimit(const HeadroomSettingsData& settings) { + size_t decodedBytes = static_cast(max(1, min(15, settings.maxBase64TtsSeconds))) * 24000 * 2 + 128; + size_t jsonBytes = ((decodedBytes + 2) / 3) * 4 + 16384; + return min(kMaximumPayloadBytes, max(kMinimumPayloadBytes, jsonBytes)); +} + +String jsonEscape(const String& value) { + String escaped; + escaped.reserve(value.length() + 8); + for (size_t i = 0; i < value.length(); ++i) { + char c = value.charAt(i); + switch (c) { + case '"': + escaped += F("\\\""); + break; + case '\\': + escaped += F("\\\\"); + break; + case '\n': + escaped += F("\\n"); + break; + case '\r': + escaped += F("\\r"); + break; + case '\t': + escaped += F("\\t"); + break; + default: + if (static_cast(c) < 0x20) { + escaped += ' '; + } else { + escaped += c; + } + break; + } + } + return escaped; +} + +} // namespace + +void HeadroomIngressServer::begin(const HeadroomSettingsData& settings, HeadroomTransport& transport, HeadroomAudio& audio, + HeadroomFaceState& faceState) { + transport_ = &transport; + audio_ = &audio; + faceState_ = &faceState; + authToken_ = settings.authToken; + deviceId_ = settings.deviceId; + maxPayloadBytes_ = estimatePayloadLimit(settings); + + const char* headerKeys[] = {"Authorization", "X-Headroom-Auth"}; + server_.collectHeaders(headerKeys, 2); + server_.on("/health", HTTP_GET, [this]() { handleHealth(); }); + server_.on("/api/headroom/payload", HTTP_OPTIONS, [this]() { handleOptions(); }); + server_.on("/api/headroom/payload", HTTP_POST, [this]() { handlePayload(); }); + server_.on("/api/headroom/audio", HTTP_OPTIONS, [this]() { handleOptions(); }); + server_.on("/api/headroom/audio", HTTP_POST, [this]() { handleAudio(); }, [this]() { handleAudioRaw(); }); + server_.onNotFound([this]() { handleNotFound(); }); + server_.begin(); + active_ = true; + + Serial.printf("ingress listening http://%s/ max_payload=%u\n", WiFi.localIP().toString().c_str(), + static_cast(maxPayloadBytes_)); +} + +void HeadroomIngressServer::loop() { + if (!active_) { + return; + } + server_.handleClient(); +} + +bool HeadroomIngressServer::active() const { + return active_; +} + +bool HeadroomIngressServer::recentlyActive(uint32_t windowMs) const { + return lastPayloadMs_ != 0 && millis() - lastPayloadMs_ <= windowMs; +} + +void HeadroomIngressServer::handleHealth() { + String body = F("{\"ok\":true,\"service\":\"atoms3r-headroom\",\"device_id\":\""); + body += jsonEscape(deviceId_); + body += F("\",\"ip\":\""); + body += WiFi.localIP().toString(); + body += F("\",\"ingress\":true}"); + sendJson(200, body); +} + +void HeadroomIngressServer::handlePayload() { + if (!isAuthorized()) { + sendJson(401, F("{\"ok\":false,\"error\":\"unauthorized\"}")); + return; + } + if (!transport_) { + sendJson(503, F("{\"ok\":false,\"error\":\"transport_not_ready\"}")); + return; + } + + String body = server_.arg("plain"); + if (body.length() == 0) { + sendJson(400, F("{\"ok\":false,\"error\":\"empty_body\"}")); + return; + } + if (body.length() > maxPayloadBytes_) { + sendJson(413, F("{\"ok\":false,\"error\":\"payload_too_large\"}")); + return; + } + + bool ok = transport_->handleJsonPayload(reinterpret_cast(body.c_str()), body.length()); + if (!ok) { + sendJson(400, F("{\"ok\":false,\"error\":\"invalid_json\"}")); + return; + } + + lastPayloadMs_ = millis(); + sendJson(202, F("{\"ok\":true}")); +} + +void HeadroomIngressServer::handleAudioRaw() { + HTTPRaw& raw = server_.raw(); + + if (raw.status == RAW_START) { + releaseAudioRawBuffer(); + audioRawUnauthorized_ = false; + audioRawTooLarge_ = false; + audioRawFailed_ = false; + + if (!isAuthorized()) { + audioRawUnauthorized_ = true; + return; + } + + int contentLength = server_.clientContentLength(); + if (contentLength <= 0) { + audioRawFailed_ = true; + return; + } + if (static_cast(contentLength) > maxPayloadBytes_) { + audioRawTooLarge_ = true; + return; + } + + audioRawCapacity_ = static_cast(contentLength); + audioRawBuffer_ = static_cast(ps_malloc(audioRawCapacity_)); + if (!audioRawBuffer_) { + audioRawBuffer_ = static_cast(malloc(audioRawCapacity_)); + } + if (!audioRawBuffer_) { + audioRawCapacity_ = 0; + audioRawFailed_ = true; + } + return; + } + + if (raw.status == RAW_WRITE) { + if (audioRawUnauthorized_ || audioRawTooLarge_ || audioRawFailed_) { + return; + } + if (!audioRawBuffer_ || audioRawLength_ + raw.currentSize > audioRawCapacity_) { + audioRawTooLarge_ = true; + releaseAudioRawBuffer(); + return; + } + memcpy(audioRawBuffer_ + audioRawLength_, raw.buf, raw.currentSize); + audioRawLength_ += raw.currentSize; + return; + } + + if (raw.status == RAW_ABORTED) { + audioRawFailed_ = true; + releaseAudioRawBuffer(); + } +} + +void HeadroomIngressServer::handleAudio() { + if (!isAuthorized()) { + sendJson(401, F("{\"ok\":false,\"error\":\"unauthorized\"}")); + return; + } + if (!audio_ || !faceState_) { + sendJson(503, F("{\"ok\":false,\"error\":\"audio_not_ready\"}")); + return; + } + + if (audioRawUnauthorized_) { + sendJson(401, F("{\"ok\":false,\"error\":\"unauthorized\"}")); + return; + } + if (audioRawTooLarge_) { + releaseAudioRawBuffer(); + sendJson(413, F("{\"ok\":false,\"error\":\"payload_too_large\"}")); + return; + } + if (audioRawFailed_) { + releaseAudioRawBuffer(); + sendJson(400, F("{\"ok\":false,\"error\":\"audio_body_failed\"}")); + return; + } + if (!audioRawBuffer_ || audioRawLength_ == 0) { + sendJson(400, F("{\"ok\":false,\"error\":\"empty_body\"}")); + return; + } + + HeadroomAudioResult result = audio_->playWavBytes(audioRawBuffer_, audioRawLength_); + size_t audioBytes = audioRawLength_; + releaseAudioRawBuffer(); + if (result != HeadroomAudioResult::Ok) { + Serial.printf("ingress audio failed result=%d bytes=%u\n", static_cast(result), static_cast(audioBytes)); + faceState_->expression = HeadroomExpression::Failed; + String body = F("{\"ok\":false,\"error\":\"audio_rejected\",\"result\":"); + body += static_cast(result); + body += F("}"); + sendJson(400, body); + return; + } + + faceState_->expression = HeadroomExpression::Speaking; + faceState_->mouthOpen = max(faceState_->mouthOpen, 0.28f); + lastPayloadMs_ = millis(); + sendJson(202, F("{\"ok\":true}")); +} + +void HeadroomIngressServer::handleOptions() { + server_.sendHeader("Access-Control-Allow-Origin", "*"); + server_.sendHeader("Access-Control-Allow-Headers", "content-type,authorization,x-headroom-auth,x-utterance-id,x-generation"); + server_.sendHeader("Access-Control-Allow-Methods", "POST,OPTIONS"); + server_.send(204, "text/plain", ""); +} + +void HeadroomIngressServer::handleNotFound() { + sendJson(404, F("{\"ok\":false,\"error\":\"not_found\"}")); +} + +bool HeadroomIngressServer::isAuthorized() { + if (authToken_.length() == 0) { + return true; + } + + String queryToken = server_.arg("auth_token"); + if (queryToken.length() == 0) { + queryToken = server_.arg("token"); + } + if (queryToken.length() > 0 && timingSafeStringEquals(queryToken, authToken_)) { + return true; + } + + String headerToken = server_.header("X-Headroom-Auth"); + if (headerToken.length() > 0 && timingSafeStringEquals(headerToken, authToken_)) { + return true; + } + + String bearer = bearerToken(server_.header("Authorization")); + return bearer.length() > 0 && timingSafeStringEquals(bearer, authToken_); +} + +void HeadroomIngressServer::releaseAudioRawBuffer() { + if (audioRawBuffer_) { + free(audioRawBuffer_); + } + audioRawBuffer_ = nullptr; + audioRawLength_ = 0; + audioRawCapacity_ = 0; +} + +void HeadroomIngressServer::sendJson(int statusCode, const String& body) { + server_.sendHeader("Cache-Control", "no-store"); + server_.sendHeader("Access-Control-Allow-Origin", "*"); + server_.send(statusCode, "application/json; charset=utf-8", body); +} diff --git a/firmware/atoms3r-headroom/src/headroom_ingress_server.h b/firmware/atoms3r-headroom/src/headroom_ingress_server.h new file mode 100644 index 0000000..87fdb5e --- /dev/null +++ b/firmware/atoms3r-headroom/src/headroom_ingress_server.h @@ -0,0 +1,44 @@ +#pragma once + +#include +#include + +#include "face_renderer.h" +#include "headroom_audio.h" +#include "headroom_settings.h" +#include "headroom_transport.h" + +class HeadroomIngressServer { +public: + void begin(const HeadroomSettingsData& settings, HeadroomTransport& transport, HeadroomAudio& audio, HeadroomFaceState& faceState); + void loop(); + bool active() const; + bool recentlyActive(uint32_t windowMs) const; + +private: + WebServer server_{80}; + HeadroomTransport* transport_ = nullptr; + HeadroomAudio* audio_ = nullptr; + HeadroomFaceState* faceState_ = nullptr; + String authToken_; + String deviceId_; + bool active_ = false; + size_t maxPayloadBytes_ = 720000; + uint32_t lastPayloadMs_ = 0; + uint8_t* audioRawBuffer_ = nullptr; + size_t audioRawLength_ = 0; + size_t audioRawCapacity_ = 0; + bool audioRawUnauthorized_ = false; + bool audioRawTooLarge_ = false; + bool audioRawFailed_ = false; + + void handleHealth(); + void handlePayload(); + void handleAudioRaw(); + void handleAudio(); + void handleOptions(); + void handleNotFound(); + bool isAuthorized(); + void releaseAudioRawBuffer(); + void sendJson(int statusCode, const String& body); +}; diff --git a/firmware/atoms3r-headroom/src/headroom_settings.cpp b/firmware/atoms3r-headroom/src/headroom_settings.cpp new file mode 100644 index 0000000..76f7ae7 --- /dev/null +++ b/firmware/atoms3r-headroom/src/headroom_settings.cpp @@ -0,0 +1,170 @@ +#include "headroom_settings.h" + +#include + +#include "headroom_config.h" + +namespace { + +constexpr const char* kNamespace = "rmh"; + +bool isPlaceholderWifi(const String& ssid) { + return ssid.length() == 0 || ssid == "your-wifi"; +} + +String readString(Preferences& prefs, const char* key, const String& fallback) { + if (!prefs.isKey(key)) { + return fallback; + } + return prefs.getString(key, fallback); +} + +int readInt(Preferences& prefs, const char* key, int fallback) { + if (!prefs.isKey(key)) { + return fallback; + } + return prefs.getInt(key, fallback); +} + +} // namespace + +void HeadroomSettings::begin() { + loadCompileDefaults(); + loadNvsOverrides(); +} + +const HeadroomSettingsData& HeadroomSettings::data() const { + return data_; +} + +HeadroomSettingsData HeadroomSettings::editable() const { + return data_; +} + +bool HeadroomSettings::save(const HeadroomSettingsData& next) { + HeadroomSettingsData normalized = next; + normalized.faceRotationDegrees = normalizeRotation(normalized.faceRotationDegrees); + normalized.upSideDegrees = normalizeRotation(normalized.upSideDegrees); + + Preferences prefs; + if (!prefs.begin(kNamespace, false)) { + return false; + } + + prefs.putString("ssid", normalized.wifiSsid); + prefs.putString("wifi_pw", normalized.wifiPassword); + prefs.putString("http_base", normalized.faceHttpBase); + prefs.putString("ws_url", normalized.faceWsUrl); + prefs.putString("auth", normalized.authToken); + prefs.putString("device_id", normalized.deviceId); + prefs.putString("display_id", normalized.displayAgentId); + prefs.putString("input_id", normalized.inputTargetAgentId); + prefs.putInt("max_b64_sec", normalized.maxBase64TtsSeconds); + prefs.putInt("max_http_b", normalized.maxHttpTtsBytes); + prefs.putInt("rotation", normalized.faceRotationDegrees); + prefs.putString("pose", placementPoseName(normalized.placementPose)); + prefs.putInt("up_side", normalized.upSideDegrees); + prefs.end(); + + data_ = normalized; + loadedFromNvs_ = true; + return true; +} + +bool HeadroomSettings::hasUsableWifi() const { + return !isPlaceholderWifi(data_.wifiSsid); +} + +bool HeadroomSettings::hasSavedSettings() const { + return loadedFromNvs_; +} + +bool HeadroomSettings::isValidRotation(int degrees) { + return degrees == 0 || degrees == 90 || degrees == 180 || degrees == 270; +} + +int HeadroomSettings::normalizeRotation(int degrees) { + int normalized = ((degrees % 360) + 360) % 360; + if (normalized < 45 || normalized >= 315) { + return 0; + } + if (normalized < 135) { + return 90; + } + if (normalized < 225) { + return 180; + } + return 270; +} + +HeadroomPlacementPose HeadroomSettings::parsePlacementPose(const String& value) { + if (value == "side_up" || value == "screen_forward") { + return HeadroomPlacementPose::SideUp; + } + return HeadroomPlacementPose::ScreenUp; +} + +const char* HeadroomSettings::placementPoseName(HeadroomPlacementPose pose) { + switch (pose) { + case HeadroomPlacementPose::SideUp: + return "side_up"; + case HeadroomPlacementPose::ScreenUp: + default: + return "screen_up"; + } +} + +void HeadroomSettings::loadCompileDefaults() { + data_.wifiSsid = HEADROOM_WIFI_SSID; + data_.wifiPassword = HEADROOM_WIFI_PASSWORD; + data_.faceHttpBase = HEADROOM_FACE_HTTP_BASE; + data_.faceWsUrl = HEADROOM_FACE_WS_URL; + data_.authToken = HEADROOM_FACE_AUTH_TOKEN; + data_.deviceId = HEADROOM_DEVICE_ID; + data_.displayAgentId = HEADROOM_DISPLAY_AGENT_ID; + data_.inputTargetAgentId = HEADROOM_INPUT_TARGET_AGENT_ID; + data_.maxBase64TtsSeconds = HEADROOM_MAX_BASE64_TTS_SECONDS; + data_.maxHttpTtsBytes = HEADROOM_MAX_HTTP_TTS_BYTES; + data_.faceRotationDegrees = normalizeRotation(HEADROOM_FACE_ROTATION_DEGREES); + data_.placementPose = parsePlacementPose(HEADROOM_PLACEMENT_POSE); + data_.upSideDegrees = normalizeRotation(HEADROOM_UP_SIDE_DEGREES); +} + +void HeadroomSettings::loadNvsOverrides() { + String compileAuthToken = data_.authToken; + String compileHttpBase = data_.faceHttpBase; + String compileWsUrl = data_.faceWsUrl; + + Preferences prefs; + if (!prefs.begin(kNamespace, true)) { + loadedFromNvs_ = false; + return; + } + + loadedFromNvs_ = prefs.isKey("device_id") || prefs.isKey("ssid") || prefs.isKey("ws_url"); + data_.wifiSsid = readString(prefs, "ssid", data_.wifiSsid); + data_.wifiPassword = readString(prefs, "wifi_pw", data_.wifiPassword); + data_.faceHttpBase = readString(prefs, "http_base", data_.faceHttpBase); + data_.faceWsUrl = readString(prefs, "ws_url", data_.faceWsUrl); + if ((data_.faceHttpBase.indexOf("192.168.1.10") >= 0 || data_.faceHttpBase.indexOf("192.168.1.34") >= 0) && + compileHttpBase.length() > 0) { + data_.faceHttpBase = compileHttpBase; + } + if ((data_.faceWsUrl.indexOf("192.168.1.10") >= 0 || data_.faceWsUrl.indexOf("192.168.1.34") >= 0) && + compileWsUrl.length() > 0) { + data_.faceWsUrl = compileWsUrl; + } + data_.authToken = readString(prefs, "auth", data_.authToken); + if (data_.authToken.length() == 0 && compileAuthToken.length() > 0) { + data_.authToken = compileAuthToken; + } + data_.deviceId = readString(prefs, "device_id", data_.deviceId); + data_.displayAgentId = readString(prefs, "display_id", data_.displayAgentId); + data_.inputTargetAgentId = readString(prefs, "input_id", data_.inputTargetAgentId); + data_.maxBase64TtsSeconds = readInt(prefs, "max_b64_sec", data_.maxBase64TtsSeconds); + data_.maxHttpTtsBytes = readInt(prefs, "max_http_b", data_.maxHttpTtsBytes); + data_.faceRotationDegrees = normalizeRotation(readInt(prefs, "rotation", data_.faceRotationDegrees)); + data_.placementPose = parsePlacementPose(readString(prefs, "pose", placementPoseName(data_.placementPose))); + data_.upSideDegrees = normalizeRotation(readInt(prefs, "up_side", data_.upSideDegrees)); + prefs.end(); +} diff --git a/firmware/atoms3r-headroom/src/headroom_settings.h b/firmware/atoms3r-headroom/src/headroom_settings.h new file mode 100644 index 0000000..fcbca28 --- /dev/null +++ b/firmware/atoms3r-headroom/src/headroom_settings.h @@ -0,0 +1,46 @@ +#pragma once + +#include + +enum class HeadroomPlacementPose { + ScreenUp, + SideUp, +}; + +struct HeadroomSettingsData { + String wifiSsid; + String wifiPassword; + String faceHttpBase; + String faceWsUrl; + String authToken; + String deviceId; + String displayAgentId; + String inputTargetAgentId; + int maxBase64TtsSeconds = 10; + int maxHttpTtsBytes = 1200000; + int faceRotationDegrees = 0; + HeadroomPlacementPose placementPose = HeadroomPlacementPose::ScreenUp; + int upSideDegrees = 0; +}; + +class HeadroomSettings { +public: + void begin(); + const HeadroomSettingsData& data() const; + HeadroomSettingsData editable() const; + bool save(const HeadroomSettingsData& next); + bool hasUsableWifi() const; + bool hasSavedSettings() const; + + static bool isValidRotation(int degrees); + static int normalizeRotation(int degrees); + static HeadroomPlacementPose parsePlacementPose(const String& value); + static const char* placementPoseName(HeadroomPlacementPose pose); + +private: + HeadroomSettingsData data_; + bool loadedFromNvs_ = false; + + void loadCompileDefaults(); + void loadNvsOverrides(); +}; diff --git a/firmware/atoms3r-headroom/src/headroom_setup_portal.cpp b/firmware/atoms3r-headroom/src/headroom_setup_portal.cpp new file mode 100644 index 0000000..f01b0ad --- /dev/null +++ b/firmware/atoms3r-headroom/src/headroom_setup_portal.cpp @@ -0,0 +1,209 @@ +#include "headroom_setup_portal.h" + +namespace { + +constexpr byte kDnsPort = 53; + +String htmlEscape(const String& input) { + String escaped; + escaped.reserve(input.length() + 8); + for (size_t i = 0; i < input.length(); ++i) { + char c = input.charAt(i); + switch (c) { + case '&': + escaped += F("&"); + break; + case '<': + escaped += F("<"); + break; + case '>': + escaped += F(">"); + break; + case '"': + escaped += F("""); + break; + default: + escaped += c; + break; + } + } + return escaped; +} + +String selectedIf(bool selected) { + return selected ? F(" selected") : String(); +} + +int requestInt(WebServer& server, const char* name, int fallback) { + if (!server.hasArg(name)) { + return fallback; + } + return server.arg(name).toInt(); +} + +} // namespace + +HeadroomSetupPortal::HeadroomSetupPortal(HeadroomSettings& settings) + : settings_(settings), server_(80) {} + +bool HeadroomSetupPortal::begin() { + uint64_t mac = ESP.getEfuseMac(); + char suffix[5]; + snprintf(suffix, sizeof(suffix), "%04X", static_cast(mac & 0xFFFF)); + ssid_ = String("RMH-SETUP-") + suffix; + + WiFi.mode(WIFI_AP); + if (!WiFi.softAP(ssid_.c_str())) { + return false; + } + + IPAddress apIp = WiFi.softAPIP(); + dns_.start(kDnsPort, "*", apIp); + + server_.on("/", HTTP_GET, [this]() { handleRoot(); }); + server_.on("/save", HTTP_POST, [this]() { handleSave(); }); + server_.onNotFound([this]() { handleNotFound(); }); + server_.begin(); + active_ = true; + return true; +} + +void HeadroomSetupPortal::handleClient() { + if (!active_) { + return; + } + dns_.processNextRequest(); + server_.handleClient(); +} + +bool HeadroomSetupPortal::active() const { + return active_; +} + +const String& HeadroomSetupPortal::ssid() const { + return ssid_; +} + +IPAddress HeadroomSetupPortal::ip() const { + return WiFi.softAPIP(); +} + +void HeadroomSetupPortal::handleRoot() { + server_.send(200, "text/html; charset=utf-8", renderPage(String())); +} + +void HeadroomSetupPortal::handleSave() { + HeadroomSettingsData next = settingsFromRequest(); + if (!settings_.save(next)) { + server_.send(500, "text/html; charset=utf-8", renderPage("Save failed. Check serial logs.")); + return; + } + server_.send(200, "text/html; charset=utf-8", renderPage("Saved. Restart the Atom to use the new settings.")); +} + +void HeadroomSetupPortal::handleNotFound() { + server_.sendHeader("Location", "/", true); + server_.send(302, "text/plain", ""); +} + +String HeadroomSetupPortal::renderPage(const String& message) { + const HeadroomSettingsData& data = settings_.data(); + String pose = HeadroomSettings::placementPoseName(data.placementPose); + String html; + html.reserve(6800); + html += F(""); + html += F("RMH Atom Setup
"); + html += F("

RMH Atom Setup

"); + if (message.length() > 0) { + html += F("

"); + html += htmlEscape(message); + html += F("

"); + } + html += F("
"); + html += F(""); + html += F(""); + html += F(""); + html += F(""); + html += F(""); + html += F(""); + html += F(""); + html += F(""); + html += F("
"); + html += F("
"); + html += F("
"); + html += F("
"); + return html; +} + +HeadroomSettingsData HeadroomSetupPortal::settingsFromRequest() { + HeadroomSettingsData next = settings_.editable(); + next.wifiSsid = server_.arg("ssid"); + next.wifiPassword = server_.arg("wifi_pw"); + next.faceHttpBase = server_.arg("http_base"); + next.faceWsUrl = server_.arg("ws_url"); + next.authToken = server_.arg("auth"); + next.deviceId = server_.arg("device_id"); + next.displayAgentId = server_.arg("display_id"); + next.inputTargetAgentId = server_.arg("input_id"); + next.maxBase64TtsSeconds = requestInt(server_, "max_b64_sec", next.maxBase64TtsSeconds); + next.maxHttpTtsBytes = requestInt(server_, "max_http_b", next.maxHttpTtsBytes); + next.faceRotationDegrees = HeadroomSettings::normalizeRotation(requestInt(server_, "rotation", next.faceRotationDegrees)); + next.placementPose = HeadroomSettings::parsePlacementPose(server_.arg("pose")); + next.upSideDegrees = HeadroomSettings::normalizeRotation(requestInt(server_, "up_side", next.upSideDegrees)); + return next; +} diff --git a/firmware/atoms3r-headroom/src/headroom_setup_portal.h b/firmware/atoms3r-headroom/src/headroom_setup_portal.h new file mode 100644 index 0000000..3bf4a01 --- /dev/null +++ b/firmware/atoms3r-headroom/src/headroom_setup_portal.h @@ -0,0 +1,31 @@ +#pragma once + +#include +#include +#include + +#include "headroom_settings.h" + +class HeadroomSetupPortal { +public: + explicit HeadroomSetupPortal(HeadroomSettings& settings); + + bool begin(); + void handleClient(); + bool active() const; + const String& ssid() const; + IPAddress ip() const; + +private: + HeadroomSettings& settings_; + WebServer server_; + DNSServer dns_; + String ssid_; + bool active_ = false; + + void handleRoot(); + void handleSave(); + void handleNotFound(); + String renderPage(const String& message); + HeadroomSettingsData settingsFromRequest(); +}; diff --git a/firmware/atoms3r-headroom/src/headroom_transport.cpp b/firmware/atoms3r-headroom/src/headroom_transport.cpp new file mode 100644 index 0000000..9c363f3 --- /dev/null +++ b/firmware/atoms3r-headroom/src/headroom_transport.cpp @@ -0,0 +1,280 @@ +#include "headroom_transport.h" + +#include + +namespace { + +struct ParsedWsUrl { + String scheme; + String host; + uint16_t port = 80; + String path = "/ws"; +}; + +String urlEncode(const String& value) { + String encoded; + encoded.reserve(value.length()); + const char* hex = "0123456789ABCDEF"; + for (size_t i = 0; i < value.length(); ++i) { + uint8_t c = static_cast(value.charAt(i)); + if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '-' || c == '_' || + c == '.' || c == '~') { + encoded += static_cast(c); + } else { + encoded += '%'; + encoded += hex[(c >> 4) & 0x0f]; + encoded += hex[c & 0x0f]; + } + } + return encoded; +} + +ParsedWsUrl parseWsUrl(const String& rawUrl) { + ParsedWsUrl parsed; + String url = rawUrl; + int schemeEnd = url.indexOf("://"); + if (schemeEnd >= 0) { + parsed.scheme = url.substring(0, schemeEnd); + url = url.substring(schemeEnd + 3); + } else { + parsed.scheme = "ws"; + } + + int pathStart = url.indexOf('/'); + String authority = pathStart >= 0 ? url.substring(0, pathStart) : url; + parsed.path = pathStart >= 0 ? url.substring(pathStart) : "/ws"; + + int portStart = authority.lastIndexOf(':'); + if (portStart > 0) { + parsed.host = authority.substring(0, portStart); + int port = authority.substring(portStart + 1).toInt(); + parsed.port = port > 0 ? static_cast(port) : 80; + } else { + parsed.host = authority; + parsed.port = parsed.scheme == "wss" ? 443 : 80; + } + + if (parsed.path.length() == 0) { + parsed.path = "/ws"; + } + return parsed; +} + +String appendQueryToken(String path, const String& token) { + if (token.length() == 0) { + return path; + } + path += path.indexOf('?') >= 0 ? '&' : '?'; + path += "auth_token="; + path += urlEncode(token); + return path; +} + +String stringField(JsonDocument& doc, const char* key) { + const char* value = doc[key] | ""; + return String(value); +} + +} // namespace + +void HeadroomTransport::begin(const HeadroomSettingsData& settings, HeadroomFaceState& faceState, HeadroomAudio& audio) { + faceState_ = &faceState; + audio_ = &audio; + deviceId_ = settings.deviceId; + displayAgentId_ = settings.displayAgentId; + + ParsedWsUrl url = parseWsUrl(settings.faceWsUrl); + String path = appendQueryToken(url.path, settings.authToken); + Serial.printf("ws connecting host=%s port=%u path=%s\n", url.host.c_str(), url.port, path.c_str()); + + ws_.onEvent([this](WStype_t type, uint8_t* payload, size_t length) { onWsEvent(type, payload, length); }); + ws_.setReconnectInterval(3000); + ws_.enableHeartbeat(15000, 3000, 2); + + if (url.scheme == "wss") { + Serial.println("wss is not implemented yet; use ws:// on same LAN for now"); + setExpression(HeadroomExpression::Failed); + return; + } + + ws_.begin(url.host.c_str(), url.port, path.c_str()); +} + +void HeadroomTransport::loop() { + ws_.loop(); + updateExpressionTimeout(millis()); +} + +bool HeadroomTransport::connected() const { + return connected_; +} + +void HeadroomTransport::onWsEvent(WStype_t type, uint8_t* payload, size_t length) { + switch (type) { + case WStype_CONNECTED: + connected_ = true; + if (faceState_) { + faceState_->connected = true; + } + Serial.println("ws connected"); + setExpression(HeadroomExpression::Neutral); + break; + case WStype_DISCONNECTED: + connected_ = false; + if (faceState_) { + faceState_->connected = false; + } + Serial.println("ws disconnected"); + break; + case WStype_TEXT: + handleJsonPayload(payload, length); + break; + case WStype_ERROR: + Serial.println("ws error"); + setExpression(HeadroomExpression::Failed); + break; + default: + break; + } +} + +bool HeadroomTransport::handleJsonPayload(const uint8_t* payload, size_t length) { + JsonDocument doc; + DeserializationError error = deserializeJson(doc, payload, length); + if (error) { + Serial.printf("json parse failed: %s\n", error.c_str()); + return false; + } + + String type = stringField(doc, "type"); + String agentId = stringField(doc, "agent_id"); + if (!shouldApplyPayload(agentId, type, millis())) { + return true; + } + + if (type == "event") { + handleEventPayload(stringField(doc, "name")); + } else if (type == "tts_state") { + handleTtsStatePayload(stringField(doc, "phase")); + } else if (type == "tts_mouth") { + if (!faceState_) { + return true; + } + float open = doc["open"] | 0.0f; + faceState_->mouthOpen = constrain(open, 0.0f, 1.0f); + if (faceState_->mouthOpen > 0.04f) { + setExpression(HeadroomExpression::Speaking); + } + } else if (type == "tts_audio" || type == "tts_audio_ref") { + handleAudioPayload(doc, type); + } + return true; +} + +void HeadroomTransport::handleAudioPayload(JsonDocument& doc, const String& type) { + if (!audio_) { + return; + } + + HeadroomAudioResult result = HeadroomAudioResult::Ignored; + if (type == "tts_audio") { + const char* audioBase64 = doc["audio_base64"] | ""; + size_t length = strlen(audioBase64); + int sampleRate = doc["sample_rate"] | 24000; + result = audio_->playBase64Wav(audioBase64, length, sampleRate); + } else { + String url = stringField(doc, "url"); + result = audio_->playHttpWavRef(url); + } + + if (result == HeadroomAudioResult::Ok) { + setExpression(HeadroomExpression::Speaking); + return; + } + if (result != HeadroomAudioResult::Ignored) { + Serial.printf("audio playback failed result=%d type=%s\n", static_cast(result), type.c_str()); + setExpression(HeadroomExpression::Failed); + } +} + +bool HeadroomTransport::shouldApplyPayload(const String& agentId, const String& type, uint32_t nowMs) { + if (displayAgentId_.length() == 0 || agentId.length() == 0) { + return true; + } + + if (agentId == displayAgentId_) { + if (type == "tts_mouth" || type == "tts_state" || type == "event") { + priorityDisplayUntilMs_ = nowMs + 2500; + } + return true; + } + + if (nowMs < priorityDisplayUntilMs_) { + return false; + } + return true; +} + +void HeadroomTransport::handleEventPayload(const String& name) { + if (name == "cmd_started" || name == "retrying") { + setExpression(HeadroomExpression::Thinking); + } else if (name == "permission_required") { + setExpression(HeadroomExpression::Permission); + } else if (name == "cmd_failed" || name == "tests_failed") { + setExpression(HeadroomExpression::Failed); + } else if (name == "cmd_succeeded" || name == "tests_passed") { + setExpression(HeadroomExpression::Success); + } else if (name == "idle" || name == "idle_after_response") { + if (faceState_) { + faceState_->mouthOpen = 0.0f; + } + setExpression(HeadroomExpression::Neutral); + } +} + +void HeadroomTransport::handleTtsStatePayload(const String& phase) { + if (phase == "queued" || phase == "synth_start") { + setExpression(HeadroomExpression::Thinking); + } else if (phase == "play_start") { + setExpression(HeadroomExpression::Speaking); + } else if (phase == "play_stop") { + if (faceState_) { + faceState_->mouthOpen = 0.0f; + } + setExpression(HeadroomExpression::Neutral); + } else if (phase == "dropped" || phase == "worker_error") { + setExpression(HeadroomExpression::Failed); + } +} + +void HeadroomTransport::updateExpressionTimeout(uint32_t nowMs) { + if (!faceState_ || lastExpressionMs_ == 0) { + return; + } + + if (faceState_->expression == HeadroomExpression::Speaking) { + if ((!audio_ || !audio_->busy()) && faceState_->mouthOpen <= 0.04f && nowMs - lastExpressionMs_ > 2500) { + setExpression(HeadroomExpression::Neutral); + } + return; + } + + // Failed (like Permission) is intentionally NOT auto-reverted: the red + // background must persist until the next state event (idle / cmd_* / + // tts_state) so a transient error is not visually lost after 8 s. + if (faceState_->expression == HeadroomExpression::Thinking || + faceState_->expression == HeadroomExpression::Success) { + if (nowMs - lastExpressionMs_ > 8000) { + faceState_->mouthOpen = 0.0f; + setExpression(HeadroomExpression::Neutral); + } + } +} + +void HeadroomTransport::setExpression(HeadroomExpression expression) { + if (!faceState_) { + return; + } + faceState_->expression = expression; + lastExpressionMs_ = millis(); +} diff --git a/firmware/atoms3r-headroom/src/headroom_transport.h b/firmware/atoms3r-headroom/src/headroom_transport.h new file mode 100644 index 0000000..273c347 --- /dev/null +++ b/firmware/atoms3r-headroom/src/headroom_transport.h @@ -0,0 +1,35 @@ +#pragma once + +#include +#include +#include + +#include "face_renderer.h" +#include "headroom_audio.h" +#include "headroom_settings.h" + +class HeadroomTransport { +public: + void begin(const HeadroomSettingsData& settings, HeadroomFaceState& faceState, HeadroomAudio& audio); + void loop(); + bool connected() const; + bool handleJsonPayload(const uint8_t* payload, size_t length); + +private: + WebSocketsClient ws_; + HeadroomFaceState* faceState_ = nullptr; + HeadroomAudio* audio_ = nullptr; + bool connected_ = false; + String deviceId_; + String displayAgentId_; + uint32_t priorityDisplayUntilMs_ = 0; + uint32_t lastExpressionMs_ = 0; + + void onWsEvent(WStype_t type, uint8_t* payload, size_t length); + void handleAudioPayload(JsonDocument& doc, const String& type); + bool shouldApplyPayload(const String& agentId, const String& type, uint32_t nowMs); + void handleEventPayload(const String& name); + void handleTtsStatePayload(const String& phase); + void updateExpressionTimeout(uint32_t nowMs); + void setExpression(HeadroomExpression expression); +}; diff --git a/firmware/atoms3r-headroom/src/main.cpp b/firmware/atoms3r-headroom/src/main.cpp new file mode 100644 index 0000000..d9f8b39 --- /dev/null +++ b/firmware/atoms3r-headroom/src/main.cpp @@ -0,0 +1,206 @@ +#include +#include + +#include "face_renderer.h" +#include "headroom_audio.h" +#include "headroom_ingress_server.h" +#include "headroom_settings.h" +#include "headroom_setup_portal.h" +#include "headroom_transport.h" + +namespace { + +HeadroomSettings settings; +HeadroomSetupPortal setupPortal(settings); +HeadroomAudio audio; +HeadroomTransport transport; +HeadroomIngressServer ingressServer; +HeadroomFaceRenderer renderer; +HeadroomFaceState faceState; + +constexpr HeadroomExpression kExpressions[] = { + HeadroomExpression::Neutral, + HeadroomExpression::Thinking, + HeadroomExpression::Speaking, + HeadroomExpression::Listening, + HeadroomExpression::Permission, + HeadroomExpression::Success, + HeadroomExpression::Failed, +}; + +size_t expressionIndex = 0; +uint32_t lastFrameMs = 0; +uint32_t startedMs = 0; +bool setupMode = false; +bool wifiConnected = false; +bool forcedSetupMode = false; + +void applyCurrentExpression() { + faceState.expression = kExpressions[expressionIndex]; + faceState.connected = wifiConnected; + if (faceState.expression != HeadroomExpression::Speaking) { + faceState.mouthOpen = 0.0f; + } +} + +void updateDemoMotion(uint32_t nowMs) { + if (wifiConnected && !setupMode) { + return; + } + + float phase = static_cast((nowMs - startedMs) % 1400) / 1400.0f; + float wave = (sinf(phase * 2.0f * PI) + 1.0f) * 0.5f; + + if (faceState.expression == HeadroomExpression::Speaking) { + faceState.mouthOpen = 0.12f + wave * 0.82f; + } else if (faceState.expression == HeadroomExpression::Thinking) { + faceState.gazeX = sinf(phase * 2.0f * PI) * 0.75f; + faceState.gazeY = cosf(phase * 2.0f * PI) * 0.35f; + } else { + faceState.gazeX = 0.0f; + faceState.gazeY = 0.0f; + } +} + +bool connectWifi(const HeadroomSettingsData& data, uint32_t timeoutMs) { + if (!settings.hasUsableWifi()) { + Serial.println("wifi missing; starting setup portal"); + return false; + } + + WiFi.mode(WIFI_STA); + WiFi.begin(data.wifiSsid.c_str(), data.wifiPassword.c_str()); + Serial.printf("wifi connecting ssid=%s\n", data.wifiSsid.c_str()); + + uint32_t started = millis(); + while (WiFi.status() != WL_CONNECTED && millis() - started < timeoutMs) { + M5.update(); + delay(100); + } + + if (WiFi.status() != WL_CONNECTED) { + Serial.println("wifi connect failed; starting setup portal"); + WiFi.disconnect(true); + return false; + } + + Serial.printf("wifi connected ip=%s\n", WiFi.localIP().toString().c_str()); + return true; +} + +bool shouldForceSetupPortal(uint32_t holdMs) { + uint32_t started = millis(); + bool sawPressed = false; + while (millis() - started < holdMs) { + M5.update(); + if (M5.BtnA.isPressed()) { + sawPressed = true; + faceState.expression = HeadroomExpression::Permission; + faceState.mouthOpen = 0.25f; + } else if (sawPressed) { + return false; + } + renderer.draw(faceState); + delay(20); + } + return sawPressed; +} + +void startSetupPortal() { + setupMode = setupPortal.begin(); + wifiConnected = false; + expressionIndex = 4; + applyCurrentExpression(); + if (setupMode) { + Serial.printf("setup portal ssid=%s ip=%s\n", setupPortal.ssid().c_str(), setupPortal.ip().toString().c_str()); + } else { + Serial.println("setup portal failed to start"); + } +} + +void drawSetupOverlay() { + if (!setupMode) { + return; + } + M5.Display.setTextDatum(top_left); + M5.Display.setTextColor(TFT_WHITE, TFT_BLACK); + M5.Display.setTextSize(1); + M5.Display.fillRect(7, 104, 114, 22, TFT_BLACK); + M5.Display.setCursor(9, 106); + M5.Display.print(setupPortal.ssid()); + M5.Display.setCursor(9, 116); + M5.Display.print(setupPortal.ip()); +} + +} // namespace + +void setup() { + auto cfg = M5.config(); + cfg.serial_baudrate = 115200; + cfg.output_power = true; + cfg.internal_imu = false; + cfg.internal_mic = false; + // AtomS3R has no internal speaker. Audio output requires the external + // Atomic Echo Base (ES8311). Without this, M5.Speaker.playWav() returns + // true but produces no sound and the mouth sticks half-open. + cfg.internal_spk = false; + cfg.external_speaker.atomic_echo = true; + M5.begin(cfg); + + Serial.println("Real Minimum Headroom AtomS3R starting"); + Serial.println("display ready"); + Serial.println("demo face mode"); + + settings.begin(); + const HeadroomSettingsData& data = settings.data(); + Serial.printf("device_id=%s saved_settings=%s\n", data.deviceId.c_str(), settings.hasSavedSettings() ? "yes" : "no"); + + startedMs = millis(); + renderer.begin(128, 128, data.faceRotationDegrees); + audio.begin(data); + forcedSetupMode = shouldForceSetupPortal(2000); + if (forcedSetupMode) { + Serial.println("button held at boot; forcing setup portal"); + } + + wifiConnected = forcedSetupMode ? false : connectWifi(data, 8000); + if (!wifiConnected) { + startSetupPortal(); + } else { + faceState.expression = HeadroomExpression::Thinking; + faceState.connected = true; + transport.begin(data, faceState, audio); + ingressServer.begin(data, transport, audio, faceState); + } + + if (!wifiConnected) { + applyCurrentExpression(); + } + renderer.draw(faceState); + drawSetupOverlay(); +} + +void loop() { + M5.update(); + setupPortal.handleClient(); + audio.loop(); + if (!setupMode && wifiConnected) { + ingressServer.loop(); + transport.loop(); + faceState.connected = transport.connected() || ingressServer.recentlyActive(10000); + } + uint32_t nowMs = millis(); + + if ((!wifiConnected || setupMode) && M5.BtnA.wasPressed()) { + expressionIndex = (expressionIndex + 1) % (sizeof(kExpressions) / sizeof(kExpressions[0])); + applyCurrentExpression(); + Serial.printf("expression index=%u\n", static_cast(expressionIndex)); + } + + if (nowMs - lastFrameMs >= 33) { + updateDemoMotion(nowMs); + renderer.draw(faceState); + drawSetupOverlay(); + lastFrameMs = nowMs; + } +} diff --git a/integrations/stackchan-minimal/README.md b/integrations/stackchan-minimal/README.md new file mode 100644 index 0000000..c75bb8b --- /dev/null +++ b/integrations/stackchan-minimal/README.md @@ -0,0 +1,98 @@ +# StackChan Minimal sidecar + +This directory contains sidecar services for using StackChan Minimal with this repository's local speech stack. + +It does not start the minimum-headroom operator UI. It starts only the pieces StackChan Minimal expects: + +- a whisper.cpp-compatible STT adapter on port `8081`, backed by `asr-worker` Parakeet JA/EN +- a piper/VOICEVOX-shaped TTS adapter on port `5000`, backed by Kokoro ONNX +- an optional `llama-server` OpenAI-compatible LLM endpoint on port `8080` + +## Quick start + +The launcher auto-detects this local Qwen GGUF when it exists: + + /home/amari1/models/unsloth/Qwen3.6-35B-A3B/Qwen3.6-35B-A3B-UD-Q4_K_XL.gguf + +You can also set a local GGUF model path explicitly, then run: + + export STACKCHAN_LLM_MODEL_PATH=/home/amari1/models/unsloth/Qwen3.6-35B-A3B/Qwen3.6-35B-A3B-UD-Q4_K_XL.gguf + ./scripts/run-stackchan-sidecar.sh + +To use Nemotron instead, point `STACKCHAN_LLM_MODEL_PATH` at the Nemotron GGUF file. The launcher does not hard-code a model because local filenames and quantization choices vary. + +## StackChan Minimal settings + +Use the host IP printed by `run-stackchan-sidecar.sh`. + +- STT server IP: printed host IP +- STT server port: `8081` +- STT path: `/inference` +- TTS server IP: printed host IP +- TTS server port: `5000` +- LLM base URL: `http://:8080/v1` + +The LLM endpoint is OpenAI-compatible through llama.cpp. Use the model name expected by your StackChan Minimal build; llama.cpp accepts chat completions at `/v1/chat/completions`. + +The llama-server defaults use the safer 32GB VRAM starting point from english-trainer's `.env.example`: + + LLAMA_CTX_SIZE=8192 + LLAMA_PARALLEL=1 + LLAMA_GPU_LAYERS=-1 + LLAMA_FLASH_ATTN=on + LLAMA_JINJA=1 + LLAMA_REASONING=off + +On a 32GB VRAM GPU this is the intended starting point for the Qwen3.6 35B Q4_K_XL model plus Parakeet JA on CUDA. The english-trainer README also records `12288` as a single-user Nemotron Cascade operating point; try `LLAMA_CTX_SIZE=12288` only after confirming the Qwen sidecar is stable at `8192`. + +The local llama.cpp build reports `--jinja` as enabled by default and `--flash-attn` as `auto` by default. The sidecar still passes them explicitly so the Qwen chat template path and attention mode are visible in the launch command. Set `LLAMA_FLASH_ATTN=auto` if a future llama.cpp build or model combination has trouble with forced Flash Attention. + +`LLAMA_REASONING=off` is intentional for StackChan Minimal. Qwen thinking chunks can be returned as `reasoning_content`; StackChan Minimal expects normal `content` chunks for display and TTS, so reasoning output can leave the spoken response empty. + +## Environment + +Copy or source `stackchan.env.example` for the most common knobs. + +The defaults bind the adapter and llama-server ports to `0.0.0.0` so an M5Stack device on the same trusted LAN can reach them. Do not expose these ports to an untrusted network. + +If you already have services running: + + STACKCHAN_START_ASR_WORKER=0 ./scripts/run-stackchan-sidecar.sh + STACKCHAN_START_LLM=0 ./scripts/run-stackchan-sidecar.sh + +## Parakeet GPU mode + +The sidecar starts minimum-headroom `asr-worker` with CUDA by default: + + ASR_DEVICE=cuda + STACKCHAN_ASR_DEVICE=cuda + ASR_SINGLE_MODEL_CACHE=true + ASR_PRELOAD_MODELS=false + ASR_MODEL_JA=nvidia/parakeet-tdt_ctc-0.6b-ja + +This mirrors the safer english-trainer GPU posture: keep only one ASR model resident at a time and avoid preloading EN/JA together. That keeps VRAM pressure lower while still running the Japanese Parakeet model on GPU. + +If your login shell already exports `ASR_DEVICE=cpu`, set `STACKCHAN_ASR_DEVICE=cuda` or unset `ASR_DEVICE` before launching. The launcher prints the effective ASR device at startup. + +## Adapter endpoints + +ASR adapter: + +- `GET /health` +- `POST /inference` +- `POST /v1/audio/transcriptions` + +The request may be whisper.cpp-style multipart form data with a file field, raw audio bytes, or JSON containing `audioBase64` and `mimeType`. + +TTS adapter: + +- `GET /health` +- `GET /voices` +- `GET /tts_live.wav?text=...` +- `POST /synthesize` +- `POST /tts` +- `POST /api/tts` +- `POST /audio_query` +- `POST /synthesis` + +The piper-like endpoints return `audio/wav`. The VOICEVOX-like endpoints are minimal compatibility endpoints for clients that call `audio_query` and then `synthesis`. diff --git a/integrations/stackchan-minimal/stackchan.env.example b/integrations/stackchan-minimal/stackchan.env.example new file mode 100644 index 0000000..3cfc1a4 --- /dev/null +++ b/integrations/stackchan-minimal/stackchan.env.example @@ -0,0 +1,43 @@ +# Source this file before running ./scripts/run-stackchan-sidecar.sh. + +# Required when STACKCHAN_START_LLM=1. +STACKCHAN_LLM_MODEL_PATH=/home/amari1/models/unsloth/Qwen3.6-35B-A3B/Qwen3.6-35B-A3B-UD-Q4_K_XL.gguf + +# Alternative model example: +# STACKCHAN_LLM_MODEL_PATH=/path/to/nemotron.gguf + +# Bind adapter ports to the LAN so AtomS3R can connect. +STACKCHAN_ASR_ADAPTER_HOST=0.0.0.0 +STACKCHAN_ASR_ADAPTER_PORT=8081 +STACKCHAN_TTS_ADAPTER_HOST=0.0.0.0 +STACKCHAN_TTS_ADAPTER_PORT=5000 + +# Start minimum-headroom Parakeet asr-worker for the ASR adapter. +STACKCHAN_START_ASR_WORKER=1 +STACKCHAN_ASR_WORKER_HOST=127.0.0.1 +STACKCHAN_ASR_WORKER_PORT=8091 +ASR_DEVICE=cuda +# Prefer this StackChan-specific knob if your shell already exports ASR_DEVICE. +STACKCHAN_ASR_DEVICE=cuda +ASR_SINGLE_MODEL_CACHE=true +ASR_PRELOAD_MODELS=false +ASR_MODEL_JA=nvidia/parakeet-tdt_ctc-0.6b-ja +ASR_MODEL_EN=nvidia/parakeet-tdt-0.6b-v2 +ASR_MODEL_FAST=nvidia/parakeet-tdt-0.6b-v2 + +# Start llama.cpp server for StackChan chat. +STACKCHAN_START_LLM=1 +LLAMA_SERVER_BIN= +LLAMA_HOST=0.0.0.0 +LLAMA_PORT=8080 +LLAMA_CTX_SIZE=8192 +LLAMA_PARALLEL=1 +LLAMA_GPU_LAYERS=-1 +LLAMA_FLASH_ATTN=on +LLAMA_JINJA=1 +LLAMA_REASONING=off +LLAMA_THREADS= +LLAMA_EXTRA_ARGS= + +# Kokoro voice name. Numeric StackChan speaker ids use this default. +STACKCHAN_KOKORO_VOICE=af_heart diff --git a/integrations/stackchan-minimal/stackchan_asr_adapter.py b/integrations/stackchan-minimal/stackchan_asr_adapter.py new file mode 100755 index 0000000..903d1d1 --- /dev/null +++ b/integrations/stackchan-minimal/stackchan_asr_adapter.py @@ -0,0 +1,259 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import base64 +import json +import mimetypes +import sys +import urllib.error +import urllib.parse +import urllib.request +from dataclasses import dataclass +from email.parser import BytesParser +from email.policy import default +from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer +from typing import Any + + +@dataclass(frozen=True) +class AudioRequest: + audio: bytes + mime_type: str + language: str + filename: str | None = None + + +def normalize_language(value: str | None, fallback: str = 'ja') -> str: + normalized = (value or '').strip().lower().replace('_', '-') + if normalized.startswith('en'): + return 'en' + if normalized.startswith('ja'): + return 'ja' + return fallback + + +def parse_content_type(header: str | None) -> tuple[str, dict[str, str]]: + if not header: + return 'application/octet-stream', {} + parts = [part.strip() for part in header.split(';')] + media_type = parts[0].lower() if parts and parts[0] else 'application/octet-stream' + params: dict[str, str] = {} + for part in parts[1:]: + if '=' not in part: + continue + key, value = part.split('=', 1) + params[key.strip().lower()] = value.strip().strip('"') + return media_type, params + + +def guess_mime_type(filename: str | None, fallback: str) -> str: + if fallback and fallback != 'application/octet-stream': + return fallback + if filename: + guessed, _ = mimetypes.guess_type(filename) + if guessed: + return guessed + return 'application/octet-stream' + + +def parse_audio_request( + body: bytes, + content_type: str | None, + query: dict[str, list[str]], + default_language: str, +) -> AudioRequest: + media_type, _ = parse_content_type(content_type) + language = normalize_language(first_value(query, 'language') or first_value(query, 'lang'), default_language) + + if media_type.startswith('multipart/form-data'): + return parse_multipart_request(body, content_type or media_type, language) + + if media_type == 'application/json': + payload = json.loads(body.decode('utf-8')) + if not isinstance(payload, dict): + raise ValueError('JSON body must be an object') + raw_audio = payload.get('audioBase64') or payload.get('audio_base64') or payload.get('audio') + if not isinstance(raw_audio, str) or not raw_audio.strip(): + raise ValueError('JSON body must include audioBase64') + language = normalize_language(str(payload.get('language') or payload.get('lang') or ''), language) + mime_type = str(payload.get('mimeType') or payload.get('mime_type') or 'application/octet-stream') + return AudioRequest(audio=base64.b64decode(raw_audio), mime_type=mime_type, language=language) + + if media_type == 'application/x-www-form-urlencoded': + values = urllib.parse.parse_qs(body.decode('utf-8'), keep_blank_values=True) + raw_audio = first_value(values, 'audioBase64') or first_value(values, 'audio_base64') or first_value(values, 'audio') + if not raw_audio: + raise ValueError('form body must include audioBase64') + language = normalize_language(first_value(values, 'language') or first_value(values, 'lang'), language) + mime_type = first_value(values, 'mimeType') or first_value(values, 'mime_type') or 'application/octet-stream' + return AudioRequest(audio=base64.b64decode(raw_audio), mime_type=mime_type, language=language) + + if not body: + raise ValueError('request body is empty') + return AudioRequest(audio=body, mime_type=media_type, language=language) + + +def parse_multipart_request(body: bytes, content_type: str, language: str) -> AudioRequest: + header = f'Content-Type: {content_type}\r\nMIME-Version: 1.0\r\n\r\n'.encode('utf-8') + message = BytesParser(policy=default).parsebytes(header + body) + audio: bytes | None = None + mime_type = 'application/octet-stream' + filename: str | None = None + + for part in message.iter_parts(): + name = part.get_param('name', header='content-disposition') + if name in {'language', 'lang'}: + value = part.get_payload(decode=True).decode('utf-8', errors='ignore') + language = normalize_language(value, language) + continue + if name in {'file', 'audio', 'audio_file', 'upload'} or audio is None: + payload = part.get_payload(decode=True) + if payload: + audio = payload + filename = part.get_filename() + mime_type = guess_mime_type(filename, part.get_content_type()) + + if audio is None: + raise ValueError('multipart body does not include audio') + return AudioRequest(audio=audio, mime_type=mime_type, language=language, filename=filename) + + +def first_value(values: dict[str, list[str]], key: str) -> str | None: + items = values.get(key) + if not items: + return None + return items[0] + + +def forward_to_asr(audio_request: AudioRequest, asr_base_url: str, timeout: float) -> dict[str, Any]: + base = asr_base_url.rstrip('/') + endpoint = f'{base}/v1/asr/{audio_request.language}' + payload = { + 'audioBase64': base64.b64encode(audio_request.audio).decode('ascii'), + 'mimeType': audio_request.mime_type, + } + request = urllib.request.Request( + endpoint, + data=json.dumps(payload).encode('utf-8'), + headers={'content-type': 'application/json'}, + method='POST', + ) + with urllib.request.urlopen(request, timeout=timeout) as response: + raw = response.read() + parsed = json.loads(raw.decode('utf-8')) + if not isinstance(parsed, dict): + raise ValueError('ASR worker returned non-object JSON') + return parsed + + +def whisper_response(asr_response: dict[str, Any]) -> dict[str, Any]: + text = str(asr_response.get('text') or '').strip() + language = str(asr_response.get('language') or 'unknown') + return { + 'text': text, + 'language': language, + 'segments': [], + 'minimum_headroom': asr_response, + } + + +class StackChanAsrHandler(BaseHTTPRequestHandler): + server_version = 'StackChanAsrAdapter/0.1' + + def do_GET(self) -> None: + parsed = urllib.parse.urlparse(self.path) + if parsed.path in {'/health', '/'}: + self.write_json(200, {'ok': True, 'service': 'stackchan-asr-adapter'}) + return + self.write_json(404, {'ok': False, 'error': 'not_found'}) + + def do_POST(self) -> None: + parsed = urllib.parse.urlparse(self.path) + if parsed.path not in {'/inference', '/v1/audio/transcriptions', '/transcribe', '/asr'}: + self.write_json(404, {'ok': False, 'error': 'not_found'}) + return + + length = int(self.headers.get('content-length') or '0') + body = self.rfile.read(length) + query = urllib.parse.parse_qs(parsed.query, keep_blank_values=True) + try: + audio_request = parse_audio_request(body, self.headers.get('content-type'), query, self.server.default_language) + upstream = forward_to_asr(audio_request, self.server.asr_base_url, self.server.upstream_timeout) + self.write_json(200, whisper_response(upstream)) + except urllib.error.HTTPError as error: + detail = error.read().decode('utf-8', errors='replace') + self.write_json(502, {'ok': False, 'error': 'asr_upstream_error', 'status': error.code, 'detail': detail[:500]}) + except Exception as error: + self.write_json(400, {'ok': False, 'error': 'asr_adapter_error', 'detail': str(error)}) + + def log_message(self, fmt: str, *args: Any) -> None: + sys.stderr.write('[stackchan-asr] ' + fmt % args + '\n') + + def write_json(self, status: int, payload: dict[str, Any]) -> None: + body = json.dumps(payload, ensure_ascii=False).encode('utf-8') + self.send_response(status) + self.send_header('content-type', 'application/json; charset=utf-8') + self.send_header('cache-control', 'no-store') + self.send_header('content-length', str(len(body))) + self.end_headers() + self.wfile.write(body) + + +class StackChanAsrServer(ThreadingHTTPServer): + def __init__(self, address: tuple[str, int], asr_base_url: str, default_language: str, upstream_timeout: float) -> None: + super().__init__(address, StackChanAsrHandler) + self.asr_base_url = asr_base_url + self.default_language = default_language + self.upstream_timeout = upstream_timeout + + +def run_self_test() -> None: + boundary = 'test-boundary' + body = ( + f'--{boundary}\r\n' + 'Content-Disposition: form-data; name="language"\r\n\r\n' + 'ja\r\n' + f'--{boundary}\r\n' + 'Content-Disposition: form-data; name="file"; filename="sample.webm"\r\n' + 'Content-Type: audio/webm\r\n\r\n' + ).encode('utf-8') + b'0123456789abcdef' + f'\r\n--{boundary}--\r\n'.encode('utf-8') + parsed = parse_audio_request(body, f'multipart/form-data; boundary={boundary}', {}, 'ja') + assert parsed.language == 'ja' + assert parsed.mime_type == 'audio/webm' + assert parsed.audio == b'0123456789abcdef' + print('ok') + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser(description='StackChan Minimal whisper.cpp-compatible ASR adapter for minimum-headroom Parakeet ASR') + parser.add_argument('--host', default='0.0.0.0') + parser.add_argument('--port', type=int, default=8081) + parser.add_argument('--asr-base-url', default='http://127.0.0.1:8091') + parser.add_argument('--language', default='ja') + parser.add_argument('--upstream-timeout', type=float, default=30.0) + parser.add_argument('--self-test', action='store_true') + args = parser.parse_args(argv) + + if args.self_test: + run_self_test() + return 0 + + server = StackChanAsrServer( + (args.host, args.port), + asr_base_url=args.asr_base_url, + default_language=normalize_language(args.language), + upstream_timeout=args.upstream_timeout, + ) + print(f'[stackchan-asr] listening on http://{args.host}:{args.port}, upstream={args.asr_base_url}', flush=True) + try: + server.serve_forever() + except KeyboardInterrupt: + return 130 + finally: + server.server_close() + return 0 + + +if __name__ == '__main__': + raise SystemExit(main()) diff --git a/integrations/stackchan-minimal/stackchan_tts_adapter.py b/integrations/stackchan-minimal/stackchan_tts_adapter.py new file mode 100755 index 0000000..9929c23 --- /dev/null +++ b/integrations/stackchan-minimal/stackchan_tts_adapter.py @@ -0,0 +1,243 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import base64 +import json +import os +import sys +import urllib.parse +from dataclasses import dataclass +from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer +from pathlib import Path +from typing import Any + + +@dataclass(frozen=True) +class TtsRequest: + text: str + voice: str | None = None + + +def parse_content_type(header: str | None) -> str: + if not header: + return 'application/octet-stream' + return header.split(';', 1)[0].strip().lower() + + +def first_value(values: dict[str, list[str]], key: str) -> str | None: + items = values.get(key) + if not items: + return None + return items[0] + + +def extract_text_from_payload(payload: Any) -> str | None: + if isinstance(payload, dict): + for key in ('text', 'input', 'sentence', 'query', 'message'): + value = payload.get(key) + if isinstance(value, str) and value.strip(): + return value + if isinstance(payload.get('audio_query'), dict): + return extract_text_from_payload(payload['audio_query']) + return None + + +def parse_tts_request(body: bytes, content_type: str | None, query: dict[str, list[str]]) -> TtsRequest: + text = first_value(query, 'text') or first_value(query, 'input') or first_value(query, 'sentence') + voice = first_value(query, 'voice') or first_value(query, 'speaker') or first_value(query, 'character') + media_type = parse_content_type(content_type) + + if text: + return TtsRequest(text=text.strip(), voice=voice) + + if media_type == 'application/json': + payload = json.loads(body.decode('utf-8') if body else '{}') + text = extract_text_from_payload(payload) + if isinstance(payload, dict): + voice = voice or string_or_none(payload.get('voice')) or string_or_none(payload.get('speaker')) + if text: + return TtsRequest(text=text.strip(), voice=voice) + + if media_type == 'application/x-www-form-urlencoded': + values = urllib.parse.parse_qs(body.decode('utf-8'), keep_blank_values=True) + text = first_value(values, 'text') or first_value(values, 'input') or first_value(values, 'sentence') + voice = voice or first_value(values, 'voice') or first_value(values, 'speaker') + if text: + return TtsRequest(text=text.strip(), voice=voice) + + raw = body.decode('utf-8', errors='ignore').strip() + if raw: + return TtsRequest(text=raw, voice=voice) + + raise ValueError('TTS request does not include text') + + +def string_or_none(value: Any) -> str | None: + if isinstance(value, str) and value.strip(): + return value.strip() + if isinstance(value, int): + return str(value) + return None + + +class KokoroSynthesizer: + def __init__(self, repo_root: Path, default_voice: str) -> None: + self.repo_root = repo_root + self.default_voice = default_voice + self._engine = None + + def _load(self) -> Any: + if self._engine is not None: + return self._engine + + tts_src = self.repo_root / 'tts-worker' / 'src' + if str(tts_src) not in sys.path: + sys.path.insert(0, str(tts_src)) + + os.environ.setdefault('MH_KOKORO_MODEL', str(self.repo_root / 'assets' / 'kokoro' / 'kokoro-v1.0.onnx')) + os.environ.setdefault('MH_KOKORO_VOICES', str(self.repo_root / 'assets' / 'kokoro' / 'voices-v1.0.bin')) + + from tts_worker.kokoro_engine import KokoroEngine, resolve_model_paths + + self._engine = KokoroEngine(model_paths=resolve_model_paths(), voice=self.default_voice) + return self._engine + + def synthesize_wav(self, request: TtsRequest) -> bytes: + engine = self._load() + audio, sample_rate = engine.synthesize_text(request.text, voice_override=self.voice_override(request.voice)) + from tts_worker.playback import encode_wav_base64 + + return base64.b64decode(encode_wav_base64(audio, sample_rate)) + + def voice_override(self, requested: str | None) -> str | None: + if not requested: + return None + requested = requested.strip() + # StackChan may pass numeric VOICEVOX speaker ids or character ids such as ja-02. + # Keep the Kokoro default unless the caller explicitly passes a Kokoro voice name. + if requested.isdigit() or requested.startswith('ja-') or requested.startswith('en-'): + return None + return requested + + +class StackChanTtsHandler(BaseHTTPRequestHandler): + server_version = 'StackChanTtsAdapter/0.1' + + def do_GET(self) -> None: + parsed = urllib.parse.urlparse(self.path) + query = urllib.parse.parse_qs(parsed.query, keep_blank_values=True) + if parsed.path in {'/health', '/version'}: + self.write_json(200, {'ok': True, 'service': 'stackchan-tts-adapter', 'engine': 'kokoro-onnx'}) + return + if parsed.path in {'/speakers', '/voices'}: + self.write_json(200, [{'name': self.server.default_voice, 'speaker_uuid': 'kokoro', 'styles': [{'name': 'default', 'id': 0}]}]) + return + if parsed.path in {'/tts_live.wav', '/synthesize', '/tts', '/api/tts', '/'} and first_value(query, 'text'): + self.handle_synthesis(b'', 'text/plain', query) + return + self.write_json(404, {'ok': False, 'error': 'not_found'}) + + def do_POST(self) -> None: + parsed = urllib.parse.urlparse(self.path) + query = urllib.parse.parse_qs(parsed.query, keep_blank_values=True) + length = int(self.headers.get('content-length') or '0') + body = self.rfile.read(length) + + if parsed.path == '/audio_query': + text = first_value(query, 'text') + if not text: + try: + text = parse_tts_request(body, self.headers.get('content-type'), query).text + except Exception: + text = '' + self.write_json(200, self.voicevox_audio_query(text or '')) + return + + if parsed.path in {'/synthesis', '/synthesize', '/tts', '/api/tts', '/'}: + self.handle_synthesis(body, self.headers.get('content-type'), query) + return + + self.write_json(404, {'ok': False, 'error': 'not_found'}) + + def handle_synthesis(self, body: bytes, content_type: str | None, query: dict[str, list[str]]) -> None: + try: + request = parse_tts_request(body, content_type, query) + wav = self.server.synthesizer.synthesize_wav(request) + self.send_response(200) + self.send_header('content-type', 'audio/wav') + self.send_header('cache-control', 'no-store') + self.send_header('content-length', str(len(wav))) + self.end_headers() + self.wfile.write(wav) + except Exception as error: + self.write_json(400, {'ok': False, 'error': 'tts_adapter_error', 'detail': str(error)}) + + def voicevox_audio_query(self, text: str) -> dict[str, Any]: + return { + 'accent_phrases': [], + 'speedScale': 1.0, + 'pitchScale': 0.0, + 'intonationScale': 1.0, + 'volumeScale': 1.0, + 'prePhonemeLength': 0.1, + 'postPhonemeLength': 0.1, + 'outputSamplingRate': 24000, + 'outputStereo': False, + 'kana': '', + 'text': text, + } + + def log_message(self, fmt: str, *args: Any) -> None: + sys.stderr.write('[stackchan-tts] ' + fmt % args + '\n') + + def write_json(self, status: int, payload: Any) -> None: + body = json.dumps(payload, ensure_ascii=False).encode('utf-8') + self.send_response(status) + self.send_header('content-type', 'application/json; charset=utf-8') + self.send_header('cache-control', 'no-store') + self.send_header('content-length', str(len(body))) + self.end_headers() + self.wfile.write(body) + + +class StackChanTtsServer(ThreadingHTTPServer): + def __init__(self, address: tuple[str, int], repo_root: Path, default_voice: str) -> None: + super().__init__(address, StackChanTtsHandler) + self.default_voice = default_voice + self.synthesizer = KokoroSynthesizer(repo_root=repo_root, default_voice=default_voice) + + +def run_self_test() -> None: + assert parse_tts_request(b'{"text":"hello"}', 'application/json', {}).text == 'hello' + assert parse_tts_request(b'text=hello', 'application/x-www-form-urlencoded', {}).text == 'hello' + assert parse_tts_request(b'hello', 'text/plain', {}).text == 'hello' + print('ok') + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser(description='StackChan Minimal piper/VOICEVOX-compatible TTS adapter for minimum-headroom Kokoro') + parser.add_argument('--host', default='0.0.0.0') + parser.add_argument('--port', type=int, default=5000) + parser.add_argument('--voice', default='af_heart') + parser.add_argument('--repo-root', default=str(Path(__file__).resolve().parents[2])) + parser.add_argument('--self-test', action='store_true') + args = parser.parse_args(argv) + + if args.self_test: + run_self_test() + return 0 + + server = StackChanTtsServer((args.host, args.port), repo_root=Path(args.repo_root).resolve(), default_voice=args.voice) + print(f'[stackchan-tts] listening on http://{args.host}:{args.port}, voice={args.voice}', flush=True) + try: + server.serve_forever() + except KeyboardInterrupt: + return 130 + finally: + server.server_close() + return 0 + + +if __name__ == '__main__': + raise SystemExit(main()) diff --git a/scripts/atoms3r-http-bridge.mjs b/scripts/atoms3r-http-bridge.mjs new file mode 100755 index 0000000..58dfa74 --- /dev/null +++ b/scripts/atoms3r-http-bridge.mjs @@ -0,0 +1,415 @@ +#!/usr/bin/env node + +import { readFileSync } from 'node:fs'; +import { join } from 'node:path'; + +const DEFAULT_FACE_WS_URL = 'ws://127.0.0.1:8765/ws'; +const DEFAULT_ATOM_URL = 'http://192.168.1.33'; +const DEFAULT_MAX_PAYLOAD_BYTES = 1_050_000; +const DEFAULT_MOUTH_INTERVAL_MS = 80; + +const faceWsUrlInput = process.env.FACE_WS_URL ?? process.env.MH_FACE_WS_URL ?? DEFAULT_FACE_WS_URL; +const atomBaseUrl = normalizeBaseUrl(process.env.ATOM_HEADROOM_URL ?? DEFAULT_ATOM_URL); +const localConfigToken = readLocalHeadroomToken(); +const faceAuthToken = process.env.MH_FACE_AUTH_TOKEN ?? tokenFromUrl(faceWsUrlInput) ?? localConfigToken ?? ''; +const atomAuthToken = process.env.ATOM_HEADROOM_AUTH_TOKEN ?? faceAuthToken; +const maxPayloadBytes = positiveInt(process.env.ATOM_HEADROOM_MAX_PAYLOAD_BYTES, DEFAULT_MAX_PAYLOAD_BYTES); +const mouthIntervalMs = positiveInt(process.env.ATOM_HEADROOM_MOUTH_INTERVAL_MS, DEFAULT_MOUTH_INTERVAL_MS); +const forwardAudio = process.env.ATOM_HEADROOM_FORWARD_AUDIO !== '0'; +const fetchAudioRef = process.env.ATOM_HEADROOM_FETCH_AUDIO_REF === '1'; + +const faceWsUrl = withAuthQuery(faceWsUrlInput, faceAuthToken); +const faceHttpBase = httpBaseFromWsUrl(faceWsUrl); +const atomPayloadUrl = new URL('/api/headroom/payload', atomBaseUrl).toString(); +const atomAudioUrl = new URL('/api/headroom/audio', atomBaseUrl).toString(); +const atomHealthUrl = new URL('/health', atomBaseUrl).toString(); +const relayTypes = new Set(['event', 'tts_state', 'tts_mouth', 'tts_audio']); + +let ws = null; +let reconnectTimer = null; +let postChain = Promise.resolve(); +let lastMouthForwardedAt = 0; +let lastMouthOpen = null; + +if (typeof WebSocket !== 'function') { + console.error('[atoms3r-bridge] Node.js global WebSocket is unavailable. Use Node 22+.'); + process.exit(1); +} + +console.log( + `[atoms3r-bridge] face_ws=${redactUrl(faceWsUrl)} atom=${redactUrl(atomBaseUrl)} max_payload=${maxPayloadBytes} forward_audio=${forwardAudio} fetch_audio_ref=${fetchAudioRef}` +); + +await checkAtomHealth(); +connect(); + +process.on('SIGINT', () => { + if (reconnectTimer) { + clearTimeout(reconnectTimer); + } + if (ws) { + ws.close(); + } + process.exit(0); +}); + +function connect() { + ws = new WebSocket(faceWsUrl); + + ws.addEventListener('open', () => { + console.log('[atoms3r-bridge] connected to face-app websocket'); + }); + + ws.addEventListener('message', (event) => { + handleWsMessage(event.data).catch((error) => { + console.error(`[atoms3r-bridge] message handling failed: ${error.message}`); + }); + }); + + ws.addEventListener('error', (event) => { + const message = event?.message ?? 'websocket error'; + console.error(`[atoms3r-bridge] ${message}`); + }); + + ws.addEventListener('close', () => { + console.error('[atoms3r-bridge] face-app websocket closed; reconnecting'); + reconnectTimer = setTimeout(connect, 1000); + }); +} + +async function handleWsMessage(data) { + const text = await dataToString(data); + let payload = null; + try { + payload = JSON.parse(text); + } catch { + return; + } + if (!payload || typeof payload.type !== 'string') { + return; + } + + if (payload.type === 'tts_audio_ref') { + console.log( + `[atoms3r-bridge] received tts_audio_ref bytes=${Number.isInteger(payload.byte_length) ? payload.byte_length : 'unknown'}` + ); + await forwardAudioRef(payload); + return; + } + + if (!relayTypes.has(payload.type)) { + return; + } + + if (payload.type === 'tts_mouth' && !shouldForwardMouth(payload)) { + return; + } + + if (payload.type === 'tts_state') { + console.log(`[atoms3r-bridge] received tts_state phase=${payload.phase ?? 'unknown'}`); + } else if (payload.type === 'tts_audio') { + await forwardDirectAudio(payload); + return; + } + + enqueuePost(payload, payload.type); +} + +async function forwardDirectAudio(payload) { + if (!forwardAudio) { + return; + } + const audioBase64 = typeof payload.audio_base64 === 'string' ? payload.audio_base64 : ''; + console.log(`[atoms3r-bridge] received tts_audio base64=${audioBase64.length}`); + if (audioBase64.trim() === '') { + return; + } + + const audio = Buffer.from(audioBase64, 'base64'); + if (audio.length === 0) { + console.error('[atoms3r-bridge] skipping tts_audio; decoded audio is empty'); + return; + } + if (audio.length + 4096 > maxPayloadBytes) { + console.error(`[atoms3r-bridge] skipping tts_audio; decoded payload is too large (${audio.length} bytes)`); + return; + } + + enqueueAudioPost(audio, payload); +} + +async function forwardAudioRef(payload) { + if (!forwardAudio) { + return; + } + if (!fetchAudioRef) { + console.log('[atoms3r-bridge] skipping tts_audio_ref fetch; waiting for direct tts_audio'); + return; + } + if (typeof payload.url !== 'string' || payload.url.trim() === '') { + return; + } + + const advertisedLength = Number.isInteger(payload.byte_length) ? payload.byte_length : null; + if (advertisedLength !== null && estimatedAudioPayloadBytes(advertisedLength) > maxPayloadBytes) { + console.error(`[atoms3r-bridge] skipping audio ref; advertised payload is too large (${advertisedLength} bytes)`); + return; + } + + const audioUrl = new URL(payload.url, faceHttpBase).toString(); + const headers = {}; + if (faceAuthToken) { + headers.Authorization = `Bearer ${faceAuthToken}`; + } + + const response = await fetch(audioUrl, { + headers, + signal: AbortSignal.timeout(8000) + }); + if (!response.ok) { + console.error(`[atoms3r-bridge] audio fetch failed status=${response.status}`); + return; + } + + const contentLength = Number.parseInt(response.headers.get('content-length') ?? '', 10); + if (Number.isInteger(contentLength) && contentLength > 0 && estimatedAudioPayloadBytes(contentLength) > maxPayloadBytes) { + console.error(`[atoms3r-bridge] skipping audio ref; fetched payload is too large (${contentLength} bytes)`); + return; + } + + const audio = Buffer.from(await response.arrayBuffer()); + if (estimatedAudioPayloadBytes(audio.length) > maxPayloadBytes) { + console.error(`[atoms3r-bridge] skipping audio ref; fetched payload is too large (${audio.length} bytes)`); + return; + } + + const forwarded = { + ...payload, + type: 'tts_audio', + mime_type: typeof payload.mime_type === 'string' ? payload.mime_type : 'audio/wav', + audio_base64: audio.toString('base64'), + byte_length: audio.length, + ts: Date.now() + }; + delete forwarded.url; + delete forwarded.expires_at; + + enqueuePost(forwarded, 'tts_audio_ref'); +} + +function shouldForwardMouth(payload) { + const open = Number(payload.open); + if (!Number.isFinite(open)) { + return false; + } + const now = Date.now(); + const isClosed = open <= 0.04; + if (!isClosed && now - lastMouthForwardedAt < mouthIntervalMs) { + return false; + } + if (lastMouthOpen !== null && Math.abs(open - lastMouthOpen) < 0.02 && now - lastMouthForwardedAt < 250) { + return false; + } + lastMouthForwardedAt = now; + lastMouthOpen = open; + return true; +} + +function enqueuePost(payload, sourceType) { + const body = JSON.stringify(payload); + const byteLength = Buffer.byteLength(body); + if (byteLength > maxPayloadBytes) { + console.error(`[atoms3r-bridge] skipping ${sourceType}; payload too large (${byteLength} bytes)`); + return; + } + + postChain = postChain + .catch(() => {}) + .then(async () => { + try { + await postPayload(body, sourceType, byteLength); + } catch (error) { + console.error(`[atoms3r-bridge] ${error.message}`); + } + }); +} + +function enqueueAudioPost(audio, payload) { + postChain = postChain + .catch(() => {}) + .then(async () => { + try { + await postAudio(audio, payload); + } catch (error) { + console.error(`[atoms3r-bridge] ${error.message}`); + } + }); +} + +async function postPayload(body, sourceType, byteLength) { + const headers = { + 'content-type': 'application/json' + }; + if (atomAuthToken) { + headers['x-headroom-auth'] = atomAuthToken; + } + + const response = await fetch(atomPayloadUrl, { + method: 'POST', + headers, + body, + signal: AbortSignal.timeout(8000) + }); + if (!response.ok) { + const responseText = await response.text().catch(() => ''); + throw new Error(`Atom POST failed type=${sourceType} status=${response.status} body=${responseText.slice(0, 120)}`); + } + + if (sourceType !== 'tts_mouth') { + console.log(`[atoms3r-bridge] forwarded ${sourceType} (${byteLength} bytes)`); + } +} + +async function postAudio(audio, payload) { + const headers = { + 'content-type': typeof payload.mime_type === 'string' && payload.mime_type.trim() !== '' ? payload.mime_type : 'audio/wav' + }; + if (atomAuthToken) { + headers['x-headroom-auth'] = atomAuthToken; + } + if (typeof payload.utterance_id === 'string') { + headers['x-utterance-id'] = payload.utterance_id; + } + if (Number.isInteger(payload.generation)) { + headers['x-generation'] = String(payload.generation); + } + + const response = await fetch(atomAudioUrl, { + method: 'POST', + headers, + body: audio, + signal: AbortSignal.timeout(8000) + }); + if (!response.ok) { + const responseText = await response.text().catch(() => ''); + throw new Error(`Atom audio POST failed status=${response.status} body=${responseText.slice(0, 120)}`); + } + console.log(`[atoms3r-bridge] forwarded tts_audio wav (${audio.length} bytes)`); +} + +async function checkAtomHealth() { + const headers = {}; + if (atomAuthToken) { + headers['x-headroom-auth'] = atomAuthToken; + } + try { + const response = await fetch(atomHealthUrl, { + headers, + signal: AbortSignal.timeout(3000) + }); + if (!response.ok) { + console.error(`[atoms3r-bridge] Atom health status=${response.status}; continuing`); + return; + } + console.log('[atoms3r-bridge] Atom health ok'); + } catch (error) { + console.error(`[atoms3r-bridge] Atom health check failed: ${error.message}; continuing`); + } +} + +async function dataToString(data) { + if (typeof data === 'string') { + return data; + } + if (Buffer.isBuffer(data)) { + return data.toString('utf8'); + } + if (data instanceof ArrayBuffer) { + return Buffer.from(data).toString('utf8'); + } + if (ArrayBuffer.isView(data)) { + return Buffer.from(data.buffer, data.byteOffset, data.byteLength).toString('utf8'); + } + if (data && typeof data.text === 'function') { + return data.text(); + } + return String(data ?? ''); +} + +function positiveInt(value, fallback) { + const parsed = Number.parseInt(value ?? '', 10); + return Number.isInteger(parsed) && parsed > 0 ? parsed : fallback; +} + +function estimatedAudioPayloadBytes(audioBytes) { + return Math.ceil((audioBytes * 4) / 3) + 16384; +} + +function normalizeBaseUrl(value) { + const parsed = new URL(value); + parsed.pathname = parsed.pathname === '/' ? '/' : parsed.pathname.replace(/\/+$/, '/'); + parsed.search = ''; + parsed.hash = ''; + return parsed.toString(); +} + +function tokenFromUrl(value) { + try { + const parsed = new URL(value); + return parsed.searchParams.get('auth_token') ?? parsed.searchParams.get('token'); + } catch { + return null; + } +} + +function readLocalHeadroomToken() { + const candidates = [ + join(process.cwd(), 'firmware/atoms3r-headroom/include/headroom_config.local.h'), + join(new URL('..', import.meta.url).pathname, 'firmware/atoms3r-headroom/include/headroom_config.local.h') + ]; + for (const file of candidates) { + try { + const text = readFileSync(file, 'utf8'); + const match = text.match(/^\s*#define\s+HEADROOM_AUTH_TOKEN\s+"([^"]+)"/m); + if (match?.[1]) { + return match[1]; + } + } catch {} + } + return null; +} + +function withAuthQuery(value, token) { + if (!token) { + return value; + } + const parsed = new URL(value); + if (!parsed.searchParams.has('auth_token') && !parsed.searchParams.has('token')) { + parsed.searchParams.set('auth_token', token); + } + return parsed.toString(); +} + +function httpBaseFromWsUrl(value) { + const parsed = new URL(value); + parsed.protocol = parsed.protocol === 'wss:' ? 'https:' : 'http:'; + parsed.pathname = '/'; + parsed.search = ''; + parsed.hash = ''; + return parsed.toString(); +} + +function redactUrl(value) { + try { + const parsed = new URL(value); + if (parsed.searchParams.has('auth_token')) { + parsed.searchParams.set('auth_token', ''); + } + if (parsed.searchParams.has('token')) { + parsed.searchParams.set('token', ''); + } + return parsed.toString(); + } catch { + return value; + } +} diff --git a/scripts/restart-operator-stack-in-place.sh b/scripts/restart-operator-stack-in-place.sh index d8d490c..114781b 100755 --- a/scripts/restart-operator-stack-in-place.sh +++ b/scripts/restart-operator-stack-in-place.sh @@ -10,7 +10,7 @@ STACK_CMD="./scripts/run-operator-stack.sh" PROFILE_NAME="default" STACK_CMD_SET=0 FACE_UI_MODE="" -FACE_AUDIO_TARGET="" +FACE_AUDIO_TARGET="${MH_FACE_AUDIO_TARGET:-both}" ASR_BASE_URL="" OPERATOR_FACE_AGENT_ID="${MH_OPERATOR_FACE_AGENT_ID:-__operator__}" OPERATOR_FACE_AGENT_LABEL="${MH_OPERATOR_FACE_AGENT_LABEL:-Operator}" @@ -182,11 +182,19 @@ stack_pane="$(tmux display-message -p -t "${SESSION_NAME}:${WINDOW_NAME}.1" '#{p agent_cwd="$(tmux display-message -p -t "${SESSION_NAME}:${WINDOW_NAME}.0" '#{pane_current_path}' 2>/dev/null || true)" agent_repo_root="$(derive_agent_repo_root "$agent_cwd")" -if [[ -z "$agent_pane" || -z "$stack_pane" || -z "$agent_cwd" ]]; then - echo "[restart-operator-stack] expected panes .0 (agent) and .1 (stack) in ${SESSION_NAME}:${WINDOW_NAME}" >&2 +if [[ -z "$agent_pane" || -z "$agent_cwd" ]]; then + echo "[restart-operator-stack] expected pane .0 (agent) in ${SESSION_NAME}:${WINDOW_NAME}" >&2 exit 2 fi +if [[ -z "$stack_pane" ]]; then + stack_pane="$(tmux split-window -h -t "$agent_pane" -c "$agent_cwd" -P -F '#{pane_id}' 2>/dev/null || true)" + if [[ -z "$stack_pane" ]]; then + echo "[restart-operator-stack] failed to create missing stack pane in ${SESSION_NAME}:${WINDOW_NAME}" >&2 + exit 2 + fi +fi + stack_launch="env" append_env() { local key="$1" diff --git a/scripts/run-stackchan-sidecar.sh b/scripts/run-stackchan-sidecar.sh new file mode 100755 index 0000000..95c1036 --- /dev/null +++ b/scripts/run-stackchan-sidecar.sh @@ -0,0 +1,208 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +cd "$ROOT_DIR" + +: "${STACKCHAN_START_ASR_WORKER:=1}" +: "${STACKCHAN_START_LLM:=1}" +: "${STACKCHAN_ASR_WORKER_HOST:=127.0.0.1}" +: "${STACKCHAN_ASR_WORKER_PORT:=8091}" +: "${STACKCHAN_ASR_ADAPTER_HOST:=0.0.0.0}" +: "${STACKCHAN_ASR_ADAPTER_PORT:=8081}" +: "${STACKCHAN_TTS_ADAPTER_HOST:=0.0.0.0}" +: "${STACKCHAN_TTS_ADAPTER_PORT:=5000}" +: "${STACKCHAN_KOKORO_VOICE:=af_heart}" +: "${STACKCHAN_ASR_DEVICE:=cuda}" +ASR_DEVICE="$STACKCHAN_ASR_DEVICE" +: "${ASR_SINGLE_MODEL_CACHE:=true}" +: "${ASR_PRELOAD_MODELS:=false}" +: "${ASR_MODEL_JA:=nvidia/parakeet-tdt_ctc-0.6b-ja}" +: "${ASR_MODEL_EN:=nvidia/parakeet-tdt-0.6b-v2}" +: "${ASR_MODEL_FAST:=nvidia/parakeet-tdt-0.6b-v2}" +: "${LLAMA_HOST:=0.0.0.0}" +: "${LLAMA_PORT:=8080}" +: "${LLAMA_CTX_SIZE:=8192}" +: "${LLAMA_PARALLEL:=1}" +: "${LLAMA_GPU_LAYERS:=-1}" +: "${LLAMA_FLASH_ATTN:=on}" +: "${LLAMA_JINJA:=1}" +: "${LLAMA_REASONING:=off}" +: "${LLAMA_THREADS:=}" +: "${LLAMA_EXTRA_ARGS:=}" + +DEFAULT_QWEN_MODEL="$HOME/models/unsloth/Qwen3.6-35B-A3B/Qwen3.6-35B-A3B-UD-Q4_K_XL.gguf" +if [[ -z "${STACKCHAN_LLM_MODEL_PATH:-}" && -f "$DEFAULT_QWEN_MODEL" ]]; then + STACKCHAN_LLM_MODEL_PATH="$DEFAULT_QWEN_MODEL" +fi + +declare -a PIDS=() +declare -A NAMES=() + +cleanup() { + for pid in "${PIDS[@]:-}"; do + if kill -0 "$pid" 2>/dev/null; then + kill "$pid" >/dev/null 2>&1 || true + fi + done +} + +trap cleanup EXIT INT TERM + +start_proc() { + local name="$1" + shift + "$@" & + local pid=$! + PIDS+=("$pid") + NAMES["$pid"]="$name" + echo "[stackchan-sidecar] started ${name} (pid=${pid})" +} + +resolve_llama_server() { + if [[ -n "${LLAMA_SERVER_BIN:-}" ]]; then + printf '%s\n' "$LLAMA_SERVER_BIN" + return 0 + fi + + local candidates=( + "$ROOT_DIR/../llama.cpp/build/bin/llama-server" + "$ROOT_DIR/llama.cpp/build/bin/llama-server" + "$HOME/github/llama.cpp/build/bin/llama-server" + "$HOME/.local/llama.cpp/build/bin/llama-server" + ) + local candidate + for candidate in "${candidates[@]}"; do + if [[ -x "$candidate" ]]; then + printf '%s\n' "$candidate" + return 0 + fi + done + + if command -v llama-server >/dev/null 2>&1; then + command -v llama-server + return 0 + fi + + return 1 +} + +detect_host_ip() { + local ip + ip="$(hostname -I 2>/dev/null | awk '{print $1}')" + if [[ -n "$ip" ]]; then + printf '%s\n' "$ip" + else + printf '127.0.0.1\n' + fi +} + +LAN_HOST="$(detect_host_ip)" +ASR_BASE_URL="http://${STACKCHAN_ASR_WORKER_HOST}:${STACKCHAN_ASR_WORKER_PORT}" + +echo "[stackchan-sidecar] root=${ROOT_DIR}" +echo "[stackchan-sidecar] LAN host guess=${LAN_HOST}" +echo "[stackchan-sidecar] ASR device=${ASR_DEVICE}" + +if [[ "$STACKCHAN_START_ASR_WORKER" == "1" ]]; then + start_proc "asr-worker" \ + env ASR_HOST="$STACKCHAN_ASR_WORKER_HOST" ASR_PORT="$STACKCHAN_ASR_WORKER_PORT" \ + ASR_DEVICE="$ASR_DEVICE" \ + ASR_SINGLE_MODEL_CACHE="$ASR_SINGLE_MODEL_CACHE" \ + ASR_PRELOAD_MODELS="$ASR_PRELOAD_MODELS" \ + ASR_MODEL_JA="$ASR_MODEL_JA" \ + ASR_MODEL_EN="$ASR_MODEL_EN" \ + ASR_MODEL_FAST="$ASR_MODEL_FAST" \ + ./scripts/run-asr-worker.sh +else + echo "[stackchan-sidecar] skipping asr-worker startup (STACKCHAN_START_ASR_WORKER=0)" +fi + +start_proc "stackchan-asr-adapter" \ + python3 integrations/stackchan-minimal/stackchan_asr_adapter.py \ + --host "$STACKCHAN_ASR_ADAPTER_HOST" \ + --port "$STACKCHAN_ASR_ADAPTER_PORT" \ + --asr-base-url "$ASR_BASE_URL" \ + --language ja + +start_proc "stackchan-tts-adapter" \ + uv run --project tts-worker python "$ROOT_DIR/integrations/stackchan-minimal/stackchan_tts_adapter.py" \ + --host "$STACKCHAN_TTS_ADAPTER_HOST" \ + --port "$STACKCHAN_TTS_ADAPTER_PORT" \ + --voice "$STACKCHAN_KOKORO_VOICE" \ + --repo-root "$ROOT_DIR" + +if [[ "$STACKCHAN_START_LLM" == "1" ]]; then + if [[ -z "${STACKCHAN_LLM_MODEL_PATH:-}" ]]; then + echo "[stackchan-sidecar] STACKCHAN_LLM_MODEL_PATH is required when STACKCHAN_START_LLM=1" >&2 + exit 2 + fi + if [[ ! -f "$STACKCHAN_LLM_MODEL_PATH" ]]; then + echo "[stackchan-sidecar] model file not found: $STACKCHAN_LLM_MODEL_PATH" >&2 + exit 2 + fi + if ! LLAMA_BIN="$(resolve_llama_server)"; then + echo "[stackchan-sidecar] llama-server not found. Set LLAMA_SERVER_BIN=/path/to/llama-server." >&2 + exit 2 + fi + + declare -a llama_cmd=( + "$LLAMA_BIN" + -m "$STACKCHAN_LLM_MODEL_PATH" + --host "$LLAMA_HOST" + --port "$LLAMA_PORT" + -c "$LLAMA_CTX_SIZE" + --parallel "$LLAMA_PARALLEL" + --flash-attn "$LLAMA_FLASH_ATTN" + -ngl "$LLAMA_GPU_LAYERS" + --reasoning "$LLAMA_REASONING" + ) + if [[ "$LLAMA_JINJA" == "1" || "${LLAMA_JINJA,,}" == "true" || "${LLAMA_JINJA,,}" == "yes" || "${LLAMA_JINJA,,}" == "on" ]]; then + llama_cmd+=(--jinja) + fi + if [[ -n "$LLAMA_THREADS" ]]; then + llama_cmd+=(-t "$LLAMA_THREADS") + fi + if [[ -n "$LLAMA_EXTRA_ARGS" ]]; then + read -r -a extra_args <<< "$LLAMA_EXTRA_ARGS" + llama_cmd+=("${extra_args[@]}") + fi + + start_proc "llama-server" "${llama_cmd[@]}" +else + echo "[stackchan-sidecar] skipping llama-server startup (STACKCHAN_START_LLM=0)" +fi + +cat </dev/null; then + exited_pid="$pid" + break + fi + done + + if [[ -n "$exited_pid" ]]; then + echo "[stackchan-sidecar] ${NAMES[$exited_pid]:-service} exited (pid=${exited_pid}, code=${exit_code}). stopping others." + break + fi +done + +exit "$exit_code" From 17661ec7930a2b601c991805d3b5f0c54b853fa0 Mon Sep 17 00:00:00 2001 From: amariichi <68761912+amariichi@users.noreply.github.com> Date: Sun, 17 May 2026 08:51:34 +0900 Subject: [PATCH 2/9] AtomS3R: flatten closed-eye arc and recenter face MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Closed "∪" eyelid: larger radius (15->19) with a narrower sweep (25..155 -> 45..135) for a gentler, flatter curve, dropped a touch lower. - Shift the whole composed face down by kFaceOffsetY (4px) so the head looks vertically centered despite the visual weight of the hair. Co-Authored-By: Claude Opus 4.7 --- .../atoms3r-headroom/src/face_renderer.cpp | 23 +++++++++++++++---- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/firmware/atoms3r-headroom/src/face_renderer.cpp b/firmware/atoms3r-headroom/src/face_renderer.cpp index d1d0ae3..42149a6 100644 --- a/firmware/atoms3r-headroom/src/face_renderer.cpp +++ b/firmware/atoms3r-headroom/src/face_renderer.cpp @@ -24,6 +24,12 @@ int displayRotationForDegrees(int degrees) { } } +// Shift the whole composed face downward by this many pixels. The head art is +// geometrically centered, but the hair adds visual weight up top, so nudging +// everything down makes the head look centered. The bottom rows clipped by the +// offset are background-only (head art ends well above the screen edge). +constexpr int kFaceOffsetY = 4; + void drawThickLine(M5Canvas& canvas, int x0, int y0, int x1, int y1, uint16_t color) { for (int offset = -2; offset <= 2; ++offset) { canvas.drawLine(x0, y0 + offset, x1, y1 + offset, color); @@ -82,8 +88,9 @@ void HeadroomFaceRenderer::setRotationDegrees(int rotationDegrees) { } void HeadroomFaceRenderer::draw(const HeadroomFaceState& state) { + uint16_t background = backgroundFor(state); canvas_.startWrite(); - canvas_.fillScreen(backgroundFor(state)); + canvas_.fillScreen(background); drawHeadBase(state); drawBrows(state); drawEyes(state); @@ -95,7 +102,11 @@ void HeadroomFaceRenderer::draw(const HeadroomFaceState& state) { canvas_.fillCircle(width_ - 10, 10, 3, TFT_GREEN); } canvas_.endWrite(); - canvas_.pushSprite(0, 0); + // Clear the strip exposed above the shifted sprite, then push it down. + if (kFaceOffsetY > 0) { + M5.Display.fillRect(0, 0, width_, kFaceOffsetY, background); + } + canvas_.pushSprite(0, kFaceOffsetY); } void HeadroomFaceRenderer::drawHeadBase(const HeadroomFaceState& state) { @@ -167,9 +178,11 @@ void HeadroomFaceRenderer::drawBrows(const HeadroomFaceState& state) { void HeadroomFaceRenderer::drawClosedEyeArc(int centerX, int eyeCenterY, uint16_t color) { // Downward-convex "∪" eyelid arc (a dark lash line), not a white sliver. - const int radius = 15; - const int arcCenterY = eyeCenterY - 13; // bottom of the arc sits at the eye center - canvas_.fillArc(centerX, arcCenterY, radius, radius - 4, 25.0f, 155.0f, color); + // Larger radius + narrower sweep = a gentler, flatter curve; the slightly + // smaller upward offset drops the whole arc a touch lower on the face. + const int radius = 19; + const int arcCenterY = eyeCenterY - 16; // a little lower than before + canvas_.fillArc(centerX, arcCenterY, radius, radius - 4, 45.0f, 135.0f, color); } void HeadroomFaceRenderer::drawEyes(const HeadroomFaceState& state) { From 3c7d3b4a89854897654f72f4fe1da891f91693c9 Mon Sep 17 00:00:00 2001 From: amariichi <68761912+amariichi@users.noreply.github.com> Date: Sun, 17 May 2026 08:53:47 +0900 Subject: [PATCH 3/9] face-app: serve TTS audio by HTTP reference for non-browser sinks - Add tts_audio_store: TTL-bounded server-side store of generated TTS audio, exposed over HTTP with a lightweight WS reference payload and WAV-duration parsing. - tts_controller: always stash audio and broadcast a reference; only broadcast the base64 body when browser audio is enabled (previously audio was dropped entirely when browser audio was off), so the AtomS3R/Echo Base bridge and Stack-chan sidecar can fetch it by URL. - index.js: wire the store (MH_TTS_AUDIO_REF_TTL_MS, default 60s) into the HTTP router and controller; clear it on worker stop. - package.json: add stackchan:run and atoms3r:bridge scripts. - .gitignore: ignore .venv-platformio/ and esp-web-tools-logs.txt. - Tests for the store (reference metadata, TTL expiry, WAV duration) and the controller reference path. Co-Authored-By: Claude Opus 4.7 --- .gitignore | 2 + face-app/dist/index.js | 8 + face-app/dist/tts_audio_store.js | 212 +++++++++++++++++++++++++ face-app/dist/tts_controller.js | 61 ++++--- package.json | 2 + test/face-app/tts_audio_store.test.mjs | 104 ++++++++++++ test/face-app/tts_controller.test.mjs | 92 +++++++++++ 7 files changed, 463 insertions(+), 18 deletions(-) create mode 100644 face-app/dist/tts_audio_store.js create mode 100644 test/face-app/tts_audio_store.test.mjs diff --git a/.gitignore b/.gitignore index 89edb28..3b2cf0a 100644 --- a/.gitignore +++ b/.gitignore @@ -2,11 +2,13 @@ node_modules/ npm-debug.log* *.log +esp-web-tools-logs.txt # Python caches / virtualenv __pycache__/ .pytest_cache/ .venv/ +.venv-platformio/ .venv-vllm/ .venv-qwen-tts/ tts-worker/.venv/ diff --git a/face-app/dist/index.js b/face-app/dist/index.js index ae2e084..c3b912c 100644 --- a/face-app/dist/index.js +++ b/face-app/dist/index.js @@ -4,6 +4,7 @@ import path from 'node:path'; import { fileURLToPath } from 'node:url'; import { startFaceWebSocketServer } from './ws_server.js'; import { createTtsController } from './tts_controller.js'; +import { createTtsAudioStore } from './tts_audio_store.js'; import { loadFaceAppConfig } from './config_loader.js'; import { resolveBrowserAudioMaxChannels } from './browser_audio_config.js'; import { createOperatorAsrProxy } from './operator_asr_proxy.js'; @@ -242,6 +243,9 @@ const operatorAsrProxy = createOperatorAsrProxy({ let operatorRealtimeAsrProxy = null; let ttsController = null; +const ttsAudioStore = createTtsAudioStore({ + ttlMs: Number.parseInt(process.env.MH_TTS_AUDIO_REF_TTL_MS ?? '60000', 10) +}); function normalizeSayPayload(payload) { const normalized = { ...payload }; @@ -417,6 +421,9 @@ const server = await startFaceWebSocketServer({ }); return true; } + if (ttsAudioStore.handleHttpRequest(request, response)) { + return true; + } return operatorAsrProxy.handleHttpRequest(request, response); }, log: console @@ -461,6 +468,7 @@ if (ttsEnabled) { broadcast(payload) { return server.broadcast(payload); }, + audioStore: ttsAudioStore, defaultTtlMs: faceConfig.tts.defaultTtlMs, autoInterruptAfterMs: faceConfig.tts.autoInterruptAfterMs, qwenBoundarySpeaker: process.env.MH_QWEN_TTS_BOUNDARY_SPEAKER ?? 'Ono_Anna', diff --git a/face-app/dist/tts_audio_store.js b/face-app/dist/tts_audio_store.js new file mode 100644 index 0000000..578e1f8 --- /dev/null +++ b/face-app/dist/tts_audio_store.js @@ -0,0 +1,212 @@ +import { randomUUID } from 'node:crypto'; + +function toNow(options) { + return typeof options.now === 'function' ? options.now : () => Date.now(); +} + +function normalizeTtlMs(value, fallbackMs = 60_000) { + if (!Number.isInteger(value)) { + return fallbackMs; + } + return Math.max(1, value); +} + +function normalizeMimeType(value) { + return typeof value === 'string' && value.trim() !== '' ? value.trim() : 'audio/wav'; +} + +function parseWavDurationMs(buffer, sampleRateFallback = null) { + if (!Buffer.isBuffer(buffer) || buffer.length < 44) { + return null; + } + if (buffer.toString('ascii', 0, 4) !== 'RIFF' || buffer.toString('ascii', 8, 12) !== 'WAVE') { + return null; + } + + let sampleRate = Number.isInteger(sampleRateFallback) && sampleRateFallback > 0 ? sampleRateFallback : null; + let byteRate = null; + let dataBytes = null; + let offset = 12; + + while (offset + 8 <= buffer.length) { + const chunkId = buffer.toString('ascii', offset, offset + 4); + const chunkSize = buffer.readUInt32LE(offset + 4); + const chunkStart = offset + 8; + if (chunkStart + chunkSize > buffer.length) { + break; + } + + if (chunkId === 'fmt ' && chunkSize >= 16) { + sampleRate = buffer.readUInt32LE(chunkStart + 4); + byteRate = buffer.readUInt32LE(chunkStart + 8); + } else if (chunkId === 'data') { + dataBytes = chunkSize; + break; + } + + offset = chunkStart + chunkSize + (chunkSize % 2); + } + + if (Number.isInteger(byteRate) && byteRate > 0 && Number.isInteger(dataBytes)) { + return Math.round((dataBytes / byteRate) * 1000); + } + if (Number.isInteger(sampleRate) && sampleRate > 0 && Number.isInteger(dataBytes)) { + return Math.round((dataBytes / (sampleRate * 2)) * 1000); + } + return null; +} + +function writeJson(response, statusCode, payload) { + response.writeHead(statusCode, { + 'content-type': 'application/json; charset=utf-8', + 'cache-control': 'no-store' + }); + response.end(JSON.stringify(payload)); +} + +export function createTtsAudioStore(options = {}) { + const now = toNow(options); + const ttlMs = normalizeTtlMs(options.ttlMs, 60_000); + const entries = new Map(); + + function prune(atMs = now()) { + for (const [id, entry] of entries) { + if (atMs > entry.expiresAt) { + entries.delete(id); + } + } + } + + function putAudio({ + audioBase64, + mimeType = 'audio/wav', + sampleRate = null, + sessionId = '-', + utteranceId = null, + generation = null, + messageId = null, + revision = null, + agentId = null, + agentLabel = null + }) { + if (typeof audioBase64 !== 'string' || audioBase64.trim() === '') { + return null; + } + prune(); + + const audio = Buffer.from(audioBase64, 'base64'); + const id = randomUUID(); + const createdAt = now(); + const entry = { + id, + audio, + mimeType: normalizeMimeType(mimeType), + sampleRate: Number.isInteger(sampleRate) ? sampleRate : null, + byteLength: audio.length, + durationMs: parseWavDurationMs(audio, sampleRate), + sessionId, + utteranceId, + generation, + messageId, + revision, + agentId, + agentLabel, + createdAt, + expiresAt: createdAt + ttlMs + }; + entries.set(id, entry); + return entry; + } + + function get(id) { + prune(); + const entry = entries.get(id); + if (!entry) { + return null; + } + if (now() > entry.expiresAt) { + entries.delete(id); + return null; + } + return entry; + } + + function deleteAudio(id) { + return entries.delete(id); + } + + function clear() { + entries.clear(); + } + + function toReferencePayload(entry, { basePath = '/api/tts/audio' } = {}) { + if (!entry) { + return null; + } + const url = `${basePath}/${entry.id}.wav`; + return { + v: 1, + type: 'tts_audio_ref', + session_id: entry.sessionId, + ...(entry.agentId ? { agent_id: entry.agentId } : {}), + ...(entry.agentLabel ? { agent_label: entry.agentLabel } : {}), + utterance_id: entry.utteranceId, + generation: entry.generation, + message_id: entry.messageId, + revision: entry.revision, + mime_type: entry.mimeType, + sample_rate: entry.sampleRate, + byte_length: entry.byteLength, + duration_ms: entry.durationMs, + expires_at: entry.expiresAt, + url, + ts: now() + }; + } + + function handleHttpRequest(request, response) { + const parsedUrl = new URL(request.url ?? '/', 'http://127.0.0.1'); + const match = parsedUrl.pathname.match(/^\/api\/tts\/audio\/([0-9a-fA-F-]+)\.wav$/); + if (!match) { + return false; + } + if (request.method !== 'GET' && request.method !== 'HEAD') { + writeJson(response, 405, { ok: false, error: 'method_not_allowed' }); + return true; + } + + const entry = get(match[1]); + if (!entry) { + writeJson(response, 410, { ok: false, error: 'audio_expired' }); + return true; + } + + response.writeHead(200, { + 'content-type': entry.mimeType, + 'content-length': String(entry.byteLength), + 'cache-control': 'no-store', + 'x-utterance-id': entry.utteranceId ?? '', + 'x-generation': Number.isInteger(entry.generation) ? String(entry.generation) : '' + }); + if (request.method === 'HEAD') { + response.end(); + return true; + } + response.end(entry.audio); + return true; + } + + return { + putAudio, + get, + deleteAudio, + prune, + clear, + toReferencePayload, + handleHttpRequest, + size() { + prune(); + return entries.size; + } + }; +} diff --git a/face-app/dist/tts_controller.js b/face-app/dist/tts_controller.js index 56ad9f1..b77aa7e 100644 --- a/face-app/dist/tts_controller.js +++ b/face-app/dist/tts_controller.js @@ -248,6 +248,7 @@ export function createTtsController(options = {}) { const log = toLogger(options.log ?? console); const now = typeof options.now === 'function' ? options.now : () => Date.now(); const broadcast = typeof options.broadcast === 'function' ? options.broadcast : () => false; + const audioStore = options.audioStore ?? null; const audioTarget = normalizeAudioTarget(options.audioTarget); const browserAudioEnabled = audioTarget === 'browser' || audioTarget === 'both'; const defaultTtlMs = Number.isInteger(options.defaultTtlMs) ? Math.max(1, options.defaultTtlMs) : 60_000; @@ -545,9 +546,6 @@ export function createTtsController(options = {}) { } if (message.type === 'audio') { - if (!browserAudioEnabled) { - return; - } if (!active || !Number.isInteger(message.generation) || message.generation !== active.generation) { return; } @@ -555,21 +553,45 @@ export function createTtsController(options = {}) { return; } - broadcast({ - v: 1, - type: 'tts_audio', - session_id: active.sessionId, - ...(active.agentId ? { agent_id: active.agentId } : {}), - ...(active.agentLabel ? { agent_label: active.agentLabel } : {}), - utterance_id: active.utteranceId, - generation: active.generation, - message_id: active.messageId, - revision: active.revision, - mime_type: typeof message.mime_type === 'string' ? message.mime_type : 'audio/wav', - audio_base64: message.audio_base64, - sample_rate: Number.isInteger(message.sample_rate) ? message.sample_rate : null, - ts: now() - }); + const mimeType = typeof message.mime_type === 'string' ? message.mime_type : 'audio/wav'; + const sampleRate = Number.isInteger(message.sample_rate) ? message.sample_rate : null; + + if (audioStore && typeof audioStore.putAudio === 'function' && typeof audioStore.toReferencePayload === 'function') { + const entry = audioStore.putAudio({ + audioBase64: message.audio_base64, + mimeType, + sampleRate, + sessionId: active.sessionId, + agentId: active.agentId, + agentLabel: active.agentLabel, + utteranceId: active.utteranceId, + generation: active.generation, + messageId: active.messageId, + revision: active.revision + }); + const refPayload = audioStore.toReferencePayload(entry); + if (refPayload) { + broadcast(refPayload); + } + } + + if (browserAudioEnabled) { + broadcast({ + v: 1, + type: 'tts_audio', + session_id: active.sessionId, + ...(active.agentId ? { agent_id: active.agentId } : {}), + ...(active.agentLabel ? { agent_label: active.agentLabel } : {}), + utterance_id: active.utteranceId, + generation: active.generation, + message_id: active.messageId, + revision: active.revision, + mime_type: mimeType, + audio_base64: message.audio_base64, + sample_rate: sampleRate, + ts: now() + }); + } return; } @@ -774,6 +796,9 @@ export function createTtsController(options = {}) { } worker.stop(); + if (audioStore && typeof audioStore.clear === 'function') { + audioStore.clear(); + } } return { diff --git a/package.json b/package.json index fa6062e..75cb9f3 100644 --- a/package.json +++ b/package.json @@ -21,6 +21,8 @@ "mcp-server:run": "./scripts/run-mcp-server.sh", "tts-worker:run": "./scripts/run-tts-worker.sh", "tts-worker:smoke": "./scripts/run-tts-worker.sh --smoke", + "stackchan:run": "./scripts/run-stackchan-sidecar.sh", + "atoms3r:bridge": "node scripts/atoms3r-http-bridge.mjs", "test": "node --test" } } diff --git a/test/face-app/tts_audio_store.test.mjs b/test/face-app/tts_audio_store.test.mjs new file mode 100644 index 0000000..1d0d76d --- /dev/null +++ b/test/face-app/tts_audio_store.test.mjs @@ -0,0 +1,104 @@ +import assert from 'node:assert/strict'; +import test from 'node:test'; +import { createTtsAudioStore } from '../../face-app/dist/tts_audio_store.js'; +import { startFaceWebSocketServer } from '../../face-app/dist/ws_server.js'; + +function wavBase64() { + const data = Buffer.from([0, 0, 1, 0]); + const header = Buffer.alloc(44); + header.write('RIFF', 0, 'ascii'); + header.writeUInt32LE(36 + data.length, 4); + header.write('WAVE', 8, 'ascii'); + header.write('fmt ', 12, 'ascii'); + header.writeUInt32LE(16, 16); + header.writeUInt16LE(1, 20); + header.writeUInt16LE(1, 22); + header.writeUInt32LE(24_000, 24); + header.writeUInt32LE(48_000, 28); + header.writeUInt16LE(2, 32); + header.writeUInt16LE(16, 34); + header.write('data', 36, 'ascii'); + header.writeUInt32LE(data.length, 40); + return Buffer.concat([header, data]).toString('base64'); +} + +test('tts audio store returns reference metadata and expires entries', () => { + let nowMs = 10_000; + const store = createTtsAudioStore({ now: () => nowMs, ttlMs: 1_000 }); + + const entry = store.putAudio({ + audioBase64: wavBase64(), + mimeType: 'audio/wav', + sampleRate: 24_000, + sessionId: 's1', + utteranceId: 'u1', + generation: 7, + messageId: 'm1', + revision: 123 + }); + assert.ok(entry); + + const ref = store.toReferencePayload(entry); + assert.equal(ref.type, 'tts_audio_ref'); + assert.equal(ref.session_id, 's1'); + assert.equal(ref.utterance_id, 'u1'); + assert.equal(ref.generation, 7); + assert.equal(ref.mime_type, 'audio/wav'); + assert.equal(ref.sample_rate, 24_000); + assert.equal(ref.byte_length, 48); + assert.equal(ref.duration_ms, 0); + assert.match(ref.url, /^\/api\/tts\/audio\/[0-9a-f-]+\.wav$/); + assert.equal(store.size(), 1); + + nowMs += 1_001; + assert.equal(store.get(entry.id), null); + assert.equal(store.size(), 0); +}); + +test('tts audio endpoint supports authenticated HEAD and GET', async (t) => { + const store = createTtsAudioStore({ ttlMs: 60_000 }); + const entry = store.putAudio({ + audioBase64: wavBase64(), + mimeType: 'audio/wav', + sampleRate: 24_000, + sessionId: 's1', + utteranceId: 'u1', + generation: 1 + }); + + const server = await startFaceWebSocketServer({ + host: '127.0.0.1', + port: 0, + path: '/ws', + authToken: 'secret-token', + requireOriginCheck: true, + relayPayloads: false, + onHttpRequest(request, response) { + return store.handleHttpRequest(request, response); + }, + log: { info: () => {}, error: () => {} } + }); + + t.after(async () => { + await server.stop(); + }); + + const url = `${server.httpUrl}api/tts/audio/${entry.id}.wav`; + const denied = await fetch(url); + assert.equal(denied.status, 401); + + const head = await fetch(url, { + method: 'HEAD', + headers: { authorization: 'Bearer secret-token' } + }); + assert.equal(head.status, 200); + assert.equal(head.headers.get('content-type'), 'audio/wav'); + assert.equal(head.headers.get('content-length'), '48'); + + const get = await fetch(url, { + headers: { authorization: 'Bearer secret-token' } + }); + assert.equal(get.status, 200); + assert.equal(get.headers.get('cache-control'), 'no-store'); + assert.equal(Buffer.from(await get.arrayBuffer()).length, 48); +}); diff --git a/test/face-app/tts_controller.test.mjs b/test/face-app/tts_controller.test.mjs index 5dc5169..4448c86 100644 --- a/test/face-app/tts_controller.test.mjs +++ b/test/face-app/tts_controller.test.mjs @@ -523,6 +523,98 @@ test('tts controller relays worker audio payload when browser audio is enabled', assert.equal(relayed.audio_base64, 'ZmFrZQ=='); }); +test('tts controller broadcasts audio reference when audio store is configured', async () => { + const worker = new FakeWorker(); + const broadcasts = []; + const stored = []; + const controller = createTtsController({ + worker, + now: () => 32_000, + audioTarget: 'browser', + audioStore: { + putAudio(payload) { + stored.push(payload); + return { + id: 'audio-1', + sessionId: payload.sessionId, + agentId: payload.agentId, + agentLabel: payload.agentLabel, + utteranceId: payload.utteranceId, + generation: payload.generation, + messageId: payload.messageId, + revision: payload.revision, + mimeType: payload.mimeType, + sampleRate: payload.sampleRate, + byteLength: 4, + durationMs: null, + expiresAt: 92_000 + }; + }, + toReferencePayload(entry) { + return { + v: 1, + type: 'tts_audio_ref', + session_id: entry.sessionId, + utterance_id: entry.utteranceId, + generation: entry.generation, + message_id: entry.messageId, + revision: entry.revision, + mime_type: entry.mimeType, + sample_rate: entry.sampleRate, + byte_length: entry.byteLength, + duration_ms: entry.durationMs, + expires_at: entry.expiresAt, + url: `/api/tts/audio/${entry.id}.wav`, + ts: 32_000 + }; + } + }, + gate: { check: () => ({ allow: true }) }, + broadcast(payload) { + broadcasts.push(payload); + return true; + }, + log: { info: () => {}, warn: () => {}, error: () => {} } + }); + + worker.emit('message', { type: 'ready', voice: 'af_heart', engine: 'kokoro', playback_backend: 'silent' }); + await controller.handleSayPayload({ + type: 'say', + session_id: 's1', + utterance_id: 'u1', + message_id: 'm-1', + revision: 123, + agent_id: '__operator__', + text: 'browser audio', + priority: 2, + policy: 'replace', + ttl_ms: 4_000, + ts: 32_000 + }); + + worker.emit('message', { + type: 'audio', + generation: 1, + mime_type: 'audio/wav', + sample_rate: 24_000, + audio_base64: 'ZmFrZQ==' + }); + + assert.equal(stored.length, 1); + assert.equal(stored[0].audioBase64, 'ZmFrZQ=='); + assert.equal(stored[0].agentId, '__operator__'); + + const ref = broadcasts.find((payload) => payload.type === 'tts_audio_ref'); + assert.ok(ref); + assert.equal(ref.url, '/api/tts/audio/audio-1.wav'); + assert.equal(ref.message_id, 'm-1'); + assert.equal(ref.sample_rate, 24_000); + + const base64 = broadcasts.find((payload) => payload.type === 'tts_audio'); + assert.ok(base64); + assert.equal(base64.audio_base64, 'ZmFrZQ=='); +}); + test('tts controller does not relay worker audio payload in local-only mode', async () => { const worker = new FakeWorker(); const broadcasts = []; From 2d26c1c1dfe3cc2e0906be0159f5236bad3311bf Mon Sep 17 00:00:00 2001 From: amariichi <68761912+amariichi@users.noreply.github.com> Date: Sun, 17 May 2026 10:06:01 +0900 Subject: [PATCH 4/9] face-app: chunk long TTS into sentence-bounded FIFO (Atom long-reply fix) Long agent replies exceeded the AtomS3R firmware's per-utterance base64/HTTP WAV cap, so the audio was dropped while the mouth kept animating from the independent tts_mouth stream ("mouth-only, no sound" on long local-LLM output). - Add segmentTtsText: JA/EN hard sentence boundaries, late comma soft split, greedy packing, default 120 chars (MH_TTS_CHUNK_MAX_CHARS). Text <= limit is returned verbatim (unchanged single-chunk path). - Replace the single `pending` slot with a real ordered FIFO queue; one logical utterance occupies it, a newer say flushes the remainder, interrupt/auto-interrupt/stop clear the whole queue. - Each chunk is its own worker `speak` with the parent generation and a #k/N utterance/message suffix, dispatched sequentially on play_stop, keeping every WAV under the Atom size cap. - Tests: segmentTtsText units + sequential-dispatch + interrupt-flush; full node --test suite green (333). Step 1 of 3 (Step 2: PTT-clear wiring; Step 3: firmware playback queue). Co-Authored-By: Claude Opus 4.7 --- face-app/dist/index.js | 75 ++++++++++++ face-app/dist/tts_controller.js | 161 ++++++++++++++++++++++---- test/face-app/tts_controller.test.mjs | 113 +++++++++++++++++- 3 files changed, 328 insertions(+), 21 deletions(-) diff --git a/face-app/dist/index.js b/face-app/dist/index.js index c3b912c..f4d9739 100644 --- a/face-app/dist/index.js +++ b/face-app/dist/index.js @@ -67,6 +67,32 @@ function writeJson(response, statusCode, payload) { response.end(JSON.stringify(payload)); } +async function readJsonRequestBody(request, { maxBytes = 32_768 } = {}) { + const chunks = []; + let byteLength = 0; + for await (const chunk of request) { + byteLength += chunk.length; + if (byteLength > maxBytes) { + const error = new Error('request_body_too_large'); + error.code = 'request_body_too_large'; + throw error; + } + chunks.push(chunk); + } + if (byteLength === 0) { + const error = new Error('empty_body'); + error.code = 'empty_body'; + throw error; + } + try { + return JSON.parse(Buffer.concat(chunks, byteLength).toString('utf8')); + } catch { + const error = new Error('invalid_json'); + error.code = 'invalid_json'; + throw error; + } +} + function normalizeOptionalString(value) { if (typeof value !== 'string') { return null; @@ -398,6 +424,54 @@ const server = await startFaceWebSocketServer({ }); return true; } + if (parsedUrl.pathname === '/api/operator/response') { + if (request.method !== 'POST') { + writeJson(response, 405, { + ok: false, + error: 'method_not_allowed' + }); + return true; + } + let payload = null; + try { + payload = await readJsonRequestBody(request); + } catch (error) { + writeJson(response, error.code === 'request_body_too_large' ? 413 : 400, { + ok: false, + error: error.code ?? 'invalid_request_body' + }); + return true; + } + if (!payload || payload.type !== 'operator_response') { + writeJson(response, 400, { + ok: false, + error: 'invalid_operator_response' + }); + return true; + } + if (typeof payload.value !== 'string' || payload.value.trim() === '') { + writeJson(response, 400, { + ok: false, + error: 'empty_value' + }); + return true; + } + const normalized = { + ...payload, + v: payload.v ?? 1, + type: 'operator_response', + session_id: normalizeSessionId(payload), + response_kind: typeof payload.response_kind === 'string' ? payload.response_kind : 'text', + value: payload.value.trim(), + source: typeof payload.source === 'string' && payload.source.trim() !== '' ? payload.source.trim() : 'http', + ts: Date.now() + }; + server.broadcast(normalized); + writeJson(response, 202, { + ok: true + }); + return true; + } if (parsedUrl.pathname === '/api/operator/ui-config') { writeJson(response, 200, { ok: true, @@ -472,6 +546,7 @@ if (ttsEnabled) { defaultTtlMs: faceConfig.tts.defaultTtlMs, autoInterruptAfterMs: faceConfig.tts.autoInterruptAfterMs, qwenBoundarySpeaker: process.env.MH_QWEN_TTS_BOUNDARY_SPEAKER ?? 'Ono_Anna', + maxChunkChars: Number.parseInt(process.env.MH_TTS_CHUNK_MAX_CHARS ?? '120', 10), gateConfig: faceConfig.speechGate, workerCwd: repoRoot, workerEnv: { diff --git a/face-app/dist/tts_controller.js b/face-app/dist/tts_controller.js index b77aa7e..bc59a0a 100644 --- a/face-app/dist/tts_controller.js +++ b/face-app/dist/tts_controller.js @@ -57,6 +57,76 @@ function normalizePolicy(value) { return value === 'interrupt' ? 'interrupt' : 'replace'; } +// Split a long utterance into ordered, sentence-bounded chunks so each +// synthesized WAV stays small enough for memory-constrained sinks (the +// AtomS3R firmware drops a single oversized base64/HTTP WAV while the +// mouth keeps animating from the independent tts_mouth stream). Short +// text (<= maxChars) is returned verbatim as a single chunk so existing +// single-utterance behavior is unchanged. +export function segmentTtsText(text, maxChars = 120) { + const source = typeof text === 'string' ? text : ''; + const limit = Number.isInteger(maxChars) && maxChars > 0 ? maxChars : 120; + if (source.length <= limit) { + return source.length > 0 ? [source] : []; + } + + // Hard sentence boundaries keep their terminator; newlines also split. + const hardBoundary = /[^。..!?!?…\n]*(?:[。..!?!?…]+|\n+|$)/gu; + const sentences = []; + let match; + while ((match = hardBoundary.exec(source)) !== null) { + if (match.index === hardBoundary.lastIndex) { + hardBoundary.lastIndex += 1; + } + if (match[0] && match[0].trim() !== '') { + sentences.push(match[0]); + } + if (hardBoundary.lastIndex >= source.length) { + break; + } + } + if (sentences.length === 0) { + sentences.push(source); + } + + // Soft-split any sentence still longer than the limit, preferring a + // late comma/semicolon boundary, then a hard cut as a last resort. + const units = []; + for (const sentence of sentences) { + let rest = sentence; + while (rest.length > limit) { + const window = rest.slice(0, limit); + const soft = window.match(/[、,,;;](?=[^、,,;;]*$)/u); + const cut = soft && soft.index >= Math.floor(limit * 0.4) ? soft.index + 1 : limit; + units.push(rest.slice(0, cut)); + rest = rest.slice(cut); + } + if (rest.trim() !== '') { + units.push(rest); + } + } + + // Greedily pack consecutive units so we do not emit one chunk per tiny + // sentence. + const chunks = []; + let buffer = ''; + for (const unit of units) { + if (buffer === '') { + buffer = unit; + } else if ((buffer + unit).length <= limit) { + buffer += unit; + } else { + chunks.push(buffer.trim()); + buffer = unit; + } + } + if (buffer.trim() !== '') { + chunks.push(buffer.trim()); + } + + return chunks.filter((chunk) => chunk !== ''); +} + function normalizeAudioTarget(value) { if (typeof value !== 'string') { return 'local'; @@ -256,6 +326,10 @@ export function createTtsController(options = {}) { Number.isInteger(options.autoInterruptAfterMs) && options.autoInterruptAfterMs >= 0 ? options.autoInterruptAfterMs : null; + const maxChunkChars = + Number.isInteger(options.maxChunkChars) && options.maxChunkChars > 0 + ? options.maxChunkChars + : 120; const gate = options.gate ?? createSayGate(options.gateConfig ?? {}); const worker = options.worker ?? createStdioWorkerClient({ @@ -274,7 +348,19 @@ export function createTtsController(options = {}) { let active = null; let activeQueuedAt = null; let activePlayStartedAt = null; - let pending = null; + // FIFO of ordered chunks belonging to the current logical utterance. + // A newer accepted say flushes this and replaces it. + let queue = []; + + function clearQueue() { + queue = []; + } + + function enqueueEntries(entries) { + for (const entry of entries) { + queue.push(entry); + } + } function emitState(sessionId, utteranceId, phase, extra = {}) { const payload = { @@ -349,6 +435,27 @@ export function createTtsController(options = {}) { }; } + function makeChildEntry(parent, text, index, count) { + if (count <= 1) { + return parent; + } + const speaker = selectQwenSpeakerForText(text, { + engine: workerEngine, + defaultVoice: workerVoice, + boundarySpeaker: qwenBoundarySpeaker + }); + const suffix = `#${index + 1}/${count}`; + return { + ...parent, + text, + speaker, + utteranceId: `${parent.utteranceId}${suffix}`, + messageId: `${parent.messageId}${suffix}`, + chunkIndex: index, + chunkCount: count + }; + } + function sendWorker(payload) { const ok = worker.send(payload); if (!ok) { @@ -446,12 +553,11 @@ export function createTtsController(options = {}) { } function maybeStartPending() { - if (active || !pending) { + if (active || queue.length === 0) { return; } - const next = pending; - pending = null; + const next = queue.shift(); dispatchSpeak(next, 'dequeued'); } @@ -663,7 +769,7 @@ export function createTtsController(options = {}) { activeQueuedAt = null; activePlayStartedAt = null; } - pending = null; + clearQueue(); emitState('-', null, 'worker_unavailable', { reason: `exit:${info.code ?? 'null'}:${info.signal ?? 'none'}` @@ -734,40 +840,54 @@ export function createTtsController(options = {}) { generation = entry.generation; + // Split long utterances so each synthesized WAV stays small; short + // text yields a single chunk and the original code path. + const segments = segmentTtsText(entry.text, maxChunkChars); + const children = + segments.length > 1 + ? segments.map((seg, index) => makeChildEntry(entry, seg, index, segments.length)) + : [entry]; + const head = children[0]; + const tail = children.slice(1); + const forceInterrupt = entry.policy === 'interrupt' || entry.priority >= 3; const autoInterrupt = shouldPromoteToAutoInterrupt(entry, acceptedAt); if (forceInterrupt || autoInterrupt) { - pending = null; + clearQueue(); if (active) { interruptActive(autoInterrupt ? 'auto_interrupt' : 'superseded', entry.generation); } - return dispatchSpeak(entry, autoInterrupt ? 'auto_interrupt' : 'interrupt'); + enqueueEntries(tail); + return dispatchSpeak(head, autoInterrupt ? 'auto_interrupt' : 'interrupt'); } if (active) { - pending = entry; - emitState(entry.sessionId, entry.utteranceId, 'queued', { - ...(entry.agentId ? { agent_id: entry.agentId } : {}), - ...(entry.agentLabel ? { agent_label: entry.agentLabel } : {}), + // A newer utterance supersedes the previous one's queued remainder. + clearQueue(); + enqueueEntries(children); + emitState(head.sessionId, head.utteranceId, 'queued', { + ...(head.agentId ? { agent_id: head.agentId } : {}), + ...(head.agentLabel ? { agent_label: head.agentLabel } : {}), reason: 'pending_replace', - generation: entry.generation, - message_id: entry.messageId, - revision: entry.revision + generation: head.generation, + message_id: head.messageId, + revision: head.revision }); return { accepted: true, spoken: true, - generation: entry.generation, + generation: head.generation, queued: true, - message_id: entry.messageId, - revision: entry.revision, + message_id: head.messageId, + revision: head.revision, reason: null }; } - return dispatchSpeak(entry, 'immediate'); + enqueueEntries(tail); + return dispatchSpeak(head, 'immediate'); } async function interruptCurrent(reason = 'manual_interrupt') { @@ -783,7 +903,7 @@ export function createTtsController(options = {}) { } stopped = true; - pending = null; + clearQueue(); if (active) { emitMouth(active.sessionId, active.utteranceId, 0, active.generation, active.messageId, active.revision, { @@ -810,7 +930,8 @@ export function createTtsController(options = {}) { workerReady, generation, activeGeneration: active?.generation ?? null, - pendingGeneration: pending?.generation ?? null + pendingGeneration: queue[0]?.generation ?? null, + queuedChunks: queue.length }; } }; diff --git a/test/face-app/tts_controller.test.mjs b/test/face-app/tts_controller.test.mjs index 4448c86..2430e7c 100644 --- a/test/face-app/tts_controller.test.mjs +++ b/test/face-app/tts_controller.test.mjs @@ -1,6 +1,6 @@ import assert from 'node:assert/strict'; import test from 'node:test'; -import { createTtsController } from '../../face-app/dist/tts_controller.js'; +import { createTtsController, segmentTtsText } from '../../face-app/dist/tts_controller.js'; class FakeWorker { constructor() { @@ -729,3 +729,114 @@ test('tts controller drops punctuation-only utterance after normalization', asyn assert.equal(result.reason, 'invalid_payload'); assert.equal(speaks(worker).length, 0); }); + +// --- Step 1: long-utterance sentence chunking + sequential FIFO --- + +test('segmentTtsText returns short text verbatim as a single chunk', () => { + assert.deepEqual(segmentTtsText('こんにちは。ありがとう。', 120), ['こんにちは。ありがとう。']); + assert.deepEqual(segmentTtsText('', 120), []); +}); + +test('segmentTtsText splits long text on sentence boundaries within the limit', () => { + const text = '一つ目の文です。二つ目の文です。三つ目の文です。'; + const chunks = segmentTtsText(text, 8); + assert.deepEqual(chunks, ['一つ目の文です。', '二つ目の文です。', '三つ目の文です。']); + for (const chunk of chunks) { + assert.ok(chunk.length <= 8, `chunk too long: ${chunk}`); + } + assert.equal(chunks.join(''), text); +}); + +test('segmentTtsText soft-splits an oversized single sentence on commas', () => { + const text = 'あ、'.repeat(40); // 80 chars, no hard boundary + const chunks = segmentTtsText(text, 20); + assert.ok(chunks.length > 1); + for (const chunk of chunks) { + assert.ok(chunk.length <= 20, `chunk too long: ${chunk}`); + } +}); + +function makeController(options = {}) { + const worker = new FakeWorker(); + const controller = createTtsController({ + worker, + now: () => 42_000, + gate: { check: () => ({ allow: true }) }, + broadcast: () => true, + log: { info: () => {}, warn: () => {}, error: () => {} }, + ...options + }); + worker.emit('message', { type: 'ready', voice: 'af_heart', engine: 'kokoro' }); + return { worker, controller }; +} + +function finishActive(worker, generation) { + worker.emit('message', { type: 'event', phase: 'play_stop', generation }); +} + +test('tts controller dispatches long-utterance chunks sequentially in order', async () => { + const { worker, controller } = makeController({ maxChunkChars: 8 }); + + await controller.handleSayPayload({ + type: 'say', + session_id: 's1', + utterance_id: 'u1', + priority: 2, + policy: 'replace', + ttl_ms: 60_000, + ts: 42_000, + text: '一つ目の文です。二つ目の文です。三つ目の文です。' + }); + + // Only the first chunk is sent to the worker until it finishes. + assert.equal(speaks(worker).length, 1); + assert.equal(speaks(worker)[0].text, '一つ目の文です。'); + + finishActive(worker, 1); + assert.equal(speaks(worker).length, 2); + assert.equal(speaks(worker)[1].text, '二つ目の文です。'); + + finishActive(worker, 1); + assert.equal(speaks(worker).length, 3); + assert.equal(speaks(worker)[2].text, '三つ目の文です。'); + + // Draining the queue does not resend anything. + finishActive(worker, 1); + assert.equal(speaks(worker).length, 3); +}); + +test('tts controller flushes queued chunks when an interrupt utterance arrives', async () => { + const { worker, controller } = makeController({ maxChunkChars: 8 }); + + await controller.handleSayPayload({ + type: 'say', + session_id: 's1', + utterance_id: 'u1', + priority: 2, + policy: 'replace', + ttl_ms: 60_000, + ts: 42_000, + text: '一つ目の文です。二つ目の文です。三つ目の文です。' + }); + assert.equal(speaks(worker).length, 1); + + await controller.handleSayPayload({ + type: 'say', + session_id: 's1', + utterance_id: 'u2', + priority: 3, + policy: 'interrupt', + ttl_ms: 60_000, + ts: 42_000, + text: '緊急。' + }); + + assert.equal(interrupts(worker).length, 1); + assert.equal(speaks(worker).length, 2); + assert.equal(speaks(worker)[1].text, '緊急。'); + + // The superseded utterance's queued chunks must not replay. + finishActive(worker, 2); + finishActive(worker, 2); + assert.equal(speaks(worker).length, 2); +}); From 882889ece27ebf2041ac46f3242e501bdb8c09e8 Mon Sep 17 00:00:00 2001 From: amariichi <68761912+amariichi@users.noreply.github.com> Date: Sun, 17 May 2026 10:09:11 +0900 Subject: [PATCH 5/9] face-app: operator PTT barge-in flushes the TTS chunk queue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the operator takes the turn, queued/active agent speech should stop instead of talking over the just-spoken input. - tts_controller: add flushForBargeIn(reason) — clear the chunk FIFO, interrupt + release the active chunk, emit play_stop, advance the generation so late worker audio/mouth for the old utterance is ignored, and clear the audio store so a memory-constrained sink cannot pull a stale chunk. - operator_asr_proxy: new onBargeIn option, invoked as soon as a POST /api/operator/asr audio upload arrives — the earliest cross-transport "user took the turn" signal (the Atom posts here too; it has no usable WebSocket client). Handler errors are caught so ASR still proceeds. - index.js: wire onBargeIn -> ttsController.flushForBargeIn. - Tests: controller flush behavior + audio-store clear; proxy onBargeIn invocation, non-ASR negative, and throw-safety. Full node --test green (338). Step 2 of 3 (Step 3: AtomS3R firmware playback queue + stop-on-PTT). Co-Authored-By: Claude Opus 4.7 --- face-app/dist/index.js | 8 +++ face-app/dist/operator_asr_proxy.js | 13 +++++ face-app/dist/tts_controller.js | 31 ++++++++++ test/face-app/operator_asr_proxy.test.mjs | 69 +++++++++++++++++++++++ test/face-app/tts_controller.test.mjs | 60 ++++++++++++++++++++ 5 files changed, 181 insertions(+) diff --git a/face-app/dist/index.js b/face-app/dist/index.js index f4d9739..b2357a1 100644 --- a/face-app/dist/index.js +++ b/face-app/dist/index.js @@ -264,6 +264,14 @@ const operatorAsrProxy = createOperatorAsrProxy({ modelJa: process.env.MH_OPERATOR_ASR_MODEL_JA ?? '', modelEn: process.env.MH_OPERATOR_ASR_MODEL_EN ?? '', requestTimeoutMs: Number.isNaN(operatorAsrTimeoutMs) ? 20_000 : operatorAsrTimeoutMs, + onBargeIn: (reason) => { + // Lazily resolved: ttsController is created after this proxy. + if (ttsController && typeof ttsController.flushForBargeIn === 'function') { + Promise.resolve(ttsController.flushForBargeIn(reason)).catch((error) => { + console.error(`[face-app] tts barge-in flush failed: ${error.message}`); + }); + } + }, log: console }); let operatorRealtimeAsrProxy = null; diff --git a/face-app/dist/operator_asr_proxy.js b/face-app/dist/operator_asr_proxy.js index 10b73d0..be79815 100644 --- a/face-app/dist/operator_asr_proxy.js +++ b/face-app/dist/operator_asr_proxy.js @@ -129,6 +129,7 @@ export function createOperatorAsrProxy(options = {}) { const modelEn = asNonEmptyString(options.modelEn); const modelJa = asNonEmptyString(options.modelJa); const fetchImpl = typeof options.fetchImpl === 'function' ? options.fetchImpl : globalThis.fetch; + const onBargeIn = typeof options.onBargeIn === 'function' ? options.onBargeIn : null; if (typeof fetchImpl !== 'function') { throw new Error('fetch API is unavailable for operator ASR proxy'); @@ -150,6 +151,18 @@ export function createOperatorAsrProxy(options = {}) { return true; } + // The user has taken the turn (PTT released, audio incoming): + // stop any in-flight/queued agent speech so it does not talk over + // the transcript that is about to be delivered. Cross-transport + // (Atom posts here too; it has no usable WebSocket client). + if (onBargeIn) { + try { + onBargeIn('operator_ptt'); + } catch (error) { + log.error(`[face-app] operator ASR barge-in handler failed: ${error.message}`); + } + } + const requestedLanguage = normalizeLanguage( parsedUrl.searchParams.get('lang') ?? parsedUrl.searchParams.get('languageHint') ?? 'en', 'en' diff --git a/face-app/dist/tts_controller.js b/face-app/dist/tts_controller.js index bc59a0a..abef4bd 100644 --- a/face-app/dist/tts_controller.js +++ b/face-app/dist/tts_controller.js @@ -897,6 +897,36 @@ export function createTtsController(options = {}) { interruptActive(reason, generation); } + // Barge-in: the operator took the turn (PTT). Drop every queued chunk, + // stop the active chunk, advance the generation so any late worker + // audio/mouth for the old utterance is ignored, and clear stored audio + // refs so a memory-constrained sink cannot pull a stale chunk. + async function flushForBargeIn(reason = 'operator_ptt') { + clearQueue(); + if (active) { + interruptActive(reason, generation); + emitMouth(active.sessionId, active.utteranceId, 0, active.generation, active.messageId, active.revision, { + agent_id: active.agentId, + agent_label: active.agentLabel + }); + emitState(active.sessionId, active.utteranceId, 'play_stop', { + ...(active.agentId ? { agent_id: active.agentId } : {}), + ...(active.agentLabel ? { agent_label: active.agentLabel } : {}), + reason, + generation: active.generation, + message_id: active.messageId, + revision: active.revision + }); + active = null; + activeQueuedAt = null; + activePlayStartedAt = null; + } + generation += 1; + if (audioStore && typeof audioStore.clear === 'function') { + audioStore.clear(); + } + } + async function stop() { if (stopped) { return; @@ -924,6 +954,7 @@ export function createTtsController(options = {}) { return { handleSayPayload, interruptCurrent, + flushForBargeIn, stop, snapshot() { return { diff --git a/test/face-app/operator_asr_proxy.test.mjs b/test/face-app/operator_asr_proxy.test.mjs index 216216c..8d5469b 100644 --- a/test/face-app/operator_asr_proxy.test.mjs +++ b/test/face-app/operator_asr_proxy.test.mjs @@ -108,3 +108,72 @@ test('operator ASR proxy returns 503 when upstream is not configured', async () const body = JSON.parse(result.body); assert.equal(body.error, 'asr_upstream_not_configured'); }); + +test('operator ASR proxy invokes onBargeIn for a POST audio upload', async () => { + const bargeIns = []; + const proxy = createOperatorAsrProxy({ + baseUrl: 'http://127.0.0.1:8091', + onBargeIn: (reason) => bargeIns.push(reason), + fetchImpl: async () => ({ + ok: true, + status: 200, + async text() { + return JSON.stringify({ text: 'hi', language: 'ja', confidence: 0.9 }); + } + }) + }); + + const request = createMockRequest({ + method: 'POST', + url: '/api/operator/asr?lang=ja', + headers: { 'content-type': 'audio/webm' }, + body: Buffer.from('sample-audio') + }); + const response = createMockResponse(); + + await proxy.handleHttpRequest(request, response); + assert.deepEqual(bargeIns, ['operator_ptt']); +}); + +test('operator ASR proxy does not invoke onBargeIn for non-ASR paths', async () => { + const bargeIns = []; + const proxy = createOperatorAsrProxy({ + baseUrl: 'http://127.0.0.1:8091', + onBargeIn: (reason) => bargeIns.push(reason), + fetchImpl: async () => ({ ok: true, status: 200, async text() { return '{}'; } }) + }); + + const request = createMockRequest({ method: 'POST', url: '/api/other', headers: {}, body: '' }); + const response = createMockResponse(); + + const handled = await proxy.handleHttpRequest(request, response); + assert.equal(handled, false); + assert.deepEqual(bargeIns, []); +}); + +test('operator ASR proxy still responds if onBargeIn throws', async () => { + const proxy = createOperatorAsrProxy({ + baseUrl: 'http://127.0.0.1:8091', + onBargeIn: () => { throw new Error('boom'); }, + log: { info: () => {}, warn: () => {}, error: () => {} }, + fetchImpl: async () => ({ + ok: true, + status: 200, + async text() { + return JSON.stringify({ text: 'ok', language: 'ja', confidence: 0.8 }); + } + }) + }); + + const request = createMockRequest({ + method: 'POST', + url: '/api/operator/asr?lang=ja', + headers: { 'content-type': 'audio/webm' }, + body: Buffer.from('audio') + }); + const response = createMockResponse(); + + const handled = await proxy.handleHttpRequest(request, response); + assert.equal(handled, true); + assert.equal(response.result().statusCode, 200); +}); diff --git a/test/face-app/tts_controller.test.mjs b/test/face-app/tts_controller.test.mjs index 2430e7c..d8dcb9c 100644 --- a/test/face-app/tts_controller.test.mjs +++ b/test/face-app/tts_controller.test.mjs @@ -840,3 +840,63 @@ test('tts controller flushes queued chunks when an interrupt utterance arrives', finishActive(worker, 2); assert.equal(speaks(worker).length, 2); }); + +// --- Step 2: operator PTT barge-in flushes the queue --- + +test('tts controller flushForBargeIn drops active and queued chunks', async () => { + const { worker, controller } = makeController({ maxChunkChars: 8 }); + + await controller.handleSayPayload({ + type: 'say', + session_id: 's1', + utterance_id: 'u1', + priority: 2, + policy: 'replace', + ttl_ms: 60_000, + ts: 42_000, + text: '一つ目の文です。二つ目の文です。三つ目の文です。' + }); + assert.equal(speaks(worker).length, 1); + assert.equal(controller.snapshot().queuedChunks, 2); + + await controller.flushForBargeIn('operator_ptt'); + + assert.equal(interrupts(worker).length, 1); + assert.equal(controller.snapshot().activeGeneration, null); + assert.equal(controller.snapshot().queuedChunks, 0); + + // Stale worker completion must not resurrect queued chunks. + finishActive(worker, 1); + assert.equal(speaks(worker).length, 1); + + // A new utterance after barge-in uses a fresh, higher generation. + await controller.handleSayPayload({ + type: 'say', + session_id: 's1', + utterance_id: 'u2', + priority: 2, + policy: 'replace', + ttl_ms: 60_000, + ts: 42_000, + text: '再開します。' + }); + assert.equal(speaks(worker).length, 2); + assert.ok(speaks(worker)[1].generation > speaks(worker)[0].generation); +}); + +test('tts controller flushForBargeIn clears the audio store', async () => { + const worker = new FakeWorker(); + let cleared = 0; + const controller = createTtsController({ + worker, + now: () => 42_000, + gate: { check: () => ({ allow: true }) }, + broadcast: () => true, + log: { info: () => {}, warn: () => {}, error: () => {} }, + audioStore: { clear: () => { cleared += 1; } } + }); + worker.emit('message', { type: 'ready', voice: 'af_heart', engine: 'kokoro' }); + + await controller.flushForBargeIn('operator_ptt'); + assert.equal(cleared, 1); +}); From 2579dcc3013f1678c08efe0d1e38a8c5318390d4 Mon Sep 17 00:00:00 2001 From: amariichi <68761912+amariichi@users.noreply.github.com> Date: Sun, 17 May 2026 10:14:06 +0900 Subject: [PATCH 6/9] AtomS3R: first-pass button push-to-talk + mic capture + ASR submit Preservation commit of codex's in-flight, build-passing, hardware-validated PTT firmware (Milestone 6 in PLANS_48): button hold-to-record, M5.Mic capture via the Atomic Echo Base, 16 kHz mono WAV wrapping, POST to /api/operator/asr?lang=, and operator_response submission over the authenticated HTTP fallback. Includes persisted asrLanguage (ja/en) with setup-portal selection, HeadroomTransport::sendOperatorText(), the HeadroomPtt record/process/submit module, longer ASR HTTP read timeout, and ingress/settings/config wiring. `pio run` succeeds (RAM 15.6%, Flash 36.2%); transcript verified on real hardware per PLANS_48. Committed by Claude to preserve the work while codex is paused; no behavioral changes were made to codex's firmware in this commit. Co-Authored-By: Codex (OpenAI) Co-Authored-By: Claude Opus 4.7 --- firmware/atoms3r-headroom/README.md | 51 ++- .../include/headroom_config.example.h | 1 + .../include/headroom_config.h | 4 + .../atoms3r-headroom/src/headroom_audio.cpp | 10 + .../atoms3r-headroom/src/headroom_audio.h | 2 + .../src/headroom_ingress_server.cpp | 15 +- .../src/headroom_ingress_server.h | 3 + .../atoms3r-headroom/src/headroom_ptt.cpp | 355 ++++++++++++++++++ firmware/atoms3r-headroom/src/headroom_ptt.h | 56 +++ .../src/headroom_settings.cpp | 20 + .../atoms3r-headroom/src/headroom_settings.h | 2 + .../src/headroom_setup_portal.cpp | 9 + .../src/headroom_transport.cpp | 28 ++ .../atoms3r-headroom/src/headroom_transport.h | 2 + firmware/atoms3r-headroom/src/main.cpp | 10 +- 15 files changed, 559 insertions(+), 9 deletions(-) create mode 100644 firmware/atoms3r-headroom/src/headroom_ptt.cpp create mode 100644 firmware/atoms3r-headroom/src/headroom_ptt.h diff --git a/firmware/atoms3r-headroom/README.md b/firmware/atoms3r-headroom/README.md index 562a80a..aff2b79 100644 --- a/firmware/atoms3r-headroom/README.md +++ b/firmware/atoms3r-headroom/README.md @@ -4,8 +4,10 @@ This PlatformIO project is the AtomS3R hardware frontend for minimum-headroom. Milestone 1 initializes the M5Stack AtomS3R display, draws a 128x128 parametric face, and cycles expressions with the Atom button. Milestone 2 adds saved -settings and a setup access point. WebSocket, TTS, microphone, and operator -bridge connection are still later milestones. +settings and a setup access point. WebSocket mirroring and TTS playback are +implemented. The firmware also includes first-pass button PTT recording: hold +the Atom button while connected to Wi-Fi, speak, and release to send the +recorded WAV through `face-app` operator ASR. ## Build @@ -23,6 +25,12 @@ pio run -t upload pio device monitor ``` +On the current AtomS3R hardware, flashing may require the esptool no-stub path: + +```bash +PLATFORMIO_UPLOAD_FLAGS=--no-stub pio run -t upload --upload-port /dev/ttyACM0 +``` + Expected serial output: ```text @@ -43,8 +51,8 @@ http://192.168.4.1/ ``` The setup page saves Wi-Fi, face app URLs, auth token, device id, display -priority agent id, input target agent id, face rotation, placement pose, and -upper-side orientation to ESP32 NVS/Preferences. +priority agent id, input target agent id, ASR language, face rotation, placement +pose, and upper-side orientation to ESP32 NVS/Preferences. When Wi-Fi connects successfully, the firmware opens the configured WebSocket URL and mirrors these minimum-headroom payloads: @@ -54,6 +62,36 @@ URL and mirrors these minimum-headroom payloads: - `tts_state`: shows queued/speaking/error/idle state. - `tts_mouth`: drives mouth openness from the payload's `open` value. +When Wi-Fi is connected, the Atom button is used for push-to-talk instead of the +offline expression demo. Hold the button to record up to 8 seconds of 16 kHz mono +PCM from the Atomic Echo Base microphone. On release, the firmware wraps the clip +as `audio/wav`, posts it to: + +```text +/api/operator/asr?lang= +``` + +If ASR returns non-empty text, the Atom sends an `operator_response` websocket +payload with `source: "atom"` and `response_kind: "text"`. If the Atom-to-PC +WebSocket is unavailable, it falls back to authenticated HTTP: + +```text +/api/operator/response +``` + +Recording and speaker playback are serialized because the Atomic Echo Base uses +one ES8311 codec for both mic and speaker. + +The normal-mode health endpoint is useful for desk debugging: + +```text +http:///health +``` + +It reports the configured face HTTP/WS URLs, ASR language, auth presence, and +whether the Atom-originated WebSocket is connected. The auth token value is not +returned. + If `MH_FACE_AUTH_TOKEN` is enabled on the PC, set the same token in the setup page. The firmware appends it as `auth_token` on the WebSocket URL for the same-LAN first implementation. @@ -68,5 +106,6 @@ default and must not directly target helper panes unless explicitly configured. The checked-in `include/headroom_config.example.h` contains safe placeholders. For development-only defaults, create `include/headroom_config.local.h`; it is -ignored by git. Later milestones will load saved settings from NVS and expose an -Atom-hosted setup portal for Wi-Fi, server URL, auth token, and orientation. +ignored by git. Runtime settings are loaded from NVS/Preferences when present, +and the Atom-hosted setup portal can update Wi-Fi, server URL, auth token, ASR +language, and orientation without reflashing. diff --git a/firmware/atoms3r-headroom/include/headroom_config.example.h b/firmware/atoms3r-headroom/include/headroom_config.example.h index af583ed..dfd6ad6 100644 --- a/firmware/atoms3r-headroom/include/headroom_config.example.h +++ b/firmware/atoms3r-headroom/include/headroom_config.example.h @@ -8,6 +8,7 @@ #define HEADROOM_DEVICE_ID "atom-headroom-1" #define HEADROOM_DISPLAY_AGENT_ID "__operator__" #define HEADROOM_INPUT_TARGET_AGENT_ID "__operator__" +#define HEADROOM_ASR_LANGUAGE "ja" #define HEADROOM_MAX_BASE64_TTS_SECONDS 10 #define HEADROOM_MAX_HTTP_TTS_BYTES 1200000 diff --git a/firmware/atoms3r-headroom/include/headroom_config.h b/firmware/atoms3r-headroom/include/headroom_config.h index 095177d..b0252de 100644 --- a/firmware/atoms3r-headroom/include/headroom_config.h +++ b/firmware/atoms3r-headroom/include/headroom_config.h @@ -5,3 +5,7 @@ #else #include "headroom_config.example.h" #endif + +#ifndef HEADROOM_ASR_LANGUAGE +#define HEADROOM_ASR_LANGUAGE "ja" +#endif diff --git a/firmware/atoms3r-headroom/src/headroom_audio.cpp b/firmware/atoms3r-headroom/src/headroom_audio.cpp index 5d4d2a4..366f620 100644 --- a/firmware/atoms3r-headroom/src/headroom_audio.cpp +++ b/firmware/atoms3r-headroom/src/headroom_audio.cpp @@ -43,6 +43,16 @@ void HeadroomAudio::stop() { releaseActive(); } +void HeadroomAudio::stopForRecording() { + stop(); + M5.Speaker.end(); +} + +void HeadroomAudio::restoreAfterRecording() { + M5.Speaker.setVolume(130); + M5.Speaker.begin(); +} + bool HeadroomAudio::busy() const { return M5.Speaker.isPlaying(); } diff --git a/firmware/atoms3r-headroom/src/headroom_audio.h b/firmware/atoms3r-headroom/src/headroom_audio.h index ffb983e..9927505 100644 --- a/firmware/atoms3r-headroom/src/headroom_audio.h +++ b/firmware/atoms3r-headroom/src/headroom_audio.h @@ -19,6 +19,8 @@ class HeadroomAudio { void begin(const HeadroomSettingsData& settings); void loop(); void stop(); + void stopForRecording(); + void restoreAfterRecording(); bool busy() const; HeadroomAudioResult playBase64Wav(const char* audioBase64, size_t base64Length, int sampleRateHint); diff --git a/firmware/atoms3r-headroom/src/headroom_ingress_server.cpp b/firmware/atoms3r-headroom/src/headroom_ingress_server.cpp index 427fc72..6e98342 100644 --- a/firmware/atoms3r-headroom/src/headroom_ingress_server.cpp +++ b/firmware/atoms3r-headroom/src/headroom_ingress_server.cpp @@ -73,8 +73,11 @@ void HeadroomIngressServer::begin(const HeadroomSettingsData& settings, Headroom transport_ = &transport; audio_ = &audio; faceState_ = &faceState; + faceHttpBase_ = settings.faceHttpBase; + faceWsUrl_ = settings.faceWsUrl; authToken_ = settings.authToken; deviceId_ = settings.deviceId; + asrLanguage_ = settings.asrLanguage; maxPayloadBytes_ = estimatePayloadLimit(settings); const char* headerKeys[] = {"Authorization", "X-Headroom-Auth"}; @@ -112,7 +115,17 @@ void HeadroomIngressServer::handleHealth() { body += jsonEscape(deviceId_); body += F("\",\"ip\":\""); body += WiFi.localIP().toString(); - body += F("\",\"ingress\":true}"); + body += F("\",\"ingress\":true,\"ws_connected\":"); + body += transport_ && transport_->connected() ? F("true") : F("false"); + body += F(",\"face_http_base\":\""); + body += jsonEscape(faceHttpBase_); + body += F("\",\"face_ws_url\":\""); + body += jsonEscape(faceWsUrl_); + body += F("\",\"auth_configured\":"); + body += authToken_.length() > 0 ? F("true") : F("false"); + body += F(",\"asr_language\":\""); + body += jsonEscape(asrLanguage_); + body += F("\"}"); sendJson(200, body); } diff --git a/firmware/atoms3r-headroom/src/headroom_ingress_server.h b/firmware/atoms3r-headroom/src/headroom_ingress_server.h index 87fdb5e..5982ada 100644 --- a/firmware/atoms3r-headroom/src/headroom_ingress_server.h +++ b/firmware/atoms3r-headroom/src/headroom_ingress_server.h @@ -20,8 +20,11 @@ class HeadroomIngressServer { HeadroomTransport* transport_ = nullptr; HeadroomAudio* audio_ = nullptr; HeadroomFaceState* faceState_ = nullptr; + String faceHttpBase_; + String faceWsUrl_; String authToken_; String deviceId_; + String asrLanguage_; bool active_ = false; size_t maxPayloadBytes_ = 720000; uint32_t lastPayloadMs_ = 0; diff --git a/firmware/atoms3r-headroom/src/headroom_ptt.cpp b/firmware/atoms3r-headroom/src/headroom_ptt.cpp new file mode 100644 index 0000000..053265e --- /dev/null +++ b/firmware/atoms3r-headroom/src/headroom_ptt.cpp @@ -0,0 +1,355 @@ +#include "headroom_ptt.h" + +#include +#include +#include +#include +#include + +namespace { + +void writeLe16(uint8_t* out, uint16_t value) { + out[0] = static_cast(value & 0xff); + out[1] = static_cast((value >> 8) & 0xff); +} + +void writeLe32(uint8_t* out, uint32_t value) { + out[0] = static_cast(value & 0xff); + out[1] = static_cast((value >> 8) & 0xff); + out[2] = static_cast((value >> 16) & 0xff); + out[3] = static_cast((value >> 24) & 0xff); +} + +bool isHttpUrl(const String& url) { + return url.startsWith("http://") || url.startsWith("https://"); +} + +constexpr uint16_t kAsrHttpTimeoutMs = 30000; +constexpr uint16_t kOperatorResponseHttpTimeoutMs = 8000; + +} // namespace + +void HeadroomPtt::begin( + const HeadroomSettingsData& settings, + HeadroomAudio& audio, + HeadroomTransport& transport, + HeadroomFaceState& faceState) { + audio_ = &audio; + transport_ = &transport; + faceState_ = &faceState; + httpBase_ = settings.faceHttpBase; + authToken_ = settings.authToken; + deviceId_ = settings.deviceId; + inputTargetAgentId_ = settings.inputTargetAgentId; + asrLanguage_ = HeadroomSettings::normalizeAsrLanguage(settings.asrLanguage); + Serial.printf("ptt ready asr_lang=%s mic_enabled=%s\n", asrLanguage_.c_str(), M5.Mic.isEnabled() ? "yes" : "no"); +} + +void HeadroomPtt::update() { + bool pressed = M5.BtnA.isPressed(); + + if (state_ == HeadroomPttState::Idle && pressed && !pressedLast_) { + startRecording(); + } + + if (state_ == HeadroomPttState::Recording) { + if (pressed && samplesRecorded_ < kMaxSamples) { + captureChunk(); + } + if (!pressed || samplesRecorded_ >= kMaxSamples) { + finishRecording(); + } + } + + if (state_ == HeadroomPttState::Error && millis() - stateSinceMs_ > 2500) { + setState(HeadroomPttState::Idle); + setFaceExpression(HeadroomExpression::Neutral); + } + + pressedLast_ = pressed; +} + +bool HeadroomPtt::recording() const { + return state_ == HeadroomPttState::Recording; +} + +HeadroomPttState HeadroomPtt::state() const { + return state_; +} + +bool HeadroomPtt::startRecording() { + if (!audio_ || !transport_ || !faceState_) { + return false; + } + if (audio_->busy()) { + audio_->stopForRecording(); + } else { + audio_->stopForRecording(); + } + + resetRecording(); + pcm_ = static_cast(ps_malloc(kMaxSamples * sizeof(int16_t))); + if (!pcm_) { + pcm_ = static_cast(malloc(kMaxSamples * sizeof(int16_t))); + } + if (!pcm_) { + Serial.println("ptt alloc failed"); + audio_->restoreAfterRecording(); + setState(HeadroomPttState::Error); + setFaceExpression(HeadroomExpression::Failed); + return false; + } + + if (!M5.Mic.begin()) { + Serial.println("M5.Mic.begin failed"); + resetRecording(); + audio_->restoreAfterRecording(); + setState(HeadroomPttState::Error); + setFaceExpression(HeadroomExpression::Failed); + return false; + } + + samplesRecorded_ = 0; + setState(HeadroomPttState::Recording); + setFaceExpression(HeadroomExpression::Listening, 0.08f); + Serial.println("ptt recording started"); + return true; +} + +void HeadroomPtt::captureChunk() { + if (!pcm_ || samplesRecorded_ >= kMaxSamples) { + return; + } + size_t remaining = kMaxSamples - samplesRecorded_; + size_t chunkSamples = min(remaining, kChunkSamples); + int16_t* dest = pcm_ + samplesRecorded_; + if (!M5.Mic.record(dest, chunkSamples, kSampleRate, false)) { + Serial.println("M5.Mic.record failed"); + return; + } + uint32_t waitStarted = millis(); + while (!M5.Mic.isRecording() && millis() - waitStarted < 20) { + delay(1); + M5.update(); + } + while (M5.Mic.isRecording()) { + delay(1); + M5.update(); + } + samplesRecorded_ += chunkSamples; + if (faceState_) { + faceState_->mouthOpen = 0.10f + 0.20f * static_cast((samplesRecorded_ / kChunkSamples) % 3) / 2.0f; + } +} + +void HeadroomPtt::finishRecording() { + setState(HeadroomPttState::Processing); + setFaceExpression(HeadroomExpression::Thinking, 0.0f); + M5.Mic.end(); + audio_->restoreAfterRecording(); + + Serial.printf("ptt recording finished samples=%u\n", static_cast(samplesRecorded_)); + if (samplesRecorded_ < (kSampleRate / 4)) { + Serial.println("ptt recording too short"); + resetRecording(); + setState(HeadroomPttState::Error); + setFaceExpression(HeadroomExpression::Failed); + return; + } + + bool ok = postToAsrAndSubmit(); + resetRecording(); + if (ok) { + setState(HeadroomPttState::Idle); + setFaceExpression(HeadroomExpression::Success); + } else { + setState(HeadroomPttState::Error); + setFaceExpression(HeadroomExpression::Failed); + } +} + +void HeadroomPtt::resetRecording() { + if (pcm_) { + free(pcm_); + pcm_ = nullptr; + } + samplesRecorded_ = 0; +} + +bool HeadroomPtt::postToAsrAndSubmit() { + uint8_t* wav = nullptr; + size_t wavLength = 0; + if (!buildWav(&wav, &wavLength)) { + return false; + } + + String url = asrUrl(); + if (!isHttpUrl(url)) { + Serial.printf("invalid ASR URL: %s\n", url.c_str()); + free(wav); + return false; + } + + WiFiClient client; + HTTPClient http; + if (!http.begin(client, url)) { + free(wav); + return false; + } + http.setTimeout(kAsrHttpTimeoutMs); + http.addHeader("Content-Type", "audio/wav"); + if (authToken_.length() > 0) { + http.addHeader("Authorization", String("Bearer ") + authToken_); + } + + Serial.printf("posting ASR wav bytes=%u url=%s\n", static_cast(wavLength), url.c_str()); + int status = http.POST(wav, wavLength); + free(wav); + + if (status != HTTP_CODE_OK) { + String body = http.getString(); + Serial.printf("ASR POST failed status=%d body=%s\n", status, body.substring(0, 160).c_str()); + http.end(); + return false; + } + + String body = http.getString(); + http.end(); + + JsonDocument doc; + DeserializationError error = deserializeJson(doc, body); + if (error) { + Serial.printf("ASR JSON parse failed: %s\n", error.c_str()); + return false; + } + + const char* textValue = doc["text"] | ""; + String text(textValue); + text.trim(); + if (text.length() == 0) { + Serial.println("ASR returned empty text"); + return false; + } + + Serial.printf("ASR text: %s\n", text.c_str()); + return submitOperatorText(text); +} + +bool HeadroomPtt::submitOperatorText(const String& text) { + String trimmed = text; + trimmed.trim(); + if (trimmed.length() == 0) { + return false; + } + + if (transport_ && transport_->sendOperatorText(trimmed)) { + return true; + } + + String url = operatorResponseUrl(); + if (!isHttpUrl(url)) { + Serial.printf("invalid operator response URL: %s\n", url.c_str()); + return false; + } + + JsonDocument doc; + doc["v"] = 1; + doc["type"] = "operator_response"; + doc["session_id"] = deviceId_.length() > 0 ? deviceId_ : "atom-headroom"; + doc["request_id"] = nullptr; + doc["response_kind"] = "text"; + doc["value"] = trimmed; + doc["source"] = "atom"; + if (inputTargetAgentId_.length() > 0) { + doc["target_agent_id"] = inputTargetAgentId_; + } + doc["ts"] = millis(); + + String payload; + serializeJson(doc, payload); + + WiFiClient client; + HTTPClient http; + if (!http.begin(client, url)) { + return false; + } + http.setTimeout(kOperatorResponseHttpTimeoutMs); + http.addHeader("Content-Type", "application/json"); + if (authToken_.length() > 0) { + http.addHeader("Authorization", String("Bearer ") + authToken_); + } + + Serial.printf("posting operator_response bytes=%u url=%s\n", static_cast(payload.length()), url.c_str()); + int status = http.POST(const_cast(reinterpret_cast(payload.c_str())), payload.length()); + String body = http.getString(); + http.end(); + if (status != HTTP_CODE_OK && status != HTTP_CODE_ACCEPTED) { + Serial.printf("operator_response HTTP failed status=%d body=%s\n", status, body.substring(0, 160).c_str()); + return false; + } + Serial.printf("operator_response HTTP ok status=%d\n", status); + return true; +} + +bool HeadroomPtt::buildWav(uint8_t** outWav, size_t* outLength) const { + if (!pcm_ || samplesRecorded_ == 0 || !outWav || !outLength) { + return false; + } + size_t dataBytes = samplesRecorded_ * sizeof(int16_t); + size_t totalBytes = kWavHeaderBytes + dataBytes; + uint8_t* wav = static_cast(ps_malloc(totalBytes)); + if (!wav) { + wav = static_cast(malloc(totalBytes)); + } + if (!wav) { + return false; + } + + memcpy(wav, "RIFF", 4); + writeLe32(wav + 4, static_cast(totalBytes - 8)); + memcpy(wav + 8, "WAVE", 4); + memcpy(wav + 12, "fmt ", 4); + writeLe32(wav + 16, 16); + writeLe16(wav + 20, 1); + writeLe16(wav + 22, 1); + writeLe32(wav + 24, kSampleRate); + writeLe32(wav + 28, kSampleRate * 2); + writeLe16(wav + 32, 2); + writeLe16(wav + 34, 16); + memcpy(wav + 36, "data", 4); + writeLe32(wav + 40, static_cast(dataBytes)); + memcpy(wav + kWavHeaderBytes, pcm_, dataBytes); + + *outWav = wav; + *outLength = totalBytes; + return true; +} + +String HeadroomPtt::asrUrl() const { + String base = httpBase_; + if (base.endsWith("/")) { + base.remove(base.length() - 1); + } + return base + "/api/operator/asr?lang=" + asrLanguage_; +} + +String HeadroomPtt::operatorResponseUrl() const { + String base = httpBase_; + if (base.endsWith("/")) { + base.remove(base.length() - 1); + } + return base + "/api/operator/response"; +} + +void HeadroomPtt::setState(HeadroomPttState next) { + state_ = next; + stateSinceMs_ = millis(); +} + +void HeadroomPtt::setFaceExpression(HeadroomExpression expression, float mouthOpen) { + if (!faceState_) { + return; + } + faceState_->expression = expression; + faceState_->mouthOpen = mouthOpen; +} diff --git a/firmware/atoms3r-headroom/src/headroom_ptt.h b/firmware/atoms3r-headroom/src/headroom_ptt.h new file mode 100644 index 0000000..67e7091 --- /dev/null +++ b/firmware/atoms3r-headroom/src/headroom_ptt.h @@ -0,0 +1,56 @@ +#pragma once + +#include + +#include "face_renderer.h" +#include "headroom_audio.h" +#include "headroom_settings.h" +#include "headroom_transport.h" + +enum class HeadroomPttState { + Idle, + Recording, + Processing, + Error, +}; + +class HeadroomPtt { +public: + void begin(const HeadroomSettingsData& settings, HeadroomAudio& audio, HeadroomTransport& transport, HeadroomFaceState& faceState); + void update(); + bool recording() const; + HeadroomPttState state() const; + +private: + static constexpr uint32_t kSampleRate = 16000; + static constexpr size_t kMaxSeconds = 8; + static constexpr size_t kMaxSamples = kSampleRate * kMaxSeconds; + static constexpr size_t kChunkSamples = 1024; + static constexpr size_t kWavHeaderBytes = 44; + + HeadroomAudio* audio_ = nullptr; + HeadroomTransport* transport_ = nullptr; + HeadroomFaceState* faceState_ = nullptr; + String httpBase_; + String authToken_; + String deviceId_; + String inputTargetAgentId_; + String asrLanguage_ = "ja"; + int16_t* pcm_ = nullptr; + size_t samplesRecorded_ = 0; + HeadroomPttState state_ = HeadroomPttState::Idle; + uint32_t stateSinceMs_ = 0; + bool pressedLast_ = false; + + bool startRecording(); + void captureChunk(); + void finishRecording(); + void resetRecording(); + bool postToAsrAndSubmit(); + bool submitOperatorText(const String& text); + bool buildWav(uint8_t** outWav, size_t* outLength) const; + String asrUrl() const; + String operatorResponseUrl() const; + void setState(HeadroomPttState next); + void setFaceExpression(HeadroomExpression expression, float mouthOpen = 0.0f); +}; diff --git a/firmware/atoms3r-headroom/src/headroom_settings.cpp b/firmware/atoms3r-headroom/src/headroom_settings.cpp index 76f7ae7..90bf8bd 100644 --- a/firmware/atoms3r-headroom/src/headroom_settings.cpp +++ b/firmware/atoms3r-headroom/src/headroom_settings.cpp @@ -45,6 +45,7 @@ bool HeadroomSettings::save(const HeadroomSettingsData& next) { HeadroomSettingsData normalized = next; normalized.faceRotationDegrees = normalizeRotation(normalized.faceRotationDegrees); normalized.upSideDegrees = normalizeRotation(normalized.upSideDegrees); + normalized.asrLanguage = normalizeAsrLanguage(normalized.asrLanguage); Preferences prefs; if (!prefs.begin(kNamespace, false)) { @@ -59,6 +60,7 @@ bool HeadroomSettings::save(const HeadroomSettingsData& next) { prefs.putString("device_id", normalized.deviceId); prefs.putString("display_id", normalized.displayAgentId); prefs.putString("input_id", normalized.inputTargetAgentId); + prefs.putString("asr_lang", normalized.asrLanguage); prefs.putInt("max_b64_sec", normalized.maxBase64TtsSeconds); prefs.putInt("max_http_b", normalized.maxHttpTtsBytes); prefs.putInt("rotation", normalized.faceRotationDegrees); @@ -104,6 +106,22 @@ HeadroomPlacementPose HeadroomSettings::parsePlacementPose(const String& value) return HeadroomPlacementPose::ScreenUp; } +String HeadroomSettings::normalizeAsrLanguage(const String& value, const String& fallback) { + String normalized = value; + normalized.trim(); + normalized.toLowerCase(); + if (normalized.startsWith("ja")) { + return "ja"; + } + if (normalized.startsWith("en")) { + return "en"; + } + String normalizedFallback = fallback; + normalizedFallback.trim(); + normalizedFallback.toLowerCase(); + return normalizedFallback.startsWith("en") ? "en" : "ja"; +} + const char* HeadroomSettings::placementPoseName(HeadroomPlacementPose pose) { switch (pose) { case HeadroomPlacementPose::SideUp: @@ -123,6 +141,7 @@ void HeadroomSettings::loadCompileDefaults() { data_.deviceId = HEADROOM_DEVICE_ID; data_.displayAgentId = HEADROOM_DISPLAY_AGENT_ID; data_.inputTargetAgentId = HEADROOM_INPUT_TARGET_AGENT_ID; + data_.asrLanguage = normalizeAsrLanguage(HEADROOM_ASR_LANGUAGE); data_.maxBase64TtsSeconds = HEADROOM_MAX_BASE64_TTS_SECONDS; data_.maxHttpTtsBytes = HEADROOM_MAX_HTTP_TTS_BYTES; data_.faceRotationDegrees = normalizeRotation(HEADROOM_FACE_ROTATION_DEGREES); @@ -161,6 +180,7 @@ void HeadroomSettings::loadNvsOverrides() { data_.deviceId = readString(prefs, "device_id", data_.deviceId); data_.displayAgentId = readString(prefs, "display_id", data_.displayAgentId); data_.inputTargetAgentId = readString(prefs, "input_id", data_.inputTargetAgentId); + data_.asrLanguage = normalizeAsrLanguage(readString(prefs, "asr_lang", data_.asrLanguage)); data_.maxBase64TtsSeconds = readInt(prefs, "max_b64_sec", data_.maxBase64TtsSeconds); data_.maxHttpTtsBytes = readInt(prefs, "max_http_b", data_.maxHttpTtsBytes); data_.faceRotationDegrees = normalizeRotation(readInt(prefs, "rotation", data_.faceRotationDegrees)); diff --git a/firmware/atoms3r-headroom/src/headroom_settings.h b/firmware/atoms3r-headroom/src/headroom_settings.h index fcbca28..a60786c 100644 --- a/firmware/atoms3r-headroom/src/headroom_settings.h +++ b/firmware/atoms3r-headroom/src/headroom_settings.h @@ -16,6 +16,7 @@ struct HeadroomSettingsData { String deviceId; String displayAgentId; String inputTargetAgentId; + String asrLanguage; int maxBase64TtsSeconds = 10; int maxHttpTtsBytes = 1200000; int faceRotationDegrees = 0; @@ -34,6 +35,7 @@ class HeadroomSettings { static bool isValidRotation(int degrees); static int normalizeRotation(int degrees); + static String normalizeAsrLanguage(const String& value, const String& fallback = "ja"); static HeadroomPlacementPose parsePlacementPose(const String& value); static const char* placementPoseName(HeadroomPlacementPose pose); diff --git a/firmware/atoms3r-headroom/src/headroom_setup_portal.cpp b/firmware/atoms3r-headroom/src/headroom_setup_portal.cpp index f01b0ad..076392b 100644 --- a/firmware/atoms3r-headroom/src/headroom_setup_portal.cpp +++ b/firmware/atoms3r-headroom/src/headroom_setup_portal.cpp @@ -150,6 +150,14 @@ String HeadroomSetupPortal::renderPage(const String& message) { html += F(""); + html += F(""); html += F("
--- .../atoms3r-headroom/src/headroom_audio.cpp | 60 +++++++++++++++++-- .../atoms3r-headroom/src/headroom_audio.h | 15 +++++ .../src/headroom_transport.cpp | 5 ++ 3 files changed, 76 insertions(+), 4 deletions(-) diff --git a/firmware/atoms3r-headroom/src/headroom_audio.cpp b/firmware/atoms3r-headroom/src/headroom_audio.cpp index 366f620..9e757ce 100644 --- a/firmware/atoms3r-headroom/src/headroom_audio.cpp +++ b/firmware/atoms3r-headroom/src/headroom_audio.cpp @@ -36,10 +36,12 @@ void HeadroomAudio::begin(const HeadroomSettingsData& settings) { void HeadroomAudio::loop() { releaseActive(); + startNextIfIdle(); } void HeadroomAudio::stop() { M5.Speaker.stop(); + clearQueue(); releaseActive(); } @@ -54,7 +56,9 @@ void HeadroomAudio::restoreAfterRecording() { } bool HeadroomAudio::busy() const { - return M5.Speaker.isPlaying(); + // Stay "busy" across the gap between queued chunks so the face holds + // the Speaking expression instead of flickering to Neutral mid-reply. + return M5.Speaker.isPlaying() || queueCount_ > 0; } HeadroomAudioResult HeadroomAudio::playBase64Wav(const char* audioBase64, size_t base64Length, int sampleRateHint) { @@ -105,7 +109,7 @@ HeadroomAudioResult HeadroomAudio::playBase64Wav(const char* audioBase64, size_t return HeadroomAudioResult::TooLarge; } - return playOwnedWav(wav, decodedLength, true); + return playOrEnqueue(wav, decodedLength); } HeadroomAudioResult HeadroomAudio::playHttpWavRef(const String& url) { @@ -178,7 +182,7 @@ HeadroomAudioResult HeadroomAudio::playHttpWavRef(const String& url) { return HeadroomAudioResult::Unsupported; } - return playOwnedWav(wav, offset, true); + return playOrEnqueue(wav, offset); } HeadroomAudioResult HeadroomAudio::playWavBytes(const uint8_t* wav, size_t length) { @@ -205,7 +209,7 @@ HeadroomAudioResult HeadroomAudio::playWavBytes(const uint8_t* wav, size_t lengt return HeadroomAudioResult::DecodeFailed; } memcpy(owned, wav, length); - return playOwnedWav(owned, length, true); + return playOrEnqueue(owned, length); } void HeadroomAudio::releaseActive() { @@ -219,6 +223,54 @@ void HeadroomAudio::releaseActive() { } } +void HeadroomAudio::clearQueue() { + for (size_t i = 0; i < queueCount_; ++i) { + size_t idx = (queueHead_ + i) % kMaxQueued; + if (queued_[idx]) { + free(queued_[idx]); + queued_[idx] = nullptr; + queuedLen_[idx] = 0; + } + } + queueHead_ = 0; + queueCount_ = 0; +} + +bool HeadroomAudio::enqueueOwned(uint8_t* wav, size_t length) { + if (queueCount_ >= kMaxQueued) { + Serial.printf("audio queue full (%u), dropping chunk\n", static_cast(kMaxQueued)); + free(wav); + return false; + } + size_t idx = (queueHead_ + queueCount_) % kMaxQueued; + queued_[idx] = wav; + queuedLen_[idx] = length; + queueCount_++; + return true; +} + +HeadroomAudioResult HeadroomAudio::playOrEnqueue(uint8_t* wav, size_t length) { + if (!M5.Speaker.isPlaying() && queueCount_ == 0 && !activeWav_) { + return playOwnedWav(wav, length, true); + } + return enqueueOwned(wav, length) ? HeadroomAudioResult::Ok : HeadroomAudioResult::TooLarge; +} + +void HeadroomAudio::startNextIfIdle() { + if (activeWav_ || M5.Speaker.isPlaying() || queueCount_ == 0) { + return; + } + uint8_t* wav = queued_[queueHead_]; + size_t length = queuedLen_[queueHead_]; + queued_[queueHead_] = nullptr; + queuedLen_[queueHead_] = 0; + queueHead_ = (queueHead_ + 1) % kMaxQueued; + queueCount_--; + if (wav) { + playOwnedWav(wav, length, true); + } +} + HeadroomAudioResult HeadroomAudio::playOwnedWav(uint8_t* wav, size_t length, bool takeOwnership) { releaseActive(); M5.Speaker.stop(); diff --git a/firmware/atoms3r-headroom/src/headroom_audio.h b/firmware/atoms3r-headroom/src/headroom_audio.h index 9927505..7359479 100644 --- a/firmware/atoms3r-headroom/src/headroom_audio.h +++ b/firmware/atoms3r-headroom/src/headroom_audio.h @@ -35,7 +35,22 @@ class HeadroomAudio { uint8_t* activeWav_ = nullptr; size_t activeWavLength_ = 0; + // Bounded FIFO of decoded WAV chunks waiting to play. Server-side + // sentence chunking delivers an ordered burst of small refs; without + // this queue each newly arrived chunk would M5.Speaker.stop() and + // truncate the one still playing. Chunks are small (the server caps + // the text length), so a shallow queue is enough. + static constexpr size_t kMaxQueued = 8; + uint8_t* queued_[kMaxQueued] = {nullptr}; + size_t queuedLen_[kMaxQueued] = {0}; + size_t queueHead_ = 0; + size_t queueCount_ = 0; + void releaseActive(); + void clearQueue(); + bool enqueueOwned(uint8_t* wav, size_t length); + HeadroomAudioResult playOrEnqueue(uint8_t* wav, size_t length); + void startNextIfIdle(); HeadroomAudioResult playOwnedWav(uint8_t* wav, size_t length, bool takeOwnership); bool inspectWav(const uint8_t* wav, size_t length, int* sampleRate, size_t* dataBytes, uint16_t* bitsPerSample, uint16_t* channels); String absoluteUrl(const String& url) const; diff --git a/firmware/atoms3r-headroom/src/headroom_transport.cpp b/firmware/atoms3r-headroom/src/headroom_transport.cpp index 778da58..fe62cfe 100644 --- a/firmware/atoms3r-headroom/src/headroom_transport.cpp +++ b/firmware/atoms3r-headroom/src/headroom_transport.cpp @@ -221,6 +221,11 @@ void HeadroomTransport::handleAudioPayload(JsonDocument& doc, const String& type } if (result != HeadroomAudioResult::Ignored) { Serial.printf("audio playback failed result=%d type=%s\n", static_cast(result), type.c_str()); + // No audio for this chunk: stop the mouth so it does not keep + // flapping from the independent tts_mouth stream (phantom 口パク). + if (faceState_) { + faceState_->mouthOpen = 0.0f; + } setExpression(HeadroomExpression::Failed); } } From 2c2a3001102a3608c1dda544fd9e045d68aa5ba1 Mon Sep 17 00:00:00 2001 From: amariichi <68761912+amariichi@users.noreply.github.com> Date: Sun, 17 May 2026 10:42:00 +0900 Subject: [PATCH 8/9] operator stack: cap TTS chunk size for the AtomS3R ingress Field bug after the chunking work: long replies were still mouth-only with a burst of static at the end. The atoms3r bridge log showed the Atom rejecting ~800 KB WAVs with HTTP 413 payload_too_large; the rare small sentence slipped through and a near-cap one played truncated (the static). The 120-char chunk default was never calibrated to the Atom HTTP ingress cap (~250 KB accepted in practice), and one Hermes sentence (<=120 chars) was not split at all. restart-operator-stack-in-place.sh now exports MH_TTS_CHUNK_MAX_CHARS (default 24 ~= ~3 s ~= ~150 KB WAV) into the stack so the Atom-facing pipeline chunks small enough to be accepted. The global code default stays 120 for browser/PC. Verified live: long utterance now splits into ~110-165 KB WAV chunks, all forwarded with no 413. Co-Authored-By: Claude Opus 4.7 --- scripts/restart-operator-stack-in-place.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scripts/restart-operator-stack-in-place.sh b/scripts/restart-operator-stack-in-place.sh index 114781b..c158919 100755 --- a/scripts/restart-operator-stack-in-place.sh +++ b/scripts/restart-operator-stack-in-place.sh @@ -220,6 +220,10 @@ fi if [[ -n "$ASR_BASE_URL" ]]; then append_env "MH_OPERATOR_ASR_BASE_URL" "$ASR_BASE_URL" fi +# Keep each synthesized TTS chunk small enough that its WAV stays under +# the AtomS3R HTTP ingress payload cap (~250 KB accepted in practice). +# ~24 JA chars ≈ ~3 s ≈ ~150 KB WAV. Override by exporting the var. +append_env "MH_TTS_CHUNK_MAX_CHARS" "${MH_TTS_CHUNK_MAX_CHARS:-24}" stack_launch+=" bash -lc " printf -v quoted_stack_cmd '%q' "$STACK_CMD" stack_launch+="$quoted_stack_cmd" From 5b0b3ef421eca599e53de5024d11492caf41ec88 Mon Sep 17 00:00:00 2001 From: amariichi <68761912+amariichi@users.noreply.github.com> Date: Sun, 17 May 2026 10:49:37 +0900 Subject: [PATCH 9/9] Revert AtomS3R audio FIFO (Step 3): it corrupted playback Once server-side chunking made WAVs small enough to be accepted (no more 413), every chunk played as loud static with faint voice. The regression was introduced solely by the Step 3 FIFO; the validated Milestone 5 single-play path was clean. Serial showed no firmware rejection, so the WAV was accepted and "played" but corrupted - consistent with a buffer-lifetime/scheduling bug around the async M5.Speaker.playWav in the queue. Restore headroom_audio.{cpp,h} and headroom_transport.cpp to the validated 2579dcc state, rebuilt and reflashed. Server-side chunking (Steps 1/2) and the small MH_TTS_CHUNK_MAX_CHARS budget are kept, so the Atom receives small WAVs played one-at-a-time by the known-good path. A correct Atom playback queue is deferred to an isolated rework with on-device serial validation (see PLANS_48). This reverts the firmware portion of ed6f4bf only. Co-Authored-By: Claude Opus 4.7 --- .../atoms3r-headroom/src/headroom_audio.cpp | 60 ++----------------- .../atoms3r-headroom/src/headroom_audio.h | 15 ----- .../src/headroom_transport.cpp | 5 -- 3 files changed, 4 insertions(+), 76 deletions(-) diff --git a/firmware/atoms3r-headroom/src/headroom_audio.cpp b/firmware/atoms3r-headroom/src/headroom_audio.cpp index 9e757ce..366f620 100644 --- a/firmware/atoms3r-headroom/src/headroom_audio.cpp +++ b/firmware/atoms3r-headroom/src/headroom_audio.cpp @@ -36,12 +36,10 @@ void HeadroomAudio::begin(const HeadroomSettingsData& settings) { void HeadroomAudio::loop() { releaseActive(); - startNextIfIdle(); } void HeadroomAudio::stop() { M5.Speaker.stop(); - clearQueue(); releaseActive(); } @@ -56,9 +54,7 @@ void HeadroomAudio::restoreAfterRecording() { } bool HeadroomAudio::busy() const { - // Stay "busy" across the gap between queued chunks so the face holds - // the Speaking expression instead of flickering to Neutral mid-reply. - return M5.Speaker.isPlaying() || queueCount_ > 0; + return M5.Speaker.isPlaying(); } HeadroomAudioResult HeadroomAudio::playBase64Wav(const char* audioBase64, size_t base64Length, int sampleRateHint) { @@ -109,7 +105,7 @@ HeadroomAudioResult HeadroomAudio::playBase64Wav(const char* audioBase64, size_t return HeadroomAudioResult::TooLarge; } - return playOrEnqueue(wav, decodedLength); + return playOwnedWav(wav, decodedLength, true); } HeadroomAudioResult HeadroomAudio::playHttpWavRef(const String& url) { @@ -182,7 +178,7 @@ HeadroomAudioResult HeadroomAudio::playHttpWavRef(const String& url) { return HeadroomAudioResult::Unsupported; } - return playOrEnqueue(wav, offset); + return playOwnedWav(wav, offset, true); } HeadroomAudioResult HeadroomAudio::playWavBytes(const uint8_t* wav, size_t length) { @@ -209,7 +205,7 @@ HeadroomAudioResult HeadroomAudio::playWavBytes(const uint8_t* wav, size_t lengt return HeadroomAudioResult::DecodeFailed; } memcpy(owned, wav, length); - return playOrEnqueue(owned, length); + return playOwnedWav(owned, length, true); } void HeadroomAudio::releaseActive() { @@ -223,54 +219,6 @@ void HeadroomAudio::releaseActive() { } } -void HeadroomAudio::clearQueue() { - for (size_t i = 0; i < queueCount_; ++i) { - size_t idx = (queueHead_ + i) % kMaxQueued; - if (queued_[idx]) { - free(queued_[idx]); - queued_[idx] = nullptr; - queuedLen_[idx] = 0; - } - } - queueHead_ = 0; - queueCount_ = 0; -} - -bool HeadroomAudio::enqueueOwned(uint8_t* wav, size_t length) { - if (queueCount_ >= kMaxQueued) { - Serial.printf("audio queue full (%u), dropping chunk\n", static_cast(kMaxQueued)); - free(wav); - return false; - } - size_t idx = (queueHead_ + queueCount_) % kMaxQueued; - queued_[idx] = wav; - queuedLen_[idx] = length; - queueCount_++; - return true; -} - -HeadroomAudioResult HeadroomAudio::playOrEnqueue(uint8_t* wav, size_t length) { - if (!M5.Speaker.isPlaying() && queueCount_ == 0 && !activeWav_) { - return playOwnedWav(wav, length, true); - } - return enqueueOwned(wav, length) ? HeadroomAudioResult::Ok : HeadroomAudioResult::TooLarge; -} - -void HeadroomAudio::startNextIfIdle() { - if (activeWav_ || M5.Speaker.isPlaying() || queueCount_ == 0) { - return; - } - uint8_t* wav = queued_[queueHead_]; - size_t length = queuedLen_[queueHead_]; - queued_[queueHead_] = nullptr; - queuedLen_[queueHead_] = 0; - queueHead_ = (queueHead_ + 1) % kMaxQueued; - queueCount_--; - if (wav) { - playOwnedWav(wav, length, true); - } -} - HeadroomAudioResult HeadroomAudio::playOwnedWav(uint8_t* wav, size_t length, bool takeOwnership) { releaseActive(); M5.Speaker.stop(); diff --git a/firmware/atoms3r-headroom/src/headroom_audio.h b/firmware/atoms3r-headroom/src/headroom_audio.h index 7359479..9927505 100644 --- a/firmware/atoms3r-headroom/src/headroom_audio.h +++ b/firmware/atoms3r-headroom/src/headroom_audio.h @@ -35,22 +35,7 @@ class HeadroomAudio { uint8_t* activeWav_ = nullptr; size_t activeWavLength_ = 0; - // Bounded FIFO of decoded WAV chunks waiting to play. Server-side - // sentence chunking delivers an ordered burst of small refs; without - // this queue each newly arrived chunk would M5.Speaker.stop() and - // truncate the one still playing. Chunks are small (the server caps - // the text length), so a shallow queue is enough. - static constexpr size_t kMaxQueued = 8; - uint8_t* queued_[kMaxQueued] = {nullptr}; - size_t queuedLen_[kMaxQueued] = {0}; - size_t queueHead_ = 0; - size_t queueCount_ = 0; - void releaseActive(); - void clearQueue(); - bool enqueueOwned(uint8_t* wav, size_t length); - HeadroomAudioResult playOrEnqueue(uint8_t* wav, size_t length); - void startNextIfIdle(); HeadroomAudioResult playOwnedWav(uint8_t* wav, size_t length, bool takeOwnership); bool inspectWav(const uint8_t* wav, size_t length, int* sampleRate, size_t* dataBytes, uint16_t* bitsPerSample, uint16_t* channels); String absoluteUrl(const String& url) const; diff --git a/firmware/atoms3r-headroom/src/headroom_transport.cpp b/firmware/atoms3r-headroom/src/headroom_transport.cpp index fe62cfe..778da58 100644 --- a/firmware/atoms3r-headroom/src/headroom_transport.cpp +++ b/firmware/atoms3r-headroom/src/headroom_transport.cpp @@ -221,11 +221,6 @@ void HeadroomTransport::handleAudioPayload(JsonDocument& doc, const String& type } if (result != HeadroomAudioResult::Ignored) { Serial.printf("audio playback failed result=%d type=%s\n", static_cast(result), type.c_str()); - // No audio for this chunk: stop the mouth so it does not keep - // flapping from the independent tts_mouth stream (phantom 口パク). - if (faceState_) { - faceState_->mouthOpen = 0.0f; - } setExpression(HeadroomExpression::Failed); } }