diff --git a/README.md b/README.md index 27d0acb..b4a9c1c 100644 --- a/README.md +++ b/README.md @@ -64,6 +64,8 @@ flowchart LR MCP[MCP Serverface_event / face_say / face_ping] WS[face-appWebSocket + HTTP :8765] FE[Frontend UIBrowser] + ATOM[AtomS3R Device2D face LCD + Echo speaker + PTT mic] + ATOMBR[atoms3r-http-bridge] BR[operator-bridge] ASRP[/POST /api/operator/asr/] ASR[asr-workerParakeet ASRJA/EN] @@ -73,13 +75,18 @@ flowchart LR U -- Direct prompt --> TMUX U -- PTT recording --> FE U -- Text input --> FE + U -- PTT button + voice --> ATOM + ATOM -- 2D face + Echo audio --> U FE -- Audio binary --> ASRP + ATOM -- Mic WAV (POST /api/operator/asr) --> ASRP ASRP -- JSON (audioBase64,mimeType,lang) --> ASR ASR -- JSON transcript --> ASRP ASRP -- Transcript --> FE + ASRP -- Transcript --> ATOM FE -- operator_response JSON --> WS + ATOM -- operator_response (POST /api/operator/response) --> WS WS -- relay --> BR BR -- tmux send-keys --> TMUX TMUX --> C @@ -96,6 +103,10 @@ flowchart LR WS -- say payload --> TTS TTS -- audio + tts state --> FE + WS -- face/tts payloads (WS) --> ATOMBR + ATOMBR -- POST /api/headroom/payload --> ATOM + ATOMBR -- POST /api/headroom/audio --> ATOM + FE <-- HTTPS/WS --> TS TS <---> WS ``` @@ -108,6 +119,8 @@ sequenceDiagram participant U as User participant TS as Tailscale (optional) participant FE as Frontend UI + participant ATOM as AtomS3R Device + participant ATOMBR as atoms3r-http-bridge participant FA as face-app (:8765, /ws, /api/operator/asr) participant ASR as asr-worker (Parakeet) participant BR as operator-bridge @@ -123,6 +136,7 @@ sequenceDiagram FE->>FA: Connect WebSocket /ws BR->>FA: Connect WebSocket /ws + ATOMBR->>FA: Connect WebSocket /ws alt Input path A: direct terminal prompt U->>TM: Type prompt @@ -144,6 +158,16 @@ sequenceDiagram FA-->>BR: Relay payload BR->>TM: tmux send-keys(text + Enter) TM->>CX: Prompt arrives + else Input path D: AtomS3R PTT + U->>ATOM: Hold PTT button + ATOM->>FA: POST /api/operator/asr?lang=ja|en (WAV) + FA->>ASR: /v1/asr/ja|en (audioBase64,mimeType) + ASR-->>FA: Transcript JSON + FA-->>ATOM: Transcript response + ATOM->>FA: POST /api/operator/response (text) + FA-->>BR: Relay payload + BR->>TM: tmux send-keys(text + Enter) + TM->>CX: Prompt arrives end loop During work @@ -156,10 +180,15 @@ sequenceDiagram CX->>MCP: face_event / face_say / face_ping MCP->>FA: Forward WebSocket JSON FA-->>FE: event/say/state payloads + FA-->>ATOMBR: event/say/state payloads (WS) + ATOMBR->>ATOM: POST /api/headroom/payload FA->>TTS: TTS request TTS-->>FA: tts_audio / tts_mouth / say_result FA-->>FE: Realtime status + audio + FA-->>ATOMBR: tts_audio / tts_mouth (WS) + ATOMBR->>ATOM: POST /api/headroom/audio + /payload + ATOM-->>U: 2D face on LCD + Echo speaker FE-->>U: Voice, facial state, and status updates ``` @@ -535,6 +564,8 @@ flowchart LR MCP[MCP サーバーface_event / face_say / face_ping] WS[face-appWebSocket + HTTP :8765] FE[フロントエンド UIブラウザ] + ATOM[AtomS3R 端末2D顔 LCD + Echoスピーカ + PTTマイク] + ATOMBR[atoms3r-http-bridge] BR[operator-bridge] ASRP[/POST /api/operator/asr/] ASR[asr-workerParakeet ASRJA/EN] @@ -544,13 +575,18 @@ flowchart LR U -- 直接プロンプト --> TMUX U -- PTT録音 --> FE U -- テキスト入力 --> FE + U -- PTTボタン + 発話 --> ATOM + ATOM -- 2D顔 + Echo音声 --> U FE -- 音声バイナリ --> ASRP + ATOM -- マイクWAV (POST /api/operator/asr) --> ASRP ASRP -- JSON (audioBase64,mimeType,lang) --> ASR ASR -- 文字起こしJSON --> ASRP ASRP -- 文字起こし結果 --> FE + ASRP -- 文字起こし結果 --> ATOM FE -- operator_response JSON --> WS + ATOM -- operator_response (POST /api/operator/response) --> WS WS -- relay --> BR BR -- tmux send-keys --> TMUX TMUX --> C @@ -567,6 +603,10 @@ flowchart LR WS -- say payload --> TTS TTS -- audio + tts state --> FE + WS -- face/tts payloads (WS) --> ATOMBR + ATOMBR -- POST /api/headroom/payload --> ATOM + ATOMBR -- POST /api/headroom/audio --> ATOM + FE <-- HTTPS/WS --> TS TS <---> WS ``` @@ -579,6 +619,8 @@ sequenceDiagram participant U as ユーザー participant TS as Tailscale (任意) participant FE as Frontend UI + participant ATOM as AtomS3R 端末 + participant ATOMBR as atoms3r-http-bridge participant FA as face-app (:8765, /ws, /api/operator/asr) participant ASR as asr-worker (Parakeet) participant BR as operator-bridge @@ -594,6 +636,7 @@ sequenceDiagram FE->>FA: WebSocket /ws 接続 BR->>FA: WebSocket /ws 接続 + ATOMBR->>FA: WebSocket /ws 接続 alt 入力経路A: 端末直接入力 U->>TM: プロンプトを入力 @@ -615,6 +658,16 @@ sequenceDiagram FA-->>BR: payload relay BR->>TM: tmux send-keys(text + Enter) TM->>CX: プロンプト到達 + else 入力経路D: AtomS3R PTT + U->>ATOM: PTTボタンを押下 + ATOM->>FA: POST /api/operator/asr?lang=ja|en (WAV) + FA->>ASR: /v1/asr/ja|en (audioBase64,mimeType) + ASR-->>FA: 文字起こしJSON + FA-->>ATOM: 文字起こし結果 + ATOM->>FA: POST /api/operator/response (text) + FA-->>BR: payload relay + BR->>TM: tmux send-keys(text + Enter) + TM->>CX: プロンプト到達 end loop 作業中 @@ -627,10 +680,15 @@ sequenceDiagram CX->>MCP: face_event / face_say / face_ping MCP->>FA: WebSocket JSON転送 FA-->>FE: event/say/state payloads + FA-->>ATOMBR: event/say/state payloads (WS) + ATOMBR->>ATOM: POST /api/headroom/payload FA->>TTS: TTS request TTS-->>FA: tts_audio / tts_mouth / say_result FA-->>FE: リアルタイム状態 + 音声 + FA-->>ATOMBR: tts_audio / tts_mouth (WS) + ATOMBR->>ATOM: POST /api/headroom/audio + /payload + ATOM-->>U: 2D顔 (LCD) + Echoスピーカ FE-->>U: 音声・表情・状態を表示 ``` diff --git a/asr-worker/pyproject.toml b/asr-worker/pyproject.toml index b1c62c9..36de7dd 100644 --- a/asr-worker/pyproject.toml +++ b/asr-worker/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "asr-worker" -version = "1.17.3" +version = "1.17.4" description = "Local ASR worker for english-trainer (Parakeet EN/JA routing)" readme = "README.md" requires-python = ">=3.10" diff --git a/asr-worker/uv.lock b/asr-worker/uv.lock index 52add71..7c6dca5 100644 --- a/asr-worker/uv.lock +++ b/asr-worker/uv.lock @@ -247,7 +247,7 @@ wheels = [ [[package]] name = "asr-worker" -version = "1.17.3" +version = "1.17.4" source = { editable = "." } dependencies = [ { name = "fastapi" }, diff --git a/doc/diagrams/high-level-flow.mmd b/doc/diagrams/high-level-flow.mmd index 13f0981..5521817 100644 --- a/doc/diagrams/high-level-flow.mmd +++ b/doc/diagrams/high-level-flow.mmd @@ -5,6 +5,8 @@ flowchart LR MCP["MCP Server\nface_event / face_say / face_ping"] WS["face-app\nWebSocket + HTTP :8765"] FE["Frontend UI\nBrowser"] + ATOM["AtomS3R Device\n2D face LCD + Echo speaker + PTT mic"] + ATOMBR[atoms3r-http-bridge] BR[operator-bridge] ASRP[/POST /api/operator/asr/] ASR["asr-worker\nParakeet ASR\nJA/EN"] @@ -14,13 +16,18 @@ flowchart LR U -- Direct prompt --> TMUX U -- PTT recording --> FE U -- Text input --> FE + U -- PTT button + voice --> ATOM + ATOM -- 2D face + Echo audio --> U FE -- Audio binary --> ASRP + ATOM -- Mic WAV (POST /api/operator/asr) --> ASRP ASRP -- JSON (audioBase64,mimeType,lang) --> ASR ASR -- JSON transcript --> ASRP ASRP -- Transcript --> FE + ASRP -- Transcript --> ATOM FE -- operator_response JSON --> WS + ATOM -- operator_response (POST /api/operator/response) --> WS WS -- relay --> BR BR -- tmux send-keys --> TMUX TMUX --> C @@ -37,5 +44,9 @@ flowchart LR WS -- say payload --> TTS TTS -- audio + tts state --> FE + WS -- face/tts payloads (WS) --> ATOMBR + ATOMBR -- POST /api/headroom/payload --> ATOM + ATOMBR -- POST /api/headroom/audio --> ATOM + FE <-- HTTPS/WS --> TS TS <---> WS diff --git a/doc/diagrams/high-level-flow.png b/doc/diagrams/high-level-flow.png index 0bfac3e..e737011 100644 Binary files a/doc/diagrams/high-level-flow.png and b/doc/diagrams/high-level-flow.png differ diff --git a/doc/diagrams/high-level-flow.svg b/doc/diagrams/high-level-flow.svg index d2bed5e..277698a 100644 --- a/doc/diagrams/high-level-flow.svg +++ b/doc/diagrams/high-level-flow.svg @@ -1 +1 @@ -Direct promptPTT recordingText inputAudio binaryJSON(audioBase64,mimeType,lang)JSON transcriptTranscriptoperator_response JSONrelaytmux send-keysWork logs / resultscapture-pane (500ms,change-only)operator_terminal_snapshotstdio tool callsWebSocket JSONsay payloadaudio + tts stateHTTPS/WSUsertmux Terminal\nAgent paneCoding AgentMCP Server\nface_event / face_say / face_pingface-app\nWebSocket + HTTP :8765Frontend UI\nBrowseroperator-bridgePOST /api/operator/asrasr-worker\nParakeet ASR\nJA/ENtts-worker\nKokoro TTSTailscale VPN / serve \ No newline at end of file +Direct promptPTT recordingText inputPTT button + voice2D face + Echo audioAudio binaryMic WAV (POST/api/operator/asr)JSON(audioBase64,mimeType,lang)JSON transcriptTranscriptTranscriptoperator_response JSONoperator_response (POST/api/operator/response)relaytmux send-keysWork logs / resultscapture-pane (500ms,change-only)operator_terminal_snapshotstdio tool callsWebSocket JSONsay payloadaudio + tts stateface/tts payloads (WS)POST/api/headroom/payloadPOST /api/headroom/audioHTTPS/WSUsertmux Terminal\nAgent paneCoding AgentMCP Server\nface_event / face_say / face_pingface-app\nWebSocket + HTTP :8765Frontend UI\nBrowserAtomS3R Device\n2D face LCD + Echo speaker + PTT micatoms3r-http-bridgeoperator-bridgePOST /api/operator/asrasr-worker\nParakeet ASR\nJA/ENtts-worker\nKokoro TTSTailscale VPN / serve \ No newline at end of file diff --git a/doc/diagrams/sequence-timeline.mmd b/doc/diagrams/sequence-timeline.mmd index 5b6630d..ffac5da 100644 --- a/doc/diagrams/sequence-timeline.mmd +++ b/doc/diagrams/sequence-timeline.mmd @@ -3,6 +3,8 @@ sequenceDiagram participant U as User participant TS as Tailscale (optional) participant FE as Frontend UI + participant ATOM as AtomS3R Device + participant ATOMBR as atoms3r-http-bridge participant FA as face-app (:8765, /ws, /api/operator/asr) participant ASR as asr-worker (Parakeet) participant BR as operator-bridge @@ -18,6 +20,7 @@ sequenceDiagram FE->>FA: Connect WebSocket /ws BR->>FA: Connect WebSocket /ws + ATOMBR->>FA: Connect WebSocket /ws alt Input path A: direct terminal prompt U->>TM: Type prompt @@ -39,6 +42,16 @@ sequenceDiagram FA-->>BR: Relay payload BR->>TM: tmux send-keys(text + Enter) TM->>CX: Prompt arrives + else Input path D: AtomS3R PTT + U->>ATOM: Hold PTT button + ATOM->>FA: POST /api/operator/asr?lang=ja|en (WAV) + FA->>ASR: /v1/asr/ja|en (audioBase64,mimeType) + ASR-->>FA: Transcript JSON + FA-->>ATOM: Transcript response + ATOM->>FA: POST /api/operator/response (text) + FA-->>BR: Relay payload + BR->>TM: tmux send-keys(text + Enter) + TM->>CX: Prompt arrives end loop During work @@ -51,8 +64,13 @@ sequenceDiagram CX->>MCP: face_event / face_say / face_ping MCP->>FA: Forward WebSocket JSON FA-->>FE: event/say/state payloads + FA-->>ATOMBR: event/say/state payloads (WS) + ATOMBR->>ATOM: POST /api/headroom/payload FA->>TTS: TTS request TTS-->>FA: tts_audio / tts_mouth / say_result FA-->>FE: Realtime status + audio + FA-->>ATOMBR: tts_audio / tts_mouth (WS) + ATOMBR->>ATOM: POST /api/headroom/audio + /payload + ATOM-->>U: 2D face on LCD + Echo speaker FE-->>U: Voice, facial state, and status updates diff --git a/doc/diagrams/sequence-timeline.png b/doc/diagrams/sequence-timeline.png index 5916c2f..ed2f9de 100644 Binary files a/doc/diagrams/sequence-timeline.png and b/doc/diagrams/sequence-timeline.png differ diff --git a/doc/diagrams/sequence-timeline.svg b/doc/diagrams/sequence-timeline.svg index 80f08e3..c2b7e59 100644 --- a/doc/diagrams/sequence-timeline.svg +++ b/doc/diagrams/sequence-timeline.svg @@ -1 +1 @@ -tts-worker (Kokoro)mcp-serverCoding Agenttmux (Agent pane)operator-bridgeasr-worker (Parakeet)face-app (:8765, /ws, /api/operator/asr)Frontend UITailscale (optional)Usertts-worker (Kokoro)mcp-serverCoding Agenttmux (Agent pane)operator-bridgeasr-worker (Parakeet)face-app (:8765, /ws, /api/operator/asr)Frontend UITailscale (optional)Useropt[Remote access]alt[Input path A: direct terminal prompt][Input path B: frontend PTT][Input path C: frontend text]loop[During work]Open Face UI URL1Serve forwarded UI2Connect WebSocket /ws3Connect WebSocket /ws4Type prompt5Prompt arrives6Hold PTT JA/EN7POST /api/operator/asr?lang=ja|en (audio)8/v1/asr/ja|en (audioBase64,mimeType)9Transcript JSON10Transcript response11Tap Send12operator_response{text}13Relay payload14tmux send-keys(text + Enter)15Prompt arrives16Enter text + Send Text17operator_response{text}18Relay payload19tmux send-keys(text + Enter)20Prompt arrives21Progress/result logs22capture-pane -e (500ms)23operator_terminal_snapshot24Terminal mirror update25face_event / face_say / face_ping26Forward WebSocket JSON27event/say/state payloads28TTS request29tts_audio / tts_mouth / say_result30Realtime status + audio31Voice, facial state, and status updates32 \ No newline at end of file +tts-worker (Kokoro)mcp-serverCoding Agenttmux (Agent pane)operator-bridgeasr-worker (Parakeet)face-app (:8765, /ws, /api/operator/asr)atoms3r-http-bridgeAtomS3R DeviceFrontend UITailscale (optional)Usertts-worker (Kokoro)mcp-serverCoding Agenttmux (Agent pane)operator-bridgeasr-worker (Parakeet)face-app (:8765, /ws, /api/operator/asr)atoms3r-http-bridgeAtomS3R DeviceFrontend UITailscale (optional)Useropt[Remote access]alt[Input path A: direct terminal prompt][Input path B: frontend PTT][Input path C: frontend text][Input path D: AtomS3R PTT]loop[During work]Open Face UI URL1Serve forwarded UI2Connect WebSocket /ws3Connect WebSocket /ws4Connect WebSocket /ws5Type prompt6Prompt arrives7Hold PTT JA/EN8POST /api/operator/asr?lang=ja|en (audio)9/v1/asr/ja|en (audioBase64,mimeType)10Transcript JSON11Transcript response12Tap Send13operator_response{text}14Relay payload15tmux send-keys(text + Enter)16Prompt arrives17Enter text + Send Text18operator_response{text}19Relay payload20tmux send-keys(text + Enter)21Prompt arrives22Hold PTT button23POST /api/operator/asr?lang=ja|en (WAV)24/v1/asr/ja|en (audioBase64,mimeType)25Transcript JSON26Transcript response27POST /api/operator/response (text)28Relay payload29tmux send-keys(text + Enter)30Prompt arrives31Progress/result logs32capture-pane -e (500ms)33operator_terminal_snapshot34Terminal mirror update35face_event / face_say / face_ping36Forward WebSocket JSON37event/say/state payloads38event/say/state payloads (WS)39POST /api/headroom/payload40TTS request41tts_audio / tts_mouth / say_result42Realtime status + audio43tts_audio / tts_mouth (WS)44POST /api/headroom/audio + /payload452D face on LCD + Echo speaker46Voice, facial state, and status updates47 \ No newline at end of file diff --git a/mcp-server/dist/index.js b/mcp-server/dist/index.js index b312f1b..dd95a59 100644 --- a/mcp-server/dist/index.js +++ b/mcp-server/dist/index.js @@ -3,7 +3,7 @@ import { randomUUID } from 'node:crypto'; import { createFramedMessageParser, writeMessage } from './mcp_stdio.js'; const SERVER_NAME = 'minimum-headroom'; -const SERVER_VERSION = '1.17.3'; +const SERVER_VERSION = '1.17.4'; const PROTOCOL_VERSION = '2024-11-05'; const FACE_WS_URL = process.env.FACE_WS_URL ?? 'ws://127.0.0.1:8765/ws'; const FACE_AUTH_TOKEN = (() => { diff --git a/package.json b/package.json index 67a25f7..af427a4 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "minimum-headroom", - "version": "1.17.3", + "version": "1.17.4", "private": true, "type": "module", "scripts": { diff --git a/tts-worker/pyproject.toml b/tts-worker/pyproject.toml index 9d31d04..c3cd848 100644 --- a/tts-worker/pyproject.toml +++ b/tts-worker/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "minimum-headroom-tts-worker" -version = "1.17.3" +version = "1.17.4" description = "Minimum Headroom TTS worker (Kokoro ONNX default, optional Qwen3-TTS)" readme = "README.md" requires-python = ">=3.12" diff --git a/tts-worker/uv.lock b/tts-worker/uv.lock index 6d3d72b..9b3c7cd 100644 --- a/tts-worker/uv.lock +++ b/tts-worker/uv.lock @@ -331,7 +331,7 @@ wheels = [ [[package]] name = "minimum-headroom-tts-worker" -version = "1.17.3" +version = "1.17.4" source = { editable = "." } dependencies = [ { name = "fugashi" },
User
tmux Terminal\nAgent pane
Coding Agent
MCP Server\nface_event / face_say / face_ping
face-app\nWebSocket + HTTP :8765
Frontend UI\nBrowser
operator-bridge
POST /api/operator/asr
asr-worker\nParakeet ASR\nJA/EN
tts-worker\nKokoro TTS
Tailscale VPN / serve
AtomS3R Device\n2D face LCD + Echo speaker + PTT mic
atoms3r-http-bridge