From 12dc9b5c02b2fcc58f2feae4a683c71fff590f73 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 21 Apr 2026 05:38:45 +0000 Subject: [PATCH 1/3] Add Meet bot scaffold: anonymous guest join + lobby handling Phase 1 of Google Meet speaker-identity integration. Playwright bot that joins a Meet URL as an unauthenticated guest and waits to be admitted. Audio capture, DOM scraping for active speaker, and Polyglot WebSocket wiring come in subsequent phases. All Meet DOM selectors are centralized in selectors.js so future UI rotations are a one-file fix. https://claude.ai/code/session_019SWkcdJekyEmJqkwSPMbPH --- meet-bot/.gitignore | 3 + meet-bot/README.md | 61 +++++++++++++++++ meet-bot/index.js | 155 ++++++++++++++++++++++++++++++++++++++++++ meet-bot/package.json | 17 +++++ meet-bot/selectors.js | 51 ++++++++++++++ 5 files changed, 287 insertions(+) create mode 100644 meet-bot/.gitignore create mode 100644 meet-bot/README.md create mode 100644 meet-bot/index.js create mode 100644 meet-bot/package.json create mode 100644 meet-bot/selectors.js diff --git a/meet-bot/.gitignore b/meet-bot/.gitignore new file mode 100644 index 0000000..2e6fae9 --- /dev/null +++ b/meet-bot/.gitignore @@ -0,0 +1,3 @@ +node_modules/ +package-lock.json +*.log diff --git a/meet-bot/README.md b/meet-bot/README.md new file mode 100644 index 0000000..12d6416 --- /dev/null +++ b/meet-bot/README.md @@ -0,0 +1,61 @@ +# Polyglot Meet Bot + +Headless Chromium bot that joins a Google Meet as an anonymous guest. Future phases will stream meeting audio and active-speaker names back to the Polyglot server; this initial phase validates only the join-and-get-admitted flow. + +## Setup + +```bash +cd meet-bot +npm install +npx playwright install chromium +``` + +Node 20+ required. + +## Run + +```bash +# Typical use — fully headless: +node index.js --url "https://meet.google.com/xxx-yyyy-zzz" + +# Watch what the bot sees (debug Meet UI issues): +node index.js --url "https://meet.google.com/xxx-yyyy-zzz" --headful + +# Override the displayed name (default "Polyglot Bot"): +node index.js --url "..." --name "Transcription Bot" +``` + +## What it does (phase 1) + +1. Launches a fresh, cookieless Chromium — no Google sign-in. +2. Opens the Meet URL, waits for the pre-join screen. +3. Fills the "Your name" field, mutes mic + camera, clicks **Ask to join**. +4. Waits up to 2 minutes for the host to admit it. +5. Once admitted, stays connected until the meeting ends or it's removed. + +Exit codes: + +| Code | Meaning | +|------|---------| +| 0 | Joined successfully, then meeting ended / bot removed cleanly | +| 1 | Crash / unexpected error (see stderr) | +| 2 | Bad CLI arguments | +| 3 | Host explicitly denied the join request | +| 4 | Timed out in the lobby (host never admitted) | + +## Testing + +The easy test: open Meet in a normal browser tab, start a meeting as host, run the bot with `--headful --url `, and admit it from the participants panel when it shows up as "Polyglot Bot". You should see the bot's Chromium window join the call. + +## What's NOT here yet + +- Audio capture (tab audio → 16 kHz PCM16 → Polyglot WebSocket) +- DOM scraping of active-speaker name and participant roster +- WebSocket connection to the Polyglot backend +- Control channel (join/leave commands from Polyglot's admin UI) + +Those land in subsequent phases once we've validated the bot can reliably get into meetings. + +## Selectors + +All Meet DOM selectors live in `selectors.js`. When Meet ships a UI change and the bot breaks, that's the file to update — nothing else should need touching. diff --git a/meet-bot/index.js b/meet-bot/index.js new file mode 100644 index 0000000..e248fa4 --- /dev/null +++ b/meet-bot/index.js @@ -0,0 +1,155 @@ +// Polyglot Meet Bot — phase 1: anonymous guest join + lobby wait. +// +// Usage: +// node index.js --url https://meet.google.com/xxx-yyyy-zzz [--name "Polyglot Bot"] [--headful] +// +// Exits 0 on clean leave, non-zero on join failure / denial / crash. Audio +// capture and DOM speaker-event scraping are not implemented yet — this +// phase validates the riskiest piece (can we actually get into a meeting?) +// before wiring anything to Polyglot. + +import { chromium } from "playwright"; +import { SELECTORS, firstMatching } from "./selectors.js"; + +function parseArgs(argv) { + const args = { url: null, name: "Polyglot Bot", headful: false }; + for (let i = 2; i < argv.length; i++) { + const a = argv[i]; + if (a === "--url") args.url = argv[++i]; + else if (a === "--name") args.name = argv[++i]; + else if (a === "--headful") args.headful = true; + } + if (!args.url) { + console.error("Usage: node index.js --url [--name ] [--headful]"); + process.exit(2); + } + return args; +} + +function log(msg, ...rest) { + const ts = new Date().toISOString(); + console.log(`[${ts}] ${msg}`, ...rest); +} + +async function joinMeeting({ url, name, headful }) { + log(`Launching Chromium (headful=${headful})`); + + const browser = await chromium.launch({ + headless: !headful, + args: [ + // Auto-grant mic/cam permission prompts so Meet's pre-join screen + // doesn't block. The bot mutes both immediately after join. + "--use-fake-ui-for-media-stream", + // Some distros need this when running Chromium without a full desktop. + "--no-sandbox", + "--disable-dev-shm-usage", + ], + }); + + // Fresh context = incognito-equivalent. No cookies, no persistent profile, + // no Google sign-in. Meet treats us as an anonymous guest. + const context = await browser.newContext({ + permissions: ["microphone", "camera"], + }); + const page = await context.newPage(); + + log(`Navigating to ${url}`); + await page.goto(url, { waitUntil: "domcontentloaded" }); + + // Fill the "Your name" field. Meet has had multiple implementations of + // this input across A/B rollouts, so we try each known selector. + log("Waiting for name input"); + const match = await firstMatching(page, SELECTORS.nameInput, 20000); + if (!match) { + throw new Error( + "Could not find the 'Your name' field. This meeting may require a signed-in Google account." + ); + } + log(`Filling name field (${match.selector}) with "${name}"`); + await match.element.fill(name); + + // Make sure mic + camera are off *before* joining so we don't blast audio + // into the meeting or show a black-tile camera. Meet's pre-join toggles + // default to on; we flip them if aria-label indicates they're currently on. + for (const [label, sel] of [ + ["microphone", SELECTORS.micToggle], + ["camera", SELECTORS.camToggle], + ]) { + const btn = await page.$(sel); + if (btn) { + const aria = (await btn.getAttribute("aria-label")) || ""; + // Meet writes "Turn off " when currently on, "Turn on " + // when currently off. We want them off pre-join. + if (/turn off/i.test(aria)) { + log(`Muting ${label} (was on)`); + await btn.click(); + } + } + } + + // Click the join button. Label varies: "Ask to join" (normal guest), + // "Join now" (pre-admitted). Try each known label in order. + let clicked = false; + for (const label of SELECTORS.joinButtonNames) { + const btn = page.getByRole("button", { name: label }); + if (await btn.count()) { + log(`Clicking "${label}"`); + await btn.first().click(); + clicked = true; + break; + } + } + if (!clicked) throw new Error("Could not find a Join / Ask-to-join button."); + + // Wait for one of three terminal states: + // 1. Leave-call button appears -> we're in the meeting. + // 2. "Denied" text appears -> host rejected us. + // 3. Timeout -> still in lobby, host never admitted. + log("Waiting for host to admit from lobby (up to 2 min)…"); + const inMeeting = await Promise.race([ + page + .waitForSelector(SELECTORS.leaveCallButton, { timeout: 120000 }) + .then(() => "joined") + .catch(() => null), + page + .waitForFunction( + (pattern) => new RegExp(pattern.source, pattern.flags).test(document.body.innerText), + { source: SELECTORS.deniedText.source, flags: SELECTORS.deniedText.flags }, + { timeout: 120000 } + ) + .then(() => "denied") + .catch(() => null), + ]); + + if (inMeeting === "joined") { + log("JOINED — bot is in the meeting."); + // Keep the page alive so you can see it in Meet. Phase 2 will add + // audio capture + DOM scraping here. For now, stay connected until the + // leave button disappears (meeting ended / we were removed) or SIGINT. + await page + .waitForSelector(SELECTORS.leaveCallButton, { state: "detached", timeout: 0 }) + .catch(() => {}); + log("Leave-call button gone — meeting ended or bot was removed."); + return 0; + } + + if (inMeeting === "denied") { + log("DENIED — host rejected the join request."); + return 3; + } + + log("TIMED OUT in lobby (2 min) — host did not admit."); + return 4; +} + +(async () => { + const args = parseArgs(process.argv); + let code = 1; + try { + code = await joinMeeting(args); + } catch (err) { + log("ERROR:", err.message); + code = 1; + } + process.exit(code); +})(); diff --git a/meet-bot/package.json b/meet-bot/package.json new file mode 100644 index 0000000..26f9240 --- /dev/null +++ b/meet-bot/package.json @@ -0,0 +1,17 @@ +{ + "name": "polyglot-meet-bot", + "version": "0.1.0", + "description": "Headless Chromium bot that joins a Google Meet as an anonymous guest and streams audio + speaker identity back to Polyglot.", + "private": true, + "type": "module", + "scripts": { + "start": "node index.js", + "install-browsers": "playwright install chromium" + }, + "engines": { + "node": ">=20" + }, + "dependencies": { + "playwright": "^1.47.0" + } +} diff --git a/meet-bot/selectors.js b/meet-bot/selectors.js new file mode 100644 index 0000000..2d60dfa --- /dev/null +++ b/meet-bot/selectors.js @@ -0,0 +1,51 @@ +// Google Meet DOM selectors. +// +// Meet's CSS classes are obfuscated and rotate. This file centralizes every +// selector the bot relies on so a UI change is a one-file fix. Prefer stable +// anchors (aria-label, role, visible text) over class names. + +export const SELECTORS = { + // Pre-join screen (anonymous guest path) ------------------------------- + // The "Your name" input shown to signed-out users on the Meet landing page + // before joining. Meet has used multiple implementations; try in order. + nameInput: [ + 'input[aria-label="Your name"]', + 'input[placeholder="Your name"]', + 'input[jsname][type="text"]', + ], + + // The button that submits the pre-join form. Its label depends on meeting + // config: "Ask to join" when the host hasn't admitted you, "Join now" when + // you're the host or pre-admitted. Match on visible text via Playwright's + // getByRole('button', { name: ... }) at call sites — selector here is a + // fallback for the ARIA role. + joinButtonNames: ["Ask to join", "Join now", "Join"], + + // Pre-join sometimes prompts to turn off mic/cam — these buttons toggle + // them. Anchored on aria-label which Meet has kept stable for years. + micToggle: 'div[role="button"][aria-label*="microphone" i]', + camToggle: 'div[role="button"][aria-label*="camera" i]', + + // In-call indicators --------------------------------------------------- + // Presence of the leave-call button is the most reliable "we are in the + // meeting" signal. Its aria-label is "Leave call". + leaveCallButton: 'button[aria-label="Leave call"]', + + // Lobby / denial detection. When the host denies entry, Meet shows a + // message containing this text. + deniedText: /You can't join this call|no one responded|denied/i, +}; + +// Helper: return the first selector from a list that matches something on +// the page. Used for resilient element lookup when Meet ships A/B variants. +export async function firstMatching(page, selectorList, timeoutMs = 15000) { + const deadline = Date.now() + timeoutMs; + while (Date.now() < deadline) { + for (const sel of selectorList) { + const el = await page.$(sel); + if (el) return { selector: sel, element: el }; + } + await page.waitForTimeout(250); + } + return null; +} From f7044f829ae4e7af385e29de8481634bb212cba9 Mon Sep 17 00:00:00 2001 From: Chabert Etienne Date: Tue, 21 Apr 2026 11:21:17 +0200 Subject: [PATCH 2/3] Wire Meet bot through Polyglot with captions-based speaker identity Complete end-to-end pipeline: the bot joins a Meet call, streams audio to Polyglot over Socket.IO, and resolves pyannote's SPEAKER_XX labels to real display names by overlapping diarization against a wall-clock speaker timeline built from Meet's live-captions DOM. - Bot audio capture: RTCPeerConnection init-script taps all remote audio tracks into __pgStream; AudioWorklet resamples to 16 kHz PCM16 in 20 ms frames, base64-bridged to node and forwarded to Polyglot's /meet_bot namespace. - Speaker detection via captions: enables Meet captions via toolbar button (keyboard fallback), observes the aria-label="Captions" region, and extracts speaker names from each caption block's .NWpY1d span. Falls back to legacy data-is-speaking / aria-label signals. - Polyglot ingest: /meet_bot Socket.IO namespace rechunks 320-sample bot frames into CHUNK_SIZE batches, maintains a 500-entry speaker_timeline deque of closed intervals, tracks _active_speaker open intervals, and records the full meeting audio to transcripts/.wav for offline retranscription. - Phase 5 resolution: resolve_speaker_identity overlaps each pyannote segment's wall-clock range against the timeline (closed + still-open), picks the majority-vote name if it covers >=30% of the segment, and emits rename_speaker WebSocket events. Resolved names replace SPEAKER_XX labels directly in transcript segments before WS emit. - Speaker-switch batching: when the bot reports a new active speaker, process_audio flushes the current batch so each turn transcribes as one unit (up to BOT_MAX_BATCH_SEC = 60s). - Admin UI: bot status badge, roster panel, and retroactive rename_speaker handler that rewrites SPEAKER_XX labels in-place. - Persistent chrome-profile for Google sign-in cookies, 15s Polyglot connection timeout with auto-reconnect, forced-click fallback for the join button under overlays. Co-Authored-By: Claude Sonnet 4.6 --- .gitignore | 1 + app.py | 240 +++++++++++++++++++++++++++++++++-- config.py | 8 ++ meet-bot/.gitignore | 1 + meet-bot/audio.js | 127 +++++++++++++++++++ meet-bot/index.js | 285 ++++++++++++++++++++++++++++++++---------- meet-bot/package.json | 3 +- meet-bot/selectors.js | 37 ++++++ meet-bot/speaker.js | 272 ++++++++++++++++++++++++++++++++++++++++ templates/admin.html | 48 ++++++- 10 files changed, 947 insertions(+), 75 deletions(-) create mode 100644 meet-bot/audio.js create mode 100644 meet-bot/speaker.js diff --git a/.gitignore b/.gitignore index 84ad134..7b1ab87 100644 --- a/.gitignore +++ b/.gitignore @@ -208,4 +208,5 @@ __marimo__/ transcript.txt /transcripts polyglot.lock +viewer_password.txt .claude/settings.local.json diff --git a/app.py b/app.py index 6a98f48..0b93f65 100644 --- a/app.py +++ b/app.py @@ -4,6 +4,7 @@ """ import argparse +import collections import os import queue import random @@ -11,6 +12,7 @@ import threading import time import warnings +import wave import webbrowser from concurrent.futures import ThreadPoolExecutor from datetime import datetime @@ -21,9 +23,9 @@ def log(message, tag=None): """Print a log message with timestamp. Tag is optional prefix like [SUMMARY]""" ts = datetime.now().strftime('%H:%M:%S') if tag: - print(f"[{ts}] [{tag}] {message}") + print(f"[{ts}] [{tag}] {message}", flush=True) else: - print(f"[{ts}] {message}") + print(f"[{ts}] {message}", flush=True) # Word list for generating memorable passphrases @@ -319,6 +321,30 @@ def cleanup_lock_file(): meeting_start_time = None # Track when meeting started (first transcription) summary_pending = False # Track if a summary generation is waiting +# ── Meet bot state (Phase 4) ────────────────────────────────────────────────── +# Deque of closed speaker segments: (start_ms, end_ms, display_name). +# Populated by the /meet_bot SocketIO namespace; consumed by resolve_speaker_identity(). +speaker_timeline = collections.deque(maxlen=500) +# Known participants: display_name -> first_seen_ms +meet_participants = {} +# Open (unended) speaker intervals: display_name -> start_ms +_active_speaker_starts = {} +# Whether the Playwright bot is currently connected +bot_connected = False +# Accumulation buffer for rechunking 320-sample bot frames → CHUNK_SIZE frames +_bot_pcm_buffer = np.array([], dtype=np.float32) +# Wall-clock ms of the most recently received bot audio frame (for Phase 5 time-alignment) +_last_capture_ts_ms = None +# WAV file writer — captures the full raw 16 kHz mono meeting audio for retranscription +_bot_wav_writer = None +_bot_wav_lock = threading.Lock() +# Speaker-switch batching: who the bot says is currently speaking, and whether the most recent +# speaker_start differs from the previous one (triggers process_audio to flush the current batch). +_current_bot_speaker = None +_pending_speaker_switch = False +# Maximum batch length when bot is connected (gives Whisper more context per speaker turn). +BOT_MAX_BATCH_SEC = 60 + def load_transcript_segments(transcript_path): """Load existing transcript file into all_meeting_segments for summarization. @@ -1236,8 +1262,57 @@ def perform_speaker_diarization(audio_data, sample_rate): return None +def resolve_speaker_identity(speaker_segments, batch_end_ts_ms, audio_duration_secs): + """Match pyannote speaker IDs to real names via speaker_timeline overlap. + + Returns {original_pyannote_id: real_name} for speakers with >= 30% time overlap. + Considers both closed (speaker_timeline) and still-open (_active_speaker_starts) + intervals, since a speaker who started talking during this batch may not have + emitted speaker_end yet when the transcription thread kicks off. + """ + if batch_end_ts_ms is None or not speaker_segments: + return {} + + # Build the full set of speaker intervals to consider. + intervals = list(speaker_timeline) # closed: (start_ms, end_ms, name) + now_ms = int(time.time() * 1000) + for name, start_ms in _active_speaker_starts.items(): + intervals.append((start_ms, now_ms, name)) + + if not intervals: + return {} + + from collections import defaultdict + batch_start_ms = batch_end_ts_ms - audio_duration_secs * 1000 + + speaker_times = defaultdict(list) + for seg in speaker_segments: + seg_start_ms = batch_start_ms + seg["start"] * 1000 + seg_end_ms = batch_start_ms + seg["end"] * 1000 + speaker_times[seg["speaker"]].append((seg_start_ms, seg_end_ms)) + + resolved = {} + for original_id, time_ranges in speaker_times.items(): + name_overlap = defaultdict(float) + for seg_start_ms, seg_end_ms in time_ranges: + for (tl_start, tl_end, name) in intervals: + overlap = max(0.0, min(seg_end_ms, tl_end) - max(seg_start_ms, tl_start)) + if overlap > 0: + name_overlap[name] += overlap + + if not name_overlap: + continue + + best_name = max(name_overlap, key=name_overlap.get) + total_ms = sum(end - start for start, end in time_ranges) + if total_ms > 0 and name_overlap[best_name] / total_ms >= 0.30: + resolved[original_id] = best_name + + return resolved + + @torch.inference_mode() -def transcribe_and_translate(audio_data, audio_duration): +def transcribe_and_translate(audio_data, audio_duration, batch_end_ts_ms=None): """Background thread for transcription and translation with speaker diarization""" global is_processing, all_meeting_segments @@ -1328,6 +1403,13 @@ def normalize_caps(text): speaker_mapping[original_id] = f"SPEAKER_{speaker_counter:02d}" speaker_counter += 1 + # Phase 5: resolve pyannote IDs → real names from speaker_timeline. + resolved_names = resolve_speaker_identity(speaker_segments, batch_end_ts_ms, audio_duration) + if resolved_names: + for orig_id, real_name in resolved_names.items(): + speaker_xx = speaker_mapping.get(orig_id, orig_id) + log(f"Resolved {speaker_xx} → {real_name}", "BOT") + socketio.emit("rename_speaker", {"speaker_id": speaker_xx, "name": real_name}) # Extract all words with timestamps from chunks all_words = [] @@ -1361,7 +1443,9 @@ def normalize_caps(text): if overlap > max_overlap: max_overlap = overlap - best_speaker = speaker_mapping.get(seg["speaker"], seg["speaker"]) + orig = seg["speaker"] + # Prefer resolved real name; fall back to renumbered SPEAKER_XX. + best_speaker = resolved_names.get(orig, speaker_mapping.get(orig, orig)) best_segment_idx = idx words_with_speakers.append({ @@ -1705,10 +1789,22 @@ def process_audio(): buffer.append(chunk) + # Speaker-switch batching (bot mode): when the bot reports a new active + # speaker, flush the current batch so each turn is transcribed as one unit. + # Extend max length to BOT_MAX_BATCH_SEC so one speaker can ramble up to 1 min. + global _pending_speaker_switch + bot_active = bot_connected and _current_bot_speaker is not None + if bot_active: + max_chunks = int(actual_sample_rate * BOT_MAX_BATCH_SEC / CHUNK_SIZE) + + speaker_switched = bot_active and _pending_speaker_switch and len(buffer) >= min_chunks + if speaker_switched: + _pending_speaker_switch = False + # Process when we detect end of sentence (silence after minimum audio) OR max buffer reached silence_detected = len(buffer) >= min_chunks and silence_counter >= audio_thresholds["silence_chunks"] max_length_reached = len(buffer) >= max_chunks - should_process = silence_detected or max_length_reached + should_process = silence_detected or max_length_reached or speaker_switched if should_process: is_processing = True # Set lock @@ -1753,10 +1849,13 @@ def process_audio(): # Calculate audio duration audio_duration = len(audio_resampled) / SAMPLE_RATE + # Snapshot the wall-clock anchor for Phase 5 speaker resolution. + batch_end_ts = _last_capture_ts_ms + # Launch background thread for transcription and translation # This keeps the main loop responsive for WebSocket updates processing_thread = threading.Thread( - target=transcribe_and_translate, args=(audio_resampled, audio_duration), daemon=True + target=transcribe_and_translate, args=(audio_resampled, audio_duration, batch_end_ts), daemon=True ) processing_thread.start() @@ -2201,10 +2300,18 @@ def start_listening_internal(): if not is_listening: is_listening = True - # Start audio stream + # Start the processing thread first (same for both audio sources). audio_thread = threading.Thread(target=process_audio, daemon=True) audio_thread.start() + # Meet-bot path: audio arrives over SocketIO at 16 kHz mono; skip PyAudio entirely. + if Config.AUDIO_SOURCE == "meet_bot": + actual_sample_rate = Config.SAMPLE_RATE # 16000 + num_channels = 1 + print("[AUDIO] Source: Meet bot (waiting for bot to connect and stream audio)") + socketio.emit("status", {"listening": True}) + return + # Initialize PyAudio p_audio = pyaudio.PyAudio() @@ -2491,6 +2598,125 @@ def handle_broadcast_manual_summary(data): emit('manual_summary_broadcast', {'success': True, 'languages_sent': list(translations_cache.keys())}) +# ── Meet bot SocketIO namespace (Phase 4) ──────────────────────────────────── +# +# The Playwright bot connects here as a socket.io-client at /meet_bot. +# It streams two event types: +# audio_frame — binary PCM16 payload + JSON meta {capture_ts_ms, sample_rate, channels} +# speaker_event — JSON {type, name, wall_clock_ms} for speaker_start/end/roster_update + +def _open_bot_wav(): + """Open a WAV file to record the full raw meeting audio from the bot.""" + global _bot_wav_writer + with _bot_wav_lock: + if _bot_wav_writer is not None or not TRANSCRIPT_FILE: + return + wav_path = TRANSCRIPT_FILE.with_suffix(".wav") + try: + _bot_wav_writer = wave.open(str(wav_path), "wb") + _bot_wav_writer.setnchannels(1) + _bot_wav_writer.setsampwidth(2) # int16 + _bot_wav_writer.setframerate(Config.SAMPLE_RATE) + log(f"Recording meeting audio → {wav_path.name}", "BOT") + except Exception as e: + log(f"Failed to open WAV: {e}", "BOT") + _bot_wav_writer = None + + +def _close_bot_wav(): + global _bot_wav_writer + with _bot_wav_lock: + if _bot_wav_writer is None: + return + try: + _bot_wav_writer.close() + log("Meeting audio file closed", "BOT") + except Exception as e: + log(f"Error closing WAV: {e}", "BOT") + _bot_wav_writer = None + + +if Config.MEET_BOT_ENABLED: + + @socketio.on("connect", namespace="/meet_bot") + def bot_connect(): + global bot_connected + bot_connected = True + log("Meet bot connected", "BOT") + _open_bot_wav() + socketio.emit("bot_status", {"connected": True}, room="admin") + + @socketio.on("disconnect", namespace="/meet_bot") + def bot_disconnect(): + global bot_connected, _active_speaker_starts + bot_connected = False + # Close any open speaker intervals so timeline stays consistent. + now_ms = int(time.time() * 1000) + for name, start in list(_active_speaker_starts.items()): + speaker_timeline.append((start, now_ms, name)) + _active_speaker_starts.clear() + _close_bot_wav() + log("Meet bot disconnected", "BOT") + socketio.emit("bot_status", {"connected": False}, room="admin") + + @socketio.on("audio_frame", namespace="/meet_bot") + def bot_audio_frame(meta, data): + global _bot_pcm_buffer, _last_capture_ts_ms + if not is_listening: + return + # Persist raw int16 PCM for offline retranscription. + # writeframes (not writeframesraw) patches the header on every write so + # the file is valid even if the server is killed without clean shutdown. + if _bot_wav_writer is not None: + try: + with _bot_wav_lock: + if _bot_wav_writer is not None: + _bot_wav_writer.writeframes(data) + except Exception: + pass + # data arrives as bytes (Int16 PCM, 16 kHz mono, 320 samples = 20 ms). + frame = np.frombuffer(data, dtype=np.int16).astype(np.float32) / 32768.0 + _bot_pcm_buffer = np.concatenate([_bot_pcm_buffer, frame]) + # Rechunk to CHUNK_SIZE (1024) so process_audio's duration math is correct. + while len(_bot_pcm_buffer) >= CHUNK_SIZE: + audio_queue.put(_bot_pcm_buffer[:CHUNK_SIZE].copy()) + _bot_pcm_buffer = _bot_pcm_buffer[CHUNK_SIZE:] + _last_capture_ts_ms = meta.get("capture_ts_ms") + + @socketio.on("speaker_event", namespace="/meet_bot") + def bot_speaker_event(ev): + global meet_participants, _active_speaker_starts, _current_bot_speaker, _pending_speaker_switch + ev_type = ev.get("type") + name = ev.get("name") + ts_ms = ev.get("wall_clock_ms", int(time.time() * 1000)) + + if ev_type == "roster_update": + for p in ev.get("participants", []): + if p and p not in meet_participants: + meet_participants[p] = ts_ms + socketio.emit("meet_roster", { + "participants": list(meet_participants.keys()), + "bot_connected": True, + }, room="admin") + return + + if not name: + return + + if ev_type == "speaker_start": + _active_speaker_starts[name] = ts_ms + # Flag a pending switch if the speaker changed — process_audio uses this to flush. + if _current_bot_speaker is not None and _current_bot_speaker != name: + _pending_speaker_switch = True + _current_bot_speaker = name + log(f"Speaking: {name}", "BOT") + + elif ev_type == "speaker_end": + start = _active_speaker_starts.pop(name, ts_ms - 1000) + speaker_timeline.append((start, ts_ms, name)) + log(f"Segment: {name} {ts_ms - start} ms", "BOT") + + if __name__ == "__main__": # Check for single instance before doing anything else check_single_instance() diff --git a/config.py b/config.py index 082569f..8c9485e 100644 --- a/config.py +++ b/config.py @@ -211,6 +211,14 @@ def get_translation_lang_code(cls, iso_code): SAMPLE_RATE = 16000 # Whisper expects 16kHz audio CHUNK_SIZE = 1024 # Audio buffer chunk size + # Audio source — "wasapi" uses the WASAPI loopback device (original path); + # "meet_bot" receives 16 kHz PCM16 from the Playwright bot over SocketIO. + AUDIO_SOURCE = os.getenv("AUDIO_SOURCE", "meet_bot") + + # Meet bot SocketIO receiver. The bot connects to /meet_bot on whatever + # port Polyglot is already running on — no separate port needed. + MEET_BOT_ENABLED = os.getenv("MEET_BOT_ENABLED", "True").lower() in ("true", "1", "yes") + # Minimum audio level to process (prevents hallucinations during silence) # If average audio level is below this, skip transcription MIN_AUDIO_LEVEL = 0.01 diff --git a/meet-bot/.gitignore b/meet-bot/.gitignore index 2e6fae9..793052f 100644 --- a/meet-bot/.gitignore +++ b/meet-bot/.gitignore @@ -1,3 +1,4 @@ node_modules/ package-lock.json *.log +chrome-profile/ diff --git a/meet-bot/audio.js b/meet-bot/audio.js new file mode 100644 index 0000000..508caed --- /dev/null +++ b/meet-bot/audio.js @@ -0,0 +1,127 @@ +// Phase 2: in-browser audio capture for the Meet bot. +// +// Two-part design: +// 1. RTC_INIT_SCRIPT — must be registered via context.addInitScript() BEFORE +// page.goto() so it runs before Meet initialises its RTCPeerConnections. +// It patches RTCPeerConnection to funnel every remote audio track into a +// single shared MediaStream (window.__pgStream). +// +// 2. setupAudioCapture(page, onChunk) — called after the bot has joined. +// Injects an AudioWorklet that downsamples all audio in __pgStream to +// 16 kHz mono PCM16, buffers into 20 ms frames, and sends each frame +// back to Node via an exposed function. + +// ── 1. RTC patch (init script) ─────────────────────────────────────────────── + +export const RTC_INIT_SCRIPT = `(function () { + window.__pgStream = new MediaStream(); + const _Orig = window.RTCPeerConnection; + class _Patched extends _Orig { + constructor(...a) { + super(...a); + this.addEventListener('track', (ev) => { + if (ev.track.kind !== 'audio') return; + if (!window.__pgStream.getTrackById(ev.track.id)) + window.__pgStream.addTrack(ev.track); + }); + } + } + window.RTCPeerConnection = _Patched; +})();`; + +// ── 2. AudioWorklet processor source ───────────────────────────────────────── +// +// Nearest-neighbour resampler: maintains a fractional index across process() +// calls so downsampling is consistent across block boundaries. +// Buffers output until 320 samples (20 ms @ 16 kHz) are ready, then posts +// { pcm: ArrayBuffer, ts: number } to the main thread. + +const WORKLET_SRC = ` +class PgResampler extends AudioWorkletProcessor { + constructor() { super(); this._idx = 0; this._buf = []; } + + process(inputs) { + const ch = inputs[0]?.[0]; + if (!ch) return true; + + const ratio = sampleRate / 16000; // e.g. 3.0 for 48 kHz input + while (this._idx < ch.length) { + const s = ch[Math.floor(this._idx)]; + this._buf.push(Math.round(Math.max(-1, Math.min(1, s)) * 32767)); + this._idx += ratio; + } + this._idx -= ch.length; // carry fractional offset to next block + + while (this._buf.length >= 320) { + const arr = new Int16Array(this._buf.splice(0, 320)); + this.port.postMessage({ pcm: arr.buffer, ts: Date.now() }, [arr.buffer]); + } + return true; + } +} +registerProcessor('pg-resampler', PgResampler); +`; + +// ── 3. setupAudioCapture ────────────────────────────────────────────────────── +// +// onChunk(pcm: Buffer, captureTs: number) is called for each 20 ms PCM16 frame. +// captureTs is wall-clock ms at the moment the worklet produced the frame — +// used later by resolve_speaker_identity() for time-alignment. + +export async function setupAudioCapture(page, onChunk) { + // Bridge from browser → Node. exposeFunction is safe to call post-navigate. + await page.exposeFunction('__pgChunk', (b64, ts) => { + onChunk(Buffer.from(b64, 'base64'), ts); + }); + + await page.evaluate(async (src) => { + // Inject worklet via blob URL (no local server needed). + const url = URL.createObjectURL(new Blob([src], { type: 'application/javascript' })); + const ctx = new AudioContext(); + await ctx.resume(); // bypass autoplay suspension — bot has no user gesture + await ctx.audioWorklet.addModule(url); + URL.revokeObjectURL(url); + + const node = new AudioWorkletNode(ctx, 'pg-resampler'); + + // Worklet → Node bridge: encode PCM16 ArrayBuffer as base64 string so it + // can cross the Playwright IPC boundary (exposeFunction only handles JSON). + node.port.onmessage = ({ data: { pcm, ts } }) => { + const u8 = new Uint8Array(pcm); + let s = ''; + for (let i = 0; i < u8.length; i++) s += String.fromCharCode(u8[i]); + window.__pgChunk(btoa(s), ts); + }; + + function connectTrack(track) { + // Each track gets its own MediaStreamSource; sharing a single source + // across tracks doesn't work — each source reads one stream. + ctx.createMediaStreamSource(new MediaStream([track])).connect(node); + } + + // Connect tracks already in the shared stream (joined mid-call or after + // participants were already speaking). + window.__pgStream.getAudioTracks().forEach(connectTrack); + + // Connect tracks added after this point (people join late, etc.). + window.__pgStream.addEventListener('addtrack', (e) => { + if (e.track.kind === 'audio') connectTrack(e.track); + }); + + // Fallback: some Meet versions route audio through