From 12dc9b5c02b2fcc58f2feae4a683c71fff590f73 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Tue, 21 Apr 2026 05:38:45 +0000
Subject: [PATCH 1/3] Add Meet bot scaffold: anonymous guest join + lobby
 handling

Phase 1 of Google Meet speaker-identity integration. Playwright bot that
joins a Meet URL as an unauthenticated guest and waits to be admitted.
Audio capture, DOM scraping for active speaker, and Polyglot WebSocket
wiring come in subsequent phases.

All Meet DOM selectors are centralized in selectors.js so future UI
rotations are a one-file fix.

https://claude.ai/code/session_019SWkcdJekyEmJqkwSPMbPH
---
 meet-bot/.gitignore   |   3 +
 meet-bot/README.md    |  61 +++++++++++++++++
 meet-bot/index.js     | 155 ++++++++++++++++++++++++++++++++++++++++++
 meet-bot/package.json |  17 +++++
 meet-bot/selectors.js |  51 ++++++++++++++
 5 files changed, 287 insertions(+)
 create mode 100644 meet-bot/.gitignore
 create mode 100644 meet-bot/README.md
 create mode 100644 meet-bot/index.js
 create mode 100644 meet-bot/package.json
 create mode 100644 meet-bot/selectors.js
diff --git a/meet-bot/.gitignore b/meet-bot/.gitignore
new file mode 100644
index 0000000..2e6fae9
--- /dev/null
+++ b/meet-bot/.gitignore
@@ -0,0 +1,3 @@
+node_modules/
+package-lock.json
+*.log
diff --git a/meet-bot/README.md b/meet-bot/README.md
new file mode 100644
index 0000000..12d6416
--- /dev/null
+++ b/meet-bot/README.md
@@ -0,0 +1,61 @@
+# Polyglot Meet Bot
+
+Headless Chromium bot that joins a Google Meet as an anonymous guest. Future phases will stream meeting audio and active-speaker names back to the Polyglot server; this initial phase validates only the join-and-get-admitted flow.
+
+## Setup
+
+```bash
+cd meet-bot
+npm install
+npx playwright install chromium
+```
+
+Node 20+ required.
+
+## Run
+
+```bash
+# Typical use — fully headless:
+node index.js --url "https://meet.google.com/xxx-yyyy-zzz"
+
+# Watch what the bot sees (debug Meet UI issues):
+node index.js --url "https://meet.google.com/xxx-yyyy-zzz" --headful
+
+# Override the displayed name (default "Polyglot Bot"):
+node index.js --url "..." --name "Transcription Bot"
+```
+
+## What it does (phase 1)
+
+1. Launches a fresh, cookieless Chromium — no Google sign-in.
+2. Opens the Meet URL, waits for the pre-join screen.
+3. Fills the "Your name" field, mutes mic + camera, clicks **Ask to join**.
+4. Waits up to 2 minutes for the host to admit it.
+5. Once admitted, stays connected until the meeting ends or it's removed.
+
+Exit codes:
+
+| Code | Meaning |
+|------|---------|
+| 0    | Joined successfully, then meeting ended / bot removed cleanly |
+| 1    | Crash / unexpected error (see stderr) |
+| 2    | Bad CLI arguments |
+| 3    | Host explicitly denied the join request |
+| 4    | Timed out in the lobby (host never admitted) |
+
+## Testing
+
+The easy test: open Meet in a normal browser tab, start a meeting as host, run the bot with `--headful --url <link>`, and admit it from the participants panel when it shows up as "Polyglot Bot". You should see the bot's Chromium window join the call.
+
+## What's NOT here yet
+
+- Audio capture (tab audio → 16 kHz PCM16 → Polyglot WebSocket)
+- DOM scraping of active-speaker name and participant roster
+- WebSocket connection to the Polyglot backend
+- Control channel (join/leave commands from Polyglot's admin UI)
+
+Those land in subsequent phases once we've validated the bot can reliably get into meetings.
+
+## Selectors
+
+All Meet DOM selectors live in `selectors.js`. When Meet ships a UI change and the bot breaks, that's the file to update — nothing else should need touching.
diff --git a/meet-bot/index.js b/meet-bot/index.js
new file mode 100644
index 0000000..e248fa4
--- /dev/null
+++ b/meet-bot/index.js
@@ -0,0 +1,155 @@
+// Polyglot Meet Bot — phase 1: anonymous guest join + lobby wait.
+//
+// Usage:
+//   node index.js --url https://meet.google.com/xxx-yyyy-zzz [--name "Polyglot Bot"] [--headful]
+//
+// Exits 0 on clean leave, non-zero on join failure / denial / crash. Audio
+// capture and DOM speaker-event scraping are not implemented yet — this
+// phase validates the riskiest piece (can we actually get into a meeting?)
+// before wiring anything to Polyglot.
+
+import { chromium } from "playwright";
+import { SELECTORS, firstMatching } from "./selectors.js";
+
+function parseArgs(argv) {
+  const args = { url: null, name: "Polyglot Bot", headful: false };
+  for (let i = 2; i < argv.length; i++) {
+    const a = argv[i];
+    if (a === "--url") args.url = argv[++i];
+    else if (a === "--name") args.name = argv[++i];
+    else if (a === "--headful") args.headful = true;
+  }
+  if (!args.url) {
+    console.error("Usage: node index.js --url <meet-link> [--name <display>] [--headful]");
+    process.exit(2);
+  }
+  return args;
+}
+
+function log(msg, ...rest) {
+  const ts = new Date().toISOString();
+  console.log(`[${ts}] ${msg}`, ...rest);
+}
+
+async function joinMeeting({ url, name, headful }) {
+  log(`Launching Chromium (headful=${headful})`);
+
+  const browser = await chromium.launch({
+    headless: !headful,
+    args: [
+      // Auto-grant mic/cam permission prompts so Meet's pre-join screen
+      // doesn't block. The bot mutes both immediately after join.
+      "--use-fake-ui-for-media-stream",
+      // Some distros need this when running Chromium without a full desktop.
+      "--no-sandbox",
+      "--disable-dev-shm-usage",
+    ],
+  });
+
+  // Fresh context = incognito-equivalent. No cookies, no persistent profile,
+  // no Google sign-in. Meet treats us as an anonymous guest.
+  const context = await browser.newContext({
+    permissions: ["microphone", "camera"],
+  });
+  const page = await context.newPage();
+
+  log(`Navigating to ${url}`);
+  await page.goto(url, { waitUntil: "domcontentloaded" });
+
+  // Fill the "Your name" field. Meet has had multiple implementations of
+  // this input across A/B rollouts, so we try each known selector.
+  log("Waiting for name input");
+  const match = await firstMatching(page, SELECTORS.nameInput, 20000);
+  if (!match) {
+    throw new Error(
+      "Could not find the 'Your name' field. This meeting may require a signed-in Google account."
+    );
+  }
+  log(`Filling name field (${match.selector}) with "${name}"`);
+  await match.element.fill(name);
+
+  // Make sure mic + camera are off *before* joining so we don't blast audio
+  // into the meeting or show a black-tile camera. Meet's pre-join toggles
+  // default to on; we flip them if aria-label indicates they're currently on.
+  for (const [label, sel] of [
+    ["microphone", SELECTORS.micToggle],
+    ["camera", SELECTORS.camToggle],
+  ]) {
+    const btn = await page.$(sel);
+    if (btn) {
+      const aria = (await btn.getAttribute("aria-label")) || "";
+      // Meet writes "Turn off <device>" when currently on, "Turn on <device>"
+      // when currently off. We want them off pre-join.
+      if (/turn off/i.test(aria)) {
+        log(`Muting ${label} (was on)`);
+        await btn.click();
+      }
+    }
+  }
+
+  // Click the join button. Label varies: "Ask to join" (normal guest),
+  // "Join now" (pre-admitted). Try each known label in order.
+  let clicked = false;
+  for (const label of SELECTORS.joinButtonNames) {
+    const btn = page.getByRole("button", { name: label });
+    if (await btn.count()) {
+      log(`Clicking "${label}"`);
+      await btn.first().click();
+      clicked = true;
+      break;
+    }
+  }
+  if (!clicked) throw new Error("Could not find a Join / Ask-to-join button.");
+
+  // Wait for one of three terminal states:
+  //   1. Leave-call button appears -> we're in the meeting.
+  //   2. "Denied" text appears -> host rejected us.
+  //   3. Timeout -> still in lobby, host never admitted.
+  log("Waiting for host to admit from lobby (up to 2 min)…");
+  const inMeeting = await Promise.race([
+    page
+      .waitForSelector(SELECTORS.leaveCallButton, { timeout: 120000 })
+      .then(() => "joined")
+      .catch(() => null),
+    page
+      .waitForFunction(
+        (pattern) => new RegExp(pattern.source, pattern.flags).test(document.body.innerText),
+        { source: SELECTORS.deniedText.source, flags: SELECTORS.deniedText.flags },
+        { timeout: 120000 }
+      )
+      .then(() => "denied")
+      .catch(() => null),
+  ]);
+
+  if (inMeeting === "joined") {
+    log("JOINED — bot is in the meeting.");
+    // Keep the page alive so you can see it in Meet. Phase 2 will add
+    // audio capture + DOM scraping here. For now, stay connected until the
+    // leave button disappears (meeting ended / we were removed) or SIGINT.
+    await page
+      .waitForSelector(SELECTORS.leaveCallButton, { state: "detached", timeout: 0 })
+      .catch(() => {});
+    log("Leave-call button gone — meeting ended or bot was removed.");
+    return 0;
+  }
+
+  if (inMeeting === "denied") {
+    log("DENIED — host rejected the join request.");
+    return 3;
+  }
+
+  log("TIMED OUT in lobby (2 min) — host did not admit.");
+  return 4;
+}
+
+(async () => {
+  const args = parseArgs(process.argv);
+  let code = 1;
+  try {
+    code = await joinMeeting(args);
+  } catch (err) {
+    log("ERROR:", err.message);
+    code = 1;
+  }
+  process.exit(code);
+})();
diff --git a/meet-bot/package.json b/meet-bot/package.json
new file mode 100644
index 0000000..26f9240
--- /dev/null
+++ b/meet-bot/package.json
@@ -0,0 +1,17 @@
+{
+  "name": "polyglot-meet-bot",
+  "version": "0.1.0",
+  "description": "Headless Chromium bot that joins a Google Meet as an anonymous guest and streams audio + speaker identity back to Polyglot.",
+  "private": true,
+  "type": "module",
+  "scripts": {
+    "start": "node index.js",
+    "install-browsers": "playwright install chromium"
+  },
+  "engines": {
+    "node": ">=20"
+  },
+  "dependencies": {
+    "playwright": "^1.47.0"
+  }
+}
diff --git a/meet-bot/selectors.js b/meet-bot/selectors.js
new file mode 100644
index 0000000..2d60dfa
--- /dev/null
+++ b/meet-bot/selectors.js
@@ -0,0 +1,51 @@
+// Google Meet DOM selectors.
+//
+// Meet's CSS classes are obfuscated and rotate. This file centralizes every
+// selector the bot relies on so a UI change is a one-file fix. Prefer stable
+// anchors (aria-label, role, visible text) over class names.
+
+export const SELECTORS = {
+  // Pre-join screen (anonymous guest path) -------------------------------
+  // The "Your name" input shown to signed-out users on the Meet landing page
+  // before joining. Meet has used multiple implementations; try in order.
+  nameInput: [
+    'input[aria-label="Your name"]',
+    'input[placeholder="Your name"]',
+    'input[jsname][type="text"]',
+  ],
+
+  // The button that submits the pre-join form. Its label depends on meeting
+  // config: "Ask to join" when the host hasn't admitted you, "Join now" when
+  // you're the host or pre-admitted. Match on visible text via Playwright's
+  // getByRole('button', { name: ... }) at call sites — selector here is a
+  // fallback for the ARIA role.
+  joinButtonNames: ["Ask to join", "Join now", "Join"],
+
+  // Pre-join sometimes prompts to turn off mic/cam — these buttons toggle
+  // them. Anchored on aria-label which Meet has kept stable for years.
+  micToggle: 'div[role="button"][aria-label*="microphone" i]',
+  camToggle: 'div[role="button"][aria-label*="camera" i]',
+
+  // In-call indicators ---------------------------------------------------
+  // Presence of the leave-call button is the most reliable "we are in the
+  // meeting" signal. Its aria-label is "Leave call".
+  leaveCallButton: 'button[aria-label="Leave call"]',
+
+  // Lobby / denial detection. When the host denies entry, Meet shows a
+  // message containing this text.
+  deniedText: /You can't join this call|no one responded|denied/i,
+};
+
+// Helper: return the first selector from a list that matches something on
+// the page. Used for resilient element lookup when Meet ships A/B variants.
+export async function firstMatching(page, selectorList, timeoutMs = 15000) {
+  const deadline = Date.now() + timeoutMs;
+  while (Date.now() < deadline) {
+    for (const sel of selectorList) {
+      const el = await page.$(sel);
+      if (el) return { selector: sel, element: el };
+    }
+    await page.waitForTimeout(250);
+  }
+  return null;
+}

From f7044f829ae4e7af385e29de8481634bb212cba9 Mon Sep 17 00:00:00 2001
From: Chabert Etienne <etienne.chabert@gmail.com>
Date: Tue, 21 Apr 2026 11:21:17 +0200
Subject: [PATCH 2/3] Wire Meet bot through Polyglot with captions-based
 speaker identity

Complete end-to-end pipeline: the bot joins a Meet call, streams audio
to Polyglot over Socket.IO, and resolves pyannote's SPEAKER_XX labels
to real display names by overlapping diarization against a wall-clock
speaker timeline built from Meet's live-captions DOM.

- Bot audio capture: RTCPeerConnection init-script taps all remote
  audio tracks into __pgStream; AudioWorklet resamples to 16 kHz PCM16
  in 20 ms frames, base64-bridged to node and forwarded to Polyglot's
  /meet_bot namespace.
- Speaker detection via captions: enables Meet captions via toolbar
  button (keyboard fallback), observes the aria-label="Captions" region,
  and extracts speaker names from each caption block's .NWpY1d span.
  Falls back to legacy data-is-speaking / aria-label signals.
- Polyglot ingest: /meet_bot Socket.IO namespace rechunks 320-sample
  bot frames into CHUNK_SIZE batches, maintains a 500-entry
  speaker_timeline deque of closed intervals, tracks _active_speaker
  open intervals, and records the full meeting audio to
  transcripts/<name>.wav for offline retranscription.
- Phase 5 resolution: resolve_speaker_identity overlaps each pyannote
  segment's wall-clock range against the timeline (closed + still-open),
  picks the majority-vote name if it covers >=30% of the segment, and
  emits rename_speaker WebSocket events. Resolved names replace
  SPEAKER_XX labels directly in transcript segments before WS emit.
- Speaker-switch batching: when the bot reports a new active speaker,
  process_audio flushes the current batch so each turn transcribes as
  one unit (up to BOT_MAX_BATCH_SEC = 60s).
- Admin UI: bot status badge, roster panel, and retroactive
  rename_speaker handler that rewrites SPEAKER_XX labels in-place.
- Persistent chrome-profile for Google sign-in cookies, 15s Polyglot
  connection timeout with auto-reconnect, forced-click fallback for
  the join button under overlays.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .gitignore            |   1 +
 app.py                | 240 +++++++++++++++++++++++++++++++++--
 config.py             |   8 ++
 meet-bot/.gitignore   |   1 +
 meet-bot/audio.js     | 127 +++++++++++++++++++
 meet-bot/index.js     | 285 ++++++++++++++++++++++++++++++++----------
 meet-bot/package.json |   3 +-
 meet-bot/selectors.js |  37 ++++++
 meet-bot/speaker.js   | 272 ++++++++++++++++++++++++++++++++++++++++
 templates/admin.html  |  48 ++++++-
 10 files changed, 947 insertions(+), 75 deletions(-)
 create mode 100644 meet-bot/audio.js
 create mode 100644 meet-bot/speaker.js

diff --git a/.gitignore b/.gitignore
index 84ad134..7b1ab87 100644
--- a/.gitignore
+++ b/.gitignore
@@ -208,4 +208,5 @@ __marimo__/
 transcript.txt
 /transcripts
 polyglot.lock
+viewer_password.txt
 .claude/settings.local.json
diff --git a/app.py b/app.py
index 6a98f48..0b93f65 100644
--- a/app.py
+++ b/app.py
@@ -4,6 +4,7 @@
 """
 
 import argparse
+import collections
 import os
 import queue
 import random
@@ -11,6 +12,7 @@
 import threading
 import time
 import warnings
+import wave
 import webbrowser
 from concurrent.futures import ThreadPoolExecutor
 from datetime import datetime
@@ -21,9 +23,9 @@ def log(message, tag=None):
     """Print a log message with timestamp. Tag is optional prefix like [SUMMARY]"""
     ts = datetime.now().strftime('%H:%M:%S')
     if tag:
-        print(f"[{ts}] [{tag}] {message}")
+        print(f"[{ts}] [{tag}] {message}", flush=True)
     else:
-        print(f"[{ts}] {message}")
+        print(f"[{ts}] {message}", flush=True)
 
 
 # Word list for generating memorable passphrases
@@ -319,6 +321,30 @@ def cleanup_lock_file():
 meeting_start_time = None  # Track when meeting started (first transcription)
 summary_pending = False  # Track if a summary generation is waiting
 
+# ── Meet bot state (Phase 4) ──────────────────────────────────────────────────
+# Deque of closed speaker segments: (start_ms, end_ms, display_name).
+# Populated by the /meet_bot SocketIO namespace; consumed by resolve_speaker_identity().
+speaker_timeline = collections.deque(maxlen=500)
+# Known participants: display_name -> first_seen_ms
+meet_participants = {}
+# Open (unended) speaker intervals: display_name -> start_ms
+_active_speaker_starts = {}
+# Whether the Playwright bot is currently connected
+bot_connected = False
+# Accumulation buffer for rechunking 320-sample bot frames → CHUNK_SIZE frames
+_bot_pcm_buffer = np.array([], dtype=np.float32)
+# Wall-clock ms of the most recently received bot audio frame (for Phase 5 time-alignment)
+_last_capture_ts_ms = None
+# WAV file writer — captures the full raw 16 kHz mono meeting audio for retranscription
+_bot_wav_writer = None
+_bot_wav_lock = threading.Lock()
+# Speaker-switch batching: who the bot says is currently speaking, and whether the most recent
+# speaker_start differs from the previous one (triggers process_audio to flush the current batch).
+_current_bot_speaker = None
+_pending_speaker_switch = False
+# Maximum batch length when bot is connected (gives Whisper more context per speaker turn).
+BOT_MAX_BATCH_SEC = 60
+
 
 def load_transcript_segments(transcript_path):
     """Load existing transcript file into all_meeting_segments for summarization.
@@ -1236,8 +1262,57 @@ def perform_speaker_diarization(audio_data, sample_rate):
         return None
 
 
+def resolve_speaker_identity(speaker_segments, batch_end_ts_ms, audio_duration_secs):
+    """Match pyannote speaker IDs to real names via speaker_timeline overlap.
+
+    Returns {original_pyannote_id: real_name} for speakers with >= 30% time overlap.
+    Considers both closed (speaker_timeline) and still-open (_active_speaker_starts)
+    intervals, since a speaker who started talking during this batch may not have
+    emitted speaker_end yet when the transcription thread kicks off.
+    """
+    if batch_end_ts_ms is None or not speaker_segments:
+        return {}
+
+    # Build the full set of speaker intervals to consider.
+    intervals = list(speaker_timeline)  # closed: (start_ms, end_ms, name)
+    now_ms = int(time.time() * 1000)
+    for name, start_ms in _active_speaker_starts.items():
+        intervals.append((start_ms, now_ms, name))
+
+    if not intervals:
+        return {}
+
+    from collections import defaultdict
+    batch_start_ms = batch_end_ts_ms - audio_duration_secs * 1000
+
+    speaker_times = defaultdict(list)
+    for seg in speaker_segments:
+        seg_start_ms = batch_start_ms + seg["start"] * 1000
+        seg_end_ms = batch_start_ms + seg["end"] * 1000
+        speaker_times[seg["speaker"]].append((seg_start_ms, seg_end_ms))
+
+    resolved = {}
+    for original_id, time_ranges in speaker_times.items():
+        name_overlap = defaultdict(float)
+        for seg_start_ms, seg_end_ms in time_ranges:
+            for (tl_start, tl_end, name) in intervals:
+                overlap = max(0.0, min(seg_end_ms, tl_end) - max(seg_start_ms, tl_start))
+                if overlap > 0:
+                    name_overlap[name] += overlap
+
+        if not name_overlap:
+            continue
+
+        best_name = max(name_overlap, key=name_overlap.get)
+        total_ms = sum(end - start for start, end in time_ranges)
+        if total_ms > 0 and name_overlap[best_name] / total_ms >= 0.30:
+            resolved[original_id] = best_name
+
+    return resolved
+
+
 @torch.inference_mode()
-def transcribe_and_translate(audio_data, audio_duration):
+def transcribe_and_translate(audio_data, audio_duration, batch_end_ts_ms=None):
     """Background thread for transcription and translation with speaker diarization"""
     global is_processing, all_meeting_segments
 
@@ -1328,6 +1403,13 @@ def normalize_caps(text):
                     speaker_mapping[original_id] = f"SPEAKER_{speaker_counter:02d}"
                     speaker_counter += 1
 
+            # Phase 5: resolve pyannote IDs → real names from speaker_timeline.
+            resolved_names = resolve_speaker_identity(speaker_segments, batch_end_ts_ms, audio_duration)
+            if resolved_names:
+                for orig_id, real_name in resolved_names.items():
+                    speaker_xx = speaker_mapping.get(orig_id, orig_id)
+                    log(f"Resolved {speaker_xx} → {real_name}", "BOT")
+                    socketio.emit("rename_speaker", {"speaker_id": speaker_xx, "name": real_name})
 
             # Extract all words with timestamps from chunks
             all_words = []
@@ -1361,7 +1443,9 @@ def normalize_caps(text):
 
                     if overlap > max_overlap:
                         max_overlap = overlap
-                        best_speaker = speaker_mapping.get(seg["speaker"], seg["speaker"])
+                        orig = seg["speaker"]
+                        # Prefer resolved real name; fall back to renumbered SPEAKER_XX.
+                        best_speaker = resolved_names.get(orig, speaker_mapping.get(orig, orig))
                         best_segment_idx = idx
 
                 words_with_speakers.append({
@@ -1705,10 +1789,22 @@ def process_audio():
 
             buffer.append(chunk)
 
+            # Speaker-switch batching (bot mode): when the bot reports a new active
+            # speaker, flush the current batch so each turn is transcribed as one unit.
+            # Extend max length to BOT_MAX_BATCH_SEC so one speaker can ramble up to 1 min.
+            global _pending_speaker_switch
+            bot_active = bot_connected and _current_bot_speaker is not None
+            if bot_active:
+                max_chunks = int(actual_sample_rate * BOT_MAX_BATCH_SEC / CHUNK_SIZE)
+
+            speaker_switched = bot_active and _pending_speaker_switch and len(buffer) >= min_chunks
+            if speaker_switched:
+                _pending_speaker_switch = False
+
             # Process when we detect end of sentence (silence after minimum audio) OR max buffer reached
             silence_detected = len(buffer) >= min_chunks and silence_counter >= audio_thresholds["silence_chunks"]
             max_length_reached = len(buffer) >= max_chunks
-            should_process = silence_detected or max_length_reached
+            should_process = silence_detected or max_length_reached or speaker_switched
 
             if should_process:
                 is_processing = True  # Set lock
@@ -1753,10 +1849,13 @@ def process_audio():
                 # Calculate audio duration
                 audio_duration = len(audio_resampled) / SAMPLE_RATE
 
+                # Snapshot the wall-clock anchor for Phase 5 speaker resolution.
+                batch_end_ts = _last_capture_ts_ms
+
                 # Launch background thread for transcription and translation
                 # This keeps the main loop responsive for WebSocket updates
                 processing_thread = threading.Thread(
-                    target=transcribe_and_translate, args=(audio_resampled, audio_duration), daemon=True
+                    target=transcribe_and_translate, args=(audio_resampled, audio_duration, batch_end_ts), daemon=True
                 )
                 processing_thread.start()
 
@@ -2201,10 +2300,18 @@ def start_listening_internal():
     if not is_listening:
         is_listening = True
 
-        # Start audio stream
+        # Start the processing thread first (same for both audio sources).
         audio_thread = threading.Thread(target=process_audio, daemon=True)
         audio_thread.start()
 
+        # Meet-bot path: audio arrives over SocketIO at 16 kHz mono; skip PyAudio entirely.
+        if Config.AUDIO_SOURCE == "meet_bot":
+            actual_sample_rate = Config.SAMPLE_RATE  # 16000
+            num_channels = 1
+            print("[AUDIO] Source: Meet bot (waiting for bot to connect and stream audio)")
+            socketio.emit("status", {"listening": True})
+            return
+
         # Initialize PyAudio
         p_audio = pyaudio.PyAudio()
 
@@ -2491,6 +2598,125 @@ def handle_broadcast_manual_summary(data):
     emit('manual_summary_broadcast', {'success': True, 'languages_sent': list(translations_cache.keys())})
 
 
+# ── Meet bot SocketIO namespace (Phase 4) ────────────────────────────────────
+#
+# The Playwright bot connects here as a socket.io-client at /meet_bot.
+# It streams two event types:
+#   audio_frame  — binary PCM16 payload + JSON meta {capture_ts_ms, sample_rate, channels}
+#   speaker_event — JSON {type, name, wall_clock_ms} for speaker_start/end/roster_update
+
+def _open_bot_wav():
+    """Open a WAV file to record the full raw meeting audio from the bot."""
+    global _bot_wav_writer
+    with _bot_wav_lock:
+        if _bot_wav_writer is not None or not TRANSCRIPT_FILE:
+            return
+        wav_path = TRANSCRIPT_FILE.with_suffix(".wav")
+        try:
+            _bot_wav_writer = wave.open(str(wav_path), "wb")
+            _bot_wav_writer.setnchannels(1)
+            _bot_wav_writer.setsampwidth(2)  # int16
+            _bot_wav_writer.setframerate(Config.SAMPLE_RATE)
+            log(f"Recording meeting audio → {wav_path.name}", "BOT")
+        except Exception as e:
+            log(f"Failed to open WAV: {e}", "BOT")
+            _bot_wav_writer = None
+
+
+def _close_bot_wav():
+    global _bot_wav_writer
+    with _bot_wav_lock:
+        if _bot_wav_writer is None:
+            return
+        try:
+            _bot_wav_writer.close()
+            log("Meeting audio file closed", "BOT")
+        except Exception as e:
+            log(f"Error closing WAV: {e}", "BOT")
+        _bot_wav_writer = None
+
+
+if Config.MEET_BOT_ENABLED:
+
+    @socketio.on("connect", namespace="/meet_bot")
+    def bot_connect():
+        global bot_connected
+        bot_connected = True
+        log("Meet bot connected", "BOT")
+        _open_bot_wav()
+        socketio.emit("bot_status", {"connected": True}, room="admin")
+
+    @socketio.on("disconnect", namespace="/meet_bot")
+    def bot_disconnect():
+        global bot_connected, _active_speaker_starts
+        bot_connected = False
+        # Close any open speaker intervals so timeline stays consistent.
+        now_ms = int(time.time() * 1000)
+        for name, start in list(_active_speaker_starts.items()):
+            speaker_timeline.append((start, now_ms, name))
+        _active_speaker_starts.clear()
+        _close_bot_wav()
+        log("Meet bot disconnected", "BOT")
+        socketio.emit("bot_status", {"connected": False}, room="admin")
+
+    @socketio.on("audio_frame", namespace="/meet_bot")
+    def bot_audio_frame(meta, data):
+        global _bot_pcm_buffer, _last_capture_ts_ms
+        if not is_listening:
+            return
+        # Persist raw int16 PCM for offline retranscription.
+        # writeframes (not writeframesraw) patches the header on every write so
+        # the file is valid even if the server is killed without clean shutdown.
+        if _bot_wav_writer is not None:
+            try:
+                with _bot_wav_lock:
+                    if _bot_wav_writer is not None:
+                        _bot_wav_writer.writeframes(data)
+            except Exception:
+                pass
+        # data arrives as bytes (Int16 PCM, 16 kHz mono, 320 samples = 20 ms).
+        frame = np.frombuffer(data, dtype=np.int16).astype(np.float32) / 32768.0
+        _bot_pcm_buffer = np.concatenate([_bot_pcm_buffer, frame])
+        # Rechunk to CHUNK_SIZE (1024) so process_audio's duration math is correct.
+        while len(_bot_pcm_buffer) >= CHUNK_SIZE:
+            audio_queue.put(_bot_pcm_buffer[:CHUNK_SIZE].copy())
+            _bot_pcm_buffer = _bot_pcm_buffer[CHUNK_SIZE:]
+        _last_capture_ts_ms = meta.get("capture_ts_ms")
+
+    @socketio.on("speaker_event", namespace="/meet_bot")
+    def bot_speaker_event(ev):
+        global meet_participants, _active_speaker_starts, _current_bot_speaker, _pending_speaker_switch
+        ev_type = ev.get("type")
+        name = ev.get("name")
+        ts_ms = ev.get("wall_clock_ms", int(time.time() * 1000))
+
+        if ev_type == "roster_update":
+            for p in ev.get("participants", []):
+                if p and p not in meet_participants:
+                    meet_participants[p] = ts_ms
+            socketio.emit("meet_roster", {
+                "participants": list(meet_participants.keys()),
+                "bot_connected": True,
+            }, room="admin")
+            return
+
+        if not name:
+            return
+
+        if ev_type == "speaker_start":
+            _active_speaker_starts[name] = ts_ms
+            # Flag a pending switch if the speaker changed — process_audio uses this to flush.
+            if _current_bot_speaker is not None and _current_bot_speaker != name:
+                _pending_speaker_switch = True
+            _current_bot_speaker = name
+            log(f"Speaking: {name}", "BOT")
+
+        elif ev_type == "speaker_end":
+            start = _active_speaker_starts.pop(name, ts_ms - 1000)
+            speaker_timeline.append((start, ts_ms, name))
+            log(f"Segment: {name} {ts_ms - start} ms", "BOT")
+
+
 if __name__ == "__main__":
     # Check for single instance before doing anything else
     check_single_instance()
diff --git a/config.py b/config.py
index 082569f..8c9485e 100644
--- a/config.py
+++ b/config.py
@@ -211,6 +211,14 @@ def get_translation_lang_code(cls, iso_code):
     SAMPLE_RATE = 16000  # Whisper expects 16kHz audio
     CHUNK_SIZE = 1024  # Audio buffer chunk size
 
+    # Audio source — "wasapi" uses the WASAPI loopback device (original path);
+    # "meet_bot" receives 16 kHz PCM16 from the Playwright bot over SocketIO.
+    AUDIO_SOURCE = os.getenv("AUDIO_SOURCE", "meet_bot")
+
+    # Meet bot SocketIO receiver.  The bot connects to /meet_bot on whatever
+    # port Polyglot is already running on — no separate port needed.
+    MEET_BOT_ENABLED = os.getenv("MEET_BOT_ENABLED", "True").lower() in ("true", "1", "yes")
+
     # Minimum audio level to process (prevents hallucinations during silence)
     # If average audio level is below this, skip transcription
     MIN_AUDIO_LEVEL = 0.01
diff --git a/meet-bot/.gitignore b/meet-bot/.gitignore
index 2e6fae9..793052f 100644
--- a/meet-bot/.gitignore
+++ b/meet-bot/.gitignore
@@ -1,3 +1,4 @@
 node_modules/
 package-lock.json
 *.log
+chrome-profile/
diff --git a/meet-bot/audio.js b/meet-bot/audio.js
new file mode 100644
index 0000000..508caed
--- /dev/null
+++ b/meet-bot/audio.js
@@ -0,0 +1,127 @@
+// Phase 2: in-browser audio capture for the Meet bot.
+//
+// Two-part design:
+//   1. RTC_INIT_SCRIPT — must be registered via context.addInitScript() BEFORE
+//      page.goto() so it runs before Meet initialises its RTCPeerConnections.
+//      It patches RTCPeerConnection to funnel every remote audio track into a
+//      single shared MediaStream (window.__pgStream).
+//
+//   2. setupAudioCapture(page, onChunk) — called after the bot has joined.
+//      Injects an AudioWorklet that downsamples all audio in __pgStream to
+//      16 kHz mono PCM16, buffers into 20 ms frames, and sends each frame
+//      back to Node via an exposed function.
+
+// ── 1. RTC patch (init script) ───────────────────────────────────────────────
+
+export const RTC_INIT_SCRIPT = `(function () {
+  window.__pgStream = new MediaStream();
+  const _Orig = window.RTCPeerConnection;
+  class _Patched extends _Orig {
+    constructor(...a) {
+      super(...a);
+      this.addEventListener('track', (ev) => {
+        if (ev.track.kind !== 'audio') return;
+        if (!window.__pgStream.getTrackById(ev.track.id))
+          window.__pgStream.addTrack(ev.track);
+      });
+    }
+  }
+  window.RTCPeerConnection = _Patched;
+})();`;
+
+// ── 2. AudioWorklet processor source ─────────────────────────────────────────
+//
+// Nearest-neighbour resampler: maintains a fractional index across process()
+// calls so downsampling is consistent across block boundaries.
+// Buffers output until 320 samples (20 ms @ 16 kHz) are ready, then posts
+// { pcm: ArrayBuffer, ts: number } to the main thread.
+
+const WORKLET_SRC = `
+class PgResampler extends AudioWorkletProcessor {
+  constructor() { super(); this._idx = 0; this._buf = []; }
+
+  process(inputs) {
+    const ch = inputs[0]?.[0];
+    if (!ch) return true;
+
+    const ratio = sampleRate / 16000; // e.g. 3.0 for 48 kHz input
+    while (this._idx < ch.length) {
+      const s = ch[Math.floor(this._idx)];
+      this._buf.push(Math.round(Math.max(-1, Math.min(1, s)) * 32767));
+      this._idx += ratio;
+    }
+    this._idx -= ch.length; // carry fractional offset to next block
+
+    while (this._buf.length >= 320) {
+      const arr = new Int16Array(this._buf.splice(0, 320));
+      this.port.postMessage({ pcm: arr.buffer, ts: Date.now() }, [arr.buffer]);
+    }
+    return true;
+  }
+}
+registerProcessor('pg-resampler', PgResampler);
+`;
+
+// ── 3. setupAudioCapture ──────────────────────────────────────────────────────
+//
+// onChunk(pcm: Buffer, captureTs: number) is called for each 20 ms PCM16 frame.
+// captureTs is wall-clock ms at the moment the worklet produced the frame —
+// used later by resolve_speaker_identity() for time-alignment.
+
+export async function setupAudioCapture(page, onChunk) {
+  // Bridge from browser → Node. exposeFunction is safe to call post-navigate.
+  await page.exposeFunction('__pgChunk', (b64, ts) => {
+    onChunk(Buffer.from(b64, 'base64'), ts);
+  });
+
+  await page.evaluate(async (src) => {
+    // Inject worklet via blob URL (no local server needed).
+    const url = URL.createObjectURL(new Blob([src], { type: 'application/javascript' }));
+    const ctx = new AudioContext();
+    await ctx.resume(); // bypass autoplay suspension — bot has no user gesture
+    await ctx.audioWorklet.addModule(url);
+    URL.revokeObjectURL(url);
+
+    const node = new AudioWorkletNode(ctx, 'pg-resampler');
+
+    // Worklet → Node bridge: encode PCM16 ArrayBuffer as base64 string so it
+    // can cross the Playwright IPC boundary (exposeFunction only handles JSON).
+    node.port.onmessage = ({ data: { pcm, ts } }) => {
+      const u8 = new Uint8Array(pcm);
+      let s = '';
+      for (let i = 0; i < u8.length; i++) s += String.fromCharCode(u8[i]);
+      window.__pgChunk(btoa(s), ts);
+    };
+
+    function connectTrack(track) {
+      // Each track gets its own MediaStreamSource; sharing a single source
+      // across tracks doesn't work — each source reads one stream.
+      ctx.createMediaStreamSource(new MediaStream([track])).connect(node);
+    }
+
+    // Connect tracks already in the shared stream (joined mid-call or after
+    // participants were already speaking).
+    window.__pgStream.getAudioTracks().forEach(connectTrack);
+
+    // Connect tracks added after this point (people join late, etc.).
+    window.__pgStream.addEventListener('addtrack', (e) => {
+      if (e.track.kind === 'audio') connectTrack(e.track);
+    });
+
+    // Fallback: some Meet versions route audio through <audio> elements instead
+    // of exposing it via RTCPeerConnection track events. Tap those too.
+    function connectEl(el) {
+      if (el._pg) return;
+      el._pg = true;
+      try {
+        const src = ctx.createMediaElementSource(el);
+        src.connect(node);
+        src.connect(ctx.destination); // keep the original playback alive
+      } catch (_) { /* element may already be claimed */ }
+    }
+    document.querySelectorAll('audio').forEach(connectEl);
+    new MutationObserver(() => document.querySelectorAll('audio').forEach(connectEl))
+      .observe(document.documentElement, { childList: true, subtree: true });
+
+  }, WORKLET_SRC);
+}
diff --git a/meet-bot/index.js b/meet-bot/index.js
index e248fa4..7a02f63 100644
--- a/meet-bot/index.js
+++ b/meet-bot/index.js
@@ -1,26 +1,53 @@
-// Polyglot Meet Bot — phase 1: anonymous guest join + lobby wait.
+// Polyglot Meet Bot — phases 1–3: join + audio capture + speaker detection.
 //
 // Usage:
-//   node index.js --url https://meet.google.com/xxx-yyyy-zzz [--name "Polyglot Bot"] [--headful]
+//   node index.js --url https://meet.google.com/xxx-yyyy-zzz \
+//                 [--name "Polyglot Bot"] \
+//                 [--headful] \
+//                 [--polyglot-url http://localhost:5001] \
+//                 [--profile-dir <path>]   (default: ~/.polyglot-bot-profile)
 //
-// Exits 0 on clean leave, non-zero on join failure / denial / crash. Audio
-// capture and DOM speaker-event scraping are not implemented yet — this
-// phase validates the riskiest piece (can we actually get into a meeting?)
-// before wiring anything to Polyglot.
+// Exit codes: 0 clean leave, 1 crash, 2 bad args, 3 denied by host, 4 lobby timeout,
+//             6 blocked (bot detected / meeting locked / link invalid).
+//
+// Persistent profile: the bot reuses a Chrome profile across runs so Google
+// sign-in cookies survive. On first run the user logs in manually in the
+// headful window; subsequent runs are already authenticated.
 
+import os from "os";
+import path from "path";
+import { fileURLToPath } from "url";
 import { chromium } from "playwright";
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
 import { SELECTORS, firstMatching } from "./selectors.js";
+import { RTC_INIT_SCRIPT, setupAudioCapture } from "./audio.js";
+import { setupSpeakerDetection } from "./speaker.js";
+
+// Dedicated bot profile — separate from the user's real Chrome so there's no
+// instance conflict. Stored inside meet-bot/ so it's self-contained.
+const DEFAULT_PROFILE = path.join(__dirname, "chrome-profile");
 
 function parseArgs(argv) {
-  const args = { url: null, name: "Polyglot Bot", headful: false };
+  const args = {
+    url: null,
+    name: "Polyglot Bot",
+    headful: false,
+    polyglotUrl: null,
+    profileDir: DEFAULT_PROFILE,
+  };
   for (let i = 2; i < argv.length; i++) {
     const a = argv[i];
     if (a === "--url") args.url = argv[++i];
     else if (a === "--name") args.name = argv[++i];
     else if (a === "--headful") args.headful = true;
+    else if (a === "--polyglot-url") args.polyglotUrl = argv[++i];
+    else if (a === "--profile-dir") args.profileDir = argv[++i];
   }
   if (!args.url) {
-    console.error("Usage: node index.js --url <meet-link> [--name <display>] [--headful]");
+    console.error(
+      "Usage: node index.js --url <meet-link> [--name <display>] [--headful] [--polyglot-url <url>] [--profile-dir <path>]"
+    );
     process.exit(2);
   }
   return args;
@@ -31,80 +58,159 @@ function log(msg, ...rest) {
   console.log(`[${ts}] ${msg}`, ...rest);
 }
 
-async function joinMeeting({ url, name, headful }) {
-  log(`Launching Chromium (headful=${headful})`);
+// Connect to Polyglot's /meet_bot SocketIO namespace.
+// Returns { sendAudio, sendEvent } or null if not configured / unavailable.
+async function connectPolyglot(polyglotUrl) {
+  if (!polyglotUrl) return null;
+  try {
+    const { io } = await import("socket.io-client");
+    // Socket.IO reconnects automatically; give the initial handshake 15 s so
+    // Polyglot has time to finish any lazy namespace registration.
+    const socket = io(`${polyglotUrl}/meet_bot`, {
+      transports: ["websocket"],
+      reconnection: true,
+      reconnectionDelay: 500,
+      reconnectionDelayMax: 2000,
+    });
+    await new Promise((resolve, reject) => {
+      socket.once("connect", resolve);
+      setTimeout(() => reject(new Error("connect timeout")), 15000);
+    });
+    log(`Connected to Polyglot at ${polyglotUrl}/meet_bot`);
+    return {
+      sendAudio: (pcm, captureTs) =>
+        socket.emit("audio_frame", { capture_ts_ms: captureTs, sample_rate: 16000, channels: 1 }, pcm),
+      sendEvent: (ev) =>
+        socket.emit("speaker_event", ev),
+    };
+  } catch (err) {
+    log(`WARN: Could not connect to Polyglot (${err.message}). Audio will not be forwarded.`);
+    return null;
+  }
+}
+
+async function joinMeeting({ url, name, headful, polyglotUrl, profileDir }) {
+  log(`Launching Chrome (headful=${headful}, profile=${profileDir})`);
 
-  const browser = await chromium.launch({
+  // launchPersistentContext keeps cookies/localStorage across runs — the user
+  // signs in once and subsequent runs are already authenticated.
+  const context = await chromium.launchPersistentContext(profileDir, {
     headless: !headful,
+    channel: "chrome",
     args: [
-      // Auto-grant mic/cam permission prompts so Meet's pre-join screen
-      // doesn't block. The bot mutes both immediately after join.
       "--use-fake-ui-for-media-stream",
-      // Some distros need this when running Chromium without a full desktop.
+      "--disable-blink-features=AutomationControlled",
+      "--autoplay-policy=no-user-gesture-required",
       "--no-sandbox",
       "--disable-dev-shm-usage",
     ],
+    permissions: ["microphone", "camera"],
   });
 
-  // Fresh context = incognito-equivalent. No cookies, no persistent profile,
-  // no Google sign-in. Meet treats us as an anonymous guest.
-  const context = await browser.newContext({
-    permissions: ["microphone", "camera"],
+  // Stealth: unset navigator.webdriver before any page JS runs.
+  await context.addInitScript(() => {
+    Object.defineProperty(navigator, "webdriver", { get: () => undefined });
   });
+
+  // Phase 2: patch RTCPeerConnection before Meet's JS initialises WebRTC.
+  await context.addInitScript(RTC_INIT_SCRIPT);
+
   const page = await context.newPage();
 
   log(`Navigating to ${url}`);
   await page.goto(url, { waitUntil: "domcontentloaded" });
 
-  // Fill the "Your name" field. Meet has had multiple implementations of
-  // this input across A/B rollouts, so we try each known selector.
-  log("Waiting for name input");
-  const match = await firstMatching(page, SELECTORS.nameInput, 20000);
-  if (!match) {
-    throw new Error(
-      "Could not find the 'Your name' field. This meeting may require a signed-in Google account."
-    );
-  }
-  log(`Filling name field (${match.selector}) with "${name}"`);
-  await match.element.fill(name);
-
-  // Make sure mic + camera are off *before* joining so we don't blast audio
-  // into the meeting or show a black-tile camera. Meet's pre-join toggles
-  // default to on; we flip them if aria-label indicates they're currently on.
-  for (const [label, sel] of [
-    ["microphone", SELECTORS.micToggle],
-    ["camera", SELECTORS.camToggle],
-  ]) {
-    const btn = await page.$(sel);
-    if (btn) {
-      const aria = (await btn.getAttribute("aria-label")) || "";
-      // Meet writes "Turn off <device>" when currently on, "Turn on <device>"
-      // when currently off. We want them off pre-join.
-      if (/turn off/i.test(aria)) {
-        log(`Muting ${label} (was on)`);
-        await btn.click();
+  // Pre-join loop — handles:
+  //   • Anonymous guest flow (name field → join button)
+  //   • Signed-in flow (join button directly, no name field)
+  //   • Google sign-in redirect (wait indefinitely for user to log in)
+  let clicked = false;
+  while (!clicked) {
+    await page.waitForTimeout(1200); // let Meet's JS settle
+
+    const currentUrl = page.url();
+    log(`Page: ${currentUrl.slice(0, 80)}`);
+
+    // Block / bot-detection page — Google redirected us away from Meet entirely.
+    if (SELECTORS.blockedUrls.some((u) => currentUrl.includes(u))) {
+      log("BLOCKED — Google rejected access (bot detected, meeting locked, or link invalid).");
+      return 6;
+    }
+    // "You can't join this video call" text shown inline on meet.google.com.
+    const pageText = await page.evaluate(() => document.body.innerText).catch(() => "");
+    if (SELECTORS.blockedText.test(pageText)) {
+      log(`BLOCKED — page says: "${pageText.slice(0, 120)}"`);
+      return 6;
+    }
+
+    // Sign-in wall — wait for the user to complete login in the headful window.
+    if (currentUrl.includes("accounts.google.com")) {
+      log("Google sign-in required — log in in the browser window. Bot will resume automatically.");
+      await page.waitForURL((u) => !u.href.includes("accounts.google.com"), { timeout: 0 });
+      log("Back on Meet — retrying pre-join flow.");
+      await page.waitForLoadState("domcontentloaded");
+      continue;
+    }
+
+    // Name field — present for anonymous guests only.
+    const match = await firstMatching(page, SELECTORS.nameInput, 3000);
+    if (match) {
+      log(`Filling name field with "${name}"`);
+      try {
+        await page.locator(match.selector).first().fill(name, { timeout: 4000 });
+      } catch (_) {
+        continue; // navigation happened mid-fill — loop will detect it
       }
+    } else {
+      log("No name field — signed-in account.");
     }
-  }
 
-  // Click the join button. Label varies: "Ask to join" (normal guest),
-  // "Join now" (pre-admitted). Try each known label in order.
-  let clicked = false;
-  for (const label of SELECTORS.joinButtonNames) {
-    const btn = page.getByRole("button", { name: label });
-    if (await btn.count()) {
-      log(`Clicking "${label}"`);
-      await btn.first().click();
-      clicked = true;
-      break;
+    // Mute mic + camera before entering.
+    for (const [label, sel] of [
+      ["microphone", SELECTORS.micToggle],
+      ["camera", SELECTORS.camToggle],
+    ]) {
+      try {
+        const btn = await page.$(sel);
+        if (btn) {
+          const aria = (await btn.getAttribute("aria-label")) || "";
+          if (/turn off/i.test(aria)) {
+            log(`Muting ${label}`);
+            await btn.click();
+          }
+        }
+      } catch (_) {}
+    }
+
+    // Click join button.
+    for (const label of SELECTORS.joinButtonNames) {
+      const btn = page.getByRole("button", { name: label });
+      if (await btn.count()) {
+        log(`Clicking "${label}"`);
+        try {
+          await btn.first().click({ timeout: 5000 });
+          clicked = true;
+        } catch (e) {
+          log(`  click failed: ${e.message.split("\n")[0]}`);
+          // Try forcing through any overlay.
+          try {
+            await btn.first().click({ force: true, timeout: 3000 });
+            clicked = true;
+            log(`  forced click succeeded`);
+          } catch (e2) {
+            log(`  forced click also failed: ${e2.message.split("\n")[0]}`);
+          }
+        }
+        break;
+      }
+    }
+
+    if (!clicked) {
+      log("Join button not visible yet — retrying in 2 s…");
+      await page.waitForTimeout(2000);
     }
   }
-  if (!clicked) throw new Error("Could not find a Join / Ask-to-join button.");
 
-  // Wait for one of three terminal states:
-  //   1. Leave-call button appears -> we're in the meeting.
-  //   2. "Denied" text appears -> host rejected us.
-  //   3. Timeout -> still in lobby, host never admitted.
   log("Waiting for host to admit from lobby (up to 2 min)…");
   const inMeeting = await Promise.race([
     page
@@ -123,13 +229,62 @@ async function joinMeeting({ url, name, headful }) {
 
   if (inMeeting === "joined") {
     log("JOINED — bot is in the meeting.");
-    // Keep the page alive so you can see it in Meet. Phase 2 will add
-    // audio capture + DOM scraping here. For now, stay connected until the
-    // leave button disappears (meeting ended / we were removed) or SIGINT.
+
+    // Enable Meet's live captions — primary source of reliable speaker-identity.
+    // Try in order: click the toolbar button, then the keyboard shortcut.
+    try {
+      await page.waitForTimeout(2000);
+      // First, try to find the captions button by aria-label and click it.
+      const captionsBtn = page.locator(
+        'button[aria-label*="caption" i], button[aria-label*="subtitle" i]'
+      ).first();
+      if (await captionsBtn.count()) {
+        const label = await captionsBtn.getAttribute("aria-label");
+        // Only click if label indicates captions are OFF (turn on...).
+        if (/turn on|show/i.test(label || "")) {
+          await captionsBtn.click({ timeout: 3000 });
+          log(`Enabled captions via button: "${label}"`);
+        } else {
+          log(`Captions already on: "${label}"`);
+        }
+      } else {
+        // Fallback: keyboard shortcut. Click main area first to ensure focus.
+        await page.click("body").catch(() => {});
+        await page.keyboard.press("c");
+        log("Enabled captions via keyboard shortcut.");
+      }
+    } catch (e) {
+      log(`WARN: Could not enable captions: ${e.message}`);
+    }
+
+    // Phase 2: start audio capture.
+    const polyglot = await connectPolyglot(polyglotUrl);
+    let chunkCount = 0;
+    log("Setting up audio capture…");
+    await setupAudioCapture(page, (pcm, captureTs) => {
+      chunkCount++;
+      if (chunkCount % 50 === 0) {
+        log(`Audio: ${chunkCount * 20} ms captured, last ts=${captureTs}`);
+      }
+      polyglot?.sendAudio(pcm, captureTs);
+    });
+    log("Audio capture active (16 kHz mono PCM16, 20 ms frames).");
+
+    // Phase 3: speaker events and roster.
+    await setupSpeakerDetection(page, (ev) => {
+      if (ev.type === "roster_update") {
+        log(`Roster: ${ev.participants.join(", ") || "(empty)"}`);
+      } else {
+        log(`${ev.type === "speaker_start" ? "  Speaking" : "Silent   "}  ${ev.name}`);
+      }
+      polyglot?.sendEvent(ev);
+    });
+    log("Speaker detection active.");
+
     await page
       .waitForSelector(SELECTORS.leaveCallButton, { state: "detached", timeout: 0 })
       .catch(() => {});
-    log("Leave-call button gone — meeting ended or bot was removed.");
+    log(`Meeting ended. Total audio captured: ${chunkCount * 20} ms`);
     return 0;
   }
 
diff --git a/meet-bot/package.json b/meet-bot/package.json
index 26f9240..ac6d5f1 100644
--- a/meet-bot/package.json
+++ b/meet-bot/package.json
@@ -12,6 +12,7 @@
     "node": ">=20"
   },
   "dependencies": {
-    "playwright": "^1.47.0"
+    "playwright": "^1.47.0",
+    "socket.io-client": "^4.8.0"
   }
 }
diff --git a/meet-bot/selectors.js b/meet-bot/selectors.js
index 2d60dfa..c9720e4 100644
--- a/meet-bot/selectors.js
+++ b/meet-bot/selectors.js
@@ -5,6 +5,38 @@
 // anchors (aria-label, role, visible text) over class names.
 
 export const SELECTORS = {
+  // ── Active-speaker / roster detection (Phase 3) ──────────────────────
+  //
+  // Meet's classes are obfuscated; everything here anchors on aria-label,
+  // role, or data-* attributes that have been stable across rollouts.
+  //
+  // Tile container — wraps one participant's video + name + mic ring.
+  // data-participant-id is the most stable anchor we have.
+  participantTile: '[data-participant-id]',
+
+  // Speaking indicators — Meet has used several over time; we try all.
+  // Strategy 1: explicit boolean attribute (newer Meet)
+  speakingAttr: '[data-is-speaking="true"]',
+  // Strategy 2: aria-label on the tile or mic button says "… is speaking"
+  speakingAriaLabel: '[aria-label*="is speaking" i]',
+  // Strategy 3: the audio-level bars inside a tile animate when speaking.
+  // Class is obfuscated, but the element always carries [data-is-muted="false"]
+  // and its closest tile ancestor is the active speaker. Fallback only.
+  audioLevelBar: '[data-is-muted="false"]',
+
+  // Name extraction — checked in order inside a tile element.
+  tileNameSelectors: [
+    '[data-self-name]',             // newer Meet
+    '[jsname="r8qRAd"]',            // one known jsname for name label
+    'div[class][data-tooltip]',     // tooltip often holds display name
+  ],
+
+  // People panel — open it to get full roster.
+  peopleButton:
+    'button[aria-label*="people" i], button[aria-label*="everyone" i], button[aria-label*="participants" i]',
+  // Each row in the People panel roster.
+  rosterItem: '[data-participant-id] span[jsname], [role="listitem"] span',
+
   // Pre-join screen (anonymous guest path) -------------------------------
   // The "Your name" input shown to signed-out users on the Meet landing page
   // before joining. Meet has used multiple implementations; try in order.
@@ -34,6 +66,11 @@ export const SELECTORS = {
   // Lobby / denial detection. When the host denies entry, Meet shows a
   // message containing this text.
   deniedText: /You can't join this call|no one responded|denied/i,
+
+  // Bot / access blocked detection. Google redirects here or shows this text
+  // when the meeting blocks automated access or the link is invalid.
+  blockedUrls: ["workspace.google.com/products/meet", "accounts.google.com/v3/signin/rejected"],
+  blockedText: /you can't join this video call|this meeting is locked|you're not allowed|not available/i,
 };
 
 // Helper: return the first selector from a list that matches something on
diff --git a/meet-bot/speaker.js b/meet-bot/speaker.js
new file mode 100644
index 0000000..11c8640
--- /dev/null
+++ b/meet-bot/speaker.js
@@ -0,0 +1,272 @@
+// Phase 3: active-speaker detection + roster scraping.
+//
+// Primary signal: Meet's live captions. When captions are enabled, each
+// caption block carries the speaker's name as the first text node and the
+// spoken words as the rest. This is far more reliable than DOM-class
+// heuristics (which change every Meet rollout and rotate as ambient pulse).
+//
+// Fallback signals: the legacy data-is-speaking attribute and aria-label
+// text — still present on rare pre-join / breakout-style UIs.
+
+export async function setupSpeakerDetection(page, onEvent) {
+  await page.exposeFunction("__pgSpeakerEvent", (json) => onEvent(JSON.parse(json)));
+
+  await page.evaluate(() => {
+
+    // ── Helpers ────────────────────────────────────────────────────────────
+
+    function isUIAction(s) {
+      return /^(?:pin|unpin|mute|unmute|remove|reframe|spotlight|present|share|more options|turn on|turn off|stop|start)/i.test(s.trim());
+    }
+
+    function nameFromTile(el) {
+      const root = el.closest("[data-participant-id]") || el;
+
+      const nameEl = root.querySelector("[data-self-name]");
+      if (nameEl?.dataset.selfName) {
+        const n = nameEl.dataset.selfName.trim();
+        if (n && !isUIAction(n)) return n;
+      }
+      if (nameEl?.textContent) {
+        const n = nameEl.textContent.trim();
+        if (n && !isUIAction(n)) return n;
+      }
+
+      const rootLabel = root.getAttribute("aria-label") || "";
+      if (rootLabel) {
+        const m = rootLabel.match(/^(.+?)(?:'s\s+(?:video|screen|camera|tile)|(?:\s*\(you\)))/i);
+        if (m?.[1] && !isUIAction(m[1])) return m[1].trim();
+        if (rootLabel.length < 60 && !isUIAction(rootLabel) && !/\b(?:from|your|main|screen)\b/i.test(rootLabel))
+          return rootLabel.trim();
+      }
+
+      for (const sel of ["span[jsname='r8qRAd']", "div[jsname='Cpqoke']", "div[data-tooltip]"]) {
+        const candidate = root.querySelector(sel);
+        const text = (candidate?.textContent || candidate?.dataset?.tooltip || "").trim();
+        if (text && text.length < 60 && !isUIAction(text)) return text;
+      }
+
+      return null;
+    }
+
+    function emit(ev) {
+      window.__pgSpeakerEvent(JSON.stringify(ev));
+    }
+
+    // ── Captions-based speaker detection ───────────────────────────────────
+    //
+    // We look for a container whose aria-label mentions "caption" /
+    // "transcription" / "untertitel" etc. Meet renders each caption block as
+    // a group with the speaker name as the first text child and the spoken
+    // words as siblings. When a caption block updates (new words appended),
+    // that speaker is currently active.
+
+    const CAPTION_REGION_RE = /caption|transcript|untertitel|sous-titre|subtitulo|subtitle/i;
+
+    function findCaptionContainer() {
+      // 1. Strong preference: role=region with aria-label exactly "Captions"
+      //    (or localized equivalent matching the captions-word regex).
+      const regions = [...document.querySelectorAll('[role="region"]')];
+      for (const el of regions) {
+        const lbl = el.getAttribute("aria-label") || "";
+        if (CAPTION_REGION_RE.test(lbl)) return el;
+      }
+      // 2. Fallback: ANY element whose aria-label is exactly "Captions" etc.
+      for (const el of document.querySelectorAll('[aria-label]')) {
+        const lbl = (el.getAttribute("aria-label") || "").trim();
+        // Exact match on the WORD captions (not combobox "Caption type").
+        if (/^(captions?|transcript|untertitel|sous-titres)$/i.test(lbl)) return el;
+      }
+      // 3. Known jsnames.
+      for (const sel of ['div[jsname="dsyhDe"]', 'div[jsname="YSxPC"]', 'div[jsname="r5nxDd"]']) {
+        const el = document.querySelector(sel);
+        if (el) return el;
+      }
+      return null;
+    }
+
+    // Track active speakers: name → { lastUpdateMs, startMs }
+    // A speaker is "active" if their caption block updated in the last 1.5 s.
+    const activeSpeakers = new Map();
+    const SPEAKER_TIMEOUT_MS = 1500;
+
+    // Find the enclosing caption block for any DOM node. A caption block is
+    // the wrapper that contains one speaker's current utterance. Meet uses a
+    // class like `nMcdL` on this wrapper; if that rotates we fall back to
+    // structural heuristics (a div whose direct children include both a
+    // short "name" span and a larger "text" div).
+    function findCaptionBlock(node) {
+      let cur = node.nodeType === 1 ? node : node.parentElement;
+      const container = findCaptionContainer();
+      while (cur && cur !== container && cur !== document.body) {
+        if (cur.matches && cur.matches('[class*="nMcdL"]')) return cur;
+        cur = cur.parentElement;
+      }
+      // Structural fallback: direct child of container with children that look
+      // like a (name, text) pair.
+      if (container) {
+        let p = node.nodeType === 1 ? node : node.parentElement;
+        while (p && p.parentElement !== container) p = p.parentElement;
+        if (p && p.parentElement === container) return p;
+      }
+      return null;
+    }
+
+    function extractBlockSpeaker(block) {
+      // Try the known name span first.
+      const nameEl = block.querySelector('span.NWpY1d, [class*="NWpY1d"]');
+      if (nameEl) {
+        const name = (nameEl.textContent || "").trim();
+        if (name && name.length < 60 && !isUIAction(name) && /^[\p{L}]/u.test(name))
+          return name;
+      }
+      // Structural fallback: find a short-text descendant at the top of the
+      // block whose text is distinct from the long "spoken text" sibling.
+      const texts = [];
+      for (const el of block.querySelectorAll('*')) {
+        const t = (el.textContent || "").trim();
+        if (!t || t.length > 60) continue;
+        if (!/^[\p{L}][\p{L}\s.'-]+$/u.test(t)) continue;
+        if (isUIAction(t)) continue;
+        texts.push(t);
+      }
+      return texts[0] || null;
+    }
+
+    function onCaptionMutation(mutations) {
+      const now = Date.now();
+      const processedBlocks = new Set();
+
+      // Per-mutation block lookup — catches characterData updates.
+      for (const m of mutations || []) {
+        const block = findCaptionBlock(m.target);
+        if (!block || processedBlocks.has(block)) continue;
+        processedBlocks.add(block);
+        markSpeakerActive(extractBlockSpeaker(block), now);
+      }
+
+      // Whole-container re-scan — covers cases where Meet replaces entire
+      // caption blocks rather than appending characterData to existing ones.
+      // We use a data attribute to track each block's last-seen text length
+      // and treat any growth as active speech.
+      const container = findCaptionContainer();
+      if (!container) return;
+      for (const block of container.querySelectorAll('[class*="nMcdL"]')) {
+        if (processedBlocks.has(block)) continue;
+        const textEl = block.querySelector('[class*="ygicle"], [class*="VbkSUe"]');
+        const len = textEl ? (textEl.textContent || "").length : 0;
+        const prev = parseInt(block.getAttribute("data-pg-len") || "-1", 10);
+        if (len !== prev) {
+          block.setAttribute("data-pg-len", String(len));
+          markSpeakerActive(extractBlockSpeaker(block), now);
+        }
+      }
+    }
+
+    function markSpeakerActive(name, now) {
+      if (!name) return;
+      const existing = activeSpeakers.get(name);
+      if (!existing) {
+        activeSpeakers.set(name, { lastUpdateMs: now, startMs: now });
+        emit({ type: "speaker_start", name, wall_clock_ms: now });
+      } else {
+        existing.lastUpdateMs = now;
+      }
+    }
+
+    // Sweeper: close intervals for speakers whose captions haven't updated.
+    function sweepInactive() {
+      const now = Date.now();
+      for (const [name, info] of activeSpeakers) {
+        if (now - info.lastUpdateMs > SPEAKER_TIMEOUT_MS) {
+          emit({ type: "speaker_end", name, wall_clock_ms: now });
+          activeSpeakers.delete(name);
+        }
+      }
+    }
+
+    // ── Legacy DOM signals (fallback) ──────────────────────────────────────
+
+    let legacyLastSpeaker = null;
+    function checkLegacySpeaker() {
+      // Strategy 1: data-is-speaking="true"
+      const s1 = document.querySelector('[data-is-speaking="true"]');
+      let speaker = s1 ? nameFromTile(s1) : null;
+
+      if (!speaker) {
+        // Strategy 2: aria-label "X is speaking"
+        for (const el of document.querySelectorAll("[aria-label]")) {
+          const lbl = el.getAttribute("aria-label");
+          const m = lbl.match(/^(.+?)\s+is speaking/i);
+          if (m?.[1] && !isUIAction(m[1])) { speaker = m[1].trim(); break; }
+        }
+      }
+
+      if (speaker === legacyLastSpeaker) return;
+      const now = Date.now();
+      if (legacyLastSpeaker) emit({ type: "speaker_end", name: legacyLastSpeaker, wall_clock_ms: now });
+      if (speaker) emit({ type: "speaker_start", name: speaker, wall_clock_ms: now });
+      legacyLastSpeaker = speaker;
+    }
+
+    // ── Roster scraping ────────────────────────────────────────────────────
+
+    function scrapeRoster() {
+      const names = new Set();
+      for (const tile of document.querySelectorAll("[data-participant-id]")) {
+        const n = nameFromTile(tile);
+        if (n && n.length < 60 && !/^\(you\)$|^you$/i.test(n) && !isUIAction(n))
+          names.add(n);
+      }
+      return [...names];
+    }
+
+    let lastRosterKey = "";
+    function checkRoster() {
+      const roster = scrapeRoster();
+      const key = roster.slice().sort().join("|");
+      if (key === lastRosterKey) return;
+      lastRosterKey = key;
+      emit({ type: "roster_update", participants: roster, wall_clock_ms: Date.now() });
+    }
+
+    // ── Observers / scheduler ──────────────────────────────────────────────
+
+    // Caption observer: attach once the container appears; re-attach if Meet
+    // rerenders it.
+    let captionObserver = null;
+    let boundContainer = null;
+    function ensureCaptionObserver() {
+      const container = findCaptionContainer();
+      if (!container || container === boundContainer) return;
+      if (captionObserver) captionObserver.disconnect();
+      captionObserver = new MutationObserver(onCaptionMutation);
+      captionObserver.observe(container, { childList: true, subtree: true, characterData: true });
+      boundContainer = container;
+      // Immediate scan in case captions already exist.
+      onCaptionMutation();
+    }
+
+    // Roster observer — tiles come and go with gallery pagination.
+    const rosterObserver = new MutationObserver(() => checkRoster());
+    rosterObserver.observe(document.body, {
+      childList: true, subtree: true, attributes: true,
+      attributeFilter: ["data-self-name", "aria-label"],
+    });
+
+    // Periodic work: ensure observers attached, sweep inactive speakers, run
+    // legacy fallback, refresh roster.
+    setInterval(() => {
+      ensureCaptionObserver();
+      sweepInactive();
+      checkLegacySpeaker();
+      checkRoster();
+    }, 500);
+
+    setTimeout(() => {
+      ensureCaptionObserver();
+      checkRoster();
+    }, 2000);
+
+  });
+}
diff --git a/templates/admin.html b/templates/admin.html
index 6a44972..aab537a 100644
--- a/templates/admin.html
+++ b/templates/admin.html
@@ -771,10 +771,18 @@ <h3 style="margin-bottom: 10px; color: #8b5cf6; font-size: 14px; display: flex;
             <div class="panel" style="display: flex; flex-direction: column; max-height: none; height: 100%;">
                 <div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 10px; flex-shrink: 0;">
                     <h2 style="margin: 0;">📝 Transcript</h2>
-                    <div id="adminLagTimer" style="font-size: 12px; color: rgba(255, 255, 255, 0.6); padding: 6px 12px; background: rgba(0, 0, 0, 0.3); border-radius: 8px; display: none;">
-                        Lag: <span id="adminLagSeconds">0</span>s
+                    <div style="display: flex; gap: 8px; align-items: center;">
+                        <span id="botStatusBadge" style="font-size: 11px; padding: 3px 10px; border-radius: 12px; background: #374151; color: #9ca3af; display: none;">Bot disconnected</span>
+                        <div id="adminLagTimer" style="font-size: 12px; color: rgba(255, 255, 255, 0.6); padding: 6px 12px; background: rgba(0, 0, 0, 0.3); border-radius: 8px; display: none;">
+                            Lag: <span id="adminLagSeconds">0</span>s
+                        </div>
                     </div>
                 </div>
+                <!-- Meet participant roster (shown when bot is connected) -->
+                <div id="meetRosterPanel" style="display: none; margin-bottom: 8px; padding: 8px 12px; background: rgba(255,255,255,0.05); border-radius: 8px; border: 1px solid rgba(255,255,255,0.1); flex-shrink: 0;">
+                    <div style="font-size: 11px; color: #9ca3af; margin-bottom: 4px;">PARTICIPANTS</div>
+                    <ul id="meetRosterList" style="margin: 0; padding: 0; list-style: none; max-height: 80px; overflow-y: auto; font-size: 12px;"></ul>
+                </div>
                 <div id="transcriptContent" class="translation-content" style="font-size: 16px; line-height: 1.7; flex: 1; overflow-y: auto;">
                     <div class="empty-state">
                         <svg fill="none" stroke="currentColor" viewBox="0 0 24 24">
@@ -1840,6 +1848,42 @@ <h2>Settings</h2>
             }
         });
 
+        // ── Meet bot events ─────────────────────────────────────────────────
+
+        socket.on('bot_status', (data) => {
+            const badge = document.getElementById('botStatusBadge');
+            const rosterPanel = document.getElementById('meetRosterPanel');
+            if (!badge) return;
+            badge.style.display = 'inline-block';
+            badge.textContent = data.connected ? '🤖 Bot connected' : '🤖 Bot disconnected';
+            badge.style.background = data.connected ? 'rgba(34,197,94,0.2)' : 'rgba(239,68,68,0.2)';
+            badge.style.color = data.connected ? '#22c55e' : '#ef4444';
+            badge.style.border = `1px solid ${data.connected ? '#22c55e' : '#ef4444'}`;
+            if (rosterPanel) rosterPanel.style.display = data.connected ? 'block' : 'none';
+        });
+
+        socket.on('meet_roster', (data) => {
+            const el = document.getElementById('meetRosterList');
+            const rosterPanel = document.getElementById('meetRosterPanel');
+            if (!el) return;
+            el.innerHTML = (data.participants || [])
+                .map(n => `<li style="padding:2px 0;color:#ccc;">${n}</li>`)
+                .join('');
+            if (rosterPanel && (data.participants || []).length > 0)
+                rosterPanel.style.display = 'block';
+        });
+
+        socket.on('rename_speaker', (data) => {
+            // Retroactively replace SPEAKER_XX labels in the transcript panel.
+            const tc = document.getElementById('transcriptContent');
+            if (!tc) return;
+            const re = new RegExp(data.speaker_id.replace('_', '_'), 'g');
+            tc.innerHTML = tc.innerHTML.replace(
+                new RegExp(data.speaker_id, 'g'),
+                data.name
+            );
+        });
+
         function updateStatus() {
             const statusEl = document.getElementById('statusIndicator');
             const startBtn = document.getElementById('startBtn');

From 26258b684dbab1825ccc241591286663d0b1c9da Mon Sep 17 00:00:00 2001
From: Chabert Etienne <etienne.chabert@gmail.com>
Date: Tue, 21 Apr 2026 12:06:36 +0200
Subject: [PATCH 3/3] Admin bot controls, speaker-turn batching, live UI polish

- Admin "Start bot" input + button: paste a Meet ID (or full URL) and
  spawn the Node bot as a subprocess from Polyglot; Stop button kills
  it. Backend handlers normalize the URL and track one instance at a
  time.
- Buffer bar now displays seconds with a fixed 60 s cap (matches
  BOT_MAX_BATCH_SEC), computed as chunks * 1024 / 16000.
- Drop level-based silence detection in bot mode. The only flush
  triggers are (1) a NEW speaker starting and (2) the 60 s cap. A
  single speaker's natural pauses fire speaker_end/speaker_start
  toggles that we deliberately ignore, so their turn stays one batch.
  Mic-only mode keeps the original level-based silence heuristic as
  a fallback.
- Live "currently speaking" banner on admin + viewer, driven by a
  new active_speakers socket event broadcast whenever
  _active_speaker_starts changes.
- Viewer now renders the speaker name above each translated segment
  and handles rename_speaker retroactively, so late name resolutions
  update already-displayed rows in place.
- Admin UI fixes: removed the orphan header audio-visualizer strip
  (was rendering outside any container), restored the full AUDIO
  SIGNAL panel as the third column of the top row, forced
  minmax(0, 1fr) on the two system-stats grids so VRAM and System
  Resources keep equal widths, hid the SILENCE bar when the bot is
  connected since it no longer drives flushing there.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 app.py                | 113 +++++++++++++++++++++++++++++++++++---
 templates/admin.html  | 125 +++++++++++++++++++++++++++++-------------
 templates/viewer.html |  40 ++++++++++++++
 3 files changed, 231 insertions(+), 47 deletions(-)

diff --git a/app.py b/app.py
index 0b93f65..b314ca2 100644
--- a/app.py
+++ b/app.py
@@ -1789,20 +1789,25 @@ def process_audio():
 
             buffer.append(chunk)
 
-            # Speaker-switch batching (bot mode): when the bot reports a new active
-            # speaker, flush the current batch so each turn is transcribed as one unit.
-            # Extend max length to BOT_MAX_BATCH_SEC so one speaker can ramble up to 1 min.
+            # Bot mode: only flush on speaker switch or 60 s cap. No silence
+            # detection — a speaker's natural pauses emit speaker_end/start
+            # toggles and we explicitly do NOT flush on those. Mic-only mode
+            # falls back to the original level-based silence heuristic.
             global _pending_speaker_switch
-            bot_active = bot_connected and _current_bot_speaker is not None
-            if bot_active:
+            bot_mode = bot_connected
+            if bot_mode:
                 max_chunks = int(actual_sample_rate * BOT_MAX_BATCH_SEC / CHUNK_SIZE)
 
-            speaker_switched = bot_active and _pending_speaker_switch and len(buffer) >= min_chunks
+            speaker_switched = bot_mode and _pending_speaker_switch and len(buffer) >= min_chunks
             if speaker_switched:
                 _pending_speaker_switch = False
 
-            # Process when we detect end of sentence (silence after minimum audio) OR max buffer reached
-            silence_detected = len(buffer) >= min_chunks and silence_counter >= audio_thresholds["silence_chunks"]
+            # Level-based silence detection — only used when bot isn't driving batching.
+            silence_detected = (
+                not bot_mode
+                and len(buffer) >= min_chunks
+                and silence_counter >= audio_thresholds["silence_chunks"]
+            )
             max_length_reached = len(buffer) >= max_chunks
             should_process = silence_detected or max_length_reached or speaker_switched
 
@@ -2710,11 +2715,103 @@ def bot_speaker_event(ev):
                 _pending_speaker_switch = True
             _current_bot_speaker = name
             log(f"Speaking: {name}", "BOT")
+            _broadcast_active_speakers()
 
         elif ev_type == "speaker_end":
             start = _active_speaker_starts.pop(name, ts_ms - 1000)
             speaker_timeline.append((start, ts_ms, name))
             log(f"Segment: {name} {ts_ms - start} ms", "BOT")
+            # Don't flush here — a single speaker's short pauses emit
+            # speaker_end/speaker_start toggles, so flushing on end would
+            # chop their turn into tiny fragments. We only flush when a
+            # DIFFERENT speaker starts (speaker_switched) or the 60 s cap
+            # is reached. _current_bot_speaker stays as the last name so
+            # the same speaker resuming does not trigger a switch.
+            _broadcast_active_speakers()
+
+
+def _broadcast_active_speakers():
+    """Push the current set of speaking participants to admin + all viewers."""
+    names = list(_active_speaker_starts.keys())
+    payload = {"speakers": names, "wall_clock_ms": int(time.time() * 1000)}
+    socketio.emit("active_speakers", payload, room="admin")
+    for lang_code, count in active_language_viewers.items():
+        if count > 0:
+            socketio.emit("active_speakers", payload, room=f"lang_{lang_code}")
+
+
+# ── Admin-triggered bot spawning ─────────────────────────────────────────────
+# Tracks the currently running Meet bot subprocess so we can start/stop it
+# from the admin panel. Only one bot instance is supported at a time.
+_meet_bot_process = None
+_meet_bot_lock = threading.Lock()
+
+
+def _bot_script_path():
+    return Path(__file__).parent / "meet-bot" / "index.js"
+
+
+@socketio.on("start_meet_bot")
+def handle_start_meet_bot(data):
+    """Spawn the Meet bot pointing at the given URL. Admin-only."""
+    global _meet_bot_process
+
+    url = (data or {}).get("url", "").strip()
+    if not url.startswith("http"):
+        emit("meet_bot_control_result", {"ok": False, "error": "URL must start with http(s)://"})
+        return
+
+    with _meet_bot_lock:
+        # If a bot is already running, refuse to start another.
+        if _meet_bot_process is not None and _meet_bot_process.poll() is None:
+            emit("meet_bot_control_result", {"ok": False, "error": "Bot already running — stop it first"})
+            return
+
+        script = _bot_script_path()
+        if not script.exists():
+            emit("meet_bot_control_result", {"ok": False, "error": f"Bot script not found at {script}"})
+            return
+
+        import subprocess
+        try:
+            _meet_bot_process = subprocess.Popen(
+                ["node", str(script),
+                 "--url", url,
+                 "--polyglot-url", "http://localhost:5000",
+                 "--headful"],
+                cwd=str(script.parent),
+                stdout=subprocess.DEVNULL,
+                stderr=subprocess.DEVNULL,
+                stdin=subprocess.DEVNULL,
+                creationflags=subprocess.CREATE_NEW_PROCESS_GROUP if os.name == "nt" else 0,
+            )
+            log(f"Spawned Meet bot (pid={_meet_bot_process.pid}) → {url}", "BOT")
+            emit("meet_bot_control_result", {"ok": True, "pid": _meet_bot_process.pid})
+        except Exception as e:
+            log(f"Failed to spawn bot: {e}", "BOT")
+            emit("meet_bot_control_result", {"ok": False, "error": str(e)})
+
+
+@socketio.on("stop_meet_bot")
+def handle_stop_meet_bot():
+    """Terminate the running Meet bot subprocess."""
+    global _meet_bot_process
+    with _meet_bot_lock:
+        if _meet_bot_process is None or _meet_bot_process.poll() is not None:
+            emit("meet_bot_control_result", {"ok": False, "error": "No bot running"})
+            _meet_bot_process = None
+            return
+        try:
+            _meet_bot_process.terminate()
+            try:
+                _meet_bot_process.wait(timeout=5)
+            except Exception:
+                _meet_bot_process.kill()
+            log(f"Stopped Meet bot subprocess", "BOT")
+            _meet_bot_process = None
+            emit("meet_bot_control_result", {"ok": True})
+        except Exception as e:
+            emit("meet_bot_control_result", {"ok": False, "error": str(e)})
 
 
 if __name__ == "__main__":
diff --git a/templates/admin.html b/templates/admin.html
index aab537a..ada7950 100644
--- a/templates/admin.html
+++ b/templates/admin.html
@@ -580,16 +580,6 @@ <h1>🌍 Polyglot</h1>
                     <div class="status-dot listening"></div>
                     <span>🔴 LIVE</span>
                 </div>
-                <div class="audio-visualizer" id="audioVisualizer" style="display: none;">
-                    <div class="audio-bar"></div>
-                    <div class="audio-bar"></div>
-                    <div class="audio-bar"></div>
-                    <div class="audio-bar"></div>
-                    <div class="audio-bar"></div>
-                    <div class="audio-bar"></div>
-                    <div class="audio-bar"></div>
-                    <div class="audio-bar"></div>
-                </div>
                 <button id="startBtn">▶️ Start Listening</button>
                 <button id="stopBtn" disabled>⏹️ Stop</button>
                 <button id="settingsBtn">⚙️ Settings</button>
@@ -597,8 +587,8 @@ <h1>🌍 Polyglot</h1>
             </div>
         </header>
 
-        <!-- Audio Visualization, Thresholds, and Viewer Stats - 3 Column Layout -->
-        <div style="display: grid; grid-template-columns: 1fr 1fr 1fr; gap: 10px; margin-bottom: 8px;">
+        <!-- Audio Waveform + Thresholds + Viewer Stats - 3 Column Layout -->
+        <div style="display: grid; grid-template-columns: minmax(0, 1fr) minmax(0, 1fr) minmax(0, 1fr); gap: 10px; margin-bottom: 8px;">
             <!-- Column 1: Audio Waveform -->
             <div style="background: rgba(0, 0, 0, 0.95); color: #00ff41; padding: 12px; border-radius: 10px; border: 1px solid rgba(0, 255, 65, 0.2);">
                 <h3 style="color: #00ff41; margin-bottom: 10px; font-size: 12px; font-family: 'Courier New', monospace; letter-spacing: 2px; opacity: 0.8;">AUDIO SIGNAL</h3>
@@ -643,8 +633,8 @@ <h3 style="color: #667eea; margin-bottom: 10px; font-size: 14px;">📊 Threshold
                     </div>
                 </div>
 
-                <!-- Silence Counter Bar -->
-                <div style="margin-bottom: 10px;">
+                <!-- Silence Counter Bar (hidden in bot mode — only drives mic-mode flush) -->
+                <div id="silenceRow" style="margin-bottom: 10px;">
                     <div style="display: flex; justify-content: space-between; font-size: 10px; color: #999; margin-bottom: 4px;">
                         <span>SILENCE</span>
                         <span><span id="debugSilenceCounter">0</span> / <span id="debugSilenceChunksReq">-</span></span>
@@ -658,7 +648,7 @@ <h3 style="color: #667eea; margin-bottom: 10px; font-size: 14px;">📊 Threshold
                 <div>
                     <div style="display: flex; justify-content: space-between; font-size: 10px; color: #999; margin-bottom: 4px;">
                         <span>BUFFER</span>
-                        <span><span id="debugBufferChunks">0</span> (<span id="debugMinChunks">-</span>-<span id="debugMaxChunks">-</span>)</span>
+                        <span><span id="debugBufferSeconds">0.0</span>s / 60s</span>
                     </div>
                     <div style="height: 20px; background: #1a1a1a; border-radius: 10px; overflow: hidden; position: relative;">
                         <div id="bufferBar" style="height: 100%; background: linear-gradient(90deg, #8b5cf6 0%, #a78bfa 100%); width: 0%; transition: width 0.2s ease; box-shadow: 0 0 10px rgba(139, 92, 246, 0.5);"></div>
@@ -681,7 +671,7 @@ <h3 style="margin-bottom: 10px; color: #06b6d4; font-size: 14px; display: flex;
         </div>
 
         <!-- System Stats Row -->
-        <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 10px; margin-bottom: 8px;">
+        <div style="display: grid; grid-template-columns: minmax(0, 1fr) minmax(0, 1fr); gap: 10px; margin-bottom: 8px;">
             <!-- VRAM Usage Chart -->
             <div style="background: rgba(0, 0, 0, 0.9); color: white; padding: 12px; border-radius: 10px;">
                 <h3 style="margin-bottom: 10px; color: #f59e0b; font-size: 14px; display: flex; justify-content: space-between; align-items: center;">
@@ -692,7 +682,7 @@ <h3 style="margin-bottom: 10px; color: #f59e0b; font-size: 14px; display: flex;
                     <div style="flex: 1; height: 140px;">
                         <canvas id="vramChart"></canvas>
                     </div>
-                    <div id="vramLegend" style="font-size: 10px; min-width: 180px;">
+                    <div id="vramLegend" style="font-size: 10px; min-width: 140px; flex-shrink: 0;">
                         <div style="display: flex; align-items: center; gap: 6px; margin-bottom: 4px;" title="Whisper model">
                             <span style="width: 10px; height: 10px; background: #3b82f6; border-radius: 2px; flex-shrink: 0;"></span>
                             <span style="flex: 1; overflow: hidden; text-overflow: ellipsis; white-space: nowrap;" id="modelWhisperName">Whisper</span>
@@ -778,6 +768,21 @@ <h2 style="margin: 0;">📝 Transcript</h2>
                         </div>
                     </div>
                 </div>
+                <!-- Live "currently speaking" banner -->
+                <div id="activeSpeakerBanner" style="display: none; margin-bottom: 8px; padding: 8px 12px; background: linear-gradient(90deg, rgba(34,197,94,0.15), rgba(34,197,94,0.05)); border-radius: 8px; border: 1px solid rgba(34,197,94,0.4); flex-shrink: 0; font-size: 13px; color: #22c55e; font-weight: 600;">
+                    <span style="display: inline-block; width: 8px; height: 8px; background: #22c55e; border-radius: 50%; margin-right: 8px; animation: pulse 1.5s infinite;"></span>
+                    <span id="activeSpeakerText">—</span>
+                </div>
+
+                <!-- Meet bot control: enter a Meet URL/ID and start/stop the bot -->
+                <div id="meetBotControls" style="margin-bottom: 8px; padding: 10px 12px; background: rgba(255,255,255,0.04); border-radius: 8px; border: 1px solid rgba(255,255,255,0.1); flex-shrink: 0; display: flex; gap: 8px; align-items: center;">
+                    <input id="meetUrlInput" type="text" placeholder="meet.google.com/xxx-yyyy-zzz or just xxx-yyyy-zzz"
+                           style="flex: 1; padding: 6px 10px; font-size: 12px; background: rgba(0,0,0,0.4); color: #fff; border: 1px solid rgba(255,255,255,0.15); border-radius: 6px; outline: none;">
+                    <button id="startBotBtn" onclick="startBot()"
+                            style="padding: 6px 14px; font-size: 12px; background: linear-gradient(135deg, #22c55e 0%, #16a34a 100%); border: none; color: white; cursor: pointer; border-radius: 6px; font-weight: 600;">🤖 Start bot</button>
+                    <button id="stopBotBtn" onclick="stopBot()" style="display: none; padding: 6px 14px; font-size: 12px; background: linear-gradient(135deg, #ef4444 0%, #dc2626 100%); border: none; color: white; cursor: pointer; border-radius: 6px; font-weight: 600;">⏹ Stop bot</button>
+                </div>
+
                 <!-- Meet participant roster (shown when bot is connected) -->
                 <div id="meetRosterPanel" style="display: none; margin-bottom: 8px; padding: 8px 12px; background: rgba(255,255,255,0.05); border-radius: 8px; border: 1px solid rgba(255,255,255,0.1); flex-shrink: 0;">
                     <div style="font-size: 11px; color: #9ca3af; margin-bottom: 4px;">PARTICIPANTS</div>
@@ -1809,26 +1814,25 @@ <h2>Settings</h2>
                     }
                 }
 
-                // Update buffer stats and bar
-                const debugBufferChunks = document.getElementById('debugBufferChunks');
-                const debugMinChunks = document.getElementById('debugMinChunks');
-                const debugMaxChunks = document.getElementById('debugMaxChunks');
-                if (debugBufferChunks) debugBufferChunks.textContent = data.buffer_chunks;
-                if (debugMinChunks) debugMinChunks.textContent = data.min_chunks;
-                if (debugMaxChunks) debugMaxChunks.textContent = data.max_chunks;
-
-                // Update buffer bar (scale between min and max)
-                const bufferPercent = data.max_chunks > 0
-                    ? Math.min(100, (data.buffer_chunks / data.max_chunks) * 100)
-                    : 0;
+                // Buffer: display in seconds, scale bar against a fixed 60 s cap
+                // (matches BOT_MAX_BATCH_SEC on the server). 1 chunk = 1024/16000 s.
+                const CHUNK_SEC = 1024 / 16000;
+                const BUFFER_MAX_SEC = 60;
+                const bufferSeconds = (data.buffer_chunks || 0) * CHUNK_SEC;
+                const minSeconds = (data.min_chunks || 0) * CHUNK_SEC;
+                const maxSeconds = Math.min(BUFFER_MAX_SEC, (data.max_chunks || 0) * CHUNK_SEC);
+
+                const debugBufferSeconds = document.getElementById('debugBufferSeconds');
+                if (debugBufferSeconds) debugBufferSeconds.textContent = bufferSeconds.toFixed(1);
+
+                const bufferPercent = Math.min(100, (bufferSeconds / BUFFER_MAX_SEC) * 100);
                 const bufferBar = document.getElementById('bufferBar');
                 if (bufferBar) {
                     bufferBar.style.width = bufferPercent + '%';
 
-                    // Change color based on buffer state
-                    if (data.buffer_chunks >= data.max_chunks) {
+                    if (bufferSeconds >= maxSeconds) {
                         bufferBar.style.background = 'linear-gradient(90deg, #ef4444 0%, #f87171 100%)'; // Red when full
-                    } else if (data.buffer_chunks >= data.min_chunks) {
+                    } else if (bufferSeconds >= minSeconds) {
                         bufferBar.style.background = 'linear-gradient(90deg, #f59e0b 0%, #fbbf24 100%)'; // Yellow when at min
                     } else {
                         bufferBar.style.background = 'linear-gradient(90deg, #8b5cf6 0%, #a78bfa 100%)'; // Purple otherwise
@@ -1853,13 +1857,56 @@ <h2>Settings</h2>
         socket.on('bot_status', (data) => {
             const badge = document.getElementById('botStatusBadge');
             const rosterPanel = document.getElementById('meetRosterPanel');
-            if (!badge) return;
-            badge.style.display = 'inline-block';
-            badge.textContent = data.connected ? '🤖 Bot connected' : '🤖 Bot disconnected';
-            badge.style.background = data.connected ? 'rgba(34,197,94,0.2)' : 'rgba(239,68,68,0.2)';
-            badge.style.color = data.connected ? '#22c55e' : '#ef4444';
-            badge.style.border = `1px solid ${data.connected ? '#22c55e' : '#ef4444'}`;
+            const startBtn = document.getElementById('startBotBtn');
+            const stopBtn = document.getElementById('stopBotBtn');
+            const silenceRow = document.getElementById('silenceRow');
+            if (badge) {
+                badge.style.display = 'inline-block';
+                badge.textContent = data.connected ? '🤖 Bot connected' : '🤖 Bot disconnected';
+                badge.style.background = data.connected ? 'rgba(34,197,94,0.2)' : 'rgba(239,68,68,0.2)';
+                badge.style.color = data.connected ? '#22c55e' : '#ef4444';
+                badge.style.border = `1px solid ${data.connected ? '#22c55e' : '#ef4444'}`;
+            }
             if (rosterPanel) rosterPanel.style.display = data.connected ? 'block' : 'none';
+            if (startBtn) startBtn.style.display = data.connected ? 'none' : 'inline-block';
+            if (stopBtn)  stopBtn.style.display  = data.connected ? 'inline-block' : 'none';
+            // Silence detection is only used in mic-mode — hide its bar when the bot is driving batching.
+            if (silenceRow) silenceRow.style.display = data.connected ? 'none' : 'block';
+        });
+
+        // Start / stop the Meet bot via backend subprocess.
+        function normalizeMeetUrl(raw) {
+            const s = (raw || '').trim();
+            if (!s) return '';
+            if (s.startsWith('http')) return s;
+            if (s.includes('meet.google.com')) return 'https://' + s.replace(/^\/+/, '');
+            // Just a meeting ID like "stu-tyen-aed"
+            if (/^[a-z]{3}-[a-z]{4}-[a-z]{3}$/i.test(s)) return 'https://meet.google.com/' + s;
+            return s;
+        }
+        function startBot() {
+            const input = document.getElementById('meetUrlInput');
+            const url = normalizeMeetUrl(input?.value || '');
+            if (!url) { alert('Enter a Meet URL or meeting ID (e.g. xxx-yyyy-zzz)'); return; }
+            socket.emit('start_meet_bot', { url });
+        }
+        function stopBot() { socket.emit('stop_meet_bot'); }
+
+        socket.on('meet_bot_control_result', (data) => {
+            if (!data.ok) alert(`Bot control failed: ${data.error || 'unknown'}`);
+        });
+
+        socket.on('active_speakers', (data) => {
+            const banner = document.getElementById('activeSpeakerBanner');
+            const text = document.getElementById('activeSpeakerText');
+            if (!banner || !text) return;
+            const names = (data && data.speakers) || [];
+            if (names.length === 0) {
+                banner.style.display = 'none';
+            } else {
+                banner.style.display = 'block';
+                text.textContent = `Speaking: ${names.join(', ')}`;
+            }
         });
 
         socket.on('meet_roster', (data) => {
@@ -1903,7 +1950,7 @@ <h2>Settings</h2>
                 statusEl.innerHTML = '<span class="status-dot"></span><span>Listening</span>';
                 startBtn.disabled = true;
                 stopBtn.disabled = false;
-                visualizer.style.display = 'inline-flex';
+                visualizer.style.display = 'flex';
                 console.log('[updateStatus] Set stopBtn.disabled = false');
             } else {
                 statusEl.className = 'status idle';
diff --git a/templates/viewer.html b/templates/viewer.html
index aa01b97..a299793 100644
--- a/templates/viewer.html
+++ b/templates/viewer.html
@@ -624,6 +624,11 @@ <h1>Select Your Language</h1>
                     <button class="change-language-btn" onclick="changeLanguage()">Change</button>
                 </div>
             </div>
+            <!-- Live "currently speaking" banner -->
+            <div id="activeSpeakerBanner" style="display: none; margin: 8px 12px 0; padding: 10px 14px; background: linear-gradient(90deg, rgba(139,92,246,0.18), rgba(139,92,246,0.05)); border-radius: 10px; border: 1px solid rgba(139,92,246,0.45); font-size: 14px; color: #a78bfa; font-weight: 600;">
+                <span style="display: inline-block; width: 9px; height: 9px; background: #a78bfa; border-radius: 50%; margin-right: 10px; animation: pulse 1.5s infinite;"></span>
+                <span id="activeSpeakerText">—</span>
+            </div>
             <div class="panel-content" id="translationsContainer">
                 <div class="no-data">
                     <div class="icon">...</div>
@@ -825,6 +830,7 @@ <h1>Select Your Language</h1>
                 // Apply new-message class to the newest items (at the start after reverse)
                 const isNew = index < newCount;
                 itemDiv.className = isNew ? 'translation-item new-message' : 'translation-item';
+                if (item.speaker) itemDiv.dataset.speaker = item.speaker;
 
                 const timeDiv = document.createElement('div');
                 timeDiv.className = 'time';
@@ -835,6 +841,15 @@ <h1>Select Your Language</h1>
                     hour12: false
                 });
 
+                // Speaker name line (prominent, colored)
+                if (item.speaker) {
+                    const speakerDiv = document.createElement('div');
+                    speakerDiv.className = 'speaker';
+                    speakerDiv.style.cssText = 'font-weight:600; color:#8b5cf6; font-size:13px; margin-bottom:2px;';
+                    speakerDiv.textContent = item.speaker;
+                    itemDiv.appendChild(speakerDiv);
+                }
+
                 const textDiv = document.createElement('div');
                 textDiv.className = 'text';
                 textDiv.textContent = item.text;
@@ -896,6 +911,7 @@ <h1>Select Your Language</h1>
                 newSegments.forEach(segment => {
                     displayedTranslations.push({
                         text: segment.text,
+                        speaker: segment.speaker || null,
                         timestamp: Date.now()
                     });
                 });
@@ -1046,6 +1062,30 @@ <h1>Select Your Language</h1>
             document.getElementById('connectionStatus').textContent = 'Disconnected';
         });
 
+        // Retroactively rename SPEAKER_XX labels to real names for already-displayed segments.
+        socket.on('rename_speaker', (data) => {
+            if (!data || !data.speaker_id || !data.name) return;
+            let dirty = false;
+            displayedTranslations.forEach(item => {
+                if (item.speaker === data.speaker_id) { item.speaker = data.name; dirty = true; }
+            });
+            if (dirty) renderTranslations(0);
+        });
+
+        // Live "who is currently speaking" banner.
+        socket.on('active_speakers', (data) => {
+            const banner = document.getElementById('activeSpeakerBanner');
+            const text = document.getElementById('activeSpeakerText');
+            if (!banner || !text) return;
+            const names = (data && data.speakers) || [];
+            if (names.length === 0) {
+                banner.style.display = 'none';
+            } else {
+                banner.style.display = 'block';
+                text.textContent = `Speaking: ${names.join(', ')}`;
+            }
+        });
+
         socket.io.on('reconnect_attempt', (attemptNumber) => {
             reconnectAttempts = attemptNumber;
             document.getElementById('connectionDot').className = 'status-dot reconnecting';