nmbrthirteen · nmbrthirteen · Jul 5, 2026 · Jul 5, 2026 · Jul 5, 2026
@@ -444,6 +444,8 @@ def cmd_process(args):
         config["crop_strategy"] = args.crop
     if getattr(args, "format", None):
         config["format"] = args.format
+    if getattr(args, "profile", None):
+        config["profile"] = args.profile
     if getattr(args, "thumbnails", None) is not None:
         config["generate_thumbnails"] = args.thumbnails
     if args.top:
@@ -694,6 +696,31 @@ def _transcribe_progress(pct, msg):
             clips = resumed
             resumed_from_session = True
 
+    # Saliency profiles (party/action) pick moments from the fused laughter/energy
+    # curve rather than the transcript, so they work on footage with no dialogue.
+    from services.profiles import get_profile
+
+    content_profile = get_profile(config.get("profile"))
+    if content_profile.candidate_source == "saliency" and not clips:
+        from services.saliency import detect_highlights
+        from services.formats import get_format
+
+        spec = get_format(config.get("format", "vertical"))
+        print(f"  [3/4] Detecting {content_profile.name} highlights (laughter + energy)...")
+        clips = detect_highlights(
+            video_path,
+            profile_name=content_profile.name,
+            top_n=top_n,
+            min_dur=min(8.0, float(spec.dur_min)),
+            max_dur=float(spec.dur_max),
+            progress_callback=lambda pct, msg: print(f"         {msg}") if msg else None,
+        )
+        if clips:
+            print(f"         ✓ {len(clips)} highlights found")
+            _save_suggestions_session(cache_hash, top_n, "saliency", clips, selection_sig)
+        else:
+            print("         ⚠ No highlights found, falling back to transcript selection")
+
     # Try an AI CLI first (uses PodStack knowledge base for intelligent selection)
     from services.claude_suggest import suggest_initial_with_claude, _engine_label, _find_ai_cli
 
@@ -3338,6 +3365,7 @@ def main():
     proc.add_argument("--caption-style", choices=["branded", "hormozi", "karaoke", "subtle"])
     proc.add_argument("--crop", choices=["center", "face", "speaker", "speaker-hardcut"])
     proc.add_argument("--format", choices=["vertical", "horizontal", "square"], help="Output aspect ratio (default: vertical)")
+    proc.add_argument("--profile", choices=["podcast", "party", "action"], help="Detection profile: podcast (transcript-first, default), party/action (laughter/energy highlights)")
     proc.add_argument("--logo", help="Logo image (asset name or path)")
     proc.add_argument("--outro", help="Outro video (asset name or path)")
     proc.add_argument("--time-adjust", type=float, help="Timestamp offset in seconds")

@@ -308,6 +308,37 @@ def handle_analyze_energy(task_id: str, params: dict):
     emit_result(task_id, "success", data=result)
 
 
+def handle_detect_highlights(task_id: str, params: dict):
+    """Detect highlight clips from a video's fused signal curve (party/action profiles).
+
+    Accepts a single `video_path`, or a list of `video_paths` to pool and rank
+    highlights across a whole folder of clips.
+    """
+    from services.saliency import detect_highlights, detect_highlights_pooled
+
+    video_paths = params.get("video_paths")
+    video_path = params.get("video_path", "")
+    if not video_paths and not video_path:
+        emit_result(task_id, "error", error="video_path or video_paths is required")
+        return
+
+    common = dict(
+        profile_name=params.get("profile", "party"),
+        min_dur=float(params.get("min_dur", 8.0)),
+        max_dur=float(params.get("max_dur", 60.0)),
+        progress_callback=lambda pct, msg: emit_progress(task_id, "detecting", pct, msg),
+    )
+    if video_paths:
+        clips = detect_highlights_pooled(
+            video_paths=video_paths, top_n=int(params.get("top_n", 15)), **common
+        )
+    else:
+        clips = detect_highlights(
+            video_path=video_path, top_n=int(params.get("top_n", 8)), **common
+        )
+    emit_result(task_id, "success", data={"clips": clips, "count": len(clips)})
+
+
 def handle_detect_encoder(task_id: str, params: dict):
     """Detect available hardware encoders."""
     from services.encoder import get_encoder_info
@@ -601,6 +632,7 @@ def handle_run_integration_tool(task_id: str, params: dict):
     "create_clip": handle_create_clip,
     "batch_clips": handle_batch_clips,
     "analyze_energy": handle_analyze_energy,
+    "detect_highlights": handle_detect_highlights,
     "pack_transcript": handle_pack_transcript,
     "detect_encoder": handle_detect_encoder,
     "presets": handle_presets,

@@ -0,0 +1,281 @@
+"""Multi-signal saliency engine — fuse channels, peak-pick, expand reactions, snap.
+
+For profiles whose candidate source is "saliency" (party, action), moments are
+generated from a fused interestingness curve instead of from an LLM reading the
+transcript. This is what lets podcli auto-cut highlights from footage with no useful
+transcript (party videos, action).
+
+Each channel is normalized against THIS video's own distribution (never a global
+scale) so different recordings, rooms and mic levels compare fairly, then combined by
+the active profile's weights. Peaks on the fused curve become candidate clips; a peak
+driven by a laugh or cheer is expanded backwards to capture the moment that caused the
+reaction, since the funny thing happens just before people react to it.
+"""
+
+from typing import Optional, Callable
+
+import numpy as np
+
+from services.profiles import get_profile, ContentProfile
+from services.audio_analyzer import extract_audio_energy
+from services.audio_events import extract_audio_events, is_available as audio_events_available
+
+GRID_HZ = 1.0  # common time grid; energy is per-second, so 1 Hz is the natural rate
+
+
+def _robust_z(x: np.ndarray) -> np.ndarray:
+    """Median/MAD normalization — resistant to the heavy tails of RMS and reactions."""
+    if x.size == 0:
+        return x
+    med = np.median(x)
+    mad = np.median(np.abs(x - med))
+    scale = 1.4826 * mad if mad > 1e-9 else (np.std(x) or 1.0)
+    return (x - med) / scale
+
+
+def _energy_curve(energy_data: list[dict], n_bins: int) -> np.ndarray:
+    """Per-second loudness onto the grid; silence (<= -60 dB) floored so it doesn't win."""
+    curve = np.full(n_bins, -60.0)
+    for e in energy_data:
+        b = int(e["time"] * GRID_HZ)
+        if 0 <= b < n_bins:
+            curve[b] = max(curve[b], e.get("rms_db", -60.0))
+    return curve
+
+
+def _reaction_curve(events_data: list[dict], n_bins: int) -> np.ndarray:
+    """Peak reaction (laugh/cheer/scream) per grid bin."""
+    curve = np.zeros(n_bins)
+    for e in events_data:
+        b = int(e["time"] * GRID_HZ)
+        if 0 <= b < n_bins:
+            level = max(e.get("laughter", 0), e.get("cheering", 0), e.get("screaming", 0))
+            curve[b] = max(curve[b], level)
+    return curve
+
+
+def _dilate(curve: np.ndarray, radius_bins: int) -> np.ndarray:
+    """Spread each spike to its neighbors (grayscale dilation) so a brief, narrow
+    reaction still aligns with, and can win, the fused peak near it."""
+    if radius_bins <= 0 or curve.size == 0:
+        return curve
+    out = curve.copy()
+    for r in range(1, radius_bins + 1):
+        out[r:] = np.maximum(out[r:], curve[:-r])
+        out[:-r] = np.maximum(out[:-r], curve[r:])
+    return out
+
+
+def fuse_channels(channels: dict[str, np.ndarray], profile: ContentProfile) -> np.ndarray:
+    """Weighted sum of per-video-normalized channels, weights renormalized over what exists."""
+    present = {k: v for k, v in channels.items() if profile.channel_weights.get(k, 0) > 0 and v.size}
+    if not present:
+        return np.zeros(next(iter(channels.values())).size if channels else 0)
+    total_w = sum(profile.channel_weights[k] for k in present)
+    fused = None
+    for k, curve in present.items():
+        w = profile.channel_weights[k] / total_w
+        contrib = w * _robust_z(curve)
+        fused = contrib if fused is None else fused + contrib
+    return fused
+
+
+def pick_peaks(curve: np.ndarray, height: float, min_gap_bins: int) -> list[int]:
+    """Local maxima above `height`, then greedy non-maximum suppression by min gap.
+
+    Peaks are taken in descending value order and a lower peak is dropped if it falls
+    within min_gap_bins of an already-chosen higher one (1-D NMS).
+    """
+    if curve.size == 0:
+        return []
+    candidates = [
+        i for i in range(1, len(curve) - 1)
+        if curve[i] >= curve[i - 1] and curve[i] >= curve[i + 1] and curve[i] >= height
+    ]
+    if curve[0] >= height and (len(curve) == 1 or curve[0] > curve[1]):
+        candidates.append(0)
+    if len(curve) > 1 and curve[-1] >= height and curve[-1] > curve[-2]:
+        candidates.append(len(curve) - 1)
+    candidates.sort(key=lambda i: curve[i], reverse=True)
+    chosen: list[int] = []
+    for i in candidates:
+        if all(abs(i - j) >= min_gap_bins for j in chosen):
+            chosen.append(i)
+    return sorted(chosen)
+
+
+def _snap_to_quiet(target_sec: float, energy_curve: np.ndarray, window_sec: float = 1.5) -> float:
+    """Nudge a boundary to the quietest second nearby, so cuts land in a lull not mid-action."""
+    if energy_curve.size == 0:
+        return target_sec
+    center = int(round(target_sec * GRID_HZ))
+    lo = max(0, center - int(window_sec * GRID_HZ))
+    hi = min(len(energy_curve), center + int(window_sec * GRID_HZ) + 1)
+    if lo >= hi:
+        return target_sec
+    local = energy_curve[lo:hi]
+    return (lo + int(np.argmin(local))) / GRID_HZ
+
+
+def _window_for_peak(
+    peak_sec: float,
+    reaction_level: float,
+    profile: ContentProfile,
+    duration: float,
+    energy_curve: np.ndarray,
+    min_dur: float,
+    max_dur: float,
+) -> tuple[float, float, bool]:
+    """Clip window for a peak. A reaction peak expands backwards from the reaction onset."""
+    is_reaction = reaction_level >= 0.15
+    if is_reaction:
+        # The funny thing happens BEFORE the laugh, so keep the run-up: expand backwards
+        # from the reaction, snap only the end to a lull, and grow the start (not the
+        # end) if we're under the minimum so the payoff stays put.
+        start = max(0.0, peak_sec - profile.reaction_lookback_sec)
+        end = _snap_to_quiet(min(duration, peak_sec + profile.reaction_payoff_sec), energy_curve)
+        if end - start < min_dur:
+            start = max(0.0, end - min_dur)
+        elif end - start > max_dur:
+            start = end - max_dur
+    else:
+        half = min_dur / 2.0
+        start = _snap_to_quiet(max(0.0, peak_sec - half), energy_curve)
+        end = _snap_to_quiet(min(duration, peak_sec + half), energy_curve)
+        if end - start < min_dur:
+            end = min(duration, start + min_dur)
+        elif end - start > max_dur:
+            end = start + max_dur
+    return round(max(0.0, start), 1), round(min(duration, end), 1), is_reaction
+
+
+def detect_highlights(
+    video_path: str,
+    profile_name: str = "party",
+    top_n: int = 8,
+    min_dur: float = 8.0,
+    max_dur: float = 60.0,
+    height_z: float = 1.0,
+    progress_callback: Optional[Callable] = None,
+) -> list[dict]:
+    """
+    Generate highlight clips from a video's fused signal curve (no transcript needed).
+
+    Returns clip dicts compatible with the render pipeline:
+    {title, start_second, end_second, duration, score, reasons, preview}.
+    """
+    profile = get_profile(profile_name)
+
+    if progress_callback:
+        progress_callback(10, "Analyzing audio energy...")
+    energy_data = extract_audio_energy(video_path)
+
+    events_data = []
+    if audio_events_available():
+        if progress_callback:
+            progress_callback(40, "Detecting laughter and reactions...")
+        events_data = extract_audio_events(video_path)
+
+    last_times = [e["time"] for e in energy_data] + [e["time"] for e in events_data]
+    if not last_times:
+        return []
+    duration = max(last_times) + 1.0
+    n_bins = int(duration * GRID_HZ) + 1
+
+    energy_curve = _energy_curve(energy_data, n_bins)
+    # Dilate reactions by ~2s so a single-frame laugh isn't suppressed by a louder
+    # energy neighbor and so the fused peak lands on the reaction, not next to it.
+    reaction_curve = _dilate(_reaction_curve(events_data, n_bins), int(2 * GRID_HZ))
+
+    fused = fuse_channels(
+        {"energy": energy_curve, "audio_event": reaction_curve}, profile
+    )
+    if fused.size == 0:
+        return []
+
+    if progress_callback:
+        progress_callback(70, "Selecting highlight moments...")
+    min_gap_bins = max(1, int(profile.peak_min_gap_sec * GRID_HZ))
+
+    # Reaction moments are primary candidates — a detected laugh/cheer is almost always
+    # worth a clip regardless of loudness, so they aren't made to out-compete energy in
+    # the blended curve. Energy peaks then fill the rest, minus any that collide with a
+    # reaction. Reaction score is offset above energy so reactions rank first.
+    reaction_peaks = pick_peaks(reaction_curve, 0.15, min_gap_bins)
+    energy_peaks = pick_peaks(fused, height_z, min_gap_bins)
+
+    candidates = [(i, float(reaction_curve[i]), True) for i in reaction_peaks]
+    reaction_bins = {i for i in reaction_peaks}
+    for i in energy_peaks:
+        if all(abs(i - j) >= min_gap_bins for j in reaction_bins):
+            candidates.append((i, float(fused[i]), False))
+
+    def rank_key(c):
+        i, val, is_reaction = c
+        return (1 if is_reaction else 0, val)
+
+    candidates.sort(key=rank_key, reverse=True)
+    candidates = candidates[:top_n]
+
+    clips = []
+    for i, val, want_reaction in candidates:
+        peak_sec = i / GRID_HZ
+        reaction_level = float(reaction_curve[i]) if want_reaction else 0.0
+        start, end, is_reaction = _window_for_peak(
+            peak_sec, reaction_level, profile, duration, energy_curve, min_dur, max_dur
+        )
+        if end - start < min_dur * 0.75:
+            continue
+        kind = "laugh/cheer" if is_reaction else "high energy"
+        score = round(10.0 + reaction_level * 10.0, 2) if is_reaction else round(float(val), 2)
+        clips.append({
+            "title": f"Highlight ({kind}) at {int(peak_sec // 60):d}:{int(peak_sec % 60):02d}",
+            "start_second": start,
+            "end_second": end,
+            "duration": round(end - start),
+            "score": score,
+            "reasons": ["reaction"] if is_reaction else ["energy_peak"],
+            "preview": "",
+            "content_type": "highlight",
+        })
+
+    clips.sort(key=lambda c: c["start_second"])
+    if progress_callback:
+        progress_callback(100, f"Found {len(clips)} highlights")
+    return clips
+
+
+def detect_highlights_pooled(
+    video_paths: list[str],
+    profile_name: str = "party",
+    top_n: int = 15,
+    min_dur: float = 8.0,
+    max_dur: float = 60.0,
+    progress_callback: Optional[Callable] = None,
+) -> list[dict]:
+    """
+    Detect highlights across many videos and rank them globally — "the best N bits
+    from tonight" across a folder of party clips.
+
+    Each returned clip carries a `source_file`. Ranking is reaction-first, then by
+    score, so a genuine laugh in any file outranks a merely loud moment in another.
+    """
+    pooled: list[dict] = []
+    n = len(video_paths) or 1
+    for idx, path in enumerate(video_paths):
+        clips = detect_highlights(
+            path, profile_name=profile_name, top_n=top_n, min_dur=min_dur, max_dur=max_dur
+        )
+        for c in clips:
+            c["source_file"] = path
+        pooled.extend(clips)
+        if progress_callback:
+            progress_callback(
+                int((idx + 1) / n * 100), f"{path}: {len(clips)} highlights"
+            )
+
+    pooled.sort(
+        key=lambda c: (1 if "reaction" in c.get("reasons", []) else 0, c.get("score", 0)),
+        reverse=True,
+    )
+    return pooled[:top_n]
@@ -95,9 +95,9 @@ A new `profile` param threads the **same ~12 hops the `format` field did**: `sug
 
 ## Phasing
 
-- **Phase 0 — profile scaffolding, zero behavior change.** Add `ContentProfile` abstraction; thread the `profile` param through the ~12 hops; `default = podcast` reproduces current selection exactly. **Gate: existing test suite green; same clips out for a fixed transcript.**
-- **Phase 1 — audio-event channel (the isolated valuable core).** YAMNet-ONNX laughter/cheer/applause/scream computed in the detect-once hub. Feeds podcast ranking as a labeled signal (laughs already spike energy; now they're *named*) and lays the party foundation. **Gate: laughter timestamps validated on a sample clip; podcast output unchanged unless the channel is given weight.**
-- **Phase 2 — fusion engine + saliency candidate source + party profile (audio-only).** `saliency.py` fusion + numpy peak-pick + reaction-expand (8 s) + boundary-snap. Party profile = energy + audio_event + prosody, no transcript, no motion. **Party videos auto-clip end to end. Gate: demo on real party footage.**
+- **Phase 0 — profile scaffolding, zero behavior change.** [DONE, PR #43 / Phase 2 branch] `ContentProfile` abstraction added (`profiles.py`); `profile` param threaded through the CLI (`--profile`, config, selection signature). Python-side default `podcast` reproduces current selection. TS-side threading (MCP/web `profile` param) still open — see below.
+- **Phase 1 — audio-event channel (the isolated valuable core).** [DONE, PR #43] YAMNet-ONNX laughter/cheer/applause/scream (`audio_events.py`) computed in the detect-once hub; surfaced in the packed transcript and CLI heuristic. Podcast output unchanged unless the channel is weighted. Validated on 32 real clips.
+- **Phase 2 — fusion engine + saliency candidate source + party profile (audio-only).** [DONE, this branch] `saliency.py`: per-video robust-z normalization, weighted fusion, numpy peak-pick (NMS), reaction dilation, reaction-first candidate generation, 8 s backward expansion, quiet-point boundary snap. Wired to `--profile party|action` in `podcli process`. Deterministic on clean input; reactions detected with correct run-up windows. **Gate remaining: tune thresholds/weights on real party footage (synthetic podcast-clip concat is not representative).**
 - **Phase 3 — visual channels + action profile + multi-file pooling.** Optical flow (OpenCV) + face-reaction channels; action profile; pool peaks across a *folder* of clips and rank globally ("best 15 bits from tonight" across 80 phone videos). **Gate: catches a silent visual gag; folder-level ranking works.**
 - **Phase 4 (optional) — highlight reel renderer.** Ordering, pacing, optional music-bed ducking, transitions — a thin renderer atop the detected moments, reusing the clip-render stack.