Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions backend/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -444,6 +444,8 @@ def cmd_process(args):
config["crop_strategy"] = args.crop
if getattr(args, "format", None):
config["format"] = args.format
if getattr(args, "profile", None):
config["profile"] = args.profile
if getattr(args, "thumbnails", None) is not None:
config["generate_thumbnails"] = args.thumbnails
if args.top:
Expand Down Expand Up @@ -694,6 +696,31 @@ def _transcribe_progress(pct, msg):
clips = resumed
resumed_from_session = True

# Saliency profiles (party/action) pick moments from the fused laughter/energy
# curve rather than the transcript, so they work on footage with no dialogue.
from services.profiles import get_profile

content_profile = get_profile(config.get("profile"))
if content_profile.candidate_source == "saliency" and not clips:
from services.saliency import detect_highlights
from services.formats import get_format

spec = get_format(config.get("format", "vertical"))
print(f" [3/4] Detecting {content_profile.name} highlights (laughter + energy)...")
clips = detect_highlights(
video_path,
profile_name=content_profile.name,
top_n=top_n,
min_dur=min(8.0, float(spec.dur_min)),
max_dur=float(spec.dur_max),
progress_callback=lambda pct, msg: print(f" {msg}") if msg else None,
)
if clips:
print(f" ✓ {len(clips)} highlights found")
_save_suggestions_session(cache_hash, top_n, "saliency", clips, selection_sig)
else:
print(" ⚠ No highlights found, falling back to transcript selection")

# Try an AI CLI first (uses PodStack knowledge base for intelligent selection)
from services.claude_suggest import suggest_initial_with_claude, _engine_label, _find_ai_cli

Expand Down Expand Up @@ -3338,6 +3365,7 @@ def main():
proc.add_argument("--caption-style", choices=["branded", "hormozi", "karaoke", "subtle"])
proc.add_argument("--crop", choices=["center", "face", "speaker", "speaker-hardcut"])
proc.add_argument("--format", choices=["vertical", "horizontal", "square"], help="Output aspect ratio (default: vertical)")
proc.add_argument("--profile", choices=["podcast", "party", "action"], help="Detection profile: podcast (transcript-first, default), party/action (laughter/energy highlights)")
proc.add_argument("--logo", help="Logo image (asset name or path)")
proc.add_argument("--outro", help="Outro video (asset name or path)")
proc.add_argument("--time-adjust", type=float, help="Timestamp offset in seconds")
Expand Down
32 changes: 32 additions & 0 deletions backend/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,37 @@ def handle_analyze_energy(task_id: str, params: dict):
emit_result(task_id, "success", data=result)


def handle_detect_highlights(task_id: str, params: dict):
"""Detect highlight clips from a video's fused signal curve (party/action profiles).

Accepts a single `video_path`, or a list of `video_paths` to pool and rank
highlights across a whole folder of clips.
"""
from services.saliency import detect_highlights, detect_highlights_pooled

video_paths = params.get("video_paths")
video_path = params.get("video_path", "")
if not video_paths and not video_path:
emit_result(task_id, "error", error="video_path or video_paths is required")
return

common = dict(
profile_name=params.get("profile", "party"),
min_dur=float(params.get("min_dur", 8.0)),
max_dur=float(params.get("max_dur", 60.0)),
progress_callback=lambda pct, msg: emit_progress(task_id, "detecting", pct, msg),
)
if video_paths:
clips = detect_highlights_pooled(
video_paths=video_paths, top_n=int(params.get("top_n", 15)), **common
)
else:
clips = detect_highlights(
video_path=video_path, top_n=int(params.get("top_n", 8)), **common
)
emit_result(task_id, "success", data={"clips": clips, "count": len(clips)})


def handle_detect_encoder(task_id: str, params: dict):
"""Detect available hardware encoders."""
from services.encoder import get_encoder_info
Expand Down Expand Up @@ -601,6 +632,7 @@ def handle_run_integration_tool(task_id: str, params: dict):
"create_clip": handle_create_clip,
"batch_clips": handle_batch_clips,
"analyze_energy": handle_analyze_energy,
"detect_highlights": handle_detect_highlights,
"pack_transcript": handle_pack_transcript,
"detect_encoder": handle_detect_encoder,
"presets": handle_presets,
Expand Down
281 changes: 281 additions & 0 deletions backend/services/saliency.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,281 @@
"""Multi-signal saliency engine — fuse channels, peak-pick, expand reactions, snap.

For profiles whose candidate source is "saliency" (party, action), moments are
generated from a fused interestingness curve instead of from an LLM reading the
transcript. This is what lets podcli auto-cut highlights from footage with no useful
transcript (party videos, action).

Each channel is normalized against THIS video's own distribution (never a global
scale) so different recordings, rooms and mic levels compare fairly, then combined by
the active profile's weights. Peaks on the fused curve become candidate clips; a peak
driven by a laugh or cheer is expanded backwards to capture the moment that caused the
reaction, since the funny thing happens just before people react to it.
"""

from typing import Optional, Callable

import numpy as np

from services.profiles import get_profile, ContentProfile
from services.audio_analyzer import extract_audio_energy
from services.audio_events import extract_audio_events, is_available as audio_events_available

GRID_HZ = 1.0 # common time grid; energy is per-second, so 1 Hz is the natural rate


def _robust_z(x: np.ndarray) -> np.ndarray:
"""Median/MAD normalization — resistant to the heavy tails of RMS and reactions."""
if x.size == 0:
return x
med = np.median(x)
mad = np.median(np.abs(x - med))
scale = 1.4826 * mad if mad > 1e-9 else (np.std(x) or 1.0)
return (x - med) / scale


def _energy_curve(energy_data: list[dict], n_bins: int) -> np.ndarray:
"""Per-second loudness onto the grid; silence (<= -60 dB) floored so it doesn't win."""
curve = np.full(n_bins, -60.0)
for e in energy_data:
b = int(e["time"] * GRID_HZ)
if 0 <= b < n_bins:
curve[b] = max(curve[b], e.get("rms_db", -60.0))
return curve


def _reaction_curve(events_data: list[dict], n_bins: int) -> np.ndarray:
"""Peak reaction (laugh/cheer/scream) per grid bin."""
curve = np.zeros(n_bins)
for e in events_data:
b = int(e["time"] * GRID_HZ)
if 0 <= b < n_bins:
level = max(e.get("laughter", 0), e.get("cheering", 0), e.get("screaming", 0))
curve[b] = max(curve[b], level)
return curve


def _dilate(curve: np.ndarray, radius_bins: int) -> np.ndarray:
"""Spread each spike to its neighbors (grayscale dilation) so a brief, narrow
reaction still aligns with, and can win, the fused peak near it."""
if radius_bins <= 0 or curve.size == 0:
return curve
out = curve.copy()
for r in range(1, radius_bins + 1):
out[r:] = np.maximum(out[r:], curve[:-r])
out[:-r] = np.maximum(out[:-r], curve[r:])
return out


def fuse_channels(channels: dict[str, np.ndarray], profile: ContentProfile) -> np.ndarray:
"""Weighted sum of per-video-normalized channels, weights renormalized over what exists."""
present = {k: v for k, v in channels.items() if profile.channel_weights.get(k, 0) > 0 and v.size}
if not present:
return np.zeros(next(iter(channels.values())).size if channels else 0)
total_w = sum(profile.channel_weights[k] for k in present)
fused = None
for k, curve in present.items():
w = profile.channel_weights[k] / total_w
contrib = w * _robust_z(curve)
fused = contrib if fused is None else fused + contrib
return fused


def pick_peaks(curve: np.ndarray, height: float, min_gap_bins: int) -> list[int]:
"""Local maxima above `height`, then greedy non-maximum suppression by min gap.

Peaks are taken in descending value order and a lower peak is dropped if it falls
within min_gap_bins of an already-chosen higher one (1-D NMS).
"""
if curve.size == 0:
return []
candidates = [
i for i in range(1, len(curve) - 1)
if curve[i] >= curve[i - 1] and curve[i] >= curve[i + 1] and curve[i] >= height
]
if curve[0] >= height and (len(curve) == 1 or curve[0] > curve[1]):
candidates.append(0)
if len(curve) > 1 and curve[-1] >= height and curve[-1] > curve[-2]:
candidates.append(len(curve) - 1)
candidates.sort(key=lambda i: curve[i], reverse=True)
chosen: list[int] = []
for i in candidates:
if all(abs(i - j) >= min_gap_bins for j in chosen):
chosen.append(i)
return sorted(chosen)


def _snap_to_quiet(target_sec: float, energy_curve: np.ndarray, window_sec: float = 1.5) -> float:
"""Nudge a boundary to the quietest second nearby, so cuts land in a lull not mid-action."""
if energy_curve.size == 0:
return target_sec
center = int(round(target_sec * GRID_HZ))
lo = max(0, center - int(window_sec * GRID_HZ))
hi = min(len(energy_curve), center + int(window_sec * GRID_HZ) + 1)
if lo >= hi:
return target_sec
local = energy_curve[lo:hi]
return (lo + int(np.argmin(local))) / GRID_HZ


def _window_for_peak(
peak_sec: float,
reaction_level: float,
profile: ContentProfile,
duration: float,
energy_curve: np.ndarray,
min_dur: float,
max_dur: float,
) -> tuple[float, float, bool]:
"""Clip window for a peak. A reaction peak expands backwards from the reaction onset."""
is_reaction = reaction_level >= 0.15
if is_reaction:
# The funny thing happens BEFORE the laugh, so keep the run-up: expand backwards
# from the reaction, snap only the end to a lull, and grow the start (not the
# end) if we're under the minimum so the payoff stays put.
start = max(0.0, peak_sec - profile.reaction_lookback_sec)
end = _snap_to_quiet(min(duration, peak_sec + profile.reaction_payoff_sec), energy_curve)
if end - start < min_dur:
start = max(0.0, end - min_dur)
elif end - start > max_dur:
start = end - max_dur
else:
half = min_dur / 2.0
start = _snap_to_quiet(max(0.0, peak_sec - half), energy_curve)
end = _snap_to_quiet(min(duration, peak_sec + half), energy_curve)
if end - start < min_dur:
end = min(duration, start + min_dur)
elif end - start > max_dur:
end = start + max_dur
return round(max(0.0, start), 1), round(min(duration, end), 1), is_reaction


def detect_highlights(
video_path: str,
profile_name: str = "party",
top_n: int = 8,
min_dur: float = 8.0,
max_dur: float = 60.0,
height_z: float = 1.0,
progress_callback: Optional[Callable] = None,
) -> list[dict]:
"""
Generate highlight clips from a video's fused signal curve (no transcript needed).

Returns clip dicts compatible with the render pipeline:
{title, start_second, end_second, duration, score, reasons, preview}.
"""
profile = get_profile(profile_name)

if progress_callback:
progress_callback(10, "Analyzing audio energy...")
energy_data = extract_audio_energy(video_path)

events_data = []
if audio_events_available():
if progress_callback:
progress_callback(40, "Detecting laughter and reactions...")
events_data = extract_audio_events(video_path)

last_times = [e["time"] for e in energy_data] + [e["time"] for e in events_data]
if not last_times:
return []
duration = max(last_times) + 1.0
n_bins = int(duration * GRID_HZ) + 1

energy_curve = _energy_curve(energy_data, n_bins)
# Dilate reactions by ~2s so a single-frame laugh isn't suppressed by a louder
# energy neighbor and so the fused peak lands on the reaction, not next to it.
reaction_curve = _dilate(_reaction_curve(events_data, n_bins), int(2 * GRID_HZ))

fused = fuse_channels(
{"energy": energy_curve, "audio_event": reaction_curve}, profile
)
if fused.size == 0:
return []

if progress_callback:
progress_callback(70, "Selecting highlight moments...")
min_gap_bins = max(1, int(profile.peak_min_gap_sec * GRID_HZ))

# Reaction moments are primary candidates — a detected laugh/cheer is almost always
# worth a clip regardless of loudness, so they aren't made to out-compete energy in
# the blended curve. Energy peaks then fill the rest, minus any that collide with a
# reaction. Reaction score is offset above energy so reactions rank first.
reaction_peaks = pick_peaks(reaction_curve, 0.15, min_gap_bins)
energy_peaks = pick_peaks(fused, height_z, min_gap_bins)

candidates = [(i, float(reaction_curve[i]), True) for i in reaction_peaks]
reaction_bins = {i for i in reaction_peaks}
for i in energy_peaks:
if all(abs(i - j) >= min_gap_bins for j in reaction_bins):
candidates.append((i, float(fused[i]), False))

def rank_key(c):
i, val, is_reaction = c
return (1 if is_reaction else 0, val)

candidates.sort(key=rank_key, reverse=True)
candidates = candidates[:top_n]

clips = []
for i, val, want_reaction in candidates:
peak_sec = i / GRID_HZ
reaction_level = float(reaction_curve[i]) if want_reaction else 0.0
start, end, is_reaction = _window_for_peak(
peak_sec, reaction_level, profile, duration, energy_curve, min_dur, max_dur
)
if end - start < min_dur * 0.75:
continue
kind = "laugh/cheer" if is_reaction else "high energy"
score = round(10.0 + reaction_level * 10.0, 2) if is_reaction else round(float(val), 2)
clips.append({
"title": f"Highlight ({kind}) at {int(peak_sec // 60):d}:{int(peak_sec % 60):02d}",
"start_second": start,
"end_second": end,
"duration": round(end - start),
"score": score,
"reasons": ["reaction"] if is_reaction else ["energy_peak"],
"preview": "",
"content_type": "highlight",
})

clips.sort(key=lambda c: c["start_second"])
if progress_callback:
progress_callback(100, f"Found {len(clips)} highlights")
return clips


def detect_highlights_pooled(
video_paths: list[str],
profile_name: str = "party",
top_n: int = 15,
min_dur: float = 8.0,
max_dur: float = 60.0,
progress_callback: Optional[Callable] = None,
) -> list[dict]:
"""
Detect highlights across many videos and rank them globally — "the best N bits
from tonight" across a folder of party clips.

Each returned clip carries a `source_file`. Ranking is reaction-first, then by
score, so a genuine laugh in any file outranks a merely loud moment in another.
"""
pooled: list[dict] = []
n = len(video_paths) or 1
for idx, path in enumerate(video_paths):
clips = detect_highlights(
path, profile_name=profile_name, top_n=top_n, min_dur=min_dur, max_dur=max_dur
)
for c in clips:
c["source_file"] = path
pooled.extend(clips)
if progress_callback:
progress_callback(
int((idx + 1) / n * 100), f"{path}: {len(clips)} highlights"
)

pooled.sort(
key=lambda c: (1 if "reaction" in c.get("reasons", []) else 0, c.get("score", 0)),
reverse=True,
)
return pooled[:top_n]
6 changes: 3 additions & 3 deletions plans/moment-detection.md
Original file line number Diff line number Diff line change
Expand Up @@ -95,9 +95,9 @@ A new `profile` param threads the **same ~12 hops the `format` field did**: `sug

## Phasing

- **Phase 0 — profile scaffolding, zero behavior change.** Add `ContentProfile` abstraction; thread the `profile` param through the ~12 hops; `default = podcast` reproduces current selection exactly. **Gate: existing test suite green; same clips out for a fixed transcript.**
- **Phase 1 — audio-event channel (the isolated valuable core).** YAMNet-ONNX laughter/cheer/applause/scream computed in the detect-once hub. Feeds podcast ranking as a labeled signal (laughs already spike energy; now they're *named*) and lays the party foundation. **Gate: laughter timestamps validated on a sample clip; podcast output unchanged unless the channel is given weight.**
- **Phase 2 — fusion engine + saliency candidate source + party profile (audio-only).** `saliency.py` fusion + numpy peak-pick + reaction-expand (8 s) + boundary-snap. Party profile = energy + audio_event + prosody, no transcript, no motion. **Party videos auto-clip end to end. Gate: demo on real party footage.**
- **Phase 0 — profile scaffolding, zero behavior change.** [DONE, PR #43 / Phase 2 branch] `ContentProfile` abstraction added (`profiles.py`); `profile` param threaded through the CLI (`--profile`, config, selection signature). Python-side default `podcast` reproduces current selection. TS-side threading (MCP/web `profile` param) still open — see below.
- **Phase 1 — audio-event channel (the isolated valuable core).** [DONE, PR #43] YAMNet-ONNX laughter/cheer/applause/scream (`audio_events.py`) computed in the detect-once hub; surfaced in the packed transcript and CLI heuristic. Podcast output unchanged unless the channel is weighted. Validated on 32 real clips.
- **Phase 2 — fusion engine + saliency candidate source + party profile (audio-only).** [DONE, this branch] `saliency.py`: per-video robust-z normalization, weighted fusion, numpy peak-pick (NMS), reaction dilation, reaction-first candidate generation, 8 s backward expansion, quiet-point boundary snap. Wired to `--profile party|action` in `podcli process`. Deterministic on clean input; reactions detected with correct run-up windows. **Gate remaining: tune thresholds/weights on real party footage (synthetic podcast-clip concat is not representative).**
- **Phase 3 — visual channels + action profile + multi-file pooling.** Optical flow (OpenCV) + face-reaction channels; action profile; pool peaks across a *folder* of clips and rank globally ("best 15 bits from tonight" across 80 phone videos). **Gate: catches a silent visual gag; folder-level ranking works.**
- **Phase 4 (optional) — highlight reel renderer.** Ordering, pacing, optional music-bed ducking, transitions — a thin renderer atop the detected moments, reusing the clip-render stack.

Expand Down
Loading