diff --git a/sound/install_make_tts/batch_collector.py b/sound/install_make_tts/batch_collector.py index a2a3e7c..20edc54 100644 --- a/sound/install_make_tts/batch_collector.py +++ b/sound/install_make_tts/batch_collector.py @@ -29,8 +29,8 @@ DOWNLOAD_DIR = os.path.join(_HERE, "downloads") COUNTER_FILE = os.path.join(DOWNLOAD_DIR, ".batch_counter") -BATCH_SIZE = 200 -POLL_INTERVAL = 30 # seconds +BATCH_SIZE = 1000 +POLL_INTERVAL = 600 # seconds # --------------------------------------------------------------------------- diff --git a/sound/install_make_tts/install_make_coqui.py b/sound/install_make_tts/install_make_coqui.py new file mode 100644 index 0000000..9d90a38 --- /dev/null +++ b/sound/install_make_tts/install_make_coqui.py @@ -0,0 +1,397 @@ +""" +Coqui VITS TTS batch runner. + +Reads all CSVs from the `input_csvs/` folder next to this script, generates +WAV files with Coqui TTS (VCTK multi-speaker VITS), and maintains the same +`have_barked.csv` deduplication log so already-processed image_ids are skipped +across runs and across Bark/Coqui jobs. + +Score range: 0.6 <= topic_fit < 0.65 +Speaker: random VCTK speaker picked per line (109 available) +Output: tts_bark_out/ (same dir as Bark — picked up by batch_collector.py unchanged) +""" + +from __future__ import annotations + +import argparse +import contextlib +import csv +import logging +import os +import random +import time +from dataclasses import dataclass, field +from typing import Iterable, Optional, Set + +import torch + + +# ── Logging noise suppression ───────────────────────────────────────────────── + +class _SuppressCoquiNoise(logging.Filter): + _PATTERNS = ("coqpit", "config", "model", "loading", "setting") + def filter(self, record: logging.LogRecord) -> bool: + msg = record.getMessage().lower() + return not any(p in msg for p in self._PATTERNS) + +for _logger_name in ("TTS", "TTS.tts", "TTS.utils", "coqpit"): + logging.getLogger(_logger_name).addFilter(_SuppressCoquiNoise()) + + +# ── Paths & constants ───────────────────────────────────────────────────────── + +_HERE = os.path.dirname(os.path.abspath(__file__)) + +IN_CSV_DIR = os.path.join(_HERE, "input_csvs") +OUT_DIR = os.path.join(_HERE, "tts_bark_out") # shared with Bark +HAVE_BARKED_CSV = os.path.join(_HERE, "have_barked.csv") + +TOPIC_FIT_FIELD = "topic_fit" +TOPIC_FIT_MIN = 0.6 +TOPIC_FIT_MAX = 0.7 + +MAX_PROCESSED = 0 # 0 = no limit + +# Full VCTK speaker list for tts_models/en/vctk/vits +# Used as fallback if tts.speakers is unavailable +VCTK_SPEAKERS = [ + "p225","p226","p227","p228","p229","p230","p231","p232","p233","p234", + "p236","p237","p238","p239","p240","p241","p243","p244","p245","p246", + "p247","p248","p249","p250","p251","p252","p253","p254","p255","p256", + "p257","p258","p259","p260","p261","p262","p263","p264","p265","p266", + "p267","p268","p269","p270","p271","p272","p273","p274","p275","p276", + "p277","p278","p279","p280","p281","p282","p283","p284","p285","p286", + "p287","p288","p292","p293","p294","p295","p297","p298","p299","p300", + "p301","p302","p303","p304","p305","p306","p307","p308","p310","p311", + "p312","p313","p314","p316","p317","p318","p323","p326","p329","p330", + "p333","p334","p335","p336","p339","p340","p341","p343","p345","p347", + "p351","p360","p361","p362","p363","p364","p374","p376", +] + + +# ── CSV helpers (identical to Bark script) ──────────────────────────────────── + +def _safe_int(value: object) -> Optional[int]: + try: + if value is None: + return None + s = str(value).strip() + if s == "": + return None + return int(float(s)) + except Exception: + return None + + +def _load_have_barked_ids(have_barked_csv: str) -> Set[int]: + if not os.path.exists(have_barked_csv): + return set() + ids: Set[int] = set() + with open(have_barked_csv, "r", encoding="utf-8-sig", newline="") as f: + reader = csv.DictReader(f) + if reader.fieldnames and "image_id" in reader.fieldnames: + for row in reader: + image_id = _safe_int(row.get("image_id")) + if image_id is not None: + ids.add(image_id) + else: + f.seek(0) + raw = csv.reader(f) + for r in raw: + if not r: + continue + image_id = _safe_int(r[0]) + if image_id is not None: + ids.add(image_id) + return ids + + +def _append_have_barked_id(have_barked_csv: str, image_id: int) -> None: + exists = os.path.exists(have_barked_csv) + os.makedirs(os.path.dirname(os.path.abspath(have_barked_csv)) or ".", exist_ok=True) + with open(have_barked_csv, "a", encoding="utf-8", newline="") as f: + writer = csv.DictWriter(f, fieldnames=["image_id"]) + if not exists: + writer.writeheader() + writer.writerow({"image_id": image_id}) + + +def _collect_input_csvs(csv_dir: str) -> list[str]: + if not os.path.isdir(csv_dir): + raise FileNotFoundError( + f"input_csvs folder not found: {csv_dir}\n" + "Create it and place your CSV files inside before running." + ) + paths = sorted( + os.path.join(csv_dir, f) + for f in os.listdir(csv_dir) + if f.lower().endswith(".csv") + ) + if not paths: + raise FileNotFoundError(f"No .csv files found in {csv_dir}") + return paths + + +def _iter_rows(input_csv: str) -> Iterable[dict]: + with open(input_csv, "r", encoding="utf-8-sig", newline="") as f: + reader = csv.DictReader(f) + for row in reader: + yield row + + +def _prescan_csvs(input_csvs: list[str], image_id_field: str) -> tuple[int, int]: + """Return (total_rows, total_in_topic_fit) across all input CSVs.""" + total_rows = 0 + total_in_topic_fit = 0 + for path in input_csvs: + for row in _iter_rows(path): + if _safe_int(row.get(image_id_field)) is None: + continue + total_rows += 1 + fit_raw = row.get(TOPIC_FIT_FIELD) + try: + fit = float(fit_raw) if fit_raw is not None and str(fit_raw).strip() != "" else None + except Exception: + fit = None + if fit is not None and TOPIC_FIT_MIN <= fit < TOPIC_FIT_MAX: + total_in_topic_fit += 1 + return total_rows, total_in_topic_fit + + +# ── CoquiVITS wrapper ───────────────────────────────────────────────────────── + +@dataclass +class CoquiVITS: + """ + Thin wrapper around Coqui TTS VCTK-VITS. + + VITS is non-autoregressive — inference on short texts is fast (~50-150ms + per line on a 4090). Each line gets a freshly random speaker from the + full 109-speaker VCTK set. + """ + _tts: object # TTS instance — untyped to avoid import-time dep + sample_rate: int + speaker_list: list[str] = field(default_factory=list) + + @classmethod + def load(cls, device: Optional[str] = None) -> "CoquiVITS": + from TTS.api import TTS # pip install TTS + + if device is None: + device = "cuda" if torch.cuda.is_available() else "cpu" + + print(f"Loading Coqui VCTK-VITS on {device} …") + tts = TTS( + model_name="tts_models/en/vctk/vits", + progress_bar=False, + gpu=(device == "cuda"), + ) + + # Prefer the live speaker list from the loaded model + try: + speakers = list(tts.speakers) if tts.speakers else VCTK_SPEAKERS + except Exception: + speakers = VCTK_SPEAKERS + + sample_rate = 22050 + try: + sample_rate = tts.synthesizer.output_sample_rate + except Exception: + pass + + print(f"Coqui VCTK-VITS ready. {len(speakers)} speakers. " + f"Sample rate: {sample_rate} Hz") + return cls(_tts=tts, sample_rate=sample_rate, speaker_list=speakers) + + def synthesize_to_wav(self, text: str, out_wav_path: str, speaker: str) -> str: + os.makedirs(os.path.dirname(os.path.abspath(out_wav_path)) or ".", exist_ok=True) + with open(os.devnull, "w") as _devnull, contextlib.redirect_stdout(_devnull): + self._tts.tts_to_file(text=text, speaker=speaker, file_path=out_wav_path) + return out_wav_path + + def random_speaker(self) -> str: + return random.choice(self.speaker_list) + + +# ── Output path ─────────────────────────────────────────────────────────────── + +def _build_out_path(out_dir: str, image_id: int, speaker: str) -> str: + filename = f"{image_id}_coqui_{speaker}.wav" + return os.path.join(out_dir, filename) + + +# ── Pending item ────────────────────────────────────────────────────────────── + +@dataclass +class _PendingItem: + image_id: int + text: str + out_path: str + speaker: str + + +# ── Flush ───────────────────────────────────────────────────────────────────── + +def _flush_batch( + tts: CoquiVITS, + pending: list[_PendingItem], + already: Set[int], +) -> tuple[int, list[str]]: + if not pending: + return 0, [] + + written: list[str] = [] + for item in pending: + try: + tts.synthesize_to_wav(item.text, item.out_path, speaker=item.speaker) + _append_have_barked_id(HAVE_BARKED_CSV, item.image_id) + already.add(item.image_id) + written.append(item.out_path) + except Exception as e: + print(f" Failed image_id={item.image_id} speaker={item.speaker}: " + f"{type(e).__name__}: {e}") + + return len(written), written + + +# ── Argparser ───────────────────────────────────────────────────────────────── + +def _build_argparser() -> argparse.ArgumentParser: + p = argparse.ArgumentParser( + description="Batch-generate WAV files using Coqui VCTK-VITS TTS." + ) + p.add_argument( + "--text-field", default="description", + help="CSV column name containing text to synthesize (default: description).", + ) + p.add_argument( + "--image-id-field", default="image_id", + help="CSV column name for image_id (default: image_id).", + ) + p.add_argument( + "--device", default=None, + help="Force device (cuda/cpu). Defaults to auto-detect.", + ) + p.add_argument( + "--batch-size", type=int, default=32, + help=( + "Items to accumulate before flushing progress log (default: 32). " + "VITS processes items individually so this controls log frequency only." + ), + ) + return p + + +# ── Main ────────────────────────────────────────────────────────────────────── + +def main() -> None: + args = _build_argparser().parse_args() + + os.makedirs(OUT_DIR, exist_ok=True) + already = _load_have_barked_ids(HAVE_BARKED_CSV) + print(f"Loaded {len(already)} already-processed image_ids from have_barked.csv") + + input_csvs = _collect_input_csvs(IN_CSV_DIR) + print(f"Found {len(input_csvs)} input CSV(s): " + f"{[os.path.basename(p) for p in input_csvs]}") + + print("Pre-scanning CSVs …") + total_rows, total_in_topic_fit = _prescan_csvs(input_csvs, args.image_id_field) + pct = (total_in_topic_fit / total_rows * 100.0) if total_rows else 0.0 + print(f" total_rows={total_rows} " + f"in_topic_fit={total_in_topic_fit} ({pct:.1f}%)") + + start_time = time.time() + tts = CoquiVITS.load(device=args.device) + + successes = 0 + skipped_already = 0 + skipped_topic_fit = 0 + done = False + pending: list[_PendingItem] = [] + + def flush() -> None: + nonlocal successes + n, _ = _flush_batch(tts, pending, already) + successes += n + pending.clear() + + def _log_progress() -> None: + rows_touched = successes + skipped_already + skipped_topic_fit + pct_rows = (rows_touched / total_rows * 100.0) if total_rows else 0.0 + topic_done = successes + skipped_already + pct_topic = (topic_done / total_in_topic_fit * 100.0) if total_in_topic_fit else 0.0 + elapsed = time.time() - start_time + h, rem = divmod(int(elapsed), 3600) + m, s = divmod(rem, 60) + rate = successes / elapsed if elapsed > 0 else 0.0 + print( + f"[{h:02d}:{m:02d}:{s:02d}]", + "Progress:", + f"processed={successes} ({rate:.2f}/s)", + f"skipped_already={skipped_already}", + f"skipped_topic_fit={skipped_topic_fit}", + f"rows_touched={rows_touched}/{total_rows} ({pct_rows:.1f}%)", + f"topic_fit_range=[{TOPIC_FIT_MIN},{TOPIC_FIT_MAX})", + f"done_of_topic_fit={topic_done}/{total_in_topic_fit} ({pct_topic:.1f}%)", + ) + + for input_csv in input_csvs: + if done: + break + print(f"\n--- Processing {os.path.basename(input_csv)} ---") + for row in _iter_rows(input_csv): + image_id = _safe_int(row.get(args.image_id_field)) + if image_id is None: + continue + if image_id in already: + skipped_already += 1 + continue + + fit_raw = row.get(TOPIC_FIT_FIELD) + try: + fit = ( + float(fit_raw) + if fit_raw is not None and str(fit_raw).strip() != "" + else None + ) + except Exception: + fit = None + if fit is None or fit < TOPIC_FIT_MIN or fit >= TOPIC_FIT_MAX: + skipped_topic_fit += 1 + continue + + text = str(row.get(args.text_field, "")).strip() + if not text: + continue + + speaker = tts.random_speaker() + out_path = _build_out_path(OUT_DIR, image_id=image_id, speaker=speaker) + pending.append(_PendingItem( + image_id=image_id, text=text, + out_path=out_path, speaker=speaker, + )) + + if len(pending) >= args.batch_size: + flush() + _log_progress() + + if MAX_PROCESSED and successes >= MAX_PROCESSED: + done = True + break + + if pending and not done: + flush() + + _log_progress() + + elapsed = time.time() - start_time + h, rem = divmod(int(elapsed), 3600) + m, s = divmod(rem, 60) + rate = successes / elapsed if elapsed > 0 else 0.0 + print(f"\n[{h:02d}:{m:02d}:{s:02d}] Final: " + f"processed={successes} ({rate:.2f}/s) output_dir={OUT_DIR}") + + +if __name__ == "__main__": + main() diff --git a/sound/install_make_tts/setup_runpod_coqui.py b/sound/install_make_tts/setup_runpod_coqui.py new file mode 100644 index 0000000..8849d76 --- /dev/null +++ b/sound/install_make_tts/setup_runpod_coqui.py @@ -0,0 +1,85 @@ +""" +RunPod dependency installer for install_make_coqui.py. + +Before running this script, install PyTorch manually (do this once per session): + + pip install --upgrade --force-reinstall torch torchvision torchaudio \\ + --index-url https://download.pytorch.org/whl/cu124 + +Then run this script: + + python setup_runpod_coqui.py + +Then launch the TTS job: + + python install_make_coqui.py --batch-size 32 +""" + +import subprocess +import sys + + +def pip(*args: str) -> None: + subprocess.check_call([ + sys.executable, "-m", "pip", "install", + "--upgrade", + "--ignore-installed", # skip distutils-managed system packages (e.g. blinker) + *args, + ]) + + +def main() -> None: + print("=== Installing Coqui TTS dependencies ===") + + # Core Coqui TTS package. Pulls in coqpit, librosa, inflect, + # anyascii, phonemizer, trainer, etc. + pip("TTS") + + print("\n=== Installing audio / numeric support packages ===") + pip( + "scipy", + "numpy", + "soundfile", # used internally by Coqui for WAV I/O + "pick", # batch_collector.py dependency + ) + + # espeak-ng is required by the phonemizer backend that VCTK-VITS uses. + # Must be installed at OS level, not via pip. + print("\n=== Installing espeak-ng (required for VITS phonemizer) ===") + try: + subprocess.check_call(["apt-get", "install", "-y", "espeak-ng"]) + print("espeak-ng installed.") + except subprocess.CalledProcessError: + print( + "WARNING: apt-get install espeak-ng failed.\n" + "If you see phonemizer errors at runtime, install manually:\n" + " apt-get install -y espeak-ng" + ) + + print("\n=== Verifying GPU visibility ===") + import torch # noqa: PLC0415 + if torch.cuda.is_available(): + name = torch.cuda.get_device_name(0) + vram = torch.cuda.get_device_properties(0).total_memory / 1024 ** 3 + print(f"GPU detected: {name} ({vram:.1f} GB VRAM)") + print(f"CUDA version: {torch.version.cuda}") + else: + print("WARNING: No CUDA GPU detected. Coqui will run on CPU (slower).") + + print("\n=== Pre-downloading VCTK-VITS model weights ===") + print("Downloads ~150 MB on first run, cached to ~/.local/share/tts/") + try: + from TTS.api import TTS + tts = TTS(model_name="tts_models/en/vctk/vits", progress_bar=True, gpu=False) + speakers = tts.speakers if tts.speakers else [] + print(f"Model ready. {len(speakers)} speakers available.") + del tts + except Exception as e: + print(f"Pre-download failed (non-fatal): {e}") + print("Model will be downloaded on first run of install_make_coqui.py instead.") + + print("\nSetup complete. Run: python install_make_coqui.py") + + +if __name__ == "__main__": + main() diff --git a/sound/pull_from_runpod.sh b/sound/pull_from_runpod.sh index 2a030e0..8bdeeb2 100755 --- a/sound/pull_from_runpod.sh +++ b/sound/pull_from_runpod.sh @@ -20,8 +20,8 @@ set -euo pipefail # ----------------------------------------------------------------------- RUNPOD_KEY="$HOME/.ssh/id_ed25519" RUNPOD_USER="root" -RUNPOD_HOST="203.57.40.109" -RUNPOD_PORT="10068" +RUNPOD_HOST="203.57.40.160" +RUNPOD_PORT="10069" REMOTE_DIR="/root/install_make_tts/downloads" LOCAL_DIR="/Users/tenchc/Documents/GitHub/taking_stock_production/tts_downloads" POLL_INTERVAL=3600 # seconds between polls (1 hour) diff --git a/utilities/audio_hash_folders.py b/utilities/audio_hash_folders.py new file mode 100644 index 0000000..b82f32d --- /dev/null +++ b/utilities/audio_hash_folders.py @@ -0,0 +1,116 @@ +import os +import hashlib +import shutil +import argparse + +# ── SET YOUR OUTPUT DIRECTORY HERE ────────────────────────────────────────── +OUTPUT_DIR = "/Users/tenchc/Desktop/Hashing_Test" +# ──────────────────────────────────────────────────────────────────────────── + +AUDIO_EXTENSIONS = {".wav", ".mp3", ".flac", ".ogg", ".aac", ".m4a", ".aiff", ".aif"} +HASH_ALPHABET = list("ABCDEF0123456789") + + +def get_hash_folders(hash_key): + """Return (level1, level2) folder names derived from MD5 of hash_key. + + Mirrors DataIO.get_hash_folders() in mp_db_io.py. + level1 → first hex char uppercased e.g. '3' + level2 → first two hex chars uppercased e.g. '3B' + """ + m = hashlib.md5() + m.update(hash_key.encode("utf-8")) + d = m.hexdigest() + return d[0].upper(), d[0:2].upper() + + +def make_hash_folders(path): + """Create the full two-level (16×16 = 256 leaf) hash folder tree under path. + + Mirrors DataIO.make_hash_folders() in mp_db_io.py. + Structure: path/// + """ + for letter in HASH_ALPHABET: + for letter2 in HASH_ALPHABET: + leaf = os.path.join(path, letter, letter + letter2) + os.makedirs(leaf, exist_ok=True) + + +def extract_hash_key(filename): + """Split filename at the first '_' and return the prefix as the hash key. + + Example: '14692993_coqui_p336.wav' → '14692993' + If there is no '_', the full stem is used. + """ + stem = os.path.splitext(filename)[0] + return stem.split("_")[0] + + +def main(): + parser = argparse.ArgumentParser( + description=( + "Move audio files from INPUT_DIR into a two-level MD5 hash folder " + "structure under OUTPUT_DIR. The hash key is the portion of the " + "filename before the first '_'." + ) + ) + parser.add_argument( + "input_dir", + help="Directory containing audio files to move.", + ) + args = parser.parse_args() + + input_dir = os.path.abspath(args.input_dir) + output_dir = os.path.abspath(OUTPUT_DIR) + + if not os.path.isdir(input_dir): + print(f"ERROR: input_dir does not exist or is not a directory: {input_dir}") + raise SystemExit(1) + + if output_dir == input_dir: + print("ERROR: OUTPUT_DIR and input_dir must not be the same path.") + raise SystemExit(1) + + print(f"Input : {input_dir}") + print(f"Output : {output_dir}") + print("Building hash folder tree…") + make_hash_folders(output_dir) + print("Hash folder tree ready.") + + moved = 0 + skipped = 0 + + for entry in sorted(os.scandir(input_dir), key=lambda e: e.name): + if not entry.is_file(): + continue + + filename = entry.name + + if filename.startswith("."): + continue + + ext = os.path.splitext(filename)[1].lower() + if ext not in AUDIO_EXTENSIONS: + print(f" SKIP (not audio): {filename}") + skipped += 1 + continue + + hash_key = extract_hash_key(filename) + level1, level2 = get_hash_folders(hash_key) + dest_folder = os.path.join(output_dir, level1, level2) + dest_path = os.path.join(dest_folder, filename) + + if os.path.exists(dest_path): + print(f" SKIP (already exists): {filename}") + skipped += 1 + continue + + shutil.move(entry.path, dest_path) + print(f" MOVED: {filename} → {level1}/{level2}/") + moved += 1 + + print(f"\nDone. Moved: {moved} | Skipped: {skipped}") + + +if __name__ == "__main__": + main() diff --git a/utilities/dedupe/remove_matched_pairs.py b/utilities/dedupe/remove_matched_pairs.py new file mode 100644 index 0000000..fafbe4e --- /dev/null +++ b/utilities/dedupe/remove_matched_pairs.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python3 +""" +Remove duplicate score-rating folders that contain the exact same pair of images. + +Folder structure: + / + / + high/ or medium/ + / ← contains 2 jpgs + 1 sql + imageA.jpg + imageB.jpg + dupe_*.sql + +Usage: + python remove_matched_pairs.py + +Tracks every unique (frozenset of jpg filenames) seen globally across all +clusters and all high/medium tiers. If the exact same pair is encountered +again anywhere, the duplicate folder is deleted and a message is printed. +""" + +import os +import sys +import shutil + + +def get_jpg_pair(score_dir: str) -> frozenset | None: + """Return a frozenset of jpg basenames found in a score-rating folder.""" + try: + names = [f for f in os.listdir(score_dir) if f.lower().endswith(".jpg")] + except NotADirectoryError: + return None + if len(names) != 2: + return None + return frozenset(names) + + +def main(): + if len(sys.argv) != 2: + print(f"Usage: {sys.argv[0]} ") + sys.exit(1) + + root = sys.argv[1] + if not os.path.isdir(root): + print(f"Error: '{root}' is not a directory.") + sys.exit(1) + + seen: dict[frozenset, str] = {} # pair → first folder path that had it + total_deleted = 0 + + for cluster_name in sorted(os.listdir(root)): + cluster_path = os.path.join(root, cluster_name) + if not os.path.isdir(cluster_path) or cluster_name.startswith("."): + continue + + for tier in sorted(os.listdir(cluster_path)): + tier_path = os.path.join(cluster_path, tier) + if not os.path.isdir(tier_path) or tier.startswith("."): + continue + + for score_dir_name in sorted(os.listdir(tier_path)): + score_path = os.path.join(tier_path, score_dir_name) + if not os.path.isdir(score_path) or score_dir_name.startswith("."): + continue + + pair = get_jpg_pair(score_path) + if pair is None: + continue + + if pair in seen: + images = sorted(pair) + print( + f"PERFECT MATCH — deleting duplicate:\n" + f" kept: {seen[pair]}\n" + f" deleted: {score_path}\n" + f" images: {images[0]} & {images[1]}\n" + ) + shutil.rmtree(score_path) + total_deleted += 1 + else: + seen[pair] = score_path + + print(f"Done. {total_deleted} duplicate folder(s) removed.") + + +if __name__ == "__main__": + main() + \ No newline at end of file diff --git a/utilities/dedupe/web_dedupe/dedupe_viewer.html b/utilities/dedupe/web_dedupe/dedupe_viewer.html new file mode 100644 index 0000000..b8419dd --- /dev/null +++ b/utilities/dedupe/web_dedupe/dedupe_viewer.html @@ -0,0 +1,598 @@ + + + + + + Dedupe Viewer + + + + + +
+
+
+
+ + +
+ + +
+ Enterdupe — move on + Tabnot a dupe — delete SQL & move on + Zundo (up to 10) + + Spaceflicker mode + + sec + +
+ + +
+

Dedupe Viewer

+

+ Start the local server, then open this page via the server URL.

+ node server.js /path/to/root/folder

+ Then visit http://localhost:3000 +

+
+ + +
+
+
+ + +
+
+ + +
+
+ + +
+ +
+ FLICKER + +
+ + +
+ All pairs reviewed. +
+ + + + diff --git a/utilities/dedupe/web_dedupe/server.js b/utilities/dedupe/web_dedupe/server.js new file mode 100644 index 0000000..ecc52d8 --- /dev/null +++ b/utilities/dedupe/web_dedupe/server.js @@ -0,0 +1,208 @@ +#!/usr/bin/env node +/** + * Dedupe Viewer local server. + * Usage: node server.js [port] + * + * Serves dedupe_viewer.html and provides three endpoints: + * GET /api/pairs → JSON array of all image pairs + * GET /image?p= → serve an image file + * POST /api/delete-sql → delete an SQL file { "sqlPath": "" } + */ + +const http = require('http'); +const fs = require('fs'); +const path = require('path'); +const url = require('url'); + +const rootArg = process.argv[2]; +if (!rootArg) { + console.error('Usage: node server.js [port]'); + process.exit(1); +} + +const ROOT = path.resolve(rootArg); +const PORT = parseInt(process.argv[3] || '3000', 10); + +if (!fs.existsSync(ROOT) || !fs.statSync(ROOT).isDirectory()) { + console.error(`Error: '${ROOT}' is not a directory.`); + process.exit(1); +} + +// ── Directory crawl ───────────────────────────────────────────────────────── + +function crawlPairs() { + const pairs = []; + for (const clusterName of sorted(fs.readdirSync(ROOT))) { + if (clusterName.startsWith('.')) continue; + const clusterPath = path.join(ROOT, clusterName); + if (!fs.statSync(clusterPath).isDirectory()) continue; + + for (const tierName of sorted(fs.readdirSync(clusterPath))) { + if (tierName.startsWith('.')) continue; + const tierPath = path.join(clusterPath, tierName); + if (!fs.statSync(tierPath).isDirectory()) continue; + + for (const scoreName of sorted(fs.readdirSync(tierPath))) { + if (scoreName.startsWith('.')) continue; + const scorePath = path.join(tierPath, scoreName); + if (!fs.statSync(scorePath).isDirectory()) continue; + + const pair = collectPair(scorePath, clusterName, tierName, scoreName); + if (pair) pairs.push(pair); + } + } + } + return pairs; +} + +function collectPair(scorePath, clusterName, tierName, scoreName) { + const entries = fs.readdirSync(scorePath); + const jpgs = entries.filter(n => /\.(jpg|jpeg)$/i.test(n)).sort(); + const sql = entries.find(n => /\.sql$/i.test(n)) || null; + if (jpgs.length !== 2) return null; + const rel = (name) => path.join(clusterName, tierName, scoreName, name); + return { + label: `${clusterName}/${tierName}/${scoreName}`, + imgA: rel(jpgs[0]), + imgB: rel(jpgs[1]), + sqlPath: sql ? rel(sql) : null, + }; +} + +function sorted(arr) { return [...arr].sort(); } + +// ── Progress file ──────────────────────────────────────────────────────────── + +const PROGRESS_FILE = path.join(__dirname, 'progress.json'); + +function loadProgress() { + if (!fs.existsSync(PROGRESS_FILE)) return null; + try { + const data = JSON.parse(fs.readFileSync(PROGRESS_FILE, 'utf8')); + return data.rootDir === ROOT ? data : null; + } catch { return null; } +} + +function saveProgress(data) { + fs.writeFileSync(PROGRESS_FILE, JSON.stringify({ rootDir: ROOT, ...data }, null, 2)); +} + +function clearProgress() { + if (fs.existsSync(PROGRESS_FILE)) fs.unlinkSync(PROGRESS_FILE); +} + +// ── MIME types ─────────────────────────────────────────────────────────────── + +const MIME = { '.jpg': 'image/jpeg', '.jpeg': 'image/jpeg', '.png': 'image/png', '.html': 'text/html' }; + +// ── Request handler ────────────────────────────────────────────────────────── + +const VIEWER_PATH = path.join(__dirname, 'dedupe_viewer.html'); + +const server = http.createServer((req, res) => { + const parsed = url.parse(req.url, true); + const pathname = parsed.pathname; + + // CORS for local dev + res.setHeader('Access-Control-Allow-Origin', '*'); + res.setHeader('Access-Control-Allow-Methods', 'GET, POST, DELETE, OPTIONS'); + res.setHeader('Access-Control-Allow-Headers', 'Content-Type'); + + if (req.method === 'OPTIONS') { res.writeHead(204); res.end(); return; } + + // Serve the viewer page + if (req.method === 'GET' && (pathname === '/' || pathname === '/index.html')) { + serveFile(res, VIEWER_PATH, 'text/html'); + return; + } + + // List all pairs + if (req.method === 'GET' && pathname === '/api/pairs') { + try { + const pairs = crawlPairs(); + json(res, 200, pairs); + } catch (e) { + json(res, 500, { error: e.message }); + } + return; + } + + // Serve an image + if (req.method === 'GET' && pathname === '/image') { + const relPath = parsed.query.p; + if (!relPath) { json(res, 400, { error: 'Missing ?p= parameter' }); return; } + const absPath = path.resolve(ROOT, relPath); + if (!absPath.startsWith(ROOT)) { json(res, 403, { error: 'Forbidden' }); return; } + serveFile(res, absPath, MIME[path.extname(absPath).toLowerCase()] || 'application/octet-stream'); + return; + } + + // Delete an SQL file + if (req.method === 'POST' && pathname === '/api/delete-sql') { + let body = ''; + req.on('data', chunk => { body += chunk; }); + req.on('end', () => { + try { + const { sqlPath } = JSON.parse(body); + if (!sqlPath) { json(res, 400, { error: 'Missing sqlPath' }); return; } + const absPath = path.resolve(ROOT, sqlPath); + if (!absPath.startsWith(ROOT)) { json(res, 403, { error: 'Forbidden' }); return; } + if (!absPath.endsWith('.sql')) { json(res, 400, { error: 'Not an SQL file' }); return; } + if (fs.existsSync(absPath)) fs.unlinkSync(absPath); + json(res, 200, { ok: true }); + } catch (e) { + json(res, 500, { error: e.message }); + } + }); + return; + } + + // Load saved progress + if (req.method === 'GET' && pathname === '/api/load-progress') { + json(res, 200, loadProgress()); + return; + } + + // Save progress + if (req.method === 'POST' && pathname === '/api/save-progress') { + let body = ''; + req.on('data', chunk => { body += chunk; }); + req.on('end', () => { + try { + saveProgress(JSON.parse(body)); + json(res, 200, { ok: true }); + } catch (e) { + json(res, 500, { error: e.message }); + } + }); + return; + } + + // Clear progress (called when all pairs are reviewed) + if (req.method === 'DELETE' && pathname === '/api/clear-progress') { + clearProgress(); + console.log('Progress cleared — all pairs reviewed.'); + json(res, 200, { ok: true }); + return; + } + + json(res, 404, { error: 'Not found' }); +}); + +function json(res, status, obj) { + res.writeHead(status, { 'Content-Type': 'application/json' }); + res.end(JSON.stringify(obj)); +} + +function serveFile(res, filePath, contentType) { + fs.readFile(filePath, (err, data) => { + if (err) { json(res, 404, { error: 'File not found' }); return; } + res.writeHead(200, { 'Content-Type': contentType }); + res.end(data); + }); +} + +server.listen(PORT, '127.0.0.1', () => { + console.log(`Dedupe Viewer running at http://localhost:${PORT}`); + console.log(`Root directory: ${ROOT}`); +}); diff --git a/utilities/install_video_crop.py b/utilities/install_video_crop.py new file mode 100644 index 0000000..fa9680a --- /dev/null +++ b/utilities/install_video_crop.py @@ -0,0 +1,173 @@ +#!/usr/bin/env python3 +""" +Normalize video dimensions in an installation folder by cropping videos that +are exactly 2px wider or taller than a paired dimension down to match the +smaller size, then updating installation.csv accordingly. + +Usage: + python install_video_crop.py [--dry-run] +""" + +import argparse +import csv +import os +import subprocess +import sys +from pathlib import Path + + +def load_csv(csv_path): + with open(csv_path, newline="") as f: + reader = csv.DictReader(f) + rows = list(reader) + fieldnames = reader.fieldnames + return rows, fieldnames + + +def find_2px_pairs(rows): + """ + Scan unique (width, height) pairs in the CSV and return a mapping of + larger_dim -> smaller_dim for every pair that differs by exactly 2px on + one axis while the other axis is identical. + """ + dims = set() + for row in rows: + dims.add((int(row["width"]), int(row["height"]))) + + pairs = {} # (larger_w, larger_h) -> (target_w, target_h) + dims_list = sorted(dims) + for i, (w1, h1) in enumerate(dims_list): + for w2, h2 in dims_list[i + 1 :]: + if w1 == w2 and abs(h1 - h2) == 2: + larger = (w1, h1) if h1 > h2 else (w2, h2) + smaller = (w1, h1) if h1 < h2 else (w2, h2) + pairs[larger] = smaller + elif h1 == h2 and abs(w1 - w2) == 2: + larger = (w1, h1) if w1 > w2 else (w2, h2) + smaller = (w1, h1) if w1 < w2 else (w2, h2) + pairs[larger] = smaller + return pairs + + +def crop_video(input_path, output_path, src_w, src_h, target_w, target_h): + """ + Crop input_path to target dimensions, centering the crop window, and + write the result to output_path. Audio is stream-copied unchanged. + Returns (success: bool, stderr: str). + """ + x_offset = (src_w - target_w) // 2 + y_offset = (src_h - target_h) // 2 + + cmd = [ + "ffmpeg", "-y", + "-i", str(input_path), + "-vf", f"crop={target_w}:{target_h}:{x_offset}:{y_offset}", + "-c:a", "copy", + str(output_path), + ] + result = subprocess.run(cmd, capture_output=True, text=True) + return result.returncode == 0, result.stderr + + +def main(): + parser = argparse.ArgumentParser( + description="Crop videos with 2px dimension mismatches to normalize them." + ) + parser.add_argument( + "folder", + help="Folder containing installation.csv and video files", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Print what would be done without modifying any files", + ) + args = parser.parse_args() + + folder = Path(args.folder) + csv_path = folder / "installation.csv" + + if not csv_path.exists(): + print(f"Error: {csv_path} not found", file=sys.stderr) + sys.exit(1) + + rows, fieldnames = load_csv(csv_path) + + pairs = find_2px_pairs(rows) + if not pairs: + print("No 2px dimension pairs found. Nothing to do.") + return + + print("2px dimension pairs that will be normalized (larger → smaller):") + for larger, smaller in sorted(pairs.items()): + print(f" {larger[0]}x{larger[1]} → {smaller[0]}x{smaller[1]}") + print() + + updated_rows = [] + errors = [] + + for row in rows: + w, h = int(row["width"]), int(row["height"]) + + if (w, h) not in pairs: + updated_rows.append(row) + continue + + target_w, target_h = pairs[(w, h)] + file_name = row["file_name"] + video_path = folder / file_name + + if not video_path.exists(): + print(f" WARNING: {file_name} not found in folder — skipping") + updated_rows.append(row) + continue + + print(f" {'[dry-run] ' if args.dry_run else ''}Cropping {file_name}") + print(f" {w}x{h} → {target_w}x{target_h}") + + if args.dry_run: + updated_rows.append(row) + continue + + tmp_path = video_path.with_suffix(".tmp.mp4") + success, stderr = crop_video(video_path, tmp_path, w, h, target_w, target_h) + + if success: + os.replace(tmp_path, video_path) + new_ratio = round(target_w / target_h, 3) + row = dict(row) + row["width"] = target_w + row["height"] = target_h + row["ratio"] = new_ratio + print(f" Done — new ratio {new_ratio}") + else: + print(f" ERROR: ffmpeg failed:\n{stderr[-400:]}", file=sys.stderr) + errors.append(file_name) + if tmp_path.exists(): + tmp_path.unlink() + + updated_rows.append(row) + + if args.dry_run: + print("\nDry run complete — no files modified.") + return + + if errors: + print( + f"\nFinished with {len(errors)} error(s). " + "CSV has not been updated to avoid partial state.", + file=sys.stderr, + ) + sys.exit(1) + + with open(csv_path, "w", newline="") as f: + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(updated_rows) + + affected = len([r for r in rows if (int(r["width"]), int(r["height"])) in pairs]) + print(f"\nDone. {affected} video(s) cropped, installation.csv updated.") + + +if __name__ == "__main__": + main()