diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 000000000..4e43979a9 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,6 @@ +* text=auto + +*.sh text eol=lf +*.py text eol=lf +Dockerfile text eol=lf +*.dockerignore text eol=lf diff --git a/.gitignore b/.gitignore index 6123a9ff2..8984478cd 100644 --- a/.gitignore +++ b/.gitignore @@ -42,3 +42,4 @@ cookies.txt test_episodes.json /models /tmp +/preprocessor/scripts/scribe_compare diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c4ca8a0f5..8947f5279 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,5 +1,5 @@ fail_fast: false -exclude: '^(bot/RANCZO-WIDEO/|bot/RANCZO-TRANSKRYPCJE/)' +exclude: '^(bot/RANCZO-WIDEO/|bot/RANCZO-TRANSKRYPCJE/|scripts/)' repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v6.0.0 @@ -37,6 +37,7 @@ repos: - id: chmod args: ["755"] files: (.*scripts\/.*.py$|\.sh$) + exclude: (^preprocessor/entrypoint\.sh$|^preprocessor/scripts/) - id: remove-tabs args: [--whitespaces-count, '4'] - repo: https://github.com/PyCQA/isort diff --git a/VERSION b/VERSION new file mode 100644 index 000000000..1454f6ed4 --- /dev/null +++ b/VERSION @@ -0,0 +1 @@ +4.0.1 diff --git a/bot/services/reindex/reindex_service.py b/bot/services/reindex/reindex_service.py index 310bc8434..1c8d67b45 100644 --- a/bot/services/reindex/reindex_service.py +++ b/bot/services/reindex/reindex_service.py @@ -22,7 +22,7 @@ from bot.services.reindex.video_path_transformer import VideoPathTransformer from bot.services.reindex.zip_extractor import ZipExtractor from bot.settings import settings -from preprocessor.search.elastic_manager import ElasticSearchManager +from preprocessor.search.elastic_manager import ElasticSearchManager # pylint: disable=no-name-in-module @dataclass diff --git a/preprocessor/Dockerfile b/preprocessor/Dockerfile index 07731cad3..fcc5e1271 100644 --- a/preprocessor/Dockerfile +++ b/preprocessor/Dockerfile @@ -38,12 +38,18 @@ RUN --mount=type=cache,target=/root/.cache/pip \ pip install --no-cache-dir --upgrade pip setuptools wheel \ && pip install --no-cache-dir \ -r /app/requirements.txt \ - vllm==0.13.0 \ --extra-index-url https://pypi.nvidia.com \ + && pip install --no-cache-dir --pre vllm \ + --extra-index-url https://wheels.vllm.ai/nightly \ + && pip install --no-cache-dir \ + git+https://github.com/huggingface/transformers.git@main \ + && pip uninstall -y flashinfer \ && pip uninstall -y onnxruntime \ && pip install --no-cache-dir \ onnxruntime-gpu==1.21.0 \ --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/ \ + && pip uninstall -y nvidia-cudnn-cu11 || true \ + && pip install --no-cache-dir --force-reinstall --no-deps nvidia-cudnn-cu12 \ && pip uninstall -y nvidia-nccl-cu11 || true \ && pip install --no-cache-dir --force-reinstall --no-deps nvidia-nccl-cu12 @@ -58,10 +64,7 @@ RUN mkdir -p \ /models/whisper \ /models/insightface \ /models/ultralytics \ - /models/emotion_model \ - /app/output_data/characters \ - /app/output_data/scraped_pages \ - /app/output_data/processing_metadata + /models/emotion_model COPY bot /app/bot COPY preprocessor /app/preprocessor diff --git a/preprocessor/README.md b/preprocessor/README.md index fe3306875..b81960a98 100644 --- a/preprocessor/README.md +++ b/preprocessor/README.md @@ -10,134 +10,252 @@ Docker pipeline do przetwarzania wideo z GPU: transkodowanie, transkrypcja, dete ```bash cd preprocessor -mkdir -p input_data/videos output_data -cp /twoje/wideo/*.mp4 input_data/videos/ +mkdir -p input_data output_data docker compose build -# Pełny pipeline z scrapingiem -./run-preprocessor.sh run-all /input_data/videos \ - --scrape-urls https://example.com/wiki/Seria \ - --character-urls https://example.com/wiki/Postacie \ - --series-name nazwa_serii - -# Z gotowymi metadanymi -./run-preprocessor.sh run-all /input_data/videos \ - --episodes-info-json /input_data/episodes.json \ - --series-name nazwa_serii - -# Pomiń transkodowanie i transkrypcję (użyj istniejących) -./run-preprocessor.sh run-all /input_data/videos \ - --episodes-info-json /input_data/episodes.json \ - --series-name nazwa_serii \ - --skip-transcode \ - --skip-transcribe - -# Tryb premium (Gemini + ElevenLabs + Google Images) -./run-preprocessor.sh run-all /input_data/videos \ - --series-name nazwa_serii \ - --parser-mode premium \ - --transcription-mode premium \ - --search-mode premium +# Podstawowe użycie - pełny pipeline +./run-preprocessor.sh run-all --series ranczo + +# Z pomijaniem konkretnych kroków +./run-preprocessor.sh run-all --series kiepscy --skip transcode --skip transcribe + +# Wymuszenie ponownego przetworzenia (ignoruje cache) +./run-preprocessor.sh run-all --series ranczo --force-rerun + +# Pojedynczy krok +./run-preprocessor.sh transcode --series ranczo +./run-preprocessor.sh detect-scenes --series ranczo + +# Analiza rozdzielczości (sprawdź przed uruchomieniem pipeline!) +./run-preprocessor.sh analyze-resolution --series kiepscy + +# Search +./run-preprocessor.sh search --series ranczo --text "Lucy Wilska" +./run-preprocessor.sh search --series kiepscy --stats ``` +**Konfiguracja:** Wszystkie parametry (URLs do scrapingu, tryby transkrypcji, bitrate, etc.) są w plikach `series_configs/*.json` + --- -## Pipeline (13 kroków) +## Konfiguracja per-seria +Pipeline używa plików JSON w `series_configs/` do konfiguracji każdego serialu: + +**Struktura:** ``` -SCRAPING PROCESSING INDEXING -─────────────────────────────────────────────────────────────────────── -[0a] episodes ─┬→ [1] transcode → [2] transcribe → [3] separate sounds -[0b] characters │ [4] analyze text -[0c] download │ [5] detect scenes → [6] export frames -[0d] process ─┘ [7] text embeddings - [8] frame processing (8a-8f) - [9] elastic docs → [10] archives → [11] index → [12] validate +series_configs/ +├── defaults.json # Domyślne ustawienia dla wszystkich seriali +├── ranczo.json # Nadpisuje defaults tylko dla Ranczo +└── kiepscy.json # Nadpisuje defaults tylko dla Kiepskich ``` ---- +**Przykład `kiepscy.json`:** +```json +{ + "display_name": "Świat według Kiepskich", + "series_name": "kiepscy", + "pipeline_mode": "full", + "indexing": { + "elasticsearch": { + "index_name": "kiepscy_clips" + } + }, + "processing": { + "transcode": { + "force_deinterlace": true, + "video_bitrate_mbps": 2.5 + }, + "transcription": { + "mode": "whisper", + "model": "large-v3-turbo" + } + }, + "scraping": { + "episodes": { + "parser_mode": "premium", + "urls": ["https://pl.wikipedia.org/wiki/Lista_odcinków..."] + }, + "characters": { + "parser_mode": "premium", + "urls": ["https://pl.wikipedia.org/wiki/Lista_postaci..."] + }, + "character_references": { + "search_engine": "google" + } + }, + "skip_steps": [] +} +``` -## Flagi Skip +**Tryby pipeline:** +- `"pipeline_mode": "full"` - uruchamia wszystkie kroki +- `"pipeline_mode": "selective"` - pomija kroki z `skip_steps` automatycznie -| Flaga | Krok | -|-------|------| -| `--skip-transcode` | 1: Transkodowanie | -| `--skip-transcribe` | 2-3: Transkrypcja + separacja | -| `--skip-text-analysis` | 4: Analiza tekstu | -| `--skip-scenes` | 5: Detekcja scen | -| `--skip-frame-export` | 6: Eksport klatek | -| `--skip-embeddings` | 7: Text embeddings | -| `--skip-character-reference-processing` | 0d: Przetwarzanie referencji postaci | -| `--skip-elastic-documents` | 9: Dokumenty ES | -| `--skip-archives` | 10: Archiwizacja ZIP | -| `--skip-index` | 11: Indeksowanie | -| `--skip-validation` | 12: Walidacja | +**Dostępne parametry:** Zobacz `defaults.json` dla pełnej listy opcji konfiguracyjnych. -
-Flagi frame processing (8a-8f) +--- -| Flaga | Krok | -|-------|------| -| `--skip-image-hashing` | 8a: Image hashing | -| `--skip-video-embeddings` | 8b: Video embeddings | -| `--skip-character-detection` | 8c: Character detection | -| `--skip-emotion-detection` | 8d: Emotion detection | -| `--skip-face-clustering` | 8e: Face clustering | -| `--skip-object-detection` | 8f: Object detection | +## Pipeline (21 kroków) -**Uwaga:** Wizualizacje są domyślnie wyłączone. Użyj `--debug-visualizations` aby je włączyć. +``` +SCRAPING PROCESSING INDEXING VALIDATION +──────────────────────────────────────────────────────────────────────────────────────────────────────────── +[1] scrape_episodes ──┬─→ [4] resolution_analysis ─→ [5] transcode ─→ [6] transcribe ─→ [7] separate_sounds +[2] scrape_characters │ [8] analyze_text ────┐ +[3] process_references─┘ [9] detect_scenes ─→ [10] export_frames │ + [11] text_embeddings │ + [12] video_embeddings ├─→ [21] validate + [13] image_hashing │ + [14] detect_characters │ + [15] detect_emotions │ + [16] cluster_faces │ + [17] detect_objects │ + [18] generate_elastic_docs ─→ [19] generate_archives ─→ [20] index_to_elasticsearch ─────┘ +``` -
+**Kroki są automatycznie wykonywane w poprawnej kolejności** - pipeline rozwiązuje zależności i tworzy plan wykonania. -**Premium modes:** `--parser-mode premium` (Gemini 2.5 Flash) • `--transcription-mode premium` (ElevenLabs) • `--search-mode premium` (Google Images) +**Resolution analysis (krok 4)** - analizuje rozdzielczości materiałów źródłowych przed transkodowaniem, ostrzega jeśli >50% wymaga upscalingu. + +**Validation (krok 21)** - uruchamiany na końcu, weryfikuje poprawność wszystkich poprzednich kroków pipeline. --- -## Główne komendy +## Dostępne komendy ```bash -# Pełny pipeline -./run-preprocessor.sh run-all /input_data/videos --series-name nazwa_serii [OPTIONS] - -# Pojedyncze kroki -./run-preprocessor.sh scrape-episodes --urls URL --output-file /input_data/episodes.json -./run-preprocessor.sh transcode /input_data/videos [--episodes-info-json FILE] [--resolution 720p] -./run-preprocessor.sh transcribe /input_data/videos --name series --episodes-info-json FILE -./run-preprocessor.sh transcribe-elevenlabs /input_data/videos --name series --episodes-info-json FILE -./run-preprocessor.sh separate-sounds --transcription-jsons /app/output_data/transcriptions -./run-preprocessor.sh analyze-text --season S10 --language pl -./run-preprocessor.sh detect-scenes /input_data/videos [--threshold 0.5] -./run-preprocessor.sh export-frames /input_data/videos -./run-preprocessor.sh process-character-references --name series -./run-preprocessor.sh image-hashing --frames-dir /app/output_data/exported_frames -./run-preprocessor.sh generate-embeddings --transcription-jsons /app/output_data/transcriptions -./run-preprocessor.sh generate-elastic-documents --transcription-jsons /app/output_data/transcriptions -./run-preprocessor.sh generate-archives --series-name nazwa_serii -./run-preprocessor.sh index --name nazwa_serii -./run-preprocessor.sh validate --season S01 --series-name nazwa_serii - -# Narzędzia -./run-preprocessor.sh search --text "query" -./run-preprocessor.sh search --text-semantic "query" -./run-preprocessor.sh search --image /path/to/image.jpg -./run-preprocessor.sh search --character "Nazwa" -./run-preprocessor.sh search --emotion "happiness" -./run-preprocessor.sh search --stats -./run-preprocessor.sh fix-unicode --transcription-jsons DIR --episodes-info-json FILE --name series -./run-preprocessor.sh import-transcriptions --input-dir DIR --episodes-info-json FILE --name series +# Pipeline +./run-preprocessor.sh run-all --series NAZWA [--skip STEP_ID ...] [--force-rerun] + +# Scraping +./run-preprocessor.sh scrape-episodes --series NAZWA +./run-preprocessor.sh scrape-characters --series NAZWA +./run-preprocessor.sh process-references --series NAZWA + +# Video processing +./run-preprocessor.sh transcode --series NAZWA +./run-preprocessor.sh detect-scenes --series NAZWA +./run-preprocessor.sh export-frames --series NAZWA + +# Audio/Text processing +./run-preprocessor.sh transcribe --series NAZWA +./run-preprocessor.sh separate-sounds --series NAZWA +./run-preprocessor.sh analyze-text --series NAZWA + +# Embeddings +./run-preprocessor.sh text-embeddings --series NAZWA +./run-preprocessor.sh video-embeddings --series NAZWA + +# Visual analysis +./run-preprocessor.sh image-hashing --series NAZWA +./run-preprocessor.sh detect-characters --series NAZWA +./run-preprocessor.sh detect-emotions --series NAZWA +./run-preprocessor.sh cluster-faces --series NAZWA +./run-preprocessor.sh detect-objects --series NAZWA + +# Indexing +./run-preprocessor.sh generate-elastic-docs --series NAZWA +./run-preprocessor.sh generate-archives --series NAZWA +./run-preprocessor.sh index-to-elasticsearch --series NAZWA + +# Validation +./run-preprocessor.sh validate --series NAZWA + +# Search (wymaga uruchomionego Elasticsearch) +./run-preprocessor.sh search --series NAZWA --text "query" +./run-preprocessor.sh search --series NAZWA --text-semantic "query" +./run-preprocessor.sh search --series NAZWA --image /input_data/screenshot.jpg +./run-preprocessor.sh search --series NAZWA --character "Postać" +./run-preprocessor.sh search --series NAZWA --emotion "happiness" +./run-preprocessor.sh search --series NAZWA --object "person:5+" +./run-preprocessor.sh search --series NAZWA --stats +./run-preprocessor.sh search --series NAZWA --list-characters + +# Utilities +./run-preprocessor.sh analyze-resolution --series NAZWA # Analiza rozdzielczości i rekomendacje +./run-preprocessor.sh visualize --series NAZWA # Wizualizacja grafu zależności +./run-preprocessor.sh bash # Shell w kontenerze +``` + +**Parametry:** +- `--series NAZWA` - **WYMAGANY** dla wszystkich komend (np. `ranczo`, `kiepscy`) +- `--force-rerun` - Ignoruje cache i przetwarza ponownie +- `--skip STEP_ID` - Pomija konkretny krok (można użyć wielokrotnie) + +**Step IDs do --skip:** +``` +scrape_episodes, scrape_characters, process_references, +resolution_analysis, transcode, transcribe, separate_sounds, analyze_text, +detect_scenes, export_frames, text_embeddings, video_embeddings, +image_hashing, detect_characters, detect_emotions, cluster_faces, detect_objects, +generate_elastic_docs, generate_archives, index_to_elasticsearch, +validate ``` --- -## Struktura output +## Multi-Series Support + +Pipeline wspiera przetwarzanie wielu seriali jednocześnie. Każdy serial ma dedykowany folder i konfigurację. +**Input struktura:** +``` +input_data/ +├── ranczo/ +│ ├── S01/ +│ │ ├── S01E01.mp4 +│ │ └── S01E02.mp4 +│ ├── S02/ +│ └── S03/ +└── kiepscy/ + ├── S01/ + └── S02/ +``` + +**Output struktura:** ``` output_data/ -├── transcoded_videos/ # MP4 h264_nvenc (720p) +├── ranczo/ +│ ├── transcoded_videos/ +│ ├── transcriptions/ +│ ├── scene_timestamps/ +│ ├── exported_frames/ +│ ├── embeddings/ +│ ├── elastic_documents/ +│ ├── .preprocessing_state_ranczo.json +│ └── ... +└── kiepscy/ + ├── transcoded_videos/ + ├── .preprocessing_state_kiepscy.json + └── ... +``` + +**Config struktura:** +``` +series_configs/ +├── defaults.json # Domyślne dla wszystkich +├── ranczo.json # Overrides dla Ranczo +└── kiepscy.json # Overrides dla Kiepskich +``` + +**Migracja ze starej struktury:** +```bash +mkdir -p input_data/nazwa_serii +mv input_data/S* input_data/nazwa_serii/ +``` + +--- + +## Struktura output (per serial) + +``` +output_data/{series_name}/ +├── transcoded_videos/ # MP4 h264_nvenc (720p domyślnie) ├── transcriptions/ # raw/ • clean/ • sound_events/ ├── scene_timestamps/ # JSON z timestampami scen -├── exported_frames/ # JPG 1080p (domyślnie) -├── embeddings/ # text • video • sound_events • full_episode +├── exported_frames/ # PNG (1080p domyślnie) +├── embeddings/ # text/ • video/ • sound_events/ • full_episode/ ├── image_hashes/ # perceptual hashes klatek ├── character_detections/ # detections.json + visualizations/ (opcjonalne) ├── character_references_processed/ # face vectors postaci @@ -145,10 +263,17 @@ output_data/ ├── face_clusters/ # HDBSCAN clusters ├── object_detections/ # D-FINE detections + visualizations/ (opcjonalne) ├── elastic_documents/ # JSONL per typ dokumentu +│ ├── text_segments/ +│ ├── text_embeddings/ +│ ├── video_frames/ +│ └── episode_names/ ├── archives/ # ZIP per odcinek ├── validation_reports/ # JSON raporty walidacji ├── processing_metadata/ # metadata kroków pipeline -└── scraped_pages/ # zapisane strony wiki +├── scraped_pages/ # zapisane strony wiki +├── {series}_episodes.json # metadane odcinków +├── {series}_characters.json # lista postaci +└── .preprocessing_state_{series}.json # stan pipeline (cache) ``` --- @@ -158,6 +283,7 @@ output_data/ | Komponent | Stack | |-----------|-------| | Transkodowanie | FFmpeg + h264_nvenc (GPU) | +| Deinterlacing | bwdif (opcjonalnie, auto-detect lub force) | | Transkrypcja | Whisper large-v3-turbo / ElevenLabs Scribe v1 | | Sceny | TransNetV2 | | Embeddingi | Qwen/Qwen3-VL-Embedding-8B (4096-dim) | @@ -171,13 +297,92 @@ output_data/ --- +## Parametry konfiguracyjne + +**Wszystkie parametry są w `series_configs/*.json`**. Poniżej wartości domyślne z `defaults.json`: + +**Transkodowanie (`processing.transcode`):** +```json +{ + "codec": "h264_nvenc", + "resolution": "720p", + "video_bitrate_mbps": 2.5, + "minrate_mbps": 1.5, + "maxrate_mbps": 3.5, + "bufsize_mbps": 5.0, + "audio_bitrate_kbps": 128, + "gop_size": 2.0, + "force_deinterlace": false +} +``` + +**Detekcja scen (`processing.scene_detection`):** +```json +{ + "threshold": 0.5, + "min_scene_len": 10 +} +``` + +**Eksport klatek (`processing.frame_export`):** +```json +{ + "frames_per_scene": 3 +} +``` + +**Transkrypcja (`processing.transcription`):** +```json +{ + "mode": "whisper", + "model": "large-v3-turbo", + "language": "pl", + "device": "cuda" +} +``` + +**Scraping (`scraping`):** +```json +{ + "episodes": { + "parser_mode": "normal", + "urls": ["https://..."] + }, + "characters": { + "parser_mode": "normal", + "urls": ["https://..."] + }, + "character_references": { + "search_engine": "duckduckgo", + "images_per_character": 5 + } +} +``` + +**Elasticsearch (`indexing.elasticsearch`):** +```json +{ + "index_name": "nazwa_clips", + "host": "localhost:9200", + "dry_run": false, + "append": false +} +``` + +**Tryby:** +- `parser_mode`: `"normal"` (Qwen2.5-Coder) | `"premium"` (Gemini 2.5 Flash) +- `transcription.mode`: `"whisper"` | `"elevenlabs"` +- `search_engine`: `"duckduckgo"` | `"google"` (wymaga SERPAPI_API_KEY) + +--- + ## Użycie VRAM **Target:** ~21GB VRAM (85% z 24GB dla modelu embeddingowego) -**Batch sizes:** -- Video embeddings: 32 (domyślnie), progress sub-batch: 100 -- Text embeddings: 64 (domyślnie) +**Batch sizes (domyślne):** +- Video embeddings: 32, progress sub-batch: 100 +- Text embeddings: 64 - Object detection: 8 - Emotion detection: 32 @@ -213,70 +418,27 @@ Faktyczne użycie VRAM zależy od: ## Formaty plików -**Input:** `.mp4` `.avi` `.mkv` `.mov` `.flv` `.wmv` `.webm` +**Input wideo:** `.mp4` `.avi` `.mkv` `.mov` `.flv` `.wmv` `.webm` **Output wideo:** `.mp4` (h264_nvenc, 720p domyślnie) -**Output klatki:** `.jpg` (1080p domyślnie) +**Output klatki:** `.png` (1080p domyślnie) **Nazewnictwo odcinków:** `S01E01`, `s01e12`, `S10E05` (case-insensitive) **Nazewnictwo folderów:** `S01`, `Sezon 1`, `Season 10` → autonormalizacja do `SXX` -**Metadane:** JSON (episodes.json, characters.json) -**Elastic docs:** JSONL per typ (text_segments, video_frames, etc.) +**Metadane:** JSON (`{series}_episodes.json`, `{series}_characters.json`) +**Elastic docs:** JSONL per typ (`text_segments`, `video_frames`, `text_embeddings`, `episode_names`) --- -## Parametry konfiguracyjne - -**Transkodowanie:** -- Target file size: 50MB per 100s -- Audio bitrate: 128 kbps -- GOP size: 0.5s +## State Management -**Scene detection:** -- Threshold: 0.5 -- Min scene length: 10 frames - -**Text chunking:** -- Segments per embedding: 5 -- Sentences per chunk: 8 -- Chunk overlap: 3 - -**Character detection:** -- Reference images per character: 3 -- Normalized face size: 112x112 -- Face detection threshold: 0.2 -- Reference matching threshold: 0.50 -- Frame detection threshold: 0.55 - -**Object detection:** -- Confidence threshold: 0.30 - -**Embeddings:** -- Dimension: 4096 -- Max model length: 8192 tokens -- Chunked prefill: enabled - ---- +Pipeline automatycznie zapisuje stan przetwarzania w `.preprocessing_state_{series}.json`: +- Śledzi które kroki zostały ukończone dla każdego odcinka +- Pozwala na wznowienie po przerwaniu (Ctrl+C) +- Pomija już przetworzone odcinki (chyba że `--force-rerun`) -## Dodatkowe opcje - -**State management:** -- `--no-state` - wyłącz zapisywanie stanu (brak wznowienia po przerwaniu) -- Domyślnie pipeline zapisuje stan i można wznowić po Ctrl+C - -**Ramdisk:** -- `--ramdisk-path /mnt/ramdisk` - użyj RAMdisk dla tymczasowych plików (szybsze przetwarzanie) -- Domyślnie: `/dev/shm` (shared memory, 4GB z docker-compose) -- RAMdisk używany do: kopiowania klatek podczas frame processing, tymczasowych plików transkrypcji - -**Interaktywny tryb:** -- `--interactive-character-processing` - manualna selekcja twarzy przy przetwarzaniu referencji postaci - -**Debug:** -- `--debug-visualizations` - włącz wizualizacje dla detekcji postaci i obiektów (wyłączone domyślnie) -- `--dry-run` - test indeksowania bez wysyłania do Elasticsearch - -**Embeddingi:** -- `--skip-full-episode` - pomiń generowanie embeddingów całych odcinków (tylko text, video, sound events) -- `--batch-size N` - rozmiar batcha dla embeddingów (domyślnie 32 dla video, 64 dla text) +**Resetowanie stanu:** +```bash +rm output_data/ranczo/.preprocessing_state_ranczo.json +``` --- @@ -301,26 +463,124 @@ docker run --rm --gpus all nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi ```bash # Logi -docker logs ranchbot-preprocessing-app -f +docker logs -f preprocessor-preprocessor-run-XXX # GPU check nvidia-smi docker run --rm --gpus all nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi -# OOM na GPU → zmniejsz batch size -./run-preprocessor.sh generate-embeddings --batch-size 16 # domyślnie 32 +# OOM na GPU +# Zmniejsz batch_size w series_configs/{series}.json # Brak miejsca na dysku docker system prune -a docker volume prune -du -sh output_data/* # sprawdź co zajmuje miejsce +du -sh output_data/* # Wznów pipeline po przerwaniu -./run-preprocessor.sh run-all /input_data/videos --series-name nazwa_serii --name nazwa_serii +./run-preprocessor.sh run-all --series nazwa_serii +# Stan jest automatycznie przywracany z .preprocessing_state_{series}.json + +# Reset stanu dla konkretnego serialu +rm output_data/nazwa_serii/.preprocessing_state_nazwa_serii.json # Reset całego named volume z modelami docker volume rm ranchbot-ai-models # Shell w kontenerze ./run-preprocessor.sh bash + +# Debug - wizualizacja grafu pipeline +./run-preprocessor.sh visualize --series nazwa_serii +``` + +--- + +## Search Guide + +Szczegółowy opis funkcjonalności search znajduje się w `SEARCH_GUIDE.md`. + +**Quick examples:** +```bash +# Statystyki +./run-preprocessor.sh search --series ranczo --stats + +# Full-text search +./run-preprocessor.sh search --series ranczo --text "Lucy Wilska" --season 10 + +# Semantic search +./run-preprocessor.sh search --series ranczo --text-semantic "wesele" + +# Visual search +./run-preprocessor.sh search --series ranczo --image /input_data/screenshot.jpg + +# Search by character/emotion/object +./run-preprocessor.sh search --series ranczo --character "Lucy Wilska" --emotion "happiness" +./run-preprocessor.sh search --series ranczo --object "person:5+" + +# Lista postaci +./run-preprocessor.sh search --series ranczo --list-characters +``` + +--- + +## Tworzenie nowego serialu + +1. **Przygotuj dane:** + ```bash + mkdir -p input_data/nowy_serial/S01 + cp /path/to/videos/*.mp4 input_data/nowy_serial/S01/ + ``` + +2. **Stwórz config:** + ```bash + cp series_configs/defaults.json series_configs/nowy_serial.json + ``` + +3. **Edytuj config:** + ```json + { + "series_name": "nowy_serial", + "display_name": "Nowy Serial", + "indexing": { + "elasticsearch": { + "index_name": "nowy_serial_clips" + } + }, + "scraping": { + "episodes": { + "urls": ["https://..."] + }, + "characters": { + "urls": ["https://..."] + } + } + } + ``` + +4. **Uruchom pipeline:** + ```bash + ./run-preprocessor.sh run-all --series nowy_serial + ``` + +--- + +## API Keys (opcjonalne) + +Ustaw w `.env` lub docker-compose environment: + +```bash +# ElevenLabs (dla premium transcription) +ELEVEN_API_KEY=your_key + +# Google Images (dla premium character references) +SERPAPI_API_KEY=your_key + +# Gemini (dla premium scraping) +GEMINI_API_KEY=your_key + +# Elasticsearch (jeśli wymaga auth) +ES_HOST=localhost:9200 +ES_USER=elastic +ES_PASS=password ``` diff --git a/preprocessor/SEARCH_GUIDE.md b/preprocessor/SEARCH_GUIDE.md index 06caa66b7..9b2aeaa67 100644 --- a/preprocessor/SEARCH_GUIDE.md +++ b/preprocessor/SEARCH_GUIDE.md @@ -1,13 +1,17 @@ -# Ranczo Search +# Search CLI do przeszukiwania Elasticsearch. Wymaga ES na `localhost:9200` (lub inny `--host`) z zaindeksowanymi danymi. -**Indeksy:** `ranczo_segments` • `ranczo_text_embeddings` • `ranczo_video_frames` • `ranczo_episode_names` +**Multi-series:** Każdy serial ma własne indeksy (np. `ranczo_clips_*`, `kiepscy_clips_*`). Użyj `--series nazwa_serii` aby wybrać który serial przeszukać. + +**Indeksy (przykład dla ranczo):** `ranczo_clips_text_segments` • `ranczo_clips_text_embeddings` • `ranczo_clips_video_frames` • `ranczo_clips_episode_names` --- ## Tryby wyszukiwania +**WAŻNE:** Wszystkie komendy wymagają parametru `--series nazwa_serii` (np. `--series ranczo`, `--series kiepscy`) + | Flaga | Opis | |-------|------| | `--text` | Full-text BM25, dokładne słowa | @@ -29,34 +33,39 @@ CLI do przeszukiwania Elasticsearch. Wymaga ES na `localhost:9200` (lub inny `-- ```bash # Meta -./run-preprocessor.sh search --stats -./run-preprocessor.sh search --list-characters +./run-preprocessor.sh search --series ranczo --stats +./run-preprocessor.sh search --series ranczo --list-characters +./run-preprocessor.sh search --series kiepscy --stats # dla innego serialu # Text -./run-preprocessor.sh search --text "Kto tu rządzi" --limit 5 -./run-preprocessor.sh search --text-semantic "wesele" --season 10 +./run-preprocessor.sh search --series ranczo --text "Kto tu rządzi" --limit 5 +./run-preprocessor.sh search --series ranczo --text-semantic "wesele" --season 10 # Visual -./run-preprocessor.sh search --text-to-video "pocałunek" -./run-preprocessor.sh search --image /input_data/screenshot.jpg -./run-preprocessor.sh search --hash /input_data/frame.jpg # znajdź duplikaty -./run-preprocessor.sh search --hash "a1b2c3d4e5f6" # lub podaj hash bezpośrednio +./run-preprocessor.sh search --series ranczo --text-to-video "pocałunek" +./run-preprocessor.sh search --series ranczo --image /input_data/screenshot.jpg +./run-preprocessor.sh search --series ranczo --hash /input_data/frame.jpg # znajdź duplikaty +./run-preprocessor.sh search --series ranczo --hash "a1b2c3d4e5f6" # lub podaj hash bezpośrednio # Filtry i kombinacje -./run-preprocessor.sh search --character "Lucy Wilska" --season 10 -./run-preprocessor.sh search --emotion "happiness" --character "Lucy Wilska" -./run-preprocessor.sh search --emotion "sadness" --season 1 --episode 5 -./run-preprocessor.sh search --object "person:5+" # 5+ osób -./run-preprocessor.sh search --object "dog" --season 10 -./run-preprocessor.sh search --text-to-video "pocałunek" --character "Lucy Wilska" -./run-preprocessor.sh search --image /input_data/frame.jpg --season 10 --episode 1 +./run-preprocessor.sh search --series ranczo --character "Lucy Wilska" --season 10 +./run-preprocessor.sh search --series ranczo --emotion "happiness" --character "Lucy Wilska" +./run-preprocessor.sh search --series ranczo --emotion "sadness" --season 1 --episode 5 +./run-preprocessor.sh search --series ranczo --object "person:5+" # 5+ osób +./run-preprocessor.sh search --series ranczo --object "dog" --season 10 +./run-preprocessor.sh search --series ranczo --text-to-video "pocałunek" --character "Lucy Wilska" +./run-preprocessor.sh search --series ranczo --image /input_data/frame.jpg --season 10 --episode 1 # Episode -./run-preprocessor.sh search --episode-name "Spadek" -./run-preprocessor.sh search --episode-name-semantic "wesele" +./run-preprocessor.sh search --series ranczo --episode-name "Spadek" +./run-preprocessor.sh search --series ranczo --episode-name-semantic "wesele" # Output -./run-preprocessor.sh search --text "Lucy" --json-output | jq '.hits[]' +./run-preprocessor.sh search --series ranczo --text "Lucy" --json-output | jq '.hits[]' + +# Inne seriale +./run-preprocessor.sh search --series kiepscy --text "Ferdek" +./run-preprocessor.sh search --series kiepscy --character "Halina Kiepska" --emotion "anger" ``` --- @@ -65,6 +74,7 @@ CLI do przeszukiwania Elasticsearch. Wymaga ES na `localhost:9200` (lub inny `-- | Filtr | Użycie | |-------|--------| +| `--series NAME` | **WYMAGANY:** Nazwa serialu (np. ranczo, kiepscy) | | `--season N` | Sezon | | `--episode N` | Odcinek | | `--character NAME` | Postać (case-sensitive) | @@ -164,11 +174,11 @@ curl http://localhost:9200 # Oczekiwany output: {"name": "...", "cluster_name": "...", ...} # Test indeksów -./run-preprocessor.sh search --stats +./run-preprocessor.sh search --series ranczo --stats # Powinno pokazać liczby dokumentów w każdym indeksie # Brak wyników dla postaci (case-sensitive!) -./run-preprocessor.sh search --list-characters | grep -i "lucy" +./run-preprocessor.sh search --series ranczo --list-characters | grep -i "lucy" # Użyj dokładnej nazwy: "Lucy Wilska" nie "lucy wilska" # Błąd "Cannot connect to Elasticsearch" @@ -181,8 +191,12 @@ nvidia-smi # sprawdź dostępność GPU # Plik obrazka nie znaleziony # Ścieżki w kontenerze: /input_data/ nie ./input_data/ -./run-preprocessor.sh search --image /input_data/screenshot.jpg # ✓ -./run-preprocessor.sh search --image ./input_data/screenshot.jpg # ✗ +./run-preprocessor.sh search --series ranczo --image /input_data/screenshot.jpg # ✓ +./run-preprocessor.sh search --series ranczo --image ./input_data/screenshot.jpg # ✗ + +# Brak parametru --series +./run-preprocessor.sh search --text "Lucy" # ✗ Błąd: --series jest wymagany +./run-preprocessor.sh search --series ranczo --text "Lucy" # ✓ ``` **Wymagania:** diff --git a/preprocessor/__main__.py b/preprocessor/__main__.py index 5b961d0ed..a3072a518 100644 --- a/preprocessor/__main__.py +++ b/preprocessor/__main__.py @@ -2,14 +2,13 @@ import sys from preprocessor.cli import cli -from preprocessor.utils.console import console +from preprocessor.services.ui.console import console logging.getLogger('matplotlib').setLevel(logging.ERROR) logging.getLogger('matplotlib.font_manager').setLevel(logging.ERROR) - -if __name__ == "__main__": +if __name__ == '__main__': try: cli() except KeyboardInterrupt: - console.print("\n[yellow]Operation cancelled by user[/yellow]") + console.print('\n[yellow]Operation cancelled by user[/yellow]') sys.exit(130) diff --git a/preprocessor/app/__init__.py b/preprocessor/app/__init__.py new file mode 100644 index 000000000..6d5ad1b34 --- /dev/null +++ b/preprocessor/app/__init__.py @@ -0,0 +1,6 @@ +from preprocessor.app.pipeline_factory import ( + build_pipeline, + visualize, +) + +__all__ = ["build_pipeline", "visualize"] diff --git a/preprocessor/app/pipeline.py b/preprocessor/app/pipeline.py new file mode 100644 index 000000000..84e01a72e --- /dev/null +++ b/preprocessor/app/pipeline.py @@ -0,0 +1,164 @@ +from __future__ import annotations + +from typing import ( + Dict, + List, + Optional, + Set, +) + +import networkx as nx + +from preprocessor.app.step_builder import StepBuilder +from preprocessor.services.core.logging import ErrorHandlingLogger + + +class PipelineDefinition: + def __init__(self, name: str) -> None: + self.__name = name + self.__steps: Dict[str, StepBuilder] = {} + self.__graph: Optional[nx.DiGraph] = None + + @property + def name(self) -> str: + return self.__name + + def register(self, step: StepBuilder) -> None: + if step.id in self.__steps: + raise ValueError( + f"DUPLICATE STEP:\n" + f" Step '{step.id}' is already registered in the pipeline!\n" + f" Check build_pipeline() in pipeline_factory.py", + ) + self.__steps[step.id] = step + + def get_step(self, step_id: str) -> StepBuilder: + if step_id not in self.__steps: + raise KeyError( + f"Step '{step_id}' not found. Available: {list(self.__steps.keys())}", + ) + return self.__steps[step_id] + + def get_all_steps(self) -> Dict[str, StepBuilder]: + return dict(self.__steps) + + def validate(self, logger: Optional["ErrorHandlingLogger"] = None) -> None: + self.__graph = nx.DiGraph() + + for step_id, step in self.__steps.items(): + self.__graph.add_node(step_id, step=step) + + for step_id, step in self.__steps.items(): + for dep_id in step.dependency_ids: + if dep_id not in self.__steps: + self.__raise_missing_dependency_error(step_id, dep_id) + self.__graph.add_edge(dep_id, step_id) + + if not nx.is_directed_acyclic_graph(self.__graph): + self.__raise_cycle_error() + + message = ( + f"Pipeline '{self.__name}' validated successfully:\n" + f" - {len(self.__steps)} steps registered\n" + f" - DAG structure confirmed\n" + f" - No cyclic dependencies" + ) + + if logger: + logger.info(message) + else: + print(message) + + def get_execution_order( + self, targets: Optional[List[str]] = None, skip: Optional[List[str]] = None, + ) -> List[str]: + if not self.__graph: + raise RuntimeError( + "Pipeline not validated! Call pipeline.validate() first.", + ) + + full_order: List[str] = list(nx.topological_sort(self.__graph)) + + if targets: + required: Set[str] = set() + for target in targets: + if target not in self.__steps: + raise ValueError(f"Target step '{target}' does not exist in pipeline") + required.add(target) + required.update(nx.ancestors(self.__graph, target)) + full_order = [s for s in full_order if s in required] + + skip_set: Set[str] = set(skip or []) + return [s for s in full_order if s not in skip_set] + + def to_ascii_art(self) -> str: + if not self.__graph: + self.validate() + + lines: List[str] = [ + "=" * 80, + f"PIPELINE: {self.__name}", + "=" * 80, + "", + ] + + phases: Dict[str, List[StepBuilder]] = self.__group_steps_by_phase() + + for phase_name in ("SCRAPING", "PROCESSING", "INDEXING"): + if phase_name not in phases: + continue + + lines.append(f"[{phase_name}]") + lines.append("-" * 80) + + for step in phases[phase_name]: + deps_str = f" <- needs: {', '.join(step.dependency_ids)}" if step.dependency_ids else "" + lines.append(f" {step.id}{deps_str}") + lines.append(f" -> produces: {', '.join(step.produces)}") + lines.append(f" -> {step.description}\n") + + lines.append("=" * 80) + return "\n".join(lines) + + def __group_steps_by_phase(self) -> Dict[str, List[StepBuilder]]: + phases: Dict[str, List[StepBuilder]] = {} + for step in self.__steps.values(): + phase_name = step.phase.name + if phase_name not in phases: + phases[phase_name] = [] + phases[phase_name].append(step) + return phases + + def __raise_cycle_error(self) -> None: + cycles = list(nx.simple_cycles(self.__graph)) + cycle_path = " -> ".join(cycles[0]) + f" -> {cycles[0][0]}" + + raise ValueError( + f"\n{'=' * 80}\n" + f"PIPELINE DEPENDENCY CYCLE DETECTED\n" + f"{'=' * 80}\n\n" + f"Cyclic dependency detected:\n" + f" {cycle_path}\n\n" + f"Steps in cycle: {', '.join(cycles[0])}\n\n" + f"Pipeline must be a DAG (Directed Acyclic Graph).\n" + f"Remove one of the dependencies to break the cycle.\n" + f"\n{'=' * 80}\n", + ) + + def __raise_missing_dependency_error(self, step_id: str, missing_dep_id: str) -> None: + raise ValueError( + f"\n{'=' * 80}\n" + f"PIPELINE DEPENDENCY ERROR\n" + f"{'=' * 80}\n\n" + f"Step: '{step_id}'\n" + f"Needs: '{missing_dep_id}'\n" + f"Issue: Step '{missing_dep_id}' is not registered!\n\n" + f"Solution:\n" + f" 1. Check build_pipeline() in preprocessor/app/pipeline_factory.py\n" + f" 2. Ensure '{missing_dep_id}' is added via pipeline.register()\n" + f" 3. Or remove '{missing_dep_id}' from needs=[...] in definition of '{step_id}'\n" + f"\n{'=' * 80}\n", + ) + + def __repr__(self) -> str: + return f"PipelineDefinition(name='{self.__name}', steps={len(self.__steps)})" diff --git a/preprocessor/app/pipeline_builder.py b/preprocessor/app/pipeline_builder.py new file mode 100644 index 000000000..862b8f709 --- /dev/null +++ b/preprocessor/app/pipeline_builder.py @@ -0,0 +1,326 @@ +from __future__ import annotations + +from pathlib import Path +from typing import ( + Any, + Dict, + List, +) + +from preprocessor.app.pipeline import PipelineDefinition +from preprocessor.core.artifacts import SourceVideo +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext +from preprocessor.services.episodes.episode_manager import EpisodeManager +from preprocessor.services.video.discovery import VideoDiscovery + + +class PipelineExecutor: + def __init__(self, context: ExecutionContext) -> None: + self.__context = context + self.__steps: List[PipelineStep] = [] + + def add_step(self, step: PipelineStep) -> "PipelineExecutor": + self.__steps.append(step) + return self + + def cleanup(self) -> None: + for step in self.__steps: + if hasattr(step, "cleanup"): + try: + step.cleanup() + except Exception as e: + self.__context.logger.error(f"Cleanup failed for step {step.name}: {e}") + + def __discover_source_videos( + self, source_path: Path, episode_manager: EpisodeManager, + ) -> List[SourceVideo]: + video_files = VideoDiscovery.discover(source_path) + self.__context.logger.info( + f"Discovered {len(video_files)} video files in {source_path}", + ) + + if video_files: + return self.__build_source_videos_from_files(video_files, episode_manager) + + self.__context.logger.info( + "No input files found — building episode list from episodes.json", + ) + return self.__build_source_videos_from_episodes(episode_manager) + + def __build_source_videos_from_files( + self, video_files: List[Path], episode_manager: EpisodeManager, + ) -> List[SourceVideo]: + source_videos: List[SourceVideo] = [] + for video_file in video_files: + episode_info = episode_manager.parse_filename(video_file) + if not episode_info: + self.__context.logger.warning(f"Cannot parse: {video_file}") + continue + episode_id = episode_manager.get_episode_id_for_state(episode_info) + source_videos.append( + SourceVideo( + path=video_file, + episode_id=episode_id, + episode_info=episode_info, + ), + ) + return source_videos + + def __build_source_videos_from_episodes( + self, episode_manager: EpisodeManager, + ) -> List[SourceVideo]: + all_episodes = episode_manager.get_all_episodes() + if not all_episodes: + self.__context.logger.warning( + "No episodes in episodes.json and no input files — nothing to process", + ) + return [] + + self.__context.logger.info( + f"Building source from {len(all_episodes)} episodes in episodes.json", + ) + return [ + SourceVideo( + path=Path(''), + episode_id=episode_manager.get_episode_id_for_state(ep), + episode_info=ep, + ) + for ep in all_episodes + ] + + def __execute_step_with_registry( + self, + pipeline: "PipelineDefinition", + step_id: str, + artifact_registry: Dict[str, List[Any]], + ) -> None: + step_def = pipeline.get_step(step_id) + self.__context.logger.info(f"Step: {step_id}") + self.__context.logger.info(f"{step_def.description}") + + instance = step_def.step_class(step_def.config) + + if instance.is_global: + self.__run_global_step(instance) + else: + input_artifacts = self.__get_input_artifacts(step_def, artifact_registry) + output_artifacts = self.__run_episode_step(instance, input_artifacts) + artifact_registry[step_id] = output_artifacts + + self.__context.logger.info(f"Step '{step_id}' completed") + + @staticmethod + def __get_input_artifacts( + step_def, + artifact_registry: Dict[str, List[Any]], + ) -> List[Any]: + if not step_def.dependency_ids: + return artifact_registry.get('__source__', []) + + input_source_id = step_def.dependency_ids[0] + artifacts = artifact_registry.get(input_source_id, []) + + if not artifacts: + return artifact_registry.get('__source__', []) + + return artifacts + + def execute_step( + self, + pipeline: "PipelineDefinition", + step_id: str, + source_path: Path, + episode_manager: EpisodeManager, + ) -> None: + step_def = pipeline.get_step(step_id) + self.__context.logger.info(f"Step: {step_id}") + self.__context.logger.info(f"{step_def.description}") + + instance = step_def.step_class(step_def.config) + + runner = PipelineExecutor(self.__context) + runner.add_step(instance) + runner.run(source_path, episode_manager) + + self.__context.logger.info(f"Step '{step_id}' completed") + + def execute_steps( + self, + pipeline: "PipelineDefinition", + step_ids: List[str], + source_path: Path, + episode_manager: EpisodeManager, + ) -> None: + artifact_registry: Dict[str, List[Any]] = {} + source_artifacts = self.__discover_source_videos(source_path, episode_manager) + artifact_registry['__source__'] = source_artifacts + + for step_id in step_ids: + self.__context.logger.info(f"{'=' * 80}") + self.__execute_step_with_registry( + pipeline, step_id, artifact_registry, + ) + + def run(self, source_path: Path, episode_manager: EpisodeManager) -> None: + current_artifacts = self.__discover_source_videos(source_path, episode_manager) + + for step in self.__steps: + if step.is_global: + self.__run_global_step(step) + else: + current_artifacts = self.__run_episode_step(step, current_artifacts) + + def __run_global_step(self, step: PipelineStep) -> None: + self.__context.logger.info(f"=== Running Global Step: {step.name} ===") + + if step.uses_global_completion and self.__should_skip_global_step(step.name): + self.__context.logger.info(f"Skipping {step.name} (already completed)") + return + + try: + if step.uses_global_completion: + self.__mark_step_in_progress(step.name, 'all') + step.execute(None, self.__context) + if step.uses_global_completion: + self.__mark_step_completed(step.name, 'all') + except Exception as e: + self.__context.logger.error(f"Global step {step.name} failed: {e}") + raise + + def __should_skip_global_step(self, step_name: str) -> bool: + if self.__context.force_rerun: + return False + + if self.__context.state_manager is None: + return False + + return self.__context.state_manager.is_step_completed(step_name, 'all') + + def __run_episode_step( + self, step: PipelineStep, current_artifacts: List[Any], + ) -> List[Any]: + self.__context.logger.info(f"=== Running Step: {step.name} ===") + + if self.__should_use_batch_processing(step): + return self.__run_episode_step_batch(step, current_artifacts) + return self.__run_episode_step_sequential(step, current_artifacts) + + def __should_use_batch_processing(self, step: PipelineStep) -> bool: + + if self.__context.disable_parallel: + self.__context.logger.info( + f"Batch processing disabled globally for {step.name}", + ) + return False + + if hasattr(step.config, 'enable_parallel'): + if not step.config.enable_parallel: + self.__context.logger.info( + f"Batch processing disabled by config for {step.name}", + ) + return False + + if not step.supports_batch_processing: + return False + + return True + + def __all_episodes_completed( + self, step: PipelineStep, artifacts: List[Any], + ) -> bool: + if self.__context.force_rerun or not step.uses_caching: + return False + if self.__context.state_manager is not None: + if all( + self.__context.state_manager.is_step_completed(step.name, art.episode_id) + for art in artifacts + ): + return True + return step.all_outputs_exist(artifacts, self.__context) + + def __run_episode_step_sequential( + self, step: PipelineStep, current_artifacts: List[Any], + ) -> List[Any]: + if self.__all_episodes_completed(step, current_artifacts): + self.__context.logger.info( + f'Step {step.name}: all {len(current_artifacts)} episodes already completed', + ) + return step.load_all_from_cache(current_artifacts, self.__context) + + next_artifacts = [] + + for artifact in current_artifacts: + episode_id = artifact.episode_id + + try: + if not step.uses_caching: + self.__mark_step_in_progress(step.name, episode_id) + result = step.execute(artifact, self.__context) + if not step.uses_caching: + self.__mark_step_completed(step.name, episode_id) + + if result: + next_artifacts.append(result) + else: + next_artifacts.append(artifact) + except Exception as e: + self.__context.logger.error( + f"Step {step.name} failed for {artifact.episode_id}: {e}", + ) + raise + + return next_artifacts + + def __run_episode_step_batch( + self, step: PipelineStep, current_artifacts: List[Any], + ) -> List[Any]: + if not current_artifacts: + return [] + + if self.__all_episodes_completed(step, current_artifacts): + self.__context.logger.info( + f'Step {step.name}: all {len(current_artifacts)} episodes already completed', + ) + return step.load_all_from_cache(current_artifacts, self.__context) + + workers = ( + step.config.max_parallel_episodes + if hasattr(step.config, 'max_parallel_episodes') + else 'N' + ) + self.__context.logger.info( + f"Batch processing {len(current_artifacts)} episodes with {workers} workers", + ) + + try: + if hasattr(step, 'setup_resources'): + step.setup_resources(self.__context) + + if not step.uses_caching: + for artifact in current_artifacts: + self.__mark_step_in_progress(step.name, artifact.episode_id) + + results = step.execute_batch(current_artifacts, self.__context) + + next_artifacts = [] + for artifact, result in zip(current_artifacts, results): + if not step.uses_caching: + self.__mark_step_completed(step.name, artifact.episode_id) + next_artifacts.append(result or artifact) + + return next_artifacts + + finally: + if hasattr(step, 'teardown_resources'): + step.teardown_resources(self.__context) + + def __mark_step_completed(self, step_name: str, episode_id: str) -> None: + if self.__context.state_manager is None: + return + self.__context.state_manager.mark_step_completed(step_name, episode_id) + + def __mark_step_in_progress(self, step_name: str, episode_id: str) -> None: + if self.__context.state_manager is None: + return + self.__context.state_manager.mark_step_started(step_name, episode_id) diff --git a/preprocessor/app/pipeline_factory.py b/preprocessor/app/pipeline_factory.py new file mode 100644 index 000000000..7504f08a2 --- /dev/null +++ b/preprocessor/app/pipeline_factory.py @@ -0,0 +1,633 @@ +from pathlib import Path +from typing import Dict + +from preprocessor.app.pipeline import PipelineDefinition +from preprocessor.app.step_builder import ( + Phase, + StepBuilder, +) +from preprocessor.config.series_config import SeriesConfig +from preprocessor.config.step_configs import ( + ArchiveConfig, + CharacterDetectionConfig, + CharacterReferenceConfig, + CharacterReferenceProcessorConfig, + CharacterScraperConfig, + DocumentGenerationConfig, + ElasticsearchConfig, + EmotionDetectionConfig, + EpisodeNameEmbeddingConfig, + EpisodeScraperConfig, + FrameExportConfig, + FullEpisodeEmbeddingConfig, + ImageHashConfig, + ObjectDetectionConfig, + ResolutionAnalysisConfig, + SceneDetectionConfig, + SeriesFaceClusteringConfig, + SoundEventEmbeddingConfig, + SoundEventsConfig, + SoundSeparationConfig, + TextAnalysisConfig, + TextCleaningConfig, + TextEmbeddingConfig, + TranscodeConfig, + TranscriptionConfig, + TranscriptionImportConfig, + ValidationConfig, + VideoEmbeddingConfig, +) +from preprocessor.core.output_descriptors import ( + DirectoryOutput, + FileOutput, + JsonFileOutput, + create_frames_output, +) +from preprocessor.services.media.resolution import Resolution +from preprocessor.services.text.import_step import TranscriptionImportStep +from preprocessor.steps.analysis.resolution_analysis_step import ResolutionAnalysisStep +from preprocessor.steps.audio.separation_step import SoundSeparationStep +from preprocessor.steps.packaging.archives_step import ArchiveGenerationStep +from preprocessor.steps.scraping.character_scraper_step import CharacterScraperStep +from preprocessor.steps.scraping.episode_scraper_step import EpisodeScraperStep +from preprocessor.steps.scraping.reference_processor_step import CharacterReferenceStep +from preprocessor.steps.search.document_generation_step import DocumentGeneratorStep +from preprocessor.steps.search.indexing_step import ElasticsearchIndexerStep +from preprocessor.steps.text.analysis_step import TextAnalysisStep +from preprocessor.steps.text.embeddings_step import TextEmbeddingStep +from preprocessor.steps.text.episode_name_embedding_step import EpisodeNameEmbeddingStep +from preprocessor.steps.text.full_episode_embedding_step import FullEpisodeEmbeddingStep +from preprocessor.steps.text.sound_event_embedding_step import SoundEventEmbeddingStep +from preprocessor.steps.text.sound_events_step import SoundEventsStep +from preprocessor.steps.text.text_cleaning_step import TextCleaningStep +from preprocessor.steps.text.transcription_step import TranscriptionStep +from preprocessor.steps.validation.validator_step import ValidationStep +from preprocessor.steps.video.frame_export_step import FrameExporterStep +from preprocessor.steps.video.scene_detection_step import SceneDetectorStep +from preprocessor.steps.video.transcoding_step import VideoTranscoderStep +from preprocessor.steps.vision.character_detection_step import CharacterDetectorStep +from preprocessor.steps.vision.character_reference_processor_step import CharacterReferenceProcessorStep +from preprocessor.steps.vision.embeddings_step import VideoEmbeddingStep +from preprocessor.steps.vision.emotion_detection_step import EmotionDetectionStep +from preprocessor.steps.vision.image_hashing_step import ImageHashStep +from preprocessor.steps.vision.object_detection_step import ObjectDetectionStep +from preprocessor.steps.vision.series_face_clustering_step import SeriesFaceClusteringStep + +# Phase Definitions +SCRAPING = Phase("SCRAPING", color="blue") +PROCESSING = Phase("PROCESSING", color="green") +INDEXING = Phase("INDEXING", color="yellow") +VALIDATION = Phase("VALIDATION", color="magenta") + + +def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=too-many-locals,too-many-statements + series_config = SeriesConfig.load(series_name) + _reference_source = series_config.scraping.character_references.source + + # ========================================================= + # SCRAPING PHASE + # ========================================================= + episodes_metadata = StepBuilder( + phase=SCRAPING, + step_class=EpisodeScraperStep, + description="Scrapes episode metadata from wiki", + produces=[ + JsonFileOutput( + pattern=f"{series_name}_episodes.json", + subdir="", + min_size_bytes=100, + ), + ], + needs=[], + config=EpisodeScraperConfig( + urls=series_config.scraping.episodes.urls, + headless=True, + merge_sources=True, + scraper_method="crawl4ai", + parser_mode=series_config.scraping.episodes.parser_mode, + ), + ) + + characters_metadata = StepBuilder( + phase=SCRAPING, + step_class=CharacterScraperStep, + description="Scrapes character data from wiki", + produces=[ + JsonFileOutput( + pattern=f"{series_name}_characters.json", + subdir="", + min_size_bytes=50, + ), + ], + needs=[], + config=CharacterScraperConfig( + urls=series_config.scraping.characters.urls, + headless=True, + scraper_method="crawl4ai", + parser_mode=series_config.scraping.characters.parser_mode, + ), + ) + + + # ========================================================= + # PROCESSING PHASE: VIDEO + # ========================================================= + resolution_analysis = StepBuilder( + phase=PROCESSING, + step_class=ResolutionAnalysisStep, + description="Analyze source video resolutions and warn if upscaling required", + produces=[], + needs=[], + config=ResolutionAnalysisConfig( + resolution=Resolution.from_string(series_config.processing.transcode.resolution), + ), + ) + + transcoded_videos = StepBuilder( + phase=PROCESSING, + step_class=VideoTranscoderStep, + description=f"Conversion to h264_nvenc {series_config.processing.transcode.resolution} with adaptive bitrate", + produces=[ + FileOutput( + pattern="{season}/{episode}.mp4", + min_size_bytes=1024 * 1024, + ), + ], + needs=[resolution_analysis], + config=TranscodeConfig( + max_bitrate_file_size_mb=series_config.processing.transcode.max_bitrate_file_size_mb, + max_bitrate_duration_seconds=series_config.processing.transcode.max_bitrate_duration_seconds, + keyframe_interval_seconds=series_config.processing.transcode.keyframe_interval_seconds, + min_bitrate_mbps=series_config.processing.transcode.min_bitrate_mbps, + bitrate_boost_ratio=series_config.processing.transcode.bitrate_boost_ratio, + force_deinterlace=series_config.processing.transcode.force_deinterlace, + ), + ) + + scene_data = StepBuilder( + phase=PROCESSING, + step_class=SceneDetectorStep, + description="Detects scene changes using TransNetV2", + produces=[ + JsonFileOutput( + pattern="{season}/{episode}.json", + min_size_bytes=10, + ), + ], + needs=[transcoded_videos], + config=SceneDetectionConfig( + threshold=series_config.processing.scene_detection.threshold, + min_scene_len=series_config.processing.scene_detection.min_scene_len, + ), + ) + + # Frame export output descriptor matches FrameExporterStep.get_output_descriptors() + # Defined here for pipeline validation before step instantiation + exported_frames = StepBuilder( + phase=PROCESSING, + step_class=FrameExporterStep, + description="Exports frames (PNG) at scene boundaries", + produces=[create_frames_output()], + needs=[scene_data], + config=FrameExportConfig( + frames_per_scene=series_config.processing.frame_export.frames_per_scene, + ), + ) + + # ========================================================= + # PROCESSING PHASE: TEXT & AUDIO + # ========================================================= + _transcription_output = [ + JsonFileOutput( + pattern="{season}/{episode_num}/{episode}.json", + subdir="transcriptions/raw", + min_size_bytes=50, + ), + ] + _import_cfg = series_config.processing.transcription_import + if _import_cfg: + transcription_data = StepBuilder( + phase=PROCESSING, + step_class=TranscriptionImportStep, + description=f"Import pre-existing {_import_cfg.format_type} transcriptions", + produces=_transcription_output, + needs=[], + config=TranscriptionImportConfig( + source_dir=Path(_import_cfg.source_dir), + format_type=_import_cfg.format_type, + season_remap=_import_cfg.season_remap, + ), + ) + else: + transcription_data = StepBuilder( + phase=PROCESSING, + step_class=TranscriptionStep, + description=f"Audio transcription using {series_config.processing.transcription.mode}", + produces=_transcription_output, + needs=[transcoded_videos], + config=TranscriptionConfig( + mode=series_config.processing.transcription.mode, + model=series_config.processing.transcription.model, + language=series_config.processing.transcription.language, + device=series_config.processing.transcription.device, + max_parallel_episodes=1, + ), + ) + + text_cleaning = StepBuilder( + phase=PROCESSING, + step_class=TextCleaningStep, + description="Removes sound events from transcription segments", + produces=[ + JsonFileOutput( + pattern="{season}/{episode}.json", + subdir="transcriptions/clean", + min_size_bytes=10, + ), + ], + needs=[transcription_data], + config=TextCleaningConfig(), + ) + + sound_events = StepBuilder( + phase=PROCESSING, + step_class=SoundEventsStep, + description="Extracts sound event segments from transcription", + produces=[ + JsonFileOutput( + pattern="{season}/{episode}.json", + subdir="transcriptions/sound_events", + min_size_bytes=10, + ), + ], + needs=[transcription_data], + config=SoundEventsConfig(), + ) + + separated_audio = StepBuilder( + phase=PROCESSING, + step_class=SoundSeparationStep, + description="Separates dialogue from sound effects", + produces=[ + DirectoryOutput( + pattern="{season}/{episode}", + expected_file_pattern="*.wav", + min_files=1, + min_size_per_file_bytes=1024, + ), + ], + needs=[transcription_data], + config=SoundSeparationConfig(), + ) + + text_stats = StepBuilder( + phase=PROCESSING, + step_class=TextAnalysisStep, + description="Analyzes text statistics (word frequency, sentiment)", + produces=[ + JsonFileOutput( + pattern="{season}/{episode}.json", + min_size_bytes=50, + ), + ], + needs=[text_cleaning], + config=TextAnalysisConfig(language=series_config.processing.transcription.language), + ) + + text_embeddings = StepBuilder( + phase=PROCESSING, + step_class=TextEmbeddingStep, + description="Generates text embeddings using Qwen3-VL-Embedding", + produces=[ + FileOutput( + pattern="{season}/{episode}.json", + subdir="embeddings/text", + min_size_bytes=1024, + ), + ], + needs=[text_cleaning], + config=TextEmbeddingConfig( + model_name="Qwen/Qwen3-VL-Embedding-8B", + batch_size=8, + device="cuda", + text_sentences_per_chunk=8, + text_chunk_overlap=3, + ), + ) + + sound_event_embeddings = StepBuilder( + phase=PROCESSING, + step_class=SoundEventEmbeddingStep, + description="Generates sound event embeddings using Qwen3-VL-Embedding", + produces=[ + FileOutput( + pattern="{season}/{episode}.json", + min_size_bytes=1024, + ), + ], + needs=[sound_events], + config=SoundEventEmbeddingConfig( + model_name="Qwen/Qwen3-VL-Embedding-8B", + batch_size=64, + device="cuda", + ), + ) + + full_episode_embeddings = StepBuilder( + phase=PROCESSING, + step_class=FullEpisodeEmbeddingStep, + description="Generates full episode embedding using Qwen3-VL-Embedding", + produces=[ + FileOutput( + pattern="{season}/{episode}.json", + min_size_bytes=1024, + ), + ], + needs=[text_cleaning], + config=FullEpisodeEmbeddingConfig( + model_name="Qwen/Qwen3-VL-Embedding-8B", + device="cuda", + ), + ) + + episode_name_embeddings = StepBuilder( + phase=PROCESSING, + step_class=EpisodeNameEmbeddingStep, + description="Generates episode title embedding using Qwen3-VL-Embedding", + produces=[ + FileOutput( + pattern="{season}/{episode}.json", + min_size_bytes=1024, + ), + ], + needs=[text_cleaning], + config=EpisodeNameEmbeddingConfig( + model_name="Qwen/Qwen3-VL-Embedding-8B", + device="cuda", + ), + ) + + # ========================================================= + # PROCESSING PHASE: VISION + # ========================================================= + series_face_clusters = StepBuilder( + phase=PROCESSING, + step_class=SeriesFaceClusteringStep, + description="Clusters all faces across the series into numbered folders for manual labeling", + produces=[ + JsonFileOutput( + pattern="_cluster_index.json", + subdir="character_clusters", + min_size_bytes=10, + ), + ], + needs=[exported_frames], + config=SeriesFaceClusteringConfig(), + ) + + _character_ref_vectors_output = DirectoryOutput( + pattern="character_references_processed", + subdir="", + expected_file_pattern="**/face_vector.npy", + min_files=1, + min_size_per_file_bytes=100, + ) + + if _reference_source == "clusters": + character_reference_vectors = StepBuilder( + phase=PROCESSING, + step_class=CharacterReferenceProcessorStep, + description="Builds character face vectors from manually labeled cluster frames", + produces=[_character_ref_vectors_output], + needs=[series_face_clusters], + config=CharacterReferenceProcessorConfig(reference_source="clusters"), + ) + _character_ref_steps = [character_reference_vectors] + else: + character_references = StepBuilder( + phase=SCRAPING, + step_class=CharacterReferenceStep, + description="Downloads character reference images from the web", + produces=[ + DirectoryOutput( + pattern="character_faces", + subdir="", + expected_file_pattern="**/*.jpg", + min_files=1, + min_size_per_file_bytes=1024, + ), + ], + needs=[characters_metadata], + config=CharacterReferenceConfig( + search_engine=series_config.scraping.character_references.search_engine, + images_per_character=series_config.scraping.character_references.images_per_character, + search_query_template=series_config.scraping.character_references.search_query_template, + ), + ) + character_reference_vectors = StepBuilder( + phase=SCRAPING, + step_class=CharacterReferenceProcessorStep, + description="Processes character reference images into face embedding vectors", + produces=[_character_ref_vectors_output], + needs=[character_references], + config=CharacterReferenceProcessorConfig(reference_source="web"), + ) + _character_ref_steps = [character_references, character_reference_vectors] + + image_hashes = StepBuilder( + phase=PROCESSING, + step_class=ImageHashStep, + description="Perceptual frame hashing (phash, dhash, wavelet)", + produces=[ + JsonFileOutput( + pattern="{season}/{episode}.json", + min_size_bytes=50, + ), + ], + needs=[exported_frames], + config=ImageHashConfig(batch_size=32), + ) + + video_embeddings = StepBuilder( + phase=PROCESSING, + step_class=VideoEmbeddingStep, + description="Visual embeddings using Qwen3-VL-Embedding", + produces=[ + FileOutput( + pattern="{season}/{episode}.npy", + min_size_bytes=1024, + ), + ], + needs=[exported_frames, image_hashes], + config=VideoEmbeddingConfig( + model_name="Qwen/Qwen3-VL-Embedding-8B", + batch_size=8, + device="cuda", + ), + ) + + character_detections = StepBuilder( + phase=PROCESSING, + step_class=CharacterDetectorStep, + description="Recognizes characters in frames using InsightFace", + produces=[ + JsonFileOutput( + pattern="{season}/{episode}.json", + min_size_bytes=10, + ), + ], + needs=[exported_frames, character_reference_vectors], + config=CharacterDetectionConfig(threshold=0.45, max_parallel_episodes=4), + ) + + emotion_data = StepBuilder( + phase=PROCESSING, + step_class=EmotionDetectionStep, + description="Detects emotions on faces using EmoNet", + produces=[ + JsonFileOutput( + pattern="{season}/{episode}.json", + min_size_bytes=10, + ), + ], + needs=[exported_frames], + config=EmotionDetectionConfig(), + ) + + object_detections = StepBuilder( + phase=PROCESSING, + step_class=ObjectDetectionStep, + description="General object detection using D-FINE", + produces=[ + JsonFileOutput( + pattern="{season}/{episode}.json", + min_size_bytes=10, + ), + ], + needs=[exported_frames], + config=ObjectDetectionConfig(), + ) + + # ========================================================= + # INDEXING PHASE + # ========================================================= + elastic_documents = StepBuilder( + phase=INDEXING, + step_class=DocumentGeneratorStep, + description="Combines all data into Elasticsearch documents", + produces=[ + FileOutput( + pattern="{season}/{episode}_text_segments.jsonl", + subdir="elastic_documents/text_segments", + min_size_bytes=0, + ), + ], + needs=[ + text_embeddings, + video_embeddings, + character_detections, + emotion_data, + object_detections, + ], + config=DocumentGenerationConfig(), + ) + + episode_archives = StepBuilder( + phase=INDEXING, + step_class=ArchiveGenerationStep, + description="Creates ZIP archives per episode (all artifacts)", + produces=[ + FileOutput( + pattern="{season}/{episode}.zip", + subdir="archives", + min_size_bytes=1024 * 100, + ), + ], + needs=[elastic_documents], + config=ArchiveConfig(), + ) + + indexed_data = StepBuilder( + phase=INDEXING, + step_class=ElasticsearchIndexerStep, + description="Indexes documents into Elasticsearch", + produces=[], + needs=[elastic_documents], + config=ElasticsearchConfig( + index_name=series_config.indexing.elasticsearch.index_name, + host=series_config.indexing.elasticsearch.host, + dry_run=series_config.indexing.elasticsearch.dry_run, + append=series_config.indexing.elasticsearch.append, + ), + ) + + # ========================================================= + # VALIDATION PHASE + # ========================================================= + validation = StepBuilder( + phase=VALIDATION, + step_class=ValidationStep, + description="Validates all processed data and generates reports", + produces=[ + DirectoryOutput( + pattern="{season}", + expected_file_pattern="*.json", + min_files=1, + min_size_per_file_bytes=50, + ), + ], + needs=[indexed_data, episode_archives], + config=ValidationConfig(), + ) + + # ========================================================= + # PIPELINE REGISTRATION + # ========================================================= + pipeline = PipelineDefinition(name=f"{series_name}_processing") + + pipeline.register(episodes_metadata) + pipeline.register(characters_metadata) + + pipeline.register(resolution_analysis) + pipeline.register(transcoded_videos) + pipeline.register(scene_data) + pipeline.register(exported_frames) + + pipeline.register(transcription_data) + pipeline.register(text_cleaning) + pipeline.register(sound_events) + pipeline.register(separated_audio) + pipeline.register(text_stats) + + pipeline.register(text_embeddings) + pipeline.register(sound_event_embeddings) + pipeline.register(full_episode_embeddings) + pipeline.register(episode_name_embeddings) + pipeline.register(image_hashes) + pipeline.register(video_embeddings) + + pipeline.register(series_face_clusters) + for _step in _character_ref_steps: + pipeline.register(_step) + pipeline.register(object_detections) + pipeline.register(character_detections) + pipeline.register(emotion_data) + + pipeline.register(elastic_documents) + pipeline.register(episode_archives) + pipeline.register(indexed_data) + pipeline.register(validation) + + pipeline.validate() + + return pipeline + + +def visualize(series_name: str = "ranczo") -> None: + pipeline = build_pipeline(series_name) + print(pipeline.to_ascii_art()) + + +def _get_step_configs(series_name: str) -> Dict[str, object]: + pipeline = build_pipeline(series_name) + return {step_id: step.config for step_id, step in pipeline.get_all_steps().items()} diff --git a/preprocessor/app/step_builder.py b/preprocessor/app/step_builder.py new file mode 100644 index 000000000..e4b917098 --- /dev/null +++ b/preprocessor/app/step_builder.py @@ -0,0 +1,98 @@ +from __future__ import annotations + +from dataclasses import ( + dataclass, + field, +) +from pathlib import Path +import re +from typing import ( + Any, + Dict, + List, + Optional, + Type, + Union, +) + +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.output_descriptors import ( + OutputDescriptor, + ValidationResult, +) + + +@dataclass(frozen=True) +class Phase: + name: str + color: str + + +@dataclass +class StepBuilder: + description: str + step_class: Type[PipelineStep] + phase: Phase + produces: Union[List[str], List[OutputDescriptor]] + id: Optional[str] = None + config: Any = None + needs: List[StepBuilder] = field(default_factory=list) + + @property + def dependency_ids(self) -> List[str]: + return [step.id for step in self.needs] + + def get_output_descriptors(self) -> List[OutputDescriptor]: + if not self.produces: + return [] + + if isinstance(self.produces[0], OutputDescriptor): + return self.produces + + return [] + + def validate_outputs( + self, + base_dir: Path, + context_vars: Optional[Dict[str, str]] = None, + ) -> Dict[str, ValidationResult]: + results = {} + for idx, descriptor in enumerate(self.get_output_descriptors()): + result = descriptor.validate(base_dir, context_vars) + results[f'{self.id}_output_{idx}'] = result + return results + + def get_dependency_outputs(self) -> Dict[str, List[OutputDescriptor]]: + return { + dep.id: dep.get_output_descriptors() + for dep in self.needs + } + + def __post_init__(self) -> None: + if self.id is None: + object.__setattr__(self, 'id', self.__generate_id_from_class()) + self.__validate_id() + + def __generate_id_from_class(self) -> str: + class_name = self.step_class.__name__ + class_name_without_step = re.sub(r'Step$', '', class_name) + snake_case = re.sub(r'(? None: + if not self.id or not self.id.replace("_", "").replace("-", "").isalnum(): + raise ValueError( + f"Invalid step_id: '{self.id}'. Use only alphanumeric and underscores.", + ) + + def __eq__(self, other: object) -> bool: + if not isinstance(other, StepBuilder): + return False + return self.id == other.id + + def __hash__(self) -> int: + return hash(self.id) + + def __repr__(self) -> str: + deps = f", needs={self.dependency_ids}" if self.needs else "" + return f"StepBuilder(id='{self.id}'{deps})" diff --git a/preprocessor/characters/detector.py b/preprocessor/characters/detector.py deleted file mode 100644 index b7255b50f..000000000 --- a/preprocessor/characters/detector.py +++ /dev/null @@ -1,115 +0,0 @@ -from __future__ import annotations - -import logging -from pathlib import Path -from typing import ( - Any, - Dict, - List, -) - -from insightface.app import FaceAnalysis -import numpy as np - -from preprocessor.characters.face_detection_utils import load_character_references -from preprocessor.characters.utils import init_face_detection -from preprocessor.config.config import settings -from preprocessor.core.base_processor import ( - BaseProcessor, - OutputSpec, - ProcessingItem, -) -from preprocessor.core.episode_manager import EpisodeManager -from preprocessor.core.file_naming import FileNamingConventions -from preprocessor.core.output_path_builder import OutputPathBuilder -from preprocessor.utils.console import console -from preprocessor.utils.detection_io import ( - process_frames_for_detection, - save_character_detections, -) - -# pylint: disable=duplicate-code - - - -class CharacterDetector(BaseProcessor): - def __init__(self, args: Dict[str, Any]): - super().__init__( - args=args, - class_name=self.__class__.__name__, - error_exit_code=9, - loglevel=logging.DEBUG, - ) - - self.frames_dir: Path = self._args["frames_dir"] - self.characters_dir: Path = self._args.get("characters_dir", settings.character.output_dir) - self.threshold: float = settings.character.frame_detection_threshold - - episodes_info_json = self._args.get("episodes_info_json") - self.episode_manager = EpisodeManager(episodes_info_json, self.series_name) - - self.face_app: FaceAnalysis = None - self.character_vectors: Dict[str, np.ndarray] = {} - - def _validate_args(self, args: Dict[str, Any]) -> None: - if "frames_dir" not in args: - raise ValueError("frames_dir is required") - - # pylint: disable=duplicate-code - def _get_processing_items(self) -> List[ProcessingItem]: - return self._get_episode_processing_items_from_metadata( - "**/*_frame_metadata.json", - self.frames_dir, - self.episode_manager, - ) - - def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: - episode_info = item.metadata["episode_info"] - file_naming = FileNamingConventions(self.series_name) - detections_filename = file_naming.build_filename( - episode_info, - extension="json", - suffix="character_detections", - ) - detections_output = OutputPathBuilder.build_output_path( - episode_info, - settings.output_subdirs.character_detections, - detections_filename, - ) - return [OutputSpec(path=detections_output, required=True)] - # pylint: enable=duplicate-code - - def _load_resources(self) -> bool: - if not self.characters_dir.exists(): - console.print(f"[red]Characters directory not found: {self.characters_dir}[/red]") - return False - - self.face_app = init_face_detection() - self.character_vectors = load_character_references(self.characters_dir, self.face_app) - - if not self.character_vectors: - console.print("[yellow]No character references loaded[/yellow]") - return False - - return True - - def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> None: - metadata_file = item.input_path - episode_info = item.metadata["episode_info"] - frames_dir = metadata_file.parent - - frame_files = sorted([ - f for f in frames_dir.glob("*.jpg") - if f.is_file() and "frame_" in f.name - ]) - - fps = 25.0 - - results = process_frames_for_detection( - frame_files, - self.face_app, - self.character_vectors, - self.threshold, - fps=fps, - ) - save_character_detections(episode_info, results, fps=fps) diff --git a/preprocessor/characters/duckduckgo_search.py b/preprocessor/characters/duckduckgo_search.py deleted file mode 100644 index f70264593..000000000 --- a/preprocessor/characters/duckduckgo_search.py +++ /dev/null @@ -1,19 +0,0 @@ -from typing import ( - Dict, - List, -) - -from ddgs import DDGS - -from preprocessor.characters.base_image_search import BaseImageSearch - - -class DuckDuckGoImageSearch(BaseImageSearch): - @property - def name(self) -> str: - return "DuckDuckGo" - - def search(self, query: str) -> List[Dict[str, str]]: - with DDGS() as ddgs: - results = ddgs.images(query, max_results=self.max_results) - return list(results) diff --git a/preprocessor/characters/face_detection_utils.py b/preprocessor/characters/face_detection_utils.py deleted file mode 100644 index a1d94bd97..000000000 --- a/preprocessor/characters/face_detection_utils.py +++ /dev/null @@ -1,125 +0,0 @@ -import warnings - -warnings.filterwarnings( - "ignore", - message=".*estimate.*is deprecated.*", - category=FutureWarning, - module="insightface", -) - -# pylint: disable=wrong-import-position -from pathlib import Path -from typing import ( - Any, - Dict, - List, - Optional, -) - -import cv2 -from insightface.app import FaceAnalysis -import numpy as np -from numpy.linalg import norm - -from preprocessor.utils.console import console - -# pylint: enable=wrong-import-position - - -def load_character_references( - characters_dir: Path, - face_app: FaceAnalysis, -) -> Dict[str, np.ndarray]: - console.print("[blue]Loading character references...[/blue]") - character_vectors = {} - - for char_dir in characters_dir.iterdir(): - if not char_dir.is_dir(): - continue - - char_name = char_dir.name.replace("_", " ").title() - vector_file = char_dir / "face_vector.npy" - - if vector_file.exists(): - character_vectors[char_name] = np.load(vector_file) - console.print(f"[dim] ✓ {char_name}: loaded from face_vector.npy[/dim]") - continue - - images = list(char_dir.glob("*.jpg")) - - if not images: - continue - - embeddings = [] - for img_path in images: - emb = _get_face_embedding(str(img_path), face_app) - if emb is not None: - embeddings.append(emb) - - if embeddings: - mean_emb = np.mean(embeddings, axis=0) - centroid = mean_emb / norm(mean_emb) - character_vectors[char_name] = centroid - console.print(f"[green] ✓ {char_name}: {len(embeddings)} reference images[/green]") - - console.print(f"[green]✓ Loaded {len(character_vectors)} characters[/green]") - return character_vectors - - -def _get_face_embedding(img_path: str, face_app: FaceAnalysis) -> Optional[np.ndarray]: - img = cv2.imread(img_path) - if img is None: - return None - - faces = face_app.get(img) - if not faces: - return None - - faces.sort(key=lambda x: (x.bbox[2]-x.bbox[0]) * (x.bbox[3]-x.bbox[1]), reverse=True) - return faces[0].normed_embedding - - -def detect_characters_in_frame( - frame_path: Path, - face_app: FaceAnalysis, - character_vectors: Dict[str, np.ndarray], - threshold: float, -) -> List[Dict[str, Any]]: - img = cv2.imread(str(frame_path)) - if img is None: - return [] - - faces = face_app.get(img) - if not faces: - return [] - - detected = [] - - for face in faces: - face_embedding = face.normed_embedding - bbox = face.bbox.astype(int) - - best_match = None - best_similarity = threshold - - for char_name, char_vector in character_vectors.items(): - similarity = np.dot(face_embedding, char_vector) - - if similarity > best_similarity: - best_similarity = similarity - best_match = char_name - - if best_match is not None: - detected.append({ - "name": best_match, - "confidence": float(best_similarity), - "bbox": { - "x1": int(bbox[0]), - "y1": int(bbox[1]), - "x2": int(bbox[2]), - "y2": int(bbox[3]), - }, - }) - - detected.sort(key=lambda x: x["confidence"], reverse=True) - return detected diff --git a/preprocessor/characters/google_image_search.py b/preprocessor/characters/google_image_search.py deleted file mode 100644 index 878a8f60e..000000000 --- a/preprocessor/characters/google_image_search.py +++ /dev/null @@ -1,41 +0,0 @@ -from typing import ( - Dict, - List, -) - -from serpapi import GoogleSearch - -from preprocessor.characters.base_image_search import BaseImageSearch - - -class GoogleImageSearch(BaseImageSearch): - def __init__(self, api_key: str, max_results: int = 50): - super().__init__(max_results) - if not api_key: - raise ValueError("SerpAPI key is required for Google Image Search") - self.api_key = api_key - - @property - def name(self) -> str: - return "Google Images API" - - def search(self, query: str) -> List[Dict[str, str]]: - params = { - "engine": "google_images", - "q": query, - "hl": "pl", - "gl": "pl", - "api_key": self.api_key, - } - - search = GoogleSearch(params) - results = search.get_dict() - - images = [] - for img_result in results.get("images_results", [])[:self.max_results]: - images.append({ - "image": img_result.get("original"), - "thumbnail": img_result.get("thumbnail"), - }) - - return images diff --git a/preprocessor/characters/reference_downloader.py b/preprocessor/characters/reference_downloader.py deleted file mode 100644 index 5108363dd..000000000 --- a/preprocessor/characters/reference_downloader.py +++ /dev/null @@ -1,300 +0,0 @@ -from __future__ import annotations - -import json -import logging -from pathlib import Path -import random -import time -from typing import ( - Any, - Dict, - List, - Optional, -) - -import cv2 -from insightface.app import FaceAnalysis -import numpy as np -from patchright.sync_api import ( - BrowserContext, - Page, - sync_playwright, -) - -from preprocessor.characters.base_image_search import BaseImageSearch -from preprocessor.characters.duckduckgo_search import DuckDuckGoImageSearch -from preprocessor.characters.google_image_search import GoogleImageSearch -from preprocessor.characters.utils import init_face_detection -from preprocessor.config.config import settings -from preprocessor.core.base_processor import BaseProcessor -from preprocessor.utils.console import ( - console, - create_progress, -) - - -class CharacterReferenceDownloader(BaseProcessor): - def __init__(self, args: Dict[str, Any]): - super().__init__( - args=args, - class_name=self.__class__.__name__, - error_exit_code=8, - loglevel=logging.DEBUG, - ) - - self.characters_json: Path = self._args["characters_json"] - self.series_name: str = self._args["series_name"] - self.output_dir: Path = self._args.get("output_dir", settings.character.output_dir) - self.images_per_character: int = self._args.get( - "images_per_character", - settings.character.reference_images_per_character, - ) - self.max_results: int = settings.image_scraper.max_results_to_scrape - self.min_width: int = settings.image_scraper.min_image_width - self.min_height: int = settings.image_scraper.min_image_height - self.use_gpu: bool = True - self.search_mode: str = self._args.get("search_mode", "normal") - - self.search_engine: BaseImageSearch = self.__create_search_engine() - self.face_app: FaceAnalysis = None - self.browser_context: Optional[BrowserContext] = None - - def __create_search_engine(self) -> BaseImageSearch: - if self.search_mode == "premium": - serpapi_key = settings.image_scraper.serpapi_key - return GoogleImageSearch(api_key=serpapi_key, max_results=self.max_results) - return DuckDuckGoImageSearch(max_results=self.max_results) - - def _validate_args(self, args: Dict[str, Any]) -> None: - if "characters_json" not in args: - raise ValueError("characters_json is required") - if "series_name" not in args: - raise ValueError("series_name is required") - - def __all_references_exist(self, characters: List[Dict[str, Any]]) -> bool: - for char in characters: - char_name = char["name"] - output_folder = self.output_dir / char_name.replace(" ", "_").lower() - existing_images = list(output_folder.glob("*.jpg")) - if len(existing_images) < self.images_per_character: - return False - return True - - def _execute(self) -> None: - if not self.characters_json.exists(): - console.print(f"[red]Characters JSON not found: {self.characters_json}[/red]") - return - - with open(self.characters_json, encoding="utf-8") as f: - data = json.load(f) - - characters = data.get("characters", []) - if not characters: - console.print("[yellow]No characters found in JSON[/yellow]") - return - - if self.__all_references_exist(characters): - console.print(f"[green]✓ All reference images already exist for {len(characters)} characters (skipping)[/green]") - return - - self.face_app = init_face_detection() - - console.print(f"[blue]Downloading reference images for {len(characters)} characters...[/blue]") - - with sync_playwright() as p: - self.browser_context = p.chromium.launch_persistent_context( - user_data_dir="/tmp/patchright_profile", - headless=True, - args=[ - '--no-sandbox', - '--disable-dev-shm-usage', - '--disable-gpu', - ], - ignore_default_args=['--enable-automation'], - ) - - with create_progress() as progress: - task = progress.add_task("Downloading references", total=len(characters)) - - for i, char in enumerate(characters): - char_name = char["name"] - downloaded = False - try: - downloaded = self.__download_character_references(char_name, progress) - except Exception as e: - self.logger.error(f"Failed to download references for {char_name}: {e}") - finally: - progress.advance(task) - - if downloaded and i < len(characters) - 1: - delay = random.uniform( - settings.image_scraper.request_delay_min, - settings.image_scraper.request_delay_max, - ) - time.sleep(delay) - - self.browser_context.close() - - console.print("[green]✓ Reference download completed[/green]") - - def __count_faces(self, img) -> int: - faces = self.face_app.get(img) - return len(faces) - - def _validate_and_decode_image(self, img_bytes: bytes, img_url: str) -> np.ndarray | None: - if not img_bytes: - return None - - img_array = np.asarray(bytearray(img_bytes), dtype=np.uint8) - img = cv2.imdecode(img_array, cv2.IMREAD_COLOR) - - if img is None or img.size == 0: - self.logger.debug(f"Failed to decode image from {img_url}") - return None - - if len(img.shape) != 3 or img.shape[2] != 3: - self.logger.debug(f"Image has unexpected shape {img.shape} from {img_url}") - return None - - return img - - def __download_image_with_browser(self, img_url: str, page: Page) -> np.ndarray | None: - try: # pylint: disable=too-many-try-statements - response = page.goto( - img_url, - timeout=settings.image_scraper.page_navigation_timeout, - wait_until="domcontentloaded", - ) - if not response or response.status != 200: - return None - - content_type = response.headers.get("content-type", "") - if "image" not in content_type: - return None - - img_bytes = response.body() - if not img_bytes: - return None - - img_array = np.asarray(bytearray(img_bytes), dtype=np.uint8) - img = cv2.imdecode(img_array, cv2.IMREAD_COLOR) - - if img is None or img.size == 0: - self.logger.debug(f"Failed to decode image from {img_url}") - return None - - if len(img.shape) != 3 or img.shape[2] != 3: - self.logger.debug(f"Image has unexpected shape {img.shape} from {img_url}") - return None - - return img - - except TimeoutError: - self.logger.debug(f"Timeout downloading image {img_url}") - return None - except Exception as e: - if "net::ERR_CONNECTION_CLOSED" in str(e) or "Navigation" in str(e): - self.logger.debug(f"Connection/navigation error for {img_url}: {e}") - else: - self.logger.debug(f"Failed to download image {img_url}: {e}") - return None - - def __download_character_references(self, char_name: str, progress) -> bool: # pylint: disable=too-many-locals,too-many-statements - search_query = f"Serial {self.series_name} {char_name} postać" - output_folder = self.output_dir / char_name.replace(" ", "_").lower() - output_folder.mkdir(parents=True, exist_ok=True) - - existing_images = list(output_folder.glob("*.jpg")) - if len(existing_images) >= self.images_per_character: - progress.console.print( - f"[green]✓ {char_name}: {len(existing_images)} images already exist (skipping)[/green]", - ) - return False - - progress.console.print(f"[cyan]Searching [{self.search_engine.name}]: {search_query}[/cyan]") - - saved_count = len(existing_images) - processed = 0 - - for attempt in range(settings.image_scraper.retry_attempts): # pylint: disable=too-many-nested-blocks - try: - results = self.search_engine.search(search_query) - - sorted_results = sorted( - results, - key=lambda x: ( - 0 if x.get('image', '').lower().endswith(('.jpg', '.jpeg')) else 1, - 1 if x.get('image', '').lower().endswith('.png') else 2, - ), - ) - - page = self.browser_context.new_page() - - try: - for res in sorted_results: - if saved_count >= self.images_per_character: - break - - img_url = res['image'] - processed += 1 - - try: - img = self.__download_image_with_browser(img_url, page) - - if img is None: - continue - - if not isinstance(img, np.ndarray) or img.size == 0: - self.logger.debug(f"Invalid image array from {img_url}") - continue - - h, w = img.shape[:2] - if w < self.min_width or h < self.min_height: - continue - - try: - face_count = self.__count_faces(img) - except Exception as face_err: - self.logger.debug(f"Face detection failed for {img_url}: {face_err}") - continue - - if face_count == 1: - filename = f"{saved_count:02d}.jpg" - path = output_folder / filename - cv2.imwrite(str(path), img) - saved_count += 1 - - except Exception as e: - self.logger.debug(f"Error processing image: {e}") - continue - - finally: - page.close() - - break - - except KeyboardInterrupt: - progress.console.print("\n[yellow]Download interrupted[/yellow]") - raise - except Exception as e: - if attempt < settings.image_scraper.retry_attempts - 1: - delay = settings.image_scraper.retry_delay * (2 ** attempt) - self.logger.warning( - f"Attempt {attempt + 1} failed for {char_name}, retrying in {delay}s: {e}", - ) - time.sleep(delay) - else: - self.logger.error(f"All retry attempts failed for {char_name}: {e}") - - if saved_count >= self.images_per_character: - progress.console.print( - f"[green]✓[/green] {char_name}: {saved_count}/{self.images_per_character} images", - ) - elif saved_count > 0: - progress.console.print( - f"[yellow]⚠[/yellow] {char_name}: {saved_count}/{self.images_per_character} images (incomplete)", - ) - else: - progress.console.print(f"[red]✗[/red] {char_name}: No suitable images found") - - return True diff --git a/preprocessor/characters/reference_processor.py b/preprocessor/characters/reference_processor.py deleted file mode 100644 index c3f8ea4d7..000000000 --- a/preprocessor/characters/reference_processor.py +++ /dev/null @@ -1,758 +0,0 @@ -from dataclasses import dataclass -from datetime import datetime -import json -import logging -from pathlib import Path -from typing import ( - Any, - Dict, - List, - Optional, -) -import warnings - -import cv2 -from insightface.app import FaceAnalysis -import numpy as np - -from preprocessor.characters.utils import init_face_detection -from preprocessor.config.config import settings -from preprocessor.core.base_processor import ( - BaseProcessor, - OutputSpec, - ProcessingItem, -) -from preprocessor.utils.console import console - -warnings.filterwarnings( - "ignore", - message=".*estimate.*is deprecated.*", - category=FutureWarning, - module="insightface", -) - - -@dataclass -class FaceData: - bbox: np.ndarray - face_vector: np.ndarray - source_image_path: Path - source_image_idx: int - face_img: np.ndarray - - -@dataclass -class CandidateFace: - faces: List[FaceData] - avg_similarity: float - - -class CharacterReferenceProcessor(BaseProcessor): - def __init__(self, args: Dict[str, Any]): - super().__init__( - args=args, - class_name="CharacterReferenceProcessor", - error_exit_code=20, - loglevel=logging.INFO, - ) - - self.characters_dir = args["characters_dir"] - self.output_dir = args["output_dir"] - self.similarity_threshold = args["similarity_threshold"] - self.interactive = args["interactive"] - - self.face_app: Optional[FaceAnalysis] = None - - def _validate_args(self, args: Dict[str, Any]) -> None: - required = ["characters_dir", "output_dir", "similarity_threshold", "interactive"] - for key in required: - if key not in args: - raise ValueError(f"Missing required argument: {key}") - - def _load_resources(self) -> bool: - self.face_app = init_face_detection() - return True - - def _get_processing_items(self) -> List[ProcessingItem]: - items = [] - - if not self.characters_dir.exists(): - console.print(f"[red]Characters directory not found: {self.characters_dir}[/red]") - return items - - for char_dir in sorted(self.characters_dir.iterdir()): - if not char_dir.is_dir(): - continue - - items.append( - ProcessingItem( - episode_id=char_dir.name, - input_path=char_dir, - metadata={"char_name": char_dir.name}, - ), - ) - - return items - - def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: - char_output_dir = self.output_dir / item.episode_id - - return [ - OutputSpec(path=char_output_dir / "metadata.json", required=True), - OutputSpec(path=char_output_dir / "face_vector.npy", required=True), - ] - - def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> None: - char_dir = item.input_path - char_name = item.metadata["char_name"] - - console.print(f"[blue]Processing character: {char_name}[/blue]") - - reference_images = sorted(char_dir.glob("*.jpg")) - - if len(reference_images) < 2: - console.print(f"[yellow]Skipping {char_name}: need at least 2 reference images, found {len(reference_images)}[/yellow]") - return - - all_faces = self.__detect_faces_in_references(reference_images) - - if not all_faces or not all_faces[0]: - console.print(f"[yellow]Skipping {char_name}: no faces detected in reference images[/yellow]") - return - - selected_faces = self.__find_common_face(all_faces, char_name, reference_images) - - if not selected_faces: - console.print(f"[yellow]Skipping {char_name}: could not identify common face[/yellow]") - return - - self.__save_processed_references(char_name, selected_faces, reference_images) - console.print(f"[green]✓ Processed {char_name}[/green]") - - def __detect_faces_in_references(self, image_paths: List[Path]) -> List[List[FaceData]]: - all_faces = [] - - for idx, img_path in enumerate(image_paths): - img = cv2.imread(str(img_path)) - if img is None: - console.print(f"[yellow]Warning: Could not read {img_path}[/yellow]") - all_faces.append([]) - continue - - console.print(f"[dim] {img_path.name}: detecting faces (image size: {img.shape[1]}x{img.shape[0]})...[/dim]") - faces = self.face_app.get(img) - console.print(f"[dim] Found {len(faces)} face(s)[/dim]") - - faces_data = [] - for face in faces: - bbox = face.bbox.astype(int) - x1, y1, x2, y2 = bbox - face_img = img[y1:y2, x1:x2] - - faces_data.append( - FaceData( - bbox=bbox, - face_vector=face.normed_embedding, - source_image_path=img_path, - source_image_idx=idx, - face_img=face_img, - ), - ) - - all_faces.append(faces_data) - - return all_faces - - def __find_common_face( - self, - all_faces: List[List[FaceData]], - char_name: str, - reference_images: List[Path], - ) -> Optional[List[FaceData]]: - first_image_faces = all_faces[0] - remaining_images = all_faces[1:] - - candidates = [] - - for first_face in first_image_faces: - matched_faces = [first_face] - similarities = [] - - for other_image_faces in remaining_images: - if not other_image_faces: - break - - best_match = None - best_similarity = -1.0 - - for other_face in other_image_faces: - similarity = np.dot(first_face.face_vector, other_face.face_vector) - if similarity > best_similarity: - best_similarity = similarity # pylint: disable=redefined-variable-type - best_match = other_face - - if best_match: - matched_faces.append(best_match) - similarities.append(best_similarity) - if best_similarity < self.similarity_threshold: - console.print(f"[yellow]Warning: Low similarity {best_similarity:.2f} < {self.similarity_threshold:.2f}[/yellow]") - else: - break - - if len(matched_faces) == len(all_faces): - avg_similarity = np.mean(similarities) if similarities else 1.0 - candidates.append( - CandidateFace( - faces=matched_faces, - avg_similarity=avg_similarity, - ), - ) - - if len(candidates) == 0: - if self.interactive: - return self.__ask_user_to_select_initial_face( - first_image_faces, - all_faces, - char_name, - reference_images, - ) - return None - if len(candidates) == 1: - return candidates[0].faces - if self.interactive: - return self.__ask_user_to_select_candidate(candidates, char_name) - candidates.sort(key=lambda c: c.avg_similarity, reverse=True) - return candidates[0].faces - - def __ask_user_to_select_candidate( - self, - candidates: List[CandidateFace], - char_name: str, - ) -> Optional[List[FaceData]]: - console.print(f"[yellow]Character: {char_name}[/yellow]") - console.print(f"[yellow]Found {len(candidates)} possible matches across all reference images.[/yellow]") - - for idx, candidate in enumerate(candidates, 1): - console.print(f"Candidate {idx}: avg similarity = {candidate.avg_similarity:.2f}") - - grid_path = self.__create_selection_grid(candidates, "candidates", char_name) - console.print(f"[blue]Grid image saved to: {grid_path}[/blue]") - - while True: - user_input = input(f"Select the correct character (1-{len(candidates)}) or skip (s): ").strip().lower() # pylint: disable=bad-builtin - - if user_input == 's': - return None - - try: - selection = int(user_input) - if 1 <= selection <= len(candidates): - return candidates[selection - 1].faces - console.print(f"[red]Invalid selection. Please enter 1-{len(candidates)} or 's'[/red]") - except ValueError: - console.print("[red]Invalid input. Please enter a number or 's'[/red]") - - def __ask_user_to_select_initial_face( - self, - first_image_faces: List[FaceData], - all_faces: List[List[FaceData]], - char_name: str, - reference_images: List[Path], - ) -> Optional[List[FaceData]]: - console.print(f"[yellow]Character: {char_name}[/yellow]") - console.print("[yellow]No common face found across all reference images.[/yellow]") - console.print("[yellow]Manual selection mode: Please select the correct face from the first image.[/yellow]") - console.print(f"[yellow]Found {len(first_image_faces)} faces in first reference image.[/yellow]") - - grid_path = self.__create_selection_grid(first_image_faces, "manual", char_name) - console.print(f"[blue]Grid image saved to: {grid_path}[/blue]") - - while True: - user_input = input(f"Select the correct face (1-{len(first_image_faces)}) or skip (s): ").strip().lower() # pylint: disable=bad-builtin - - if user_input == 's': - return None - - try: - selection = int(user_input) - if 1 <= selection <= len(first_image_faces): - selected_face = first_image_faces[selection - 1] - return self.__find_matching_faces_for_reference( - selected_face.face_vector, - all_faces[1:], - [selected_face], - reference_images, - ) - console.print(f"[red]Invalid selection. Please enter 1-{len(first_image_faces)} or 's'[/red]") - except ValueError: - console.print("[red]Invalid input. Please enter a number or 's'[/red]") - - def __find_matching_faces_for_reference( - self, - reference_vector: np.ndarray, - remaining_images: List[List[FaceData]], - matched_faces: List[FaceData], - reference_images: List[Path], - ) -> Optional[List[FaceData]]: - for img_idx, other_image_faces in enumerate(remaining_images, 1): - if not other_image_faces: - console.print( - f"[red]No faces found in image {img_idx + 1}: {reference_images[img_idx]}[/red]", - ) - return None - - best_match = None - best_sim = -1.0 - - for other_face in other_image_faces: - similarity = np.dot(reference_vector, other_face.face_vector) - if similarity > best_sim: - best_sim = similarity # pylint: disable=redefined-variable-type - best_match = other_face - - if best_match: - matched_faces.append(best_match) - if best_sim < self.similarity_threshold: - console.print( - f"[yellow]Warning: Low similarity in image {img_idx + 1}: {reference_images[img_idx]} " - f"(similarity: {best_sim:.2f} < threshold: {self.similarity_threshold:.2f})[/yellow]", - ) - else: - console.print( - f"[red]No faces detected in image {img_idx + 1}: {reference_images[img_idx]}[/red]", - ) - return None - - return matched_faces - - def __create_selection_grid( # pylint: disable=too-many-locals - self, - data, - mode: str, - char_name: str, - ) -> Path: - if mode == "candidates": - candidates = data - num_refs = len(candidates[0].faces) - num_candidates = len(candidates) - - face_size = 150 - padding = 10 - label_height = 30 - - grid_width = num_refs * (face_size + padding) + padding - grid_height = num_candidates * (face_size + label_height + padding) + padding + label_height - - grid = np.ones((grid_height, grid_width, 3), dtype=np.uint8) * 255 - - for col_idx in range(num_refs): - label = f"Ref {col_idx + 1}" - x = padding + col_idx * (face_size + padding) - cv2.putText( - grid, - label, - (x + 10, 20), - cv2.FONT_HERSHEY_SIMPLEX, - 0.5, - (0, 0, 0), - 1, - ) - - for cand_idx, candidate in enumerate(candidates): - y_base = label_height + padding + cand_idx * (face_size + label_height + padding) - - for face_idx, face_data in enumerate(candidate.faces): - x = padding + face_idx * (face_size + padding) - y = y_base - - face_resized = cv2.resize(face_data.face_img, (face_size, face_size)) - grid[y:y + face_size, x:x + face_size] = face_resized - - label = f"Candidate {cand_idx + 1}" - cv2.putText( - grid, - label, - (5, y_base + face_size // 2), - cv2.FONT_HERSHEY_SIMPLEX, - 0.4, - (0, 0, 255), - 1, - ) - - else: - faces_data = data - num_faces = len(faces_data) - cols = min(3, num_faces) - rows = (num_faces + cols - 1) // cols - - face_size = 150 - padding = 10 - - grid_width = cols * (face_size + padding) + padding - grid_height = rows * (face_size + padding) + padding - - grid = np.ones((grid_height, grid_width, 3), dtype=np.uint8) * 255 - - for idx, face_data in enumerate(faces_data): - row = idx // cols - col = idx % cols - - x = padding + col * (face_size + padding) - y = padding + row * (face_size + padding) - - face_resized = cv2.resize(face_data.face_img, (face_size, face_size)) - grid[y:y + face_size, x:x + face_size] = face_resized - - label = str(idx + 1) - cv2.putText( - grid, - label, - (x + 5, y + 20), - cv2.FONT_HERSHEY_SIMPLEX, - 0.7, - (0, 0, 255), - 2, - ) - - selection_grids_dir = self.output_dir.parent / "character_selection_grids" - selection_grids_dir.mkdir(parents=True, exist_ok=True) - - output_path = selection_grids_dir / f"{char_name.replace(' ', '_').lower()}_selection.jpg" - cv2.imwrite(str(output_path), grid) - - return output_path - - def __save_processed_references( # pylint: disable=too-many-locals - self, - char_name: str, - selected_faces: List[FaceData], - reference_images: List[Path], - ) -> None: - char_output_dir = self.output_dir / char_name - char_output_dir.mkdir(parents=True, exist_ok=True) - - face_vectors = [] - for idx, face_data in enumerate(selected_faces): - face_normalized = cv2.resize( - face_data.face_img, - settings.character.normalized_face_size, - ) - - face_output_path = char_output_dir / f"face_{idx:02d}.jpg" - cv2.imwrite(str(face_output_path), face_normalized) - - face_vectors.append(face_data.face_vector) - - mean_vector = np.mean(face_vectors, axis=0) - vector_path = char_output_dir / "face_vector.npy" - np.save(vector_path, mean_vector) - - total_faces_detected = [] - for faces_list in self.__detect_faces_in_references(reference_images): - total_faces_detected.append(len(faces_list)) - - similarities = [] - if len(selected_faces) > 1: - for i in range(len(selected_faces) - 1): - similarity = np.dot(selected_faces[i].face_vector, selected_faces[i + 1].face_vector) - similarities.append(similarity) - - metadata = { - "character_name": char_name.replace("_", " ").title(), - "source_images": [str(img) for img in reference_images], - "processed_at": datetime.now().isoformat(), - "processing_params": { - "similarity_threshold": self.similarity_threshold, - "face_model": settings.face_recognition.model_name, - "normalized_face_size": list(settings.character.normalized_face_size), - }, - "detection_stats": { - "total_faces_detected": total_faces_detected, - "candidates_found": 1, - "selection_method": "automatic" if len(selected_faces) == len(reference_images) else "manual", - }, - "selected_face_indices": [face.source_image_idx for face in selected_faces], - "average_similarity": float(np.mean(similarities)) if similarities else 1.0, - "face_vector_dim": int(mean_vector.shape[0]), - } - - metadata_path = char_output_dir / "metadata.json" - with open(metadata_path, "w", encoding="utf-8") as f: - json.dump(metadata, f, indent=2, ensure_ascii=False) - - def _get_progress_description(self) -> str: - return "Processing character references" - - def generate_validation_grid(self) -> None: # pylint: disable=too-many-locals,too-many-statements - output_path = self.output_dir / "validation_grid.png" - - if output_path.exists(): - console.print(f"[dim]⊘ Skipping validation grid (already exists): {output_path}[/dim]") - return - - console.print("\n[blue]Generating validation grid...[/blue]") - - if not self.output_dir.exists(): - console.print("[yellow]No processed references found, skipping validation grid[/yellow]") - return - - processed_chars = sorted([d for d in self.output_dir.iterdir() if d.is_dir()]) - - if not processed_chars: - console.print("[yellow]No processed characters found, skipping validation grid[/yellow]") - return - - face_size = 280 - padding = 15 - row_height = face_size + padding * 2 - header_height = 180 - footer_height = 80 - label_col_width = 350 - stats_col_width = 200 - face_col_width = face_size + padding - faces_per_char = 3 - - grid_width = label_col_width + stats_col_width + (faces_per_char * face_col_width) + padding * 2 - grid_height = header_height + (len(processed_chars) * row_height) + footer_height - - bg_color = (250, 252, 255) - grid = np.full((grid_height, grid_width, 3), bg_color, dtype=np.uint8) - - header_bg_color = (45, 55, 72) - cv2.rectangle(grid, (0, 0), (grid_width, header_height), header_bg_color, -1) - - title_text = "FACIAL REFERENCE VALIDATION REPORT" - cv2.putText( - grid, - title_text, - (padding * 3, 50), - cv2.FONT_HERSHEY_DUPLEX, - 1.1, - (255, 255, 255), - 2, - cv2.LINE_AA, - ) - - subtitle = "InsightFace Buffalo-L Model | Face Vector Extraction & Similarity Analysis" - cv2.putText( - grid, - subtitle, - (padding * 3, 85), - cv2.FONT_HERSHEY_SIMPLEX, - 0.55, - (200, 210, 220), - 1, - cv2.LINE_AA, - ) - - metadata_all = [] - for char_dir in processed_chars: - metadata_file = char_dir / "metadata.json" - if metadata_file.exists(): - with open(metadata_file, "r", encoding="utf-8") as f: - metadata_all.append(json.load(f)) - - total_chars = len(processed_chars) - avg_similarity = np.mean([m.get("average_similarity", 0) for m in metadata_all]) if metadata_all else 0 - threshold = self.similarity_threshold - - stats_y = 115 - stats_items = [ - f"Total Subjects: {total_chars}", - f"Avg Similarity: {avg_similarity:.4f}", - f"Threshold: {threshold:.2f}", - ] - - for idx, stat in enumerate(stats_items): - x_pos = padding * 3 + idx * 280 - cv2.putText( - grid, - stat, - (x_pos, stats_y), - cv2.FONT_HERSHEY_SIMPLEX, - 0.5, - (180, 200, 220), - 1, - cv2.LINE_AA, - ) - - table_header_y = header_height + 1 - cv2.line(grid, (0, table_header_y), (grid_width, table_header_y), (180, 190, 200), 2) - - col_headers = [ - ("CHARACTER NAME", label_col_width // 2, 0), - ("STATISTICS", label_col_width + stats_col_width // 2, 0), - ("REFERENCE IMAGE 1", label_col_width + stats_col_width + face_col_width // 2, 0), - ("REFERENCE IMAGE 2", label_col_width + stats_col_width + face_col_width * 3 // 2, 0), - ("REFERENCE IMAGE 3", label_col_width + stats_col_width + face_col_width * 5 // 2, 0), - ] - - header_row_height = 40 - for text, x_center, _ in col_headers: - text_size = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.42, 1)[0] - text_x = x_center - text_size[0] // 2 - cv2.putText( - grid, - text, - (text_x, table_header_y + 25), - cv2.FONT_HERSHEY_SIMPLEX, - 0.42, - (60, 70, 85), - 1, - cv2.LINE_AA, - ) - - cv2.line( - grid, - (0, table_header_y + header_row_height), - (grid_width, table_header_y + header_row_height), - (200, 210, 220), - 1, - ) - - y_offset = header_height + header_row_height + padding - - for idx, char_dir in enumerate(processed_chars): - char_name = char_dir.name.replace("_", " ").title() - metadata_file = char_dir / "metadata.json" - - if idx % 2 == 0: - row_bg = (245, 248, 252) - else: - row_bg = bg_color - - cv2.rectangle( - grid, - (0, y_offset - padding), - (grid_width, y_offset + face_size + padding), - row_bg, - -1, - ) - - cv2.putText( - grid, - char_name, - (padding * 2, y_offset + face_size // 2), - cv2.FONT_HERSHEY_SIMPLEX, - 0.55, - (30, 40, 50), - 1, - cv2.LINE_AA, - ) - - if metadata_file.exists(): - with open(metadata_file, "r", encoding="utf-8") as f: - metadata = json.load(f) - - similarity = metadata.get("average_similarity", 0.0) - method = metadata.get("detection_stats", {}).get("selection_method", "unknown") - faces_detected = metadata.get("detection_stats", {}).get("total_faces_detected", []) - - stats_x = label_col_width + padding - stats_y_base = y_offset + face_size // 2 - 30 - - sim_color = (0, 150, 0) if similarity >= threshold else (180, 100, 0) - cv2.putText( - grid, - f"Similarity: {similarity:.4f}", - (stats_x, stats_y_base), - cv2.FONT_HERSHEY_SIMPLEX, - 0.45, - sim_color, - 1, - cv2.LINE_AA, - ) - - method_color = (50, 120, 200) if method == "automatic" else (180, 100, 50) - cv2.putText( - grid, - f"Method: {method}", - (stats_x, stats_y_base + 25), - cv2.FONT_HERSHEY_SIMPLEX, - 0.42, - method_color, - 1, - cv2.LINE_AA, - ) - - faces_str = str(faces_detected) if len(str(faces_detected)) < 20 else f"[{len(faces_detected)} imgs]" - cv2.putText( - grid, - f"Detected: {faces_str}", - (stats_x, stats_y_base + 50), - cv2.FONT_HERSHEY_SIMPLEX, - 0.38, - (100, 110, 120), - 1, - cv2.LINE_AA, - ) - - face_files = sorted(char_dir.glob("face_*.jpg")) - - for face_idx, face_file in enumerate(face_files[:faces_per_char]): - face_img = cv2.imread(str(face_file)) - if face_img is None: - continue - - face_resized = cv2.resize(face_img, (face_size, face_size)) - - x = label_col_width + stats_col_width + face_idx * face_col_width + padding - y = y_offset - - grid[y:y + face_size, x:x + face_size] = face_resized - - border_color = (180, 190, 200) - cv2.rectangle( - grid, - (x - 1, y - 1), - (x + face_size + 1, y + face_size + 1), - border_color, - 1, - ) - - y_offset += row_height - - footer_y = grid_height - footer_height + 20 - cv2.line(grid, (0, footer_y - 20), (grid_width, footer_y - 20), (200, 210, 220), 1) - - footer_text = f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} | " \ - f"Model: {settings.face_recognition.model_name} | " \ - f"Normalized Size: {settings.character.normalized_face_size[0]}x{settings.character.normalized_face_size[1]}px" - - cv2.putText( - grid, - footer_text, - (padding * 3, footer_y), - cv2.FONT_HERSHEY_SIMPLEX, - 0.4, - (120, 130, 140), - 1, - cv2.LINE_AA, - ) - - legend_y = footer_y + 30 - legend_items = [ - ("Automatic: Face found on all references", (50, 120, 200)), - ("Manual: User-selected reference", (180, 100, 50)), - ] - - for idx, (text, color) in enumerate(legend_items): - x_pos = padding * 3 + idx * 380 - cv2.circle(grid, (x_pos, legend_y - 3), 5, color, -1) - cv2.putText( - grid, - text, - (x_pos + 15, legend_y), - cv2.FONT_HERSHEY_SIMPLEX, - 0.38, - (100, 110, 120), - 1, - cv2.LINE_AA, - ) - - cv2.imwrite(str(output_path), grid, [cv2.IMWRITE_PNG_COMPRESSION, 6]) - - console.print(f"[green]✓ Validation grid saved to: {output_path}[/green]") - console.print(f"[green] Grid size: {grid_width}x{grid_height}px[/green]") - console.print(f"[green] Characters: {len(processed_chars)}[/green]") - console.print(f"[green] Average similarity: {avg_similarity:.4f}[/green]") diff --git a/preprocessor/characters/utils.py b/preprocessor/characters/utils.py deleted file mode 100644 index d5b0e3bb1..000000000 --- a/preprocessor/characters/utils.py +++ /dev/null @@ -1,66 +0,0 @@ -import os -import warnings - -from insightface.app import FaceAnalysis -import onnxruntime as ort - -from preprocessor.config.config import settings -from preprocessor.utils.console import console - - -def init_face_detection() -> FaceAnalysis: - model_root = os.getenv("INSIGHTFACE_HOME", os.path.expanduser("~/.insightface")) - - available_providers = ort.get_available_providers() - console.print(f"[dim]Available ONNX providers: {', '.join(available_providers)}[/dim]") - - if 'CUDAExecutionProvider' not in available_providers: - console.print("[red]✗ CUDAExecutionProvider not available in onnxruntime[/red]") - console.print("[red] Check if onnxruntime-gpu is installed and CUDA libraries are accessible[/red]") - raise RuntimeError("CUDA provider not available in onnxruntime") - - providers = [ - ( - 'CUDAExecutionProvider', { - 'device_id': 0, - 'arena_extend_strategy': 'kNextPowerOfTwo', - 'gpu_mem_limit': 8 * 1024 * 1024 * 1024, - 'cudnn_conv_algo_search': 'EXHAUSTIVE', - 'do_copy_in_default_stream': True, - }, - ), - ] - - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=UserWarning, module="onnxruntime") - warnings.filterwarnings("ignore", category=FutureWarning, module="insightface") - - console.print(f"[cyan]Loading {settings.face_recognition.model_name} face detection model (GPU-only)...[/cyan]") - - try: - face_app = FaceAnalysis(name=settings.face_recognition.model_name, root=model_root, providers=providers) - face_app.prepare( - ctx_id=0, - det_size=settings.face_recognition.detection_size, - det_thresh=settings.character.face_detection_threshold, - ) - except Exception as e: - console.print("[red]✗ Failed to initialize face detection on GPU[/red]") - console.print(f"[red] Error: {e}[/red]") - console.print("[red] Ensure CUDA and onnxruntime-gpu are properly configured[/red]") - raise RuntimeError("GPU required but face detection initialization failed") from e - - actual_providers = face_app.models['detection'].session.get_providers() - - if 'CUDAExecutionProvider' not in actual_providers: - console.print("[red]✗ CUDA provider not active after initialization[/red]") - console.print(f"[red] Active providers: {', '.join(actual_providers)}[/red]") - raise RuntimeError("CUDA required but not available for face detection") - - console.print(f"[green]✓ Face detection initialized ({settings.face_recognition.model_name})[/green]") - console.print("[dim] Device: GPU (CUDA)[/dim]") - console.print(f"[dim] Detection size: {settings.face_recognition.detection_size}[/dim]") - console.print(f"[dim] Face detection threshold: {settings.character.face_detection_threshold}[/dim]") - console.print(f"[dim] Model cache: {model_root}[/dim]") - - return face_app diff --git a/preprocessor/cli/__init__.py b/preprocessor/cli/__init__.py index ee99d0a5d..f17535929 100644 --- a/preprocessor/cli/__init__.py +++ b/preprocessor/cli/__init__.py @@ -1,72 +1,3 @@ -import click +from preprocessor.cli.cli_main import cli -from preprocessor.cli.commands import ( - analyze_text, - detect_scenes, - export_frames, - fix_unicode, - generate_archives, - generate_elastic_documents, - generate_embeddings, - image_hashing, - import_transcriptions, - index, - process_character_references, - run_all, - scrape_episodes, - search, - separate_sounds, - transcode, - transcribe, - transcribe_elevenlabs, - validate, -) - - -@click.group() -@click.help_option("-h", "--help") -def cli(): - """Preprocessor CLI for video processing pipeline.""" - - -# noinspection PyTypeChecker -cli.add_command(transcode) -# noinspection PyTypeChecker -cli.add_command(transcribe) -# noinspection PyTypeChecker -cli.add_command(index) -# noinspection PyTypeChecker -cli.add_command(import_transcriptions) -# noinspection PyTypeChecker -cli.add_command(transcribe_elevenlabs) -# noinspection PyTypeChecker -cli.add_command(scrape_episodes) -# noinspection PyTypeChecker -cli.add_command(detect_scenes) -# noinspection PyTypeChecker -cli.add_command(export_frames) -# noinspection PyTypeChecker -cli.add_command(image_hashing) -# noinspection PyTypeChecker -cli.add_command(generate_embeddings) -# noinspection PyTypeChecker -cli.add_command(generate_elastic_documents) -# noinspection PyTypeChecker -cli.add_command(generate_archives) -# noinspection PyTypeChecker -cli.add_command(search) -# noinspection PyTypeChecker -cli.add_command(run_all) -# noinspection PyTypeChecker -cli.add_command(validate) -# noinspection PyTypeChecker -cli.add_command(analyze_text) -# noinspection PyTypeChecker -cli.add_command(fix_unicode) -# noinspection PyTypeChecker -cli.add_command(separate_sounds) -# noinspection PyTypeChecker -cli.add_command(process_character_references) - - -__all__ = ["cli"] +__all__ = ['cli'] diff --git a/preprocessor/cli/__main__.py b/preprocessor/cli/__main__.py index 3386182ec..8a28a7810 100644 --- a/preprocessor/cli/__main__.py +++ b/preprocessor/cli/__main__.py @@ -1,4 +1,4 @@ from preprocessor.cli import cli -if __name__ == "__main__": +if __name__ == '__main__': cli() diff --git a/preprocessor/cli/cli_main.py b/preprocessor/cli/cli_main.py new file mode 100644 index 000000000..efcd0dfcb --- /dev/null +++ b/preprocessor/cli/cli_main.py @@ -0,0 +1,351 @@ +import asyncio +import json +from pathlib import Path +import sys +from typing import ( + List, + Tuple, +) + +import click +from click import Command +from elasticsearch import AsyncElasticsearch + +from preprocessor.app.pipeline_builder import PipelineExecutor +from preprocessor.app.pipeline_factory import ( + build_pipeline, + visualize, +) +from preprocessor.cli.helpers import setup_pipeline_context +from preprocessor.cli.search_handler import ( + SearchCommandHandler, + SearchFilters, +) +from preprocessor.cli.search_params import ( + SearchActionParams, + SearchConfig, + SearchQueryParams, +) +from preprocessor.cli.skip_list_builder import SkipListBuilder +from preprocessor.config.series_config import SeriesConfig +from preprocessor.core.state_reconstruction import StateReconstructor +from preprocessor.services.episodes.types import EpisodeInfo +from preprocessor.services.io.path_service import PathService +from preprocessor.services.search.clients.elasticsearch_queries import ElasticsearchQueries +from preprocessor.services.search.clients.embedding_service import EmbeddingService + + +@click.group() +@click.help_option("-h", "--help") +def cli() -> None: + pass + + +@cli.command(name="visualize") +@click.option("--series", default="ranczo", help="Series name (e.g., ranczo)") +def __visualize_command(series: str) -> None: + visualize(series) + + +@cli.command(name="run-all") +@click.option("--series", required=True, help="Series name (e.g., ranczo)") +@click.option("--force-rerun", is_flag=True, help="Force rerun even if cached") +@click.option( + "--skip", + multiple=True, + help="Step IDs to skip (e.g., --skip transcode --skip detect_scenes)", +) +def __run_all(series: str, force_rerun: bool, skip: Tuple[str, ...]) -> None: + series_config = SeriesConfig.load(series) + pipeline = build_pipeline(series) + setup = setup_pipeline_context(series, "__run_all", force_rerun, with_episode_manager=True) + + try: + skip_list = SkipListBuilder.build(skip, series_config, setup.logger) + plan = pipeline.get_execution_order(skip=skip_list) + + source_path = PathService.get_input_base() / series + + setup.logger.info(f"Execution plan: {' -> '.join(plan)}") + setup.logger.info(f"Source: {source_path}") + + executor = PipelineExecutor(setup.context) + executor.execute_steps( + pipeline=pipeline, + step_ids=plan, + source_path=source_path, + episode_manager=setup.episode_manager, + ) + + setup.logger.info("=" * 80) + setup.logger.info("Pipeline completed successfully!") + except KeyboardInterrupt: + setup.logger.info("\nInterrupted by user") + raise + finally: + setup.logger.finalize() + + +def __load_episodes_from_json(episodes_file: Path) -> List[EpisodeInfo]: + with open(episodes_file, 'r', encoding='utf-8') as f: + episodes_data = json.load(f) + + episodes_list = [] + for season_data in episodes_data.get('seasons', []): + season_num = season_data['season_number'] + for ep_data in season_data.get('episodes', []): + episode_info = EpisodeInfo( + season=season_num, + relative_episode=ep_data['episode_in_season'], + absolute_episode=ep_data['overall_episode_number'], + title=ep_data.get('title', ''), + premiere_date=ep_data.get('premiere_date'), + ) + episodes_list.append(episode_info) + + return episodes_list + + +@cli.command(name="sync-state") +@click.option("--series", required=True, help="Series name (e.g., ranczo)") +def __sync_state(series: str) -> None: + pipeline = build_pipeline(series) + setup = setup_pipeline_context(series, "sync_state", force_rerun=False, with_episode_manager=True) + + try: + episodes_file = setup.context.base_output_dir / f'{series}_episodes.json' + + if not episodes_file.exists(): + setup.logger.error(f'Episodes file not found: {episodes_file}') + setup.logger.error('Run scraping steps first to generate episodes.json') + sys.exit(1) + + setup.logger.info(f'Loading episodes from {episodes_file}') + episodes_list = __load_episodes_from_json(episodes_file) + setup.logger.info(f'Found {len(episodes_list)} episodes') + + completed_steps = StateReconstructor.scan_filesystem( + pipeline=pipeline, + episodes_list=episodes_list, + base_output_dir=setup.context.base_output_dir, + series_name=series, + ) + + setup.context.state_manager.rebuild_state(completed_steps) + setup.logger.info('State synchronization completed!') + except Exception as e: + setup.logger.error(f'Failed to sync state: {e}') + raise + finally: + setup.logger.finalize() + + +def __create_step_command(step_id: str, step_description: str) -> Command: + @click.command(name=step_id.replace("_", "-"), help=f"{step_description}") + @click.option("--series", required=True, help="Series name (e.g., ranczo)") + @click.option("--force-rerun", is_flag=True, help="Force rerun even if cached") + def __step_command(series: str, force_rerun: bool, _step_id: str = step_id) -> None: + pipeline = build_pipeline(series) + setup = setup_pipeline_context(series, _step_id, force_rerun, with_episode_manager=True) + + try: + step = pipeline.get_step(_step_id) + deps = step.dependency_ids + + if deps: + setup.logger.info(f"Dependencies: {', '.join(deps)}") + for dep_id in deps: + if not setup.context.state_manager.is_step_completed(dep_id, "*"): + setup.logger.warning( + f"Dependency '{dep_id}' may not be completed. " + f"Run it first or use --force-rerun.", + ) + + source_path = PathService.get_input_base() / series + + executor = PipelineExecutor(setup.context) + executor.execute_step( + pipeline=pipeline, + step_id=_step_id, + source_path=source_path, + episode_manager=setup.episode_manager, + ) + + setup.logger.info(f"Step '{_step_id}' completed successfully") + except KeyboardInterrupt: + setup.logger.info("\nInterrupted by user") + raise + finally: + setup.logger.finalize() + + return __step_command + + +@cli.command(name="analyze-resolution") +@click.option("--series", required=True, help="Series name (e.g., ranczo, kiepscy)") +def __analyze_resolution(series: str) -> None: + pipeline = build_pipeline(series) + setup = setup_pipeline_context(series, "resolution_analysis", False, with_episode_manager=False) + + try: + step = pipeline.get_step("resolution_analysis") + step.execute(None, setup.context) + + setup.logger.info("Resolution analysis completed") + except KeyboardInterrupt: + setup.logger.info("\nInterrupted by user") + raise + finally: + setup.logger.finalize() + + +def __execute_search_command(config: SearchConfig) -> None: # pylint: disable=too-many-statements # Complex async search setup - splitting would reduce readability + series_config = SeriesConfig.load(config.series) + index_base = series_config.indexing.elasticsearch.index_name + + hash_value = None + if config.query.phash: + hash_value = SearchCommandHandler.compute_perceptual_hash(config.query.phash) + if hash_value is None: + sys.exit(1) + + async def __run_async_search() -> None: + es_client = AsyncElasticsearch(hosts=[config.host], verify_certs=False) + + try: + await es_client.ping() + except Exception: + click.echo(f"Cannot connect to Elasticsearch at {config.host}", err=True) + click.echo("Make sure Elasticsearch is running:", err=True) + click.echo(" docker-compose -f docker-compose.test.yml up -d", err=True) + sys.exit(1) + + embedding_svc = EmbeddingService() + queries = ElasticsearchQueries(embedding_svc, index_base) + + try: + handler = SearchCommandHandler(es_client, embedding_svc, queries, config.json_output) + + result = None + if config.actions.stats: + result = await handler.handle_stats() + elif config.actions.list_chars_flag: + result = await handler.handle_list_characters() + elif config.actions.list_objects_flag: + result = await handler.handle_list_objects() + elif config.query.text: + result = await handler.handle_text_search(config.query.text, config.filters) + elif config.query.text_semantic: + result = await handler.handle_text_semantic_search(config.query.text_semantic, config.filters) + elif config.query.text_to_video: + result = await handler.handle_text_to_video_search(config.query.text_to_video, config.filters) + elif config.query.image: + result = await handler.handle_image_search(config.query.image, config.filters) + elif config.query.emotion: + result = await handler.handle_emotion_search(config.query.emotion, config.filters) + elif config.query.character: + result = await handler.handle_character_search(config.query.character, config.filters) + elif config.query.object_query: + result = await handler.handle_object_search(config.query.object_query, config.filters) + elif hash_value: + result = await handler.handle_hash_search(hash_value, config.filters) + elif config.query.episode_name: + result = await handler.handle_episode_name_search(config.query.episode_name, config.filters) + elif config.query.episode_name_semantic: + result = await handler.handle_episode_name_semantic_search( + config.query.episode_name_semantic, config.filters, + ) + + if result: + click.echo(result) + + finally: + embedding_svc.cleanup() + await es_client.close() + + asyncio.run(__run_async_search()) + + +@cli.command(name="search") +@click.option("--series", required=True, help="Series name (e.g., ranczo, kiepscy)") +@click.option("--text", type=str, help="Full-text search by transcriptions") +@click.option("--text-semantic", type=str, help="Semantic search by text embeddings") +@click.option("--text-to-video", type=str, help="Cross-modal search: text query in video embeddings") +@click.option("--image", type=click.Path(exists=True, path_type=Path), help="Semantic search by video embeddings") +@click.option("--hash", "phash", type=str, help="Search by perceptual hash (provide hash string or image path)") +@click.option("--character", type=str, help="Search by character") +@click.option( + "--emotion", type=str, + help="Search by emotion (neutral, happiness, surprise, sadness, anger, disgust, fear, contempt)", +) +@click.option( + "--object", "object_query", type=str, + help="Search by detected objects (e.g., 'dog', 'person:5+', 'chair:2-4')", +) +@click.option("--episode-name", type=str, help="Fuzzy search by episode names") +@click.option("--episode-name-semantic", type=str, help="Semantic search by episode names") +@click.option("--list-characters", "list_chars_flag", is_flag=True, help="List all characters") +@click.option("--list-objects", "list_objects_flag", is_flag=True, help="List all object classes") +@click.option("--season", type=int, help="Filter by season") +@click.option("--episode", type=int, help="Filter by episode") +@click.option("--limit", type=int, default=20, help="Result limit") +@click.option("--stats", is_flag=True, help="Show index statistics") +@click.option("--json-output", is_flag=True, help="Output in JSON format") +@click.option("--host", type=str, default="http://localhost:9200", help="Elasticsearch host") +def search( # pylint: disable=too-many-arguments,too-many-locals # CLI command with many options - cannot refactor without breaking Click interface + series: str, + text: str, + text_semantic: str, + text_to_video: str, + image: Path, + phash: str, + character: str, + emotion: str, + object_query: str, + episode_name: str, + episode_name_semantic: str, + list_chars_flag: bool, + list_objects_flag: bool, + season: int, + episode: int, + limit: int, + stats: bool, + json_output: bool, + host: str, +) -> None: + config = SearchConfig( + series=series, + query=SearchQueryParams( + text=text, + text_semantic=text_semantic, + text_to_video=text_to_video, + image=image, + phash=phash, + character=character, + emotion=emotion, + object_query=object_query, + episode_name=episode_name, + episode_name_semantic=episode_name_semantic, + ), + filters=SearchFilters(season, episode, character, limit), + actions=SearchActionParams(list_chars_flag, list_objects_flag, stats), + json_output=json_output, + host=host, + ) + + if not config.has_any_operation(): + click.echo("Provide at least one search option. Use --help", err=True) + sys.exit(1) + + __execute_search_command(config) + + +_CLI_TEMPLATE_SERIES = "ranczo" +_cli_pipeline = build_pipeline(_CLI_TEMPLATE_SERIES) + +for _step_id, _step in _cli_pipeline.get_all_steps().items(): + command_func = __create_step_command(_step_id, _step.description) + cli.add_command(command_func) + +if __name__ == "__main__": + cli() diff --git a/preprocessor/cli/commands/__init__.py b/preprocessor/cli/commands/__init__.py deleted file mode 100644 index 842404862..000000000 --- a/preprocessor/cli/commands/__init__.py +++ /dev/null @@ -1,41 +0,0 @@ -from preprocessor.cli.commands.analyze_text import analyze_text -from preprocessor.cli.commands.detect_scenes import detect_scenes -from preprocessor.cli.commands.export_frames import export_frames -from preprocessor.cli.commands.fix_unicode import fix_unicode -from preprocessor.cli.commands.generate_archives import generate_archives -from preprocessor.cli.commands.generate_elastic_documents import generate_elastic_documents -from preprocessor.cli.commands.generate_embeddings import generate_embeddings -from preprocessor.cli.commands.image_hashing import image_hashing -from preprocessor.cli.commands.import_transcriptions import import_transcriptions -from preprocessor.cli.commands.index import index -from preprocessor.cli.commands.process_character_references import process_character_references -from preprocessor.cli.commands.run_all import run_all -from preprocessor.cli.commands.scrape_episodes import scrape_episodes -from preprocessor.cli.commands.search import search -from preprocessor.cli.commands.separate_sounds import separate_sounds -from preprocessor.cli.commands.transcode import transcode -from preprocessor.cli.commands.transcribe import transcribe -from preprocessor.cli.commands.transcribe_elevenlabs import transcribe_elevenlabs -from preprocessor.cli.commands.validate import validate - -__all__ = [ - "analyze_text", - "detect_scenes", - "export_frames", - "fix_unicode", - "generate_archives", - "generate_elastic_documents", - "generate_embeddings", - "image_hashing", - "import_transcriptions", - "index", - "process_character_references", - "run_all", - "scrape_episodes", - "search", - "separate_sounds", - "transcode", - "transcribe", - "transcribe_elevenlabs", - "validate", -] diff --git a/preprocessor/cli/commands/analyze_text.py b/preprocessor/cli/commands/analyze_text.py deleted file mode 100644 index 78e9b330c..000000000 --- a/preprocessor/cli/commands/analyze_text.py +++ /dev/null @@ -1,59 +0,0 @@ -from pathlib import Path -import sys - -import click - -from preprocessor.text_analysis.text_analyzer import TextAnalyzer - - -@click.command(context_settings={"show_default": True}) -@click.option( - "--season", - type=str, - help="Season to analyze (e.g., S10). If not provided, analyzes all seasons", -) -@click.option( - "--episode", - type=str, - help="Episode to analyze (e.g., E01). Requires --season. If not provided, analyzes all episodes in season", -) -@click.option( - "--language", - type=str, - default="pl", - help="Language code for analysis (pl or en)", -) -@click.option( - "--series-name", - type=str, - default="ranczo", - help="Series name for file naming", -) -@click.option( - "--episodes-info-json", - type=click.Path(exists=True, path_type=Path), - help="JSON file with episode metadata (optional)", -) -def analyze_text( - season: str, - episode: str, - language: str, - series_name: str, - episodes_info_json: Path, -): - """Analyze transcription texts and generate linguistic statistics.""" - if episode and not season: - click.echo("Error: --episode requires --season to be specified") - sys.exit(1) - - analyzer = TextAnalyzer( - { - "series_name": series_name, - "episodes_info_json": episodes_info_json, - "language": language, - "state_manager": None, - }, - ) - - exit_code = analyzer.work() - sys.exit(exit_code) diff --git a/preprocessor/cli/commands/detect_scenes.py b/preprocessor/cli/commands/detect_scenes.py deleted file mode 100644 index 9a35ddbd2..000000000 --- a/preprocessor/cli/commands/detect_scenes.py +++ /dev/null @@ -1,45 +0,0 @@ -from pathlib import Path -import sys - -import click - -from preprocessor.cli_utils.resource_scope import ResourceScope -from preprocessor.config.config import settings -from preprocessor.video.scene_detector import SceneDetector - - -@click.command(name="detect-scenes", context_settings={"show_default": True}) -@click.argument("videos", type=click.Path(exists=True, path_type=Path)) -@click.option( - "--output-dir", - type=click.Path(path_type=Path), - default=str(settings.scene_detection.output_dir), - help="Output directory for scene JSON files", -) -@click.option( - "--threshold", - type=float, - default=settings.scene_detection.threshold, - help="Scene detection threshold 0.0-1.0", -) -@click.option( - "--min-scene-len", - type=int, - default=settings.scene_detection.min_scene_len, - help="Minimum scene length in frames", -) -def detect_scenes(videos: Path, output_dir: Path, threshold: float, min_scene_len: int): - """Detect scene changes in videos using TransNetV2.""" - with ResourceScope(): - detector = SceneDetector( - { - "videos": videos, - "output_dir": output_dir, - "threshold": threshold, - "min_scene_len": min_scene_len, - }, - ) - exit_code = detector.work() - detector.cleanup() - - sys.exit(exit_code) diff --git a/preprocessor/cli/commands/export_frames.py b/preprocessor/cli/commands/export_frames.py deleted file mode 100644 index d5c38edbe..000000000 --- a/preprocessor/cli/commands/export_frames.py +++ /dev/null @@ -1,67 +0,0 @@ -from pathlib import Path -import sys - -import click - -from preprocessor.cli.utils import create_state_manager -from preprocessor.config.config import settings -from preprocessor.utils.resolution import Resolution -from preprocessor.video.frame_exporter import FrameExporter - - -@click.command(context_settings={"show_default": True}) -@click.argument("transcoded_videos", type=click.Path(exists=True, file_okay=False, path_type=Path)) -@click.option( - "--episodes-info-json", - type=click.Path(exists=True, path_type=Path), - required=True, - help="JSON file with episode metadata", -) -@click.option( - "--scene-timestamps-dir", - type=click.Path(exists=True, path_type=Path), - default=str(settings.scene_detection.output_dir), - help="Directory with scene timestamps", -) -@click.option( - "--output-frames", - type=click.Path(path_type=Path), - default=str(settings.frame_export.output_dir), - help="Output directory for exported frames", -) -@click.option( - "--resolution", - type=click.Choice(Resolution.get_all_choices()), - default="1080p", - help="Target resolution for exported frames", -) -@click.option("--name", required=True, help="Series name") -@click.option("--no-state", is_flag=True, help="Disable state management (no resume on interrupt)") -def export_frames( - transcoded_videos: Path, - episodes_info_json: Path, - scene_timestamps_dir: Path, - output_frames: Path, - resolution: str, - name: str, - no_state: bool, -): - """Export keyframes at target resolution based on configured keyframe strategy.""" - state_manager = create_state_manager(name, no_state) - - res = Resolution.from_str(resolution) - - exporter = FrameExporter( - { - "transcoded_videos": transcoded_videos, - "scene_timestamps_dir": scene_timestamps_dir, - "output_frames": output_frames, - "resolution": res, - "series_name": name, - "episodes_info_json": episodes_info_json, - "state_manager": state_manager, - }, - ) - - exit_code = exporter.work() - sys.exit(exit_code) diff --git a/preprocessor/cli/commands/fix_unicode.py b/preprocessor/cli/commands/fix_unicode.py deleted file mode 100644 index f7ad56227..000000000 --- a/preprocessor/cli/commands/fix_unicode.py +++ /dev/null @@ -1,45 +0,0 @@ -from pathlib import Path -import sys - -import click - -from preprocessor.cli_utils.resource_scope import ResourceScope -from preprocessor.config.config import settings -from preprocessor.transcription.processors.unicode_fixer import TranscriptionUnicodeFixer - - -@click.command(context_settings={"show_default": True}) -@click.option( - "--transcription-jsons", - type=click.Path(exists=True, path_type=Path), - default=str(settings.transcription.output_dir), - help="Directory with transcription JSON files", -) -@click.option( - "--episodes-info-json", - type=click.Path(exists=True, path_type=Path), - required=True, - help="JSON file with episode metadata", -) -@click.option( - "--name", - required=True, - help="Series name", -) -def fix_unicode( - transcription_jsons: Path, - episodes_info_json: Path, - name: str, -): - """Fix unicode escape sequences in transcription files.""" - args = { - "transcription_jsons": transcription_jsons, - "episodes_info_json": episodes_info_json, - "name": name, - } - - with ResourceScope(): - fixer = TranscriptionUnicodeFixer(args) - exit_code = fixer.work() - - sys.exit(exit_code) diff --git a/preprocessor/cli/commands/generate_archives.py b/preprocessor/cli/commands/generate_archives.py deleted file mode 100644 index 9d59381db..000000000 --- a/preprocessor/cli/commands/generate_archives.py +++ /dev/null @@ -1,79 +0,0 @@ -from pathlib import Path -import sys - -import click - -from preprocessor.cli.options.common import ( - episodes_info_option, - name_option, -) -from preprocessor.config.config import ( - BASE_OUTPUT_DIR, - settings, -) -from preprocessor.indexing.archive_generator import ArchiveGenerator - - -@click.command(name="generate-archives", context_settings={"show_default": True}) -@click.option( - "--elastic-documents-dir", - type=click.Path(exists=True, file_okay=False, path_type=Path), - default=BASE_OUTPUT_DIR / settings.output_subdirs.elastic_documents, - help="Directory with Elasticsearch documents", -) -@click.option( - "--output-dir", - type=click.Path(path_type=Path), - default=BASE_OUTPUT_DIR / settings.output_subdirs.archives, - help="Output directory for ZIP archives", -) -@click.option( - "--season", - type=int, - help="Process only specific season", -) -@click.option( - "--episode", - type=int, - help="Process only specific episode (requires --season)", -) -@click.option( - "--force-regenerate", - is_flag=True, - help="Force regenerate existing archives", -) -@click.option( - "--allow-partial", - is_flag=True, - help="Create archives even if not all 5 files are present (default: skip incomplete episodes)", -) -@name_option() -@episodes_info_option(required=False) -def generate_archives( - elastic_documents_dir: Path, - output_dir: Path, - season: int, - episode: int, - force_regenerate: bool, - allow_partial: bool, - name: str, - episodes_info_json: Path, -) -> None: - args = { - "elastic_documents_dir": elastic_documents_dir, - "output_dir": output_dir, - "series_name": name, - "episodes_info_json": episodes_info_json, - "force_regenerate": force_regenerate, - "allow_partial": allow_partial, - } - - if season: - args["season_filter"] = season - if episode: - args["episode_filter"] = episode - - generator = ArchiveGenerator(args) - exit_code = generator.work() - if exit_code != 0: - sys.exit(exit_code) diff --git a/preprocessor/cli/commands/generate_elastic_documents.py b/preprocessor/cli/commands/generate_elastic_documents.py deleted file mode 100644 index 3a7113e68..000000000 --- a/preprocessor/cli/commands/generate_elastic_documents.py +++ /dev/null @@ -1,72 +0,0 @@ -from pathlib import Path -import sys - -import click - -from preprocessor.cli.options.common import ( - episodes_info_option, - name_option, -) -from preprocessor.indexing.elastic_document_generator import ElasticDocumentGenerator - - -@click.command(name="generate-elastic-documents", context_settings={"show_default": True}) -@click.option( - "--transcription-jsons", - type=click.Path(exists=True, file_okay=False, path_type=Path), - required=True, - help="Directory with transcription JSON files", -) -@click.option( - "--embeddings-dir", - type=click.Path(exists=True, file_okay=False, path_type=Path), - help="Directory with embedding files", -) -@click.option( - "--scene-timestamps-dir", - type=click.Path(exists=True, file_okay=False, path_type=Path), - help="Directory with scene timestamp files", -) -@click.option( - "--character-detections-dir", - type=click.Path(exists=True, file_okay=False, path_type=Path), - help="Directory with character detection files", -) -@click.option( - "--object-detections-dir", - type=click.Path(exists=True, file_okay=False, path_type=Path), - help="Directory with object detection files", -) -@click.option( - "--output-dir", - type=click.Path(path_type=Path), - default="/app/output_data/elastic_documents", - help="Output directory", -) -@name_option() -@episodes_info_option(required=False) -def generate_elastic_documents( - transcription_jsons: Path, - embeddings_dir: Path, - scene_timestamps_dir: Path, - character_detections_dir: Path, - object_detections_dir: Path, - output_dir: Path, - name: str, - episodes_info_json: Path, -) -> None: - args = { - "transcription_jsons": transcription_jsons, - "embeddings_dir": embeddings_dir, - "scene_timestamps_dir": scene_timestamps_dir, - "character_detections_dir": character_detections_dir, - "object_detections_dir": object_detections_dir, - "output_dir": output_dir, - "series_name": name, - "episodes_info_json": episodes_info_json, - } - - generator = ElasticDocumentGenerator(args) - exit_code = generator.work() - if exit_code != 0: - sys.exit(exit_code) diff --git a/preprocessor/cli/commands/generate_embeddings.py b/preprocessor/cli/commands/generate_embeddings.py deleted file mode 100644 index 9833377ba..000000000 --- a/preprocessor/cli/commands/generate_embeddings.py +++ /dev/null @@ -1,137 +0,0 @@ -from pathlib import Path -import sys - -import click - -from preprocessor.cli_utils.resource_scope import ResourceScope -from preprocessor.config.config import settings -from preprocessor.embeddings.embedding_generator import EmbeddingGenerator - - -@click.command(name="generate-embeddings", context_settings={"show_default": True}) -@click.option( - "--transcription-jsons", - type=click.Path(exists=True, file_okay=False, path_type=Path), - required=True, - help="Directory with transcription JSON files", -) -@click.option( - "--frames-dir", - type=click.Path(exists=True, file_okay=False, path_type=Path), - default=str(settings.frame_export.output_dir), - help="Directory with exported frames", -) -@click.option( - "--output-dir", - type=click.Path(path_type=Path), - default=str(settings.embedding.default_output_dir), - help="Output directory", -) -@click.option( - "--image-hashes-dir", - type=click.Path(path_type=Path), - default=str(settings.image_hash.output_dir), - help="Directory with image hashes", -) -@click.option( - "--model", - default=settings.embedding_model.model_name, - help="Model name", -) -@click.option( - "--segments-per-embedding", - type=int, - default=settings.text_chunking.segments_per_embedding, - help="Segments to group for text embeddings", -) -@click.option( - "--generate-text/--no-text", - default=True, - help="Generate text embeddings", -) -@click.option( - "--generate-video/--no-video", - default=True, - help="Generate video embeddings", -) -@click.option( - "--generate-episode-names/--no-episode-names", - default=True, - help="Generate episode name embeddings", -) -@click.option( - "--generate-full-episode/--no-full-episode", - default=True, - help="Generate full episode embeddings", -) -@click.option( - "--generate-sound-events/--no-sound-events", - default=True, - help="Generate sound event embeddings", -) -@click.option( - "--device", - type=click.Choice(["cuda"]), - default="cuda", - help="Device: cuda (GPU only)", -) -@click.option( - "--batch-size", - type=int, - default=settings.embedding.batch_size, - help="Batch size for GPU inference. Reduce if OOM errors occur", -) -@click.option( - "--sentences-per-chunk", - type=int, - default=settings.text_chunking.text_sentences_per_chunk, - help="Number of sentences per chunk (only for --sentence-chunking)", -) -@click.option( - "--chunk-overlap", - type=int, - default=settings.text_chunking.text_chunk_overlap, - help="Number of overlapping sentences between chunks (only for --sentence-chunking)", -) -def generate_embeddings( # pylint: disable=too-many-arguments - transcription_jsons: Path, - frames_dir: Path, - output_dir: Path, - image_hashes_dir: Path, - model: str, - segments_per_embedding: int, - generate_text: bool, - generate_video: bool, - generate_episode_names: bool, - generate_full_episode: bool, - generate_sound_events: bool, - device: str, - batch_size: int, - sentences_per_chunk: int, - chunk_overlap: int, -): - """Generate text and video embeddings from transcriptions and exported frames.""" - with ResourceScope(): - generator = EmbeddingGenerator( - { - "transcription_jsons": transcription_jsons, - "frames_dir": frames_dir, - "output_dir": output_dir, - "image_hashes_dir": image_hashes_dir, - "model": model, - "segments_per_embedding": segments_per_embedding, - "generate_text": generate_text, - "generate_video": generate_video, - "generate_episode_names": generate_episode_names, - "generate_full_episode": generate_full_episode, - "generate_sound_events": generate_sound_events, - "device": device, - "batch_size": batch_size, - "text_sentences_per_chunk": sentences_per_chunk, - "text_chunk_overlap": chunk_overlap, - }, - ) - exit_code = generator.work() - generator.cleanup() - - sys.exit(exit_code) diff --git a/preprocessor/cli/commands/image_hashing.py b/preprocessor/cli/commands/image_hashing.py deleted file mode 100644 index 1b80a8251..000000000 --- a/preprocessor/cli/commands/image_hashing.py +++ /dev/null @@ -1,63 +0,0 @@ -from pathlib import Path -import sys - -import click - -from preprocessor.cli.utils import create_state_manager -from preprocessor.config.config import settings -from preprocessor.hashing.image_hash_processor import ImageHashProcessor - - -@click.command(context_settings={"show_default": True}) -@click.option( - "--frames-dir", - type=click.Path(exists=True, file_okay=False, path_type=Path), - default=str(settings.frame_export.output_dir), - help="Directory with exported frames", -) -@click.option( - "--episodes-info-json", - type=click.Path(exists=True, path_type=Path), - required=True, - help="JSON file with episode metadata", -) -@click.option( - "--output-dir", - type=click.Path(path_type=Path), - default=str(settings.image_hash.output_dir), - help="Output directory for image hashes", -) -@click.option( - "--batch-size", - type=int, - default=settings.embedding.batch_size, - help="Batch size for processing", -) -@click.option("--name", required=True, help="Series name") -@click.option("--no-state", is_flag=True, help="Disable state management (no resume on interrupt)") -def image_hashing( - frames_dir: Path, - episodes_info_json: Path, - output_dir: Path, - batch_size: int, - name: str, - no_state: bool, -): - """Generate perceptual hashes for exported frames.""" - state_manager = create_state_manager(name, no_state) - - hasher = ImageHashProcessor( - { - "frames_dir": frames_dir, - "output_dir": output_dir, - "batch_size": batch_size, - "device": "cuda", - "series_name": name, - "episodes_info_json": episodes_info_json, - "state_manager": state_manager, - }, - ) - - exit_code = hasher.work() - hasher.cleanup() - sys.exit(exit_code) diff --git a/preprocessor/cli/commands/import_transcriptions.py b/preprocessor/cli/commands/import_transcriptions.py deleted file mode 100644 index 99af5534d..000000000 --- a/preprocessor/cli/commands/import_transcriptions.py +++ /dev/null @@ -1,66 +0,0 @@ -from pathlib import Path -import sys - -import click - -from preprocessor.cli.utils import create_state_manager -from preprocessor.config.config import settings -from preprocessor.transcription.importer import TranscriptionImporter -from preprocessor.utils.console import console - - -@click.command(name="import-transcriptions", context_settings={"show_default": True}) -@click.option( - "--source-dir", - type=click.Path(exists=True, file_okay=False, path_type=Path), - required=True, - help="Directory with source transcriptions (11labs format)", -) -@click.option( - "--output-dir", - type=click.Path(path_type=Path), - default=str(settings.transcription.output_dir), - help="Output directory for converted transcriptions", -) -@click.option( - "--episodes-info-json", - type=click.Path(exists=True, path_type=Path), - help="JSON file with episode metadata", -) -@click.option("--name", required=True, help="Series name") -@click.option( - "--format-type", - type=click.Choice(["11labs_segmented", "11labs"]), - default="11labs_segmented", - help="Source format: 11labs_segmented or 11labs", -) -@click.option("--no-state", is_flag=True, help="Disable state management (no resume on interrupt)") -def import_transcriptions( - source_dir: Path, - output_dir: Path, - episodes_info_json: Path, - name: str, - format_type: str, - no_state: bool, -): - """Import and convert transcriptions from external sources.""" - state_manager = create_state_manager(name, no_state) - - importer = TranscriptionImporter( - { - "source_dir": source_dir, - "output_dir": output_dir, - "episodes_info_json": episodes_info_json, - "series_name": name, - "format_type": format_type, - "state_manager": state_manager, - }, - ) - - exit_code = importer.work() - - if state_manager and exit_code == 0: - console.print("[green]Import completed successfully![/green]") - state_manager.cleanup() - - sys.exit(exit_code) diff --git a/preprocessor/cli/commands/index.py b/preprocessor/cli/commands/index.py deleted file mode 100644 index 29aea19d1..000000000 --- a/preprocessor/cli/commands/index.py +++ /dev/null @@ -1,29 +0,0 @@ -from pathlib import Path -import sys - -import click - -from preprocessor.config.config import settings -from preprocessor.indexing.elasticsearch import ElasticSearchIndexer - - -@click.command() -@click.option("--name", required=True, help="Elasticsearch index name") -@click.option( - "--elastic-documents-dir", - type=click.Path(exists=True, path_type=Path), - default=str(settings.elastic_documents.output_dir) if hasattr(settings, 'elastic_documents') else "/app/output_data/elastic_documents", - help="Directory with generated elastic documents", -) -@click.option("--dry-run", is_flag=True, help="Validate without sending to Elasticsearch") -@click.option("--append", is_flag=True, help="Append to existing indices instead of recreating") -def index(name: str, elastic_documents_dir: Path, dry_run: bool, append: bool): - """Index documents into Elasticsearch (creates 3 indices: segments, text_embeddings, video_frames).""" - indexer = ElasticSearchIndexer({ - "name": name, - "elastic_documents_dir": elastic_documents_dir, - "dry_run": dry_run, - "append": append, - }) - exit_code = indexer.work() - sys.exit(exit_code) diff --git a/preprocessor/cli/commands/process_character_references.py b/preprocessor/cli/commands/process_character_references.py deleted file mode 100644 index 4b9d1a432..000000000 --- a/preprocessor/cli/commands/process_character_references.py +++ /dev/null @@ -1,60 +0,0 @@ -from pathlib import Path -import sys - -import click - -from preprocessor.characters.reference_processor import CharacterReferenceProcessor -from preprocessor.cli.utils import create_state_manager -from preprocessor.config.config import settings - - -@click.command(context_settings={"show_default": True}) -@click.option( - "--characters-dir", - type=click.Path(exists=True, path_type=Path), - default=str(settings.character.output_dir), - help="Directory with character reference images", -) -@click.option( - "--output-dir", - type=click.Path(path_type=Path), - default=str(settings.character.processed_references_dir), - help="Output directory for processed references", -) -@click.option( - "--similarity-threshold", - type=float, - default=settings.character.reference_matching_threshold, - help="Threshold for face similarity when matching between reference images", -) -@click.option( - "--interactive/--no-interactive", - default=True, - help="Enable interactive mode for ambiguous cases", -) -@click.option("--name", required=True, help="Series name") -@click.option("--no-state", is_flag=True, help="Disable state management (no resume on interrupt)") -def process_character_references( - characters_dir: Path, - output_dir: Path, - similarity_threshold: float, - interactive: bool, - name: str, - no_state: bool, -): - """Process character reference images to identify and extract common faces.""" - state_manager = create_state_manager(name, no_state) - - processor = CharacterReferenceProcessor( - { - "characters_dir": characters_dir, - "output_dir": output_dir, - "similarity_threshold": similarity_threshold, - "interactive": interactive, - "series_name": name, - "state_manager": state_manager, - }, - ) - - exit_code = processor.work() - sys.exit(exit_code) diff --git a/preprocessor/cli/commands/run_all.py b/preprocessor/cli/commands/run_all.py deleted file mode 100644 index 76cda6071..000000000 --- a/preprocessor/cli/commands/run_all.py +++ /dev/null @@ -1,297 +0,0 @@ -from pathlib import Path -import sys -from typing import Tuple - -import click - -from preprocessor.cli.pipeline.orchestrator import PipelineOrchestrator -from preprocessor.cli.pipeline.steps import ( - run_archive_generation_step, - run_character_reference_download_step, - run_character_reference_processing_step, - run_character_scrape_step, - run_elastic_documents_step, - run_embedding_step, - run_frame_export_step, - run_frame_processing_step, - run_index_step, - run_scene_step, - run_scrape_step, - run_sound_separation_step, - run_text_analysis_step, - run_transcode_step, - run_transcribe_step, - run_validation_step, -) -from preprocessor.cli.utils import create_state_manager -from preprocessor.config.config import settings -from preprocessor.utils.console import console -from preprocessor.utils.resolution import Resolution - - -@click.command(context_settings={"show_default": True}) -@click.argument("videos", type=click.Path(exists=True, file_okay=False, path_type=Path)) -@click.option( - "--episodes-info-json", - type=click.Path(path_type=Path), - help="JSON file with episode metadata (required if not using --scrape-urls)", -) -@click.option( - "--transcoded-videos", - type=click.Path(path_type=Path), - help="Output directory for transcoded videos", -) -@click.option( - "--transcription-jsons", - type=click.Path(path_type=Path), - default=str(settings.transcription.output_dir), - help="Output directory for transcription JSONs", -) -@click.option( - "--scene-timestamps-dir", - type=click.Path(path_type=Path), - default=str(settings.scene_detection.output_dir), - help="Output directory for scene timestamps", -) -@click.option("--series-name", required=True, help="Series name") -@click.option( - "--resolution", - type=click.Choice(Resolution.get_all_choices()), - default="720p", - help="Target resolution for transcoding", -) -@click.option( - "--codec", - help="Video codec", -) -@click.option( - "--model", - default=settings.transcription.model, - help="Whisper model", -) -@click.option( - "--language", - default=settings.transcription.language, - help="Language for transcription", -) -@click.option("--dry-run", is_flag=True, help="Dry run for Elasticsearch indexing") -@click.option("--no-state", is_flag=True, help="Disable state management (no resume on interrupt)") -@click.option( - "--ramdisk-path", - type=click.Path(path_type=Path), - help="Path to ramdisk for temporary files (e.g., /mnt/ramdisk)", -) -@click.option( - "--scrape-urls", - multiple=True, - help="URLs to scrape episode metadata from (Step 0a: optional)", -) -@click.option( - "--character-urls", - multiple=True, - help="URLs to scrape character metadata from (Step 0b: optional)", -) -@click.option( - "--search-mode", - type=click.Choice(["normal", "premium"]), - default="normal", - help="Image search mode: normal (DuckDuckGo) or premium (Google Images API)", -) -@click.option( - "--transcription-mode", - type=click.Choice(["normal", "premium"]), - default="normal", - help="Transcription mode: normal (Whisper) or premium (ElevenLabs API)", -) -@click.option( - "--parser-mode", - type=click.Choice(["normal", "premium"]), - default="normal", - help="Parser mode: normal (Qwen local model) or premium (Gemini 2.5 Flash)", -) -@click.option( - "--skip-character-reference-processing", - is_flag=True, - help="Skip Step 0d: Character reference processing (use existing processed references)", -) -@click.option( - "--interactive-character-processing", - is_flag=True, - help="Enable interactive mode for character reference processing (allows manual face selection)", -) -@click.option("--skip-transcode", is_flag=True, help="Skip Step 1: Transcoding (use existing transcoded videos)") -@click.option("--skip-transcribe", is_flag=True, help="Skip Step 2: Transcription (use existing transcriptions)") -@click.option("--skip-text-analysis", is_flag=True, help="Skip Step 3: Text analysis (use existing text statistics)") -@click.option("--skip-scenes", is_flag=True, help="Skip Step 4: Scene detection (use existing scene timestamps)") -@click.option("--skip-frame-export", is_flag=True, help="Skip Step 5: Frame export (use existing frames)") -@click.option("--skip-embeddings", is_flag=True, help="Skip Step 6: Text embedding generation (use existing text embeddings)") -@click.option("--skip-full-episode", is_flag=True, help="Skip full episode embedding generation (only text, video, sound events)") -@click.option("--skip-image-hashing", is_flag=True, help="Skip Step 7a: Image hashing sub-step (use existing hashes)") -@click.option("--skip-video-embeddings", is_flag=True, help="Skip Step 7b: Video embeddings sub-step (use existing)") -@click.option("--skip-character-detection", is_flag=True, help="Skip Step 7c: Character detection sub-step (use existing)") -@click.option("--skip-emotion-detection", is_flag=True, help="Skip Step 7d: Emotion detection sub-step (use existing)") -@click.option("--skip-face-clustering", is_flag=True, help="Skip Step 7e: Face clustering sub-step (use existing)") -@click.option("--skip-object-detection", is_flag=True, help="Skip Step 7f: Object detection sub-step (use existing)") -@click.option("--debug-visualizations", is_flag=True, help="Enable debug visualizations for character and object detections (disabled by default)") -@click.option("--skip-elastic-documents", is_flag=True, help="Skip Step 8: Generate Elasticsearch documents (use existing documents)") -@click.option("--skip-archives", is_flag=True, help="Skip Step 9: Archive generation (use existing archives)") -@click.option("--skip-index", is_flag=True, help="Skip Step 10: Elasticsearch indexing") -@click.option("--skip-validation", is_flag=True, help="Skip Step 11: Output validation") -def run_all( # pylint: disable=too-many-arguments,too-many-locals,too-many-statements - videos: Path, - episodes_info_json: Path, - transcoded_videos: Path, - transcription_jsons: Path, - scene_timestamps_dir: Path, - series_name: str, - resolution: str, - codec: str, - model: str, - language: str, - dry_run: bool, - no_state: bool, - ramdisk_path: Path, - scrape_urls: Tuple[str, ...], - character_urls: Tuple[str, ...], - search_mode: str, - transcription_mode: str, - parser_mode: str, - skip_character_reference_processing: bool, - interactive_character_processing: bool, - skip_transcode: bool, - skip_transcribe: bool, - skip_text_analysis: bool, - skip_scenes: bool, - skip_frame_export: bool, - skip_embeddings: bool, - skip_full_episode: bool, - skip_image_hashing: bool, - skip_video_embeddings: bool, - skip_character_detection: bool, - skip_emotion_detection: bool, - skip_face_clustering: bool, - skip_object_detection: bool, - debug_visualizations: bool, - skip_elastic_documents: bool, - skip_archives: bool, - skip_index: bool, - skip_validation: bool, -): - """Run complete video processing pipeline.""" - if transcoded_videos is None: # pylint: disable=duplicate-code - transcoded_videos = settings.transcode.output_dir - if codec is None: - codec = settings.transcode.codec - - if not episodes_info_json: - default_episodes_json = Path("/app/output_data") / f"{series_name}_episodes.json" - if default_episodes_json.exists(): - episodes_info_json = default_episodes_json - console.print(f"[cyan]Using existing episodes JSON: {episodes_info_json}[/cyan]") - elif scrape_urls: - episodes_info_json = default_episodes_json - console.print(f"[cyan]Will scrape episodes to: {episodes_info_json}[/cyan]") - else: - console.print("[red]Error: Either --episodes-info-json, --scrape-urls must be provided, or existing episodes JSON must exist[/red]") - console.print(f"[yellow]Expected location: {default_episodes_json}[/yellow]") - sys.exit(1) - - characters_json = None - default_characters_json = Path("/app/output_data") / f"{series_name}_characters.json" - - if default_characters_json.exists(): - characters_json = default_characters_json - console.print(f"[cyan]Using existing characters JSON: {characters_json}[/cyan]") - elif character_urls: - characters_json = default_characters_json - console.print(f"[cyan]Will scrape characters to: {characters_json}[/cyan]") - else: - characters_json = settings.character.characters_list_file - if characters_json and Path(characters_json).exists(): - console.print(f"[cyan]Using default characters JSON: {characters_json}[/cyan]") - else: - console.print("[yellow]No characters JSON found. Character processing may be skipped.[/yellow]") - - state_manager = create_state_manager(series_name, no_state) - - if ramdisk_path: - console.print(f"[cyan]Using ramdisk: {ramdisk_path}[/cyan]") - - params = { - "videos": videos, - "episodes_info_json": episodes_info_json, - "transcoded_videos": transcoded_videos, - "transcription_jsons": transcription_jsons, - "scene_timestamps_dir": scene_timestamps_dir, - "output_frames": settings.frame_export.output_dir, - "name": series_name, - "resolution": resolution, - "codec": codec, - "model": model, - "language": language, - "device": "cuda", - "dry_run": dry_run, - "ramdisk_path": ramdisk_path, - "scrape_urls": scrape_urls, - "character_urls": character_urls, - "characters_json": characters_json, - "search_mode": search_mode, - "transcription_mode": transcription_mode, - "parser_mode": parser_mode, - "state_manager": state_manager, - "interactive_character_processing": interactive_character_processing, - "debug_visualizations": debug_visualizations, - "skip_image_hashing": skip_image_hashing, - "skip_video_embeddings": skip_video_embeddings, - "skip_character_detection": skip_character_detection, - "skip_character_visualization": not debug_visualizations, - "skip_emotion_detection": skip_emotion_detection, - "skip_face_clustering": skip_face_clustering, - "skip_object_detection": skip_object_detection, - "skip_object_visualization": not debug_visualizations, - "skip_full_episode": skip_full_episode, - } - - metadata_output_dir = Path("/app/output_data/processing_metadata") - - orchestrator = PipelineOrchestrator( - state_manager=state_manager, - series_name=series_name, - metadata_output_dir=metadata_output_dir, - ) - skip_character_visualization = not debug_visualizations - skip_object_visualization = not debug_visualizations - skip_frame_processing = ( - skip_image_hashing and skip_video_embeddings and skip_character_detection - and skip_character_visualization and skip_emotion_detection and skip_face_clustering - and skip_object_detection and skip_object_visualization - ) - - orchestrator.add_step("Scraping episode metadata", "0a/14", run_scrape_step, skip=False) - orchestrator.add_step("Scraping character metadata", "0b/14", run_character_scrape_step, skip=False) - orchestrator.add_step("Downloading character references", "0c/14", run_character_reference_download_step, skip=False) - orchestrator.add_step("Processing character references", "0d/14", run_character_reference_processing_step, skip=skip_character_reference_processing) - orchestrator.add_step("Transcoding videos", "1/14", run_transcode_step, skip=skip_transcode) - orchestrator.add_step("Generating transcriptions", "2/14", run_transcribe_step, skip=skip_transcribe) - orchestrator.add_step("Separating sounds and dialogues", "3/14", run_sound_separation_step, skip=skip_transcribe) - orchestrator.add_step("Analyzing transcription texts", "4/14", run_text_analysis_step, skip=skip_text_analysis) - orchestrator.add_step("Detecting scenes", "5/14", run_scene_step, skip=skip_scenes) - orchestrator.add_step("Exporting frames", "6/14", run_frame_export_step, skip=skip_frame_export) - orchestrator.add_step("Generating text embeddings", "7/14", run_embedding_step, skip=skip_embeddings) - orchestrator.add_step( - "Processing frames (hashing + embeddings + characters + emotions + clustering + objects)", - "8/14", - run_frame_processing_step, - skip=skip_frame_processing, - ) - orchestrator.add_step("Generating Elasticsearch documents", "9/14", run_elastic_documents_step, skip=skip_elastic_documents) - orchestrator.add_step("Archiving Elasticsearch documents", "10/14", run_archive_generation_step, skip=skip_archives) - orchestrator.add_step("Indexing in Elasticsearch", "11/14", run_index_step, skip=skip_index) - orchestrator.add_step("Validating output data", "12/14", run_validation_step, skip=skip_validation) - - exit_code = orchestrator.execute(**params) - - if exit_code == 0: - console.print("\n[green]All steps completed successfully![/green]") - - sys.exit(exit_code) diff --git a/preprocessor/cli/commands/scrape_episodes.py b/preprocessor/cli/commands/scrape_episodes.py deleted file mode 100644 index 9813d9115..000000000 --- a/preprocessor/cli/commands/scrape_episodes.py +++ /dev/null @@ -1,65 +0,0 @@ -from pathlib import Path -import sys -from typing import Tuple - -import click - -from preprocessor.scraping.episode_scraper import EpisodeScraper - - -@click.command(name="scrape-episodes", context_settings={"show_default": True}) -@click.option( - "--urls", - multiple=True, - required=True, - help="URL to scrape (specify multiple times for multiple sources)", -) -@click.option( - "--output-file", - type=click.Path(path_type=Path), - required=True, - help="Output JSON file path", -) -@click.option( - "--headless/--no-headless", - default=True, - help="Run browser in headless mode", -) -@click.option( - "--merge-sources/--no-merge", - default=True, - help="Merge data from multiple sources", -) -@click.option( - "--videos-dir", - type=click.Path(exists=True, file_okay=False, path_type=Path), - help="Directory containing video files for coverage validation", -) -@click.option( - "--parser-mode", - type=click.Choice(["normal", "premium"], case_sensitive=False), - default="normal", - help="Parser mode: normal (Qwen local model) or premium (Gemini 2.5 Flash)", -) -def scrape_episodes( - urls: Tuple[str, ...], - output_file: Path, - headless: bool, - merge_sources: bool, - videos_dir: Path, - parser_mode: str, -): - """Scrape episode metadata from websites.""" - scraper = EpisodeScraper( - { - "urls": list(urls), - "output_file": output_file, - "headless": headless, - "merge_sources": merge_sources, - "videos_dir": videos_dir, - "parser_mode": parser_mode, - }, - ) - - exit_code = scraper.work() - sys.exit(exit_code) diff --git a/preprocessor/cli/commands/search.py b/preprocessor/cli/commands/search.py deleted file mode 100644 index d6c31dcb1..000000000 --- a/preprocessor/cli/commands/search.py +++ /dev/null @@ -1,753 +0,0 @@ -# pylint: disable=duplicate-code,too-many-arguments,too-many-statements -import asyncio -import json -from pathlib import Path -import sys - -from PIL import Image -import click -from elasticsearch import AsyncElasticsearch -from qwen_vl_utils import process_vision_info -import torch -from transformers import ( - AutoModelForVision2Seq, - AutoProcessor, -) - -from preprocessor.config.config import settings -from preprocessor.hashing.image_hasher import PerceptualHasher -from preprocessor.utils.constants import ( - ElasticsearchAggregationKeys, - ElasticsearchKeys, - EpisodeMetadataKeys, -) - -_model = None -_processor = None -_device = None -_hasher = None - - -def load_model(): - global _model, _processor, _device # pylint: disable=global-statement - if _model is not None: - return _model, _processor, _device - - click.echo("Loading embedding model...", err=True) - if not torch.cuda.is_available(): - raise RuntimeError("CUDA is required but not available. This pipeline requires GPU.") - - model_name = settings.embedding_model.model_name - _device = "cuda" - - _model = AutoModelForVision2Seq.from_pretrained( - model_name, - dtype=torch.bfloat16, - device_map="auto", - ) - _processor = AutoProcessor.from_pretrained(model_name) - - click.echo(f"Model loaded on {_device}", err=True) - return _model, _processor, _device - - -def get_text_embedding(text): - model, processor, device = load_model() - - messages = [{ - "role": "user", - "content": [{"type": "text", "text": text}], - }] - - text_inputs = processor.apply_chat_template( - messages, - add_generation_prompt=True, - tokenize=True, - return_tensors="pt", - ).to(device) - - with torch.no_grad(): - output = model(input_ids=text_inputs, output_hidden_states=True) - embedding = output.hidden_states[-1][:, -1, :].squeeze(0) - embedding = torch.nn.functional.normalize(embedding, p=2, dim=0) - - return embedding.float().cpu().numpy().tolist() - - -def _get_image_embedding(image_path): - model, processor, device = load_model() - - messages = [{ - "role": "user", - "content": [ - {"type": "image", "image": image_path}, - {"type": "text", "text": "Describe this image."}, - ], - }] - - text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) - image_inputs, video_inputs = process_vision_info(messages) - - inputs = processor( - text=[text], - images=image_inputs, - videos=video_inputs, - padding=True, - return_tensors="pt", - ) - inputs = inputs.to(device) - - with torch.no_grad(): - output = model(**inputs, output_hidden_states=True) - embedding = output.hidden_states[-1][:, -1, :].squeeze(0) - embedding = torch.nn.functional.normalize(embedding, p=2, dim=0) - - return embedding.float().cpu().numpy().tolist() - - -def _load_hasher(): - global _hasher # pylint: disable=global-statement - if _hasher is not None: - return _hasher - - click.echo("Loading perceptual hasher...", err=True) - if not torch.cuda.is_available(): - raise RuntimeError("CUDA is required but not available. This pipeline requires GPU.") - - _hasher = PerceptualHasher(device="cuda", hash_size=8) - click.echo("Hasher loaded on cuda", err=True) - return _hasher - - -def _get_perceptual_hash(image_path): - hasher = _load_hasher() - image = Image.open(image_path).convert("RGB") - hashes = hasher.compute_phash_batch([image]) - return hashes[0] if hashes else None - - -async def search_text_query(es_client, query, season=None, episode=None, limit=20): - must_clauses = [{ - "multi_match": { - "query": query, - "fields": ["text^2", "episode_metadata.title"], - "fuzziness": "AUTO", - }, - }] - - if season is not None: - must_clauses.append({"term": {"episode_metadata.season": season}}) - if episode is not None: - must_clauses.append({"term": {"episode_metadata.episode_number": episode}}) - - query_body = {"bool": {"must": must_clauses}} - - return await es_client.search( - index="ranczo_segments", - query=query_body, - size=limit, - _source=["episode_id", "segment_id", "text", "start_time", "end_time", "speaker", "video_path", "episode_metadata", "scene_info"], - ) - - -async def search_text_semantic(es_client, text, season=None, episode=None, limit=10): - embedding = get_text_embedding(text) - - filter_clauses = [] - if season is not None: - filter_clauses.append({"term": {"episode_metadata.season": season}}) - if episode is not None: - filter_clauses.append({"term": {"episode_metadata.episode_number": episode}}) - - knn_query = { - "field": "text_embedding", - "query_vector": embedding, - "k": limit, - "num_candidates": limit * 10, - } - if filter_clauses: - knn_query["filter"] = filter_clauses - - return await es_client.search( - index="ranczo_text_embeddings", - knn=knn_query, - size=limit, - _source=[ - "episode_id", "embedding_id", "text", "segment_range", - "video_path", "episode_metadata", "scene_info", - ], - ) - - -async def search_video_semantic(es_client, image_path, season=None, episode=None, character=None, limit=10): - embedding = _get_image_embedding(image_path) - - filter_clauses = [] - if season is not None: - filter_clauses.append({"term": {"episode_metadata.season": season}}) - if episode is not None: - filter_clauses.append({"term": {"episode_metadata.episode_number": episode}}) - if character: - filter_clauses.append({ - "nested": { - "path": "character_appearances", - "query": {"term": {"character_appearances.name": character}}, - }, - }) - - knn_query = { - "field": "video_embedding", - "query_vector": embedding, - "k": limit, - "num_candidates": limit * 10, - } - if filter_clauses: - knn_query["filter"] = filter_clauses - - return await es_client.search( - index="ranczo_video_frames", - knn=knn_query, - size=limit, - _source=[ - "episode_id", "frame_number", "timestamp", "frame_type", "scene_number", - "perceptual_hash", "video_path", "episode_metadata", "character_appearances", "scene_info", - ], - ) - - -async def search_text_to_video(es_client, text, season=None, episode=None, character=None, limit=10): - embedding = get_text_embedding(text) - - filter_clauses = [] - if season is not None: - filter_clauses.append({"term": {"episode_metadata.season": season}}) - if episode is not None: - filter_clauses.append({"term": {"episode_metadata.episode_number": episode}}) - if character: - filter_clauses.append({ - "nested": { - "path": "character_appearances", - "query": {"term": {"character_appearances.name": character}}, - }, - }) - - knn_query = { - "field": "video_embedding", - "query_vector": embedding, - "k": limit, - "num_candidates": limit * 10, - } - if filter_clauses: - knn_query["filter"] = filter_clauses - - return await es_client.search( - index="ranczo_video_frames", - knn=knn_query, - size=limit, - _source=[ - "episode_id", "frame_number", "timestamp", "frame_type", "scene_number", - "perceptual_hash", "video_path", "episode_metadata", "character_appearances", "scene_info", - ], - ) - - -async def search_by_character(es_client, character, season=None, episode=None, limit=20): - must_clauses = [{ - "nested": { - "path": "character_appearances", - "query": {"term": {"character_appearances.name": character}}, - }, - }] - - if season is not None: - must_clauses.append({"term": {"episode_metadata.season": season}}) - if episode is not None: - must_clauses.append({"term": {"episode_metadata.episode_number": episode}}) - - return await es_client.search( - index="ranczo_video_frames", - query={"bool": {"must": must_clauses}}, - size=limit, - _source=["episode_id", "frame_number", "timestamp", "video_path", "episode_metadata", "character_appearances", "scene_info"], - ) - - -async def search_by_emotion(es_client, emotion, season=None, episode=None, character=None, limit=20): - nested_must = [{"term": {"character_appearances.emotion.label": emotion}}] - if character: - nested_must.append({"term": {"character_appearances.name": character}}) - - must_clauses = [{ - "nested": { - "path": "character_appearances", - "query": {"bool": {"must": nested_must}}, - }, - }] - - if season is not None: - must_clauses.append({"term": {"episode_metadata.season": season}}) - if episode is not None: - must_clauses.append({"term": {"episode_metadata.episode_number": episode}}) - - nested_filter = {"term": {"character_appearances.emotion.label": emotion}} - if character: - nested_filter = { - "bool": { - "must": [ - {"term": {"character_appearances.emotion.label": emotion}}, - {"term": {"character_appearances.name": character}}, - ], - }, - } - - return await es_client.search( - index="ranczo_video_frames", - query={"bool": {"must": must_clauses}}, - sort=[ - { - "character_appearances.emotion.confidence": { - "order": "desc", - "nested": { - "path": "character_appearances", - "filter": nested_filter, - }, - }, - }, - ], - track_scores=True, - size=limit, - _source=["episode_id", "frame_number", "timestamp", "video_path", "episode_metadata", "character_appearances", "scene_info"], - ) - - -async def search_by_object(es_client, object_query, season=None, episode=None, limit=20): - filter_clauses = [] - if season is not None: - filter_clauses.append({"term": {"episode_metadata.season": season}}) - if episode is not None: - filter_clauses.append({"term": {"episode_metadata.episode_number": episode}}) - - must_clauses = [] - - if ":" in object_query: - object_class, count_filter = object_query.split(":", 1) - object_class = object_class.strip() - - if count_filter.endswith("+"): - min_count = int(count_filter[:-1]) - must_clauses.append({ - "nested": { - "path": "detected_objects", - "query": { - "bool": { - "must": [ - {"term": {"detected_objects.class": object_class}}, - {"range": {"detected_objects.count": {"gte": min_count}}}, - ], - }, - }, - }, - }) - elif "-" in count_filter: - min_c, max_c = count_filter.split("-") - must_clauses.append({ - "nested": { - "path": "detected_objects", - "query": { - "bool": { - "must": [ - {"term": {"detected_objects.class": object_class}}, - {"range": {"detected_objects.count": {"gte": int(min_c), "lte": int(max_c)}}}, - ], - }, - }, - }, - }) - else: - exact_count = int(count_filter) - must_clauses.append({ - "nested": { - "path": "detected_objects", - "query": { - "bool": { - "must": [ - {"term": {"detected_objects.class": object_class}}, - {"term": {"detected_objects.count": exact_count}}, - ], - }, - }, - }, - }) - else: - must_clauses.append({ - "nested": { - "path": "detected_objects", - "query": { - "term": {"detected_objects.class": object_query.strip()}, - }, - }, - }) - - query_body = { - "bool": { - "must": must_clauses, - "filter": filter_clauses, - }, - } - - object_class = object_query.split(":")[0].strip() if ":" in object_query else object_query.strip() - - return await es_client.search( - index="ranczo_video_frames", - query=query_body, - sort=[ - { - "detected_objects.count": { - "order": "desc", - "nested": { - "path": "detected_objects", - "filter": {"term": {"detected_objects.class": object_class}}, - }, - }, - }, - ], - track_scores=True, - size=limit, - _source=["episode_id", "frame_number", "timestamp", "detected_objects", "character_appearances", "video_path", "episode_metadata", "scene_info"], - ) - - -async def search_perceptual_hash(es_client, phash, limit=10): - return await es_client.search( - index="ranczo_video_frames", - query={"term": {"perceptual_hash": phash}}, - size=limit, - _source=["episode_id", "frame_number", "timestamp", "video_path", "episode_metadata", "perceptual_hash", "scene_info"], - ) - - -async def list_characters(es_client): - result = await es_client.search( - index="ranczo_video_frames", - size=0, - aggs={ - "characters_nested": { - "nested": {"path": "character_appearances"}, - "aggs": { - "character_names": { - "terms": {"field": "character_appearances.name", "size": 1000}, - }, - }, - }, - }, - ) - buckets = result["aggregations"]["characters_nested"]["character_names"]["buckets"] - return [(b["key"], b["doc_count"]) for b in buckets] - - -async def list_objects(es_client): - result = await es_client.search( - index="ranczo_video_frames", - size=0, - aggs={ - "objects_nested": { - "nested": {"path": "detected_objects"}, - "aggs": { - "object_classes": { - "terms": {"field": "detected_objects.class", "size": 1000}, - }, - }, - }, - }, - ) - buckets = result["aggregations"]["objects_nested"]["object_classes"]["buckets"] - return [(b["key"], b["doc_count"]) for b in buckets] - - -async def search_episode_name(es_client, query, season=None, limit=20): - must_clauses = [{ - "multi_match": { - "query": query, - "fields": ["title^2", "episode_metadata.title"], - "fuzziness": "AUTO", - }, - }] - - if season is not None: - must_clauses.append({"term": {"episode_metadata.season": season}}) - - query_body = {"bool": {"must": must_clauses}} - - return await es_client.search( - index="ranczo_episode_names", - query=query_body, - size=limit, - _source=["episode_id", "title", "video_path", "episode_metadata"], - ) - - -async def search_episode_name_semantic(es_client, text, season=None, limit=10): - embedding = get_text_embedding(text) - - filter_clauses = [] - if season is not None: - filter_clauses.append({"term": {"episode_metadata.season": season}}) - - knn_query = { - "field": "title_embedding", - "query_vector": embedding, - "k": limit, - "num_candidates": limit * 10, - } - if filter_clauses: - knn_query["filter"] = filter_clauses - - return await es_client.search( - index="ranczo_episode_names", - knn=knn_query, - size=limit, - _source=["episode_id", "title", "video_path", "episode_metadata"], - ) - - -async def get_stats(es_client): - return { - "segments": (await es_client.count(index="ranczo_segments"))["count"], - "text_embeddings": (await es_client.count(index="ranczo_text_embeddings"))["count"], - "video_embeddings": (await es_client.count(index="ranczo_video_frames"))["count"], - "episode_names": (await es_client.count(index="ranczo_episode_names"))["count"], - } - - -def format_timestamp(seconds): - minutes = int(seconds // 60) - secs = seconds % 60 - return f"{minutes}m {secs:.1f}s" - - -def _format_scene_context(scene_info): - if not scene_info: - return "" - start = format_timestamp(scene_info.get('scene_start_time', 0)) - end = format_timestamp(scene_info.get('scene_end_time', 0)) - return f" [Scene {scene_info.get('scene_number', '?')}: {start} - {end}]" - - -def _print_results(result, result_type="text"): # pylint: disable=too-many-locals - total = result[ElasticsearchKeys.HITS][ElasticsearchKeys.TOTAL][ElasticsearchAggregationKeys.VALUE] - hits = result[ElasticsearchKeys.HITS][ElasticsearchKeys.HITS] - - click.echo(f"\nZnaleziono: {total} wynikow") - click.echo("=" * 80) - - for i, hit in enumerate(hits, 1): - source = hit[ElasticsearchKeys.SOURCE] - score = hit[ElasticsearchKeys.SCORE] - meta = source[EpisodeMetadataKeys.EPISODE_METADATA] - scene_ctx = _format_scene_context(source.get("scene_info")) - - click.echo(f"\n[{i}] Score: {score:.2f}") - season_code = "S00" if meta['season'] == 0 else f"S{meta['season']:02d}" - click.echo(f"Episode: {season_code}E{meta['episode_number']:02d} - {meta.get('title', 'N/A')}") - - if result_type == "text": - click.echo(f"Segment ID: {source.get('segment_id', 'N/A')}") - start_time = format_timestamp(source['start_time']) - end_time = format_timestamp(source['end_time']) - click.echo(f"Time: {start_time} - {end_time}{scene_ctx}") - click.echo(f"Speaker: {source.get('speaker', 'N/A')}") - click.echo(f"Text: {source['text']}") - elif result_type == "text_semantic": - click.echo(f"Segments: {source['segment_range'][0]}-{source['segment_range'][1]}{scene_ctx}") - click.echo(f"Embedding ID: {source.get('embedding_id', 'N/A')}") - click.echo(f"Text: {source['text']}") - elif result_type == "episode_name": - click.echo(f"Episode Title: {source.get('title', 'N/A')}") - else: - timestamp = format_timestamp(source['timestamp']) - click.echo(f"Frame: {source['frame_number']} @ {timestamp}{scene_ctx}") - if "frame_type" in source: - click.echo(f"Type: {source['frame_type']}") - if "scene_number" in source: - click.echo(f"Scene number: {source['scene_number']}") - if "perceptual_hash" in source: - click.echo(f"Hash: {source['perceptual_hash']}") - if source.get("character_appearances"): - chars_strs = [] - for char in source['character_appearances']: - char_str = char.get('name', 'Unknown') - if char.get('emotion'): - emotion_label = char['emotion'].get('label', '?') - emotion_conf = char['emotion'].get('confidence', 0) - char_str += f" ({emotion_label} {emotion_conf:.2f})" - chars_strs.append(char_str) - click.echo(f"Characters: {', '.join(chars_strs)}") - if source.get("detected_objects"): - objects_str = ", ".join([f"{obj['class']}:{obj['count']}" for obj in source['detected_objects']]) - click.echo(f"Objects: {objects_str}") - - click.echo(f"Path: {source['video_path']}") - - -@click.command(context_settings={"show_default": True}) -@click.option("--text", type=str, help="Full-text search po transkrypcjach") -@click.option("--text-semantic", type=str, help="Semantic search po text embeddings") -@click.option("--text-to-video", type=str, help="Cross-modal search: text query w video embeddings") -@click.option("--image", type=click.Path(exists=True, path_type=Path), help="Semantic search po video embeddings") -@click.option("--hash", "phash", type=str, help="Szukaj po perceptual hash (podaj hash string lub sciezke do obrazka)") -@click.option("--character", type=str, help="Szukaj po postaci") -@click.option("--emotion", type=str, help="Szukaj po emocji (neutral, happiness, surprise, sadness, anger, disgust, fear, contempt)") -@click.option("--object", "object_query", type=str, help="Szukaj po wykrytych obiektach (np. 'dog', 'person:5+', 'chair:2-4')") -@click.option("--episode-name", type=str, help="Fuzzy search po nazwach odcinkow") -@click.option("--episode-name-semantic", type=str, help="Semantic search po nazwach odcinkow") -@click.option("--list-characters", "list_chars_flag", is_flag=True, help="Lista wszystkich postaci") -@click.option("--list-objects", "list_objects_flag", is_flag=True, help="Lista wszystkich klas obiektow") -@click.option("--season", type=int, help="Filtruj po sezonie") -@click.option("--episode", type=int, help="Filtruj po odcinku") -@click.option("--limit", type=int, default=20, help="Limit wynikow") -@click.option("--stats", is_flag=True, help="Pokaz statystyki indeksow") -@click.option("--json-output", is_flag=True, help="Output w formacie JSON") -@click.option("--host", type=str, default="http://localhost:9200", help="Elasticsearch host") -def search( # pylint: disable=too-many-locals - text, text_semantic, text_to_video, image, phash, character, emotion, object_query, episode_name, - episode_name_semantic, list_chars_flag, list_objects_flag, season, episode, limit, - stats, json_output, host, -): - """Search tool - comprehensive Elasticsearch search""" - - if not any([ - text, text_semantic, text_to_video, image, phash, character, emotion, - object_query, episode_name, episode_name_semantic, list_chars_flag, list_objects_flag, stats, - ]): - click.echo("Podaj przynajmniej jedna opcje wyszukiwania. Uzyj --help", err=True) - sys.exit(1) - - hash_value = None - if phash: - phash_path = Path(phash) - if phash_path.exists() and phash_path.is_file(): - click.echo(f"Computing perceptual hash from image: {phash}", err=True) - hash_value = _get_perceptual_hash(str(phash_path)) - if hash_value: - click.echo(f"Computed hash: {hash_value}", err=True) - else: - click.echo("Failed to compute hash from image", err=True) - sys.exit(1) - else: - hash_value = phash - - async def run(): # pylint: disable=too-many-branches - es_client = AsyncElasticsearch(hosts=[host], verify_certs=False) - - try: - await es_client.ping() - except Exception: - click.echo(f"✗ Cannot connect to Elasticsearch at {host}", err=True) - click.echo("Make sure Elasticsearch is running:", err=True) - click.echo(" docker-compose -f docker-compose.test.yml up -d", err=True) - sys.exit(1) - - try: - if stats: - result = await get_stats(es_client) - if json_output: - click.echo(json.dumps(result, indent=2)) - else: - click.echo("\nStatystyki:") - click.echo(f" Segments: {result['segments']:,}") - click.echo(f" Text Embeddings: {result['text_embeddings']:,}") - click.echo(f" Video Embeddings: {result['video_embeddings']:,}") - click.echo(f" Episode Names: {result['episode_names']:,}") - - elif list_chars_flag: - chars = await list_characters(es_client) - if json_output: - click.echo(json.dumps(chars, indent=2)) - else: - click.echo(f"\nZnaleziono {len(chars)} postaci:") - for char, count in sorted(chars, key=lambda x: -x[1]): - click.echo(f" {char}: {count:,} wystapien") - - elif list_objects_flag: - objects = await list_objects(es_client) - if json_output: - click.echo(json.dumps(objects, indent=2)) - else: - click.echo(f"\nZnaleziono {len(objects)} klas obiektow:") - for obj, count in sorted(objects, key=lambda x: -x[1]): - click.echo(f" {obj}: {count:,} wystapien") - - elif text: - result = await search_text_query(es_client, text, season, episode, limit) - if json_output: - click.echo(json.dumps(result["hits"], indent=2)) - else: - _print_results(result, "text") - - elif text_semantic: - result = await search_text_semantic(es_client, text_semantic, season, episode, limit) - if json_output: - click.echo(json.dumps(result["hits"], indent=2)) - else: - _print_results(result, "text_semantic") - - elif text_to_video: - result = await search_text_to_video(es_client, text_to_video, season, episode, character, limit) - if json_output: - click.echo(json.dumps(result["hits"], indent=2)) - else: - _print_results(result, "video") - - elif image: - result = await search_video_semantic(es_client, str(image), season, episode, character, limit) - if json_output: - click.echo(json.dumps(result["hits"], indent=2)) - else: - _print_results(result, "video") - - elif emotion: - result = await search_by_emotion(es_client, emotion, season, episode, character, limit) - if json_output: - click.echo(json.dumps(result["hits"], indent=2)) - else: - _print_results(result, "video") - - elif character: - result = await search_by_character(es_client, character, season, episode, limit) - if json_output: - click.echo(json.dumps(result["hits"], indent=2)) - else: - _print_results(result, "video") - - elif object_query: - result = await search_by_object(es_client, object_query, season, episode, limit) - if json_output: - click.echo(json.dumps(result["hits"], indent=2)) - else: - _print_results(result, "video") - - elif hash_value: - result = await search_perceptual_hash(es_client, hash_value, limit) - if json_output: - click.echo(json.dumps(result["hits"], indent=2)) - else: - _print_results(result, "video") - - elif episode_name: - result = await search_episode_name(es_client, episode_name, season, limit) - if json_output: - click.echo(json.dumps(result["hits"], indent=2)) - else: - _print_results(result, "episode_name") - - elif episode_name_semantic: - result = await search_episode_name_semantic(es_client, episode_name_semantic, season, limit) - if json_output: - click.echo(json.dumps(result["hits"], indent=2)) - else: - _print_results(result, "episode_name") - - finally: - await es_client.close() - - asyncio.run(run()) diff --git a/preprocessor/cli/commands/separate_sounds.py b/preprocessor/cli/commands/separate_sounds.py deleted file mode 100644 index cc7f7bc88..000000000 --- a/preprocessor/cli/commands/separate_sounds.py +++ /dev/null @@ -1,45 +0,0 @@ -from pathlib import Path -import sys - -import click - -from preprocessor.cli_utils.resource_scope import ResourceScope -from preprocessor.config.config import settings -from preprocessor.transcription.processors.sound_separator import SoundEventSeparator - - -@click.command(context_settings={"show_default": True}) -@click.option( - "--transcription-dir", - type=click.Path(exists=True, path_type=Path), - default=str(settings.transcription.output_dir), - help="Directory with transcription JSON files", -) -@click.option( - "--episodes-info-json", - type=click.Path(exists=True, path_type=Path), - required=True, - help="JSON file with episode metadata", -) -@click.option( - "--series-name", - required=True, - help="Series name", -) -def separate_sounds( - transcription_dir: Path, - episodes_info_json: Path, - series_name: str, -): - """Separate sound events from dialogues in transcription files.""" - args = { - "transcription_dir": transcription_dir, - "episodes_info_json": episodes_info_json, - "series_name": series_name, - } - - with ResourceScope(): - separator = SoundEventSeparator(args) - exit_code = separator.work() - - sys.exit(exit_code) diff --git a/preprocessor/cli/commands/transcode.py b/preprocessor/cli/commands/transcode.py deleted file mode 100644 index 38dc1e760..000000000 --- a/preprocessor/cli/commands/transcode.py +++ /dev/null @@ -1,95 +0,0 @@ -from pathlib import Path -import sys - -import click - -from preprocessor.cli.utils import create_state_manager -from preprocessor.cli_utils.resource_scope import ResourceScope -from preprocessor.config.config import ( - TranscodeConfig, - settings, -) -from preprocessor.utils.resolution import Resolution -from preprocessor.video.transcoder import VideoTranscoder - - -@click.command(context_settings={"show_default": True}) -@click.argument("videos", type=click.Path(exists=True, file_okay=False, path_type=Path)) -@click.option( - "--transcoded-videos", - type=click.Path(path_type=Path), - default=str(settings.transcode.output_dir), - help="Output directory for transcoded videos", -) -@click.option( - "--resolution", - type=click.Choice(Resolution.get_all_choices()), - default="720p", - help="Target resolution for videos", -) -@click.option( - "--codec", - help="Video codec: h264_nvenc (GPU), libx264 (CPU)", -) -@click.option( - "--gop-size", - type=float, - help="Keyframe interval in seconds", -) -@click.option( - "--episodes-info-json", - type=click.Path(exists=True, path_type=Path), - help="JSON file with episode metadata", -) -@click.option("--name", help="Series name for state management and resume support") -@click.option("--no-state", is_flag=True, help="Disable state management (no resume on interrupt)") -def transcode( - videos: Path, - transcoded_videos: Path, - resolution: str, - codec: str, - gop_size: float, - episodes_info_json: Path, - name: str, - no_state: bool, -): - """Transcode videos to target resolution with FFmpeg.""" - if transcoded_videos is None: # pylint: disable=duplicate-code - transcoded_videos = settings.transcode.output_dir - if codec is None: - codec = settings.transcode.codec - if gop_size is None: - gop_size = settings.transcode.gop_size - - state_manager = create_state_manager(name, no_state) - - video_bitrate_mbps = settings.transcode.calculate_video_bitrate_mbps() - minrate_mbps = settings.transcode.calculate_minrate_mbps() - maxrate_mbps = settings.transcode.calculate_maxrate_mbps() - bufsize_mbps = settings.transcode.calculate_bufsize_mbps() - - config = TranscodeConfig( - videos=videos, - transcoded_videos=transcoded_videos, - resolution=Resolution.from_str(resolution), - codec=codec, - gop_size=gop_size, - episodes_info_json=episodes_info_json, - video_bitrate_mbps=video_bitrate_mbps, - minrate_mbps=minrate_mbps, - maxrate_mbps=maxrate_mbps, - bufsize_mbps=bufsize_mbps, - audio_bitrate_kbps=settings.transcode.audio_bitrate_kbps, - ) - config_dict = config.to_dict() - config_dict["state_manager"] = state_manager - config_dict["series_name"] = name or "unknown" - - with ResourceScope(): - transcoder = VideoTranscoder(config_dict) - exit_code = transcoder.work() - - if state_manager and exit_code == 0: - state_manager.cleanup() - - sys.exit(exit_code) diff --git a/preprocessor/cli/commands/transcribe.py b/preprocessor/cli/commands/transcribe.py deleted file mode 100644 index dec170f73..000000000 --- a/preprocessor/cli/commands/transcribe.py +++ /dev/null @@ -1,79 +0,0 @@ -from pathlib import Path -import sys -from typing import Tuple - -import click - -from preprocessor.cli_utils.resource_scope import ResourceScope -from preprocessor.config.config import ( - TranscriptionConfig, - settings, -) -from preprocessor.transcription.generator import TranscriptionGenerator - -# pylint: disable=duplicate-code - - - -@click.command(context_settings={"show_default": True}) -@click.argument("videos", type=click.Path(exists=True, file_okay=False, path_type=Path)) -@click.option( - "--episodes-info-json", - type=click.Path(exists=True, path_type=Path), - required=True, - help="JSON file with episode metadata", -) -@click.option( - "--transcription-jsons", - type=click.Path(path_type=Path), - default=str(settings.transcription.output_dir), - help="Output directory for transcription JSONs", -) -@click.option( - "--model", - default=settings.transcription.model, - help="Whisper model: tiny, base, small, medium, large, large-v3-turbo", -) -@click.option( - "--language", - default=settings.transcription.language, - help="Language for transcription", -) -@click.option( - "--extra-json-keys", - multiple=True, - help="Additional JSON keys to remove from output (can specify multiple times)", -) -@click.option( - "--name", - required=True, - help="Series name for output files", -) -def transcribe( - videos: Path, - episodes_info_json: Path, - transcription_jsons: Path, - model: str, - language: str, - extra_json_keys: Tuple[str, ...], - name: str, -): - """Generate transcriptions using Whisper.""" - config = TranscriptionConfig( - videos=videos, - episodes_info_json=episodes_info_json, - transcription_jsons=transcription_jsons, - model=model, - language=language, - device="cuda", - extra_json_keys_to_remove=list(extra_json_keys), - name=name, - ) - - config_dict = config.to_dict() - - with ResourceScope(): - generator = TranscriptionGenerator(config_dict) - exit_code = generator.work() - - sys.exit(exit_code) diff --git a/preprocessor/cli/commands/transcribe_elevenlabs.py b/preprocessor/cli/commands/transcribe_elevenlabs.py deleted file mode 100644 index 4814b4273..000000000 --- a/preprocessor/cli/commands/transcribe_elevenlabs.py +++ /dev/null @@ -1,81 +0,0 @@ -from pathlib import Path -import sys - -import click - -from preprocessor.cli.utils import create_state_manager -from preprocessor.config.config import settings -from preprocessor.transcription.elevenlabs import ElevenLabsTranscriber -from preprocessor.utils.console import console - - -@click.command(name="transcribe-elevenlabs", context_settings={"show_default": True}) -@click.argument("videos", type=click.Path(exists=True, file_okay=False, path_type=Path)) -@click.option( - "--output-dir", - type=click.Path(path_type=Path), - default=str(settings.transcription.output_dir), - help="Output directory for transcriptions", -) -@click.option( - "--episodes-info-json", - type=click.Path(exists=True, path_type=Path), - help="JSON file with episode metadata", -) -@click.option("--name", required=True, help="Series name") -@click.option( - "--api-key", - envvar="ELEVEN_API_KEY", - help="ElevenLabs API key (or set ELEVEN_API_KEY env var)", -) -@click.option( - "--model-id", - default="scribe_v1", - help="ElevenLabs model ID", -) -@click.option( - "--language-code", - default="pol", - help="Language code: pol, eng, etc", -) -@click.option( - "--diarize/--no-diarize", - default=True, - help="Enable speaker diarization", -) -@click.option("--no-state", is_flag=True, help="Disable state management (no resume on interrupt)") -def transcribe_elevenlabs( - videos: Path, - output_dir: Path, - episodes_info_json: Path, - name: str, - api_key: str, - model_id: str, - language_code: str, - diarize: bool, - no_state: bool, -): - """Transcribe videos using ElevenLabs API.""" - state_manager = create_state_manager(name, no_state) - - transcriber = ElevenLabsTranscriber( - { - "videos": videos, - "output_dir": output_dir, - "episodes_info_json": episodes_info_json, - "series_name": name, - "api_key": api_key, - "model_id": model_id, - "language_code": language_code, - "diarize": diarize, - "state_manager": state_manager, - }, - ) - - exit_code = transcriber.work() - - if state_manager and exit_code == 0: - console.print("[green]Transcription completed successfully![/green]") - state_manager.cleanup() - - sys.exit(exit_code) diff --git a/preprocessor/cli/commands/validate.py b/preprocessor/cli/commands/validate.py deleted file mode 100644 index e90007d37..000000000 --- a/preprocessor/cli/commands/validate.py +++ /dev/null @@ -1,48 +0,0 @@ -from pathlib import Path -import sys - -import click - -from preprocessor.validation.validator import Validator - - -@click.command(context_settings={"show_default": True}) -@click.option( - "--season", - type=str, - required=True, - help="Season to validate (e.g., S10)", -) -@click.option( - "--anomaly-threshold", - type=float, - default=20.0, - help="Threshold for anomaly detection (%)", -) -@click.option( - "--series-name", - type=str, - default="ranczo", - help="Series name for file naming", -) -@click.option( - "--episodes-info-json", - type=click.Path(exists=True, path_type=Path), - help="JSON file with episode metadata (optional, for episode titles)", -) -def validate( - season: str, - anomaly_threshold: float, - series_name: str, - episodes_info_json: Path, -): - """Validate preprocessor output for a season.""" - validator = Validator( - season=season, - series_name=series_name, - anomaly_threshold=anomaly_threshold, - episodes_info_json=episodes_info_json, - ) - - exit_code = validator.validate() - sys.exit(exit_code) diff --git a/preprocessor/cli/helpers.py b/preprocessor/cli/helpers.py new file mode 100644 index 000000000..cf2813c32 --- /dev/null +++ b/preprocessor/cli/helpers.py @@ -0,0 +1,102 @@ +from dataclasses import dataclass +from pathlib import Path +from typing import Optional + +from preprocessor.core.context import ExecutionContext +from preprocessor.core.state_manager import StateManager +from preprocessor.services.core.logging import ErrorHandlingLogger +from preprocessor.services.episodes.episode_manager import EpisodeManager +from preprocessor.services.io.path_service import PathService + + +@dataclass(frozen=True) +class PipelineSetup: + context: ExecutionContext + logger: ErrorHandlingLogger + state_manager: StateManager + episode_manager: Optional[EpisodeManager] = None + + +class PipelineContextFactory: + @staticmethod + def build( + series: str, + logger_name: str, + force_rerun: bool = False, + with_episode_manager: bool = True, + ) -> PipelineSetup: + logger = PipelineContextFactory.__create_logger(logger_name) + base_dir = PathService.get_output_base() + series_output_dir = PipelineContextFactory.__ensure_output_dir(base_dir, series) + + state_manager = PipelineContextFactory.__create_state_manager(series, series_output_dir) + + context = ExecutionContext( + series_name=series, + base_output_dir=base_dir, + logger=logger, + state_manager=state_manager, + force_rerun=force_rerun, + ) + + episode_manager = None + if with_episode_manager: + input_base = PathService.get_input_base() + episode_manager = PipelineContextFactory.__create_episode_manager( + series, input_base, logger, + ) + + return PipelineSetup( + logger=logger, + state_manager=state_manager, + context=context, + episode_manager=episode_manager, + ) + + @staticmethod + def __create_episode_manager( + series: str, input_base: Path, logger: ErrorHandlingLogger, + ) -> Optional[EpisodeManager]: + input_episodes = input_base / series / 'episodes.json' + output_episodes = PathService.get_output_base() / series / f'{series}_episodes.json' + + if input_episodes.exists(): + episodes_json: Optional[Path] = input_episodes + elif output_episodes.exists(): + episodes_json = output_episodes + else: + episodes_json = None + + return EpisodeManager(episodes_json, series, logger) + + @staticmethod + def __create_logger( + command_name: str, + loglevel: int = ErrorHandlingLogger.INFO, + ) -> ErrorHandlingLogger: + return ErrorHandlingLogger(class_name=command_name, loglevel=loglevel, error_exit_code=1) + + @staticmethod + def __create_state_manager(series_name: str, working_dir: Path) -> StateManager: + state_manager = StateManager(series_name=series_name, working_dir=working_dir) + state_manager.load_or_create_state() + return state_manager + + @staticmethod + def __ensure_output_dir(base_dir: Path, series: str) -> Path: + series_output_dir = base_dir / series + series_output_dir.mkdir(parents=True, exist_ok=True) + return series_output_dir + + +def setup_pipeline_context( + series: str, + logger_name: str, + force_rerun: bool = False, + with_episode_manager: bool = True, +) -> PipelineSetup: + return PipelineContextFactory.build(series, logger_name, force_rerun, with_episode_manager) + + +def create_cli_logger(command_name: str, loglevel: int = ErrorHandlingLogger.INFO) -> ErrorHandlingLogger: + return ErrorHandlingLogger(class_name=command_name, loglevel=loglevel, error_exit_code=1) diff --git a/preprocessor/cli/options/common.py b/preprocessor/cli/options/common.py deleted file mode 100644 index 504cc07c4..000000000 --- a/preprocessor/cli/options/common.py +++ /dev/null @@ -1,35 +0,0 @@ -from pathlib import Path - -import click - - -def episodes_info_option(required=True): - return click.option( - "--episodes-info-json", - type=click.Path(exists=True, path_type=Path), - required=required, - help="JSON file with episode metadata", - ) - - -def name_option(required=True): - return click.option( - "--name", - required=required, - help="Series name for state management and resume support", - ) - - -def state_option(): - return click.option( - "--no-state", - is_flag=True, - help="Disable state management (no resume on interrupt)", - ) - - -def videos_argument(): - return click.argument( - "videos", - type=click.Path(exists=True, file_okay=False, path_type=Path), - ) diff --git a/preprocessor/cli/pipeline/orchestrator.py b/preprocessor/cli/pipeline/orchestrator.py deleted file mode 100644 index 500def747..000000000 --- a/preprocessor/cli/pipeline/orchestrator.py +++ /dev/null @@ -1,160 +0,0 @@ -from dataclasses import dataclass -import json -from pathlib import Path -from typing import ( - Any, - Callable, - Dict, - List, - Optional, -) - -from preprocessor.cli_utils.resource_scope import ResourceScope -from preprocessor.config.config import ( - get_output_path, - settings, -) -from preprocessor.core.processing_metadata import ProcessingMetadata -from preprocessor.core.state_manager import StateManager -from preprocessor.utils.console import console - -ELASTIC_SUBDIRS = settings.output_subdirs.elastic_document_subdirs - - -@dataclass -class PipelineStep: - name: str - step_num: str - execute_func: Callable - skip: bool = False - - -class PipelineOrchestrator: - def __init__(self, state_manager: Optional[StateManager] = None, series_name: Optional[str] = None, metadata_output_dir: Optional[Path] = None): - self.state_manager = state_manager - self.steps: List[PipelineStep] = [] - self.series_name = series_name - self.metadata_output_dir = metadata_output_dir - self.metadata: Optional[ProcessingMetadata] = None - - def add_step(self, name: str, step_num: str, func: Callable, skip: bool = False): - self.steps.append(PipelineStep(name, step_num, func, skip)) - - def execute(self, **params) -> int: - if self.series_name: - self.metadata = ProcessingMetadata(series_name=self.series_name, params=params) - - try: - exit_code = self.__run_all_steps(params) - if self.state_manager: - self.state_manager.cleanup() - self.__finalize_metadata(exit_code) - return exit_code - except KeyboardInterrupt: - console.print("\n[yellow]Pipeline interrupted by user[/yellow]") - self.__finalize_metadata(130) - return 130 - - def __run_all_steps(self, params: Dict[str, Any]) -> int: - for step in self.steps: - step_metadata = None - if self.metadata: - step_metadata = self.metadata.add_step(name=step.name, step_num=step.step_num) - - if step.skip: - console.print(f"[yellow]Step {step.step_num}: {step.name} - SKIPPED[/yellow]") - if step_metadata: - step_metadata.skip() - continue - - console.print(f"[bold blue]Step {step.step_num}: {step.name}[/bold blue]") - - if step_metadata: - step_metadata.start() - - try: - with ResourceScope(): - exit_code = step.execute_func(**params) - except KeyboardInterrupt: - console.print(f"\n[yellow]Step {step.step_num} interrupted[/yellow]") - if step_metadata: - step_metadata.finish(130) - return 130 - - if step_metadata: - step_metadata.finish(exit_code) - - if exit_code != 0: - console.print(f"[red]Step {step.step_num} failed with exit code {exit_code}[/red]") - return exit_code - - return 0 - - def __finalize_metadata(self, exit_code: int): - if self.metadata: - additional_stats = self.__collect_additional_statistics() - self.metadata.finish_processing(exit_code, additional_stats) - - if self.metadata_output_dir: - metadata_file = self.metadata_output_dir / f"{self.series_name}_processing_metadata.json" - self.metadata.save_to_file(metadata_file) - console.print(f"[green]Processing metadata saved to: {metadata_file}[/green]") - - def __collect_additional_statistics(self) -> Dict[str, Any]: # pylint: disable=too-many-locals - stats: Dict[str, Any] = {} - # noinspection PyBroadException - try: # pylint: disable=too-many-try-statements - transcription_jsons_dir = Path(self.metadata.params.get("transcription_jsons", "")) - if transcription_jsons_dir.exists(): - transcription_files = list(transcription_jsons_dir.rglob("*_segmented.json")) - stats["transcription_files_count"] = len(transcription_files) - stats["transcription_files"] = [f.name for f in transcription_files[:20]] - - transcoded_videos_dir = Path(self.metadata.params.get("transcoded_videos", "")) - if transcoded_videos_dir.exists(): - video_files = list(transcoded_videos_dir.rglob("*.mp4")) - stats["transcoded_videos_count"] = len(video_files) - total_size = sum(f.stat().st_size for f in video_files if f.is_file()) - stats["transcoded_videos_total_size_mb"] = round(total_size / (1024 * 1024), 2) - - output_frames_dir = Path(settings.frame_export.output_dir) - if output_frames_dir.exists(): - frame_metadata_files = list(output_frames_dir.rglob("*_frame_metadata.json")) - stats["processed_episodes_count"] = len(frame_metadata_files) - total_frames = 0 - for metadata_file in frame_metadata_files: - try: - with open(metadata_file, "r", encoding="utf-8") as f: - data = json.load(f) - total_frames += data.get("statistics", {}).get("total_frames", 0) - except Exception: - pass - stats["total_frames_extracted"] = total_frames - - embeddings_dir = Path(settings.embedding.default_output_dir) - if embeddings_dir.exists(): - text_embedding_files = list(embeddings_dir.rglob("*_embeddings_text.json")) - video_embedding_files = list(embeddings_dir.rglob("*_embeddings_video.json")) - stats["text_embedding_files_count"] = len(text_embedding_files) - stats["video_embedding_files_count"] = len(video_embedding_files) - - image_hashes_dir = Path(settings.image_hash.output_dir) - if image_hashes_dir.exists(): - hash_files = list(image_hashes_dir.rglob("*_image_hashes.json")) - stats["image_hash_files_count"] = len(hash_files) - - elastic_docs_dir = get_output_path("elastic_documents") - if elastic_docs_dir.exists(): - segment_files = list((elastic_docs_dir / ELASTIC_SUBDIRS.text_segments).rglob("*.jsonl")) - text_emb_files = list((elastic_docs_dir / ELASTIC_SUBDIRS.text_embeddings).rglob("*.jsonl")) - video_frame_files = list((elastic_docs_dir / ELASTIC_SUBDIRS.video_frames).rglob("*.jsonl")) - stats["elastic_documents"] = { - ELASTIC_SUBDIRS.text_segments: len(segment_files), - ELASTIC_SUBDIRS.text_embeddings: len(text_emb_files), - ELASTIC_SUBDIRS.video_frames: len(video_frame_files), - } - - except Exception: - pass - - return stats diff --git a/preprocessor/cli/pipeline/steps.py b/preprocessor/cli/pipeline/steps.py deleted file mode 100644 index f304ba09b..000000000 --- a/preprocessor/cli/pipeline/steps.py +++ /dev/null @@ -1,608 +0,0 @@ -from pathlib import Path - -from preprocessor.config.config import settings -from preprocessor.core.constants import SUPPORTED_VIDEO_EXTENSIONS -from preprocessor.utils.console import console -from preprocessor.video.emotion_detection_subprocessor import EmotionDetectionSubProcessor -from preprocessor.video.face_clustering_subprocessor import FaceClusteringSubProcessor -from preprocessor.video.frame_processor import FrameProcessor -from preprocessor.video.frame_subprocessors import ( - CharacterDetectionSubProcessor, - CharacterDetectionVisualizationSubProcessor, - ImageHashSubProcessor, - ObjectDetectionSubProcessor, - ObjectDetectionVisualizationSubProcessor, - VideoEmbeddingSubProcessor, -) - -# pylint: disable=duplicate-code - - -def run_scrape_step(scrape_urls, episodes_info_json, videos=None, parser_mode="normal", **_kwargs): - from preprocessor.scraping.episode_scraper import EpisodeScraper # pylint: disable=import-outside-toplevel - - if not scrape_urls: - return 0 - - if episodes_info_json.exists(): - console.print( - f"\n[yellow]Scraping episode metadata... SKIPPED (file exists: {episodes_info_json})[/yellow]", - ) - return 0 - - scraper = EpisodeScraper( - { - "urls": list(scrape_urls), - "output_file": episodes_info_json, - "headless": True, - "merge_sources": True, - "videos_dir": videos, - "parser_mode": parser_mode, - }, - ) - scrape_exit_code = scraper.work() - - if scrape_exit_code != 0: - console.print("[red]Scraping failed, aborting pipeline[/red]") - return scrape_exit_code - - console.print(f"[green]Episode metadata saved to: {episodes_info_json}[/green]") - return 0 - - -def run_character_scrape_step(character_urls, characters_json, name, parser_mode="normal", **_kwargs): - from preprocessor.scraping.character_scraper import CharacterScraper # pylint: disable=import-outside-toplevel - - if not character_urls: - return 0 - - if characters_json.exists(): - console.print( - f"\n[yellow]Scraping character metadata... SKIPPED (file exists: {characters_json})[/yellow]", - ) - return 0 - - scraper = CharacterScraper( - { - "urls": list(character_urls), - "output_file": characters_json, - "series_name": name, - "headless": True, - "parser_mode": parser_mode, - }, - ) - scrape_exit_code = scraper.work() - - if scrape_exit_code != 0: - console.print("[red]Character scraping failed[/red]") - return scrape_exit_code - - console.print(f"[green]Character metadata saved to: {characters_json}[/green]") - return 0 - - -def run_character_reference_download_step(name, characters_json, search_mode="normal", **_kwargs): - from preprocessor.characters.reference_downloader import CharacterReferenceDownloader # pylint: disable=import-outside-toplevel - - if not characters_json.exists(): - console.print("[yellow]No characters.json found, skipping reference download[/yellow]") - return 0 - - downloader = CharacterReferenceDownloader( - { - "characters_json": characters_json, - "series_name": name, - "output_dir": settings.character.output_dir, - "images_per_character": settings.character.reference_images_per_character, - "search_mode": search_mode, - }, - ) - return downloader.work() - - -def run_character_reference_processing_step(name, state_manager, interactive_character_processing=False, debug_visualizations=False, **_kwargs): - from preprocessor.characters.reference_processor import CharacterReferenceProcessor # pylint: disable=import-outside-toplevel - - characters_dir = settings.character.output_dir - if not characters_dir.exists() or not list(characters_dir.iterdir()): - console.print("[yellow]No character references found, skipping processing[/yellow]") - return 0 - - processor = CharacterReferenceProcessor( - { - "characters_dir": characters_dir, - "output_dir": settings.character.processed_references_dir, - "similarity_threshold": settings.character.reference_matching_threshold, - "interactive": interactive_character_processing, - "series_name": name, - "state_manager": state_manager, - }, - ) - exit_code = processor.work() - - if exit_code == 0 and debug_visualizations: - processor.generate_validation_grid() - - return exit_code - - -def run_character_detection_step(**kwargs): - from preprocessor.characters.detector import CharacterDetector # pylint: disable=import-outside-toplevel - - frames_dir = kwargs.get("output_frames", settings.frame_export.output_dir) - characters_dir = settings.character.output_dir - output_dir = settings.character.detections_dir - episodes_info_json = kwargs.get("episodes_info_json") - name = kwargs.get("name") - state_manager = kwargs.get("state_manager") - - detector = CharacterDetector( - { - "frames_dir": frames_dir, - "characters_dir": characters_dir, - "output_dir": output_dir, - "episodes_info_json": episodes_info_json, - "series_name": name, - "state_manager": state_manager, - }, - ) - return detector.work() - - -def run_transcode_step(videos, episodes_info_json, name, resolution, codec, state_manager, **kwargs): - from preprocessor.config.config import TranscodeConfig # pylint: disable=import-outside-toplevel - from preprocessor.utils.resolution import Resolution # pylint: disable=import-outside-toplevel - from preprocessor.video.transcoder import VideoTranscoder # pylint: disable=import-outside-toplevel - - transcoded_videos = kwargs.get("transcoded_videos") - - video_bitrate_mbps = settings.transcode.calculate_video_bitrate_mbps() - minrate_mbps = settings.transcode.calculate_minrate_mbps() - maxrate_mbps = settings.transcode.calculate_maxrate_mbps() - bufsize_mbps = settings.transcode.calculate_bufsize_mbps() - - transcode_config = TranscodeConfig( - videos=videos, - transcoded_videos=transcoded_videos, - resolution=Resolution.from_str(resolution), - codec=codec, - gop_size=settings.transcode.gop_size, - episodes_info_json=episodes_info_json, - video_bitrate_mbps=video_bitrate_mbps, - minrate_mbps=minrate_mbps, - maxrate_mbps=maxrate_mbps, - bufsize_mbps=bufsize_mbps, - audio_bitrate_kbps=settings.transcode.audio_bitrate_kbps, - ) - transcode_dict = transcode_config.to_dict() - transcode_dict["state_manager"] = state_manager - transcode_dict["series_name"] = name - - transcoder = VideoTranscoder(transcode_dict) - return transcoder.work() - - -def run_transcribe_step(videos, episodes_info_json, name, model, language, device, ramdisk_path, state_manager, transcription_mode="normal", **kwargs): - transcription_jsons = kwargs.get("transcription_jsons") - - if transcription_mode == "premium": - from preprocessor.transcription.elevenlabs import ElevenLabsTranscriber # pylint: disable=import-outside-toplevel - - console.print("[cyan]Using premium transcription mode (ElevenLabs API)[/cyan]") - - transcriber = ElevenLabsTranscriber( - { - "videos": videos, - "output_dir": transcription_jsons, - "episodes_info_json": episodes_info_json, - "series_name": name, - "api_key": settings.elevenlabs.api_key, - "model_id": settings.elevenlabs.model_id, - "language_code": settings.elevenlabs.language_code, - "diarize": settings.elevenlabs.diarize, - "state_manager": state_manager, - }, - ) - return transcriber.work() - - from preprocessor.config.config import TranscriptionConfig # pylint: disable=import-outside-toplevel - from preprocessor.transcription.generator import TranscriptionGenerator # pylint: disable=import-outside-toplevel - - console.print("[cyan]Using normal transcription mode (Whisper)[/cyan]") - - transcription_config = TranscriptionConfig( - videos=videos, - episodes_info_json=episodes_info_json, - transcription_jsons=transcription_jsons, - model=model, - language=language, - device=device, - extra_json_keys_to_remove=[], - name=name, - ) - transcription_dict = transcription_config.to_dict() - transcription_dict["state_manager"] = state_manager - transcription_dict["series_name"] = name - transcription_dict["ramdisk_path"] = ramdisk_path - - generator = TranscriptionGenerator(transcription_dict) - return generator.work() - - -def run_sound_separation_step(name, episodes_info_json, transcription_jsons, state_manager, **_kwargs): - from preprocessor.transcription.processors.sound_separator import SoundEventSeparator # pylint: disable=import-outside-toplevel - - separator = SoundEventSeparator( - { - "transcription_dir": transcription_jsons, - "episodes_info_json": episodes_info_json, - "series_name": name, - "state_manager": state_manager, - }, - ) - return separator.work() - - -def run_scene_step(device, **kwargs): - from preprocessor.video.scene_detector import SceneDetector # pylint: disable=import-outside-toplevel - - videos = kwargs.get("videos") - scene_timestamps_dir = kwargs.get("scene_timestamps_dir") - name = kwargs.get("name") - episodes_info_json = kwargs.get("episodes_info_json") - - detector = SceneDetector( - { - "videos": videos, - "output_dir": scene_timestamps_dir, - "threshold": settings.scene_detection.threshold, - "min_scene_len": settings.scene_detection.min_scene_len, - "device": device, - "series_name": name, - "episodes_info_json": episodes_info_json, - }, - ) - exit_code = detector.work() - detector.cleanup() - return exit_code - - -def run_frame_export_step(state_manager, **kwargs): - from preprocessor.video.frame_exporter import FrameExporter # pylint: disable=import-outside-toplevel - - videos = kwargs.get("videos") - scene_timestamps_dir = kwargs.get("scene_timestamps_dir") - name = kwargs.get("name") - episodes_info_json = kwargs.get("episodes_info_json") - output_frames = kwargs.get("output_frames", settings.frame_export.output_dir) - - exporter = FrameExporter( - { - "videos": videos, - "scene_timestamps_dir": scene_timestamps_dir, - "output_frames": output_frames, - "resolution": settings.frame_export.resolution, - "series_name": name, - "episodes_info_json": episodes_info_json, - "state_manager": state_manager, - }, - ) - return exporter.work() - - -def run_image_hashing_step(device, state_manager, **kwargs): - from preprocessor.hashing.image_hash_processor import ImageHashProcessor # pylint: disable=import-outside-toplevel - - name = kwargs.get("name") - episodes_info_json = kwargs.get("episodes_info_json") - frames_dir = kwargs.get("output_frames", settings.frame_export.output_dir) - - hasher = ImageHashProcessor( - { - "frames_dir": frames_dir, - "output_dir": settings.image_hash.output_dir, - "batch_size": settings.embedding.batch_size, - "device": device, - "series_name": name, - "episodes_info_json": episodes_info_json, - "state_manager": state_manager, - }, - ) - exit_code = hasher.work() - hasher.cleanup() - return exit_code - - -def run_embedding_step(device, state_manager, **kwargs): - from preprocessor.embeddings.embedding_generator import EmbeddingGenerator # pylint: disable=import-outside-toplevel - - transcription_jsons = kwargs.get("transcription_jsons") - name = kwargs.get("name") - episodes_info_json = kwargs.get("episodes_info_json") - frames_dir = kwargs.get("output_frames", settings.frame_export.output_dir) - skip_full_episode = kwargs.get("skip_full_episode", False) - - embedding_generator = EmbeddingGenerator( - { - "transcription_jsons": transcription_jsons, - "frames_dir": frames_dir, - "output_dir": settings.embedding.default_output_dir, - "image_hashes_dir": settings.image_hash.output_dir, - "model": settings.embedding_model.model_name, - "segments_per_embedding": settings.text_chunking.segments_per_embedding, - "generate_text": True, - "generate_video": False, - "generate_full_episode": not skip_full_episode and settings.embedding.generate_full_episode_embedding, - "device": device, - "batch_size": settings.embedding.batch_size, - "series_name": name, - "episodes_info_json": episodes_info_json, - "state_manager": state_manager, - }, - ) - exit_code = embedding_generator.work() - embedding_generator.cleanup() - return exit_code - - -def run_elastic_documents_step(**kwargs): - from preprocessor.config.config import ( # pylint: disable=import-outside-toplevel - BASE_OUTPUT_DIR, - get_output_path, - ) - from preprocessor.indexing.elastic_document_generator import ElasticDocumentGenerator # pylint: disable=import-outside-toplevel - - transcription_jsons = BASE_OUTPUT_DIR / settings.output_subdirs.transcriptions - embeddings_dir = BASE_OUTPUT_DIR / settings.output_subdirs.embeddings - scene_timestamps_dir = kwargs.get("scene_timestamps_dir") or (BASE_OUTPUT_DIR / settings.output_subdirs.scenes) - character_detections_dir = BASE_OUTPUT_DIR / settings.output_subdirs.character_detections - object_detections_dir = BASE_OUTPUT_DIR / settings.output_subdirs.object_detections - name = kwargs.get("name") - episodes_info_json = kwargs.get("episodes_info_json") - - generator = ElasticDocumentGenerator( - { - "transcription_jsons": transcription_jsons, - "embeddings_dir": embeddings_dir, - "scene_timestamps_dir": scene_timestamps_dir, - "character_detections_dir": character_detections_dir, - "object_detections_dir": object_detections_dir, - "output_dir": get_output_path("elastic_documents"), - "series_name": name, - "episodes_info_json": episodes_info_json, - }, - ) - return generator.work() - - -def run_index_step(name, dry_run, state_manager, **kwargs): - from preprocessor.config.config import get_output_path # pylint: disable=import-outside-toplevel - from preprocessor.indexing.elasticsearch import ElasticSearchIndexer # pylint: disable=import-outside-toplevel - - episodes_info_json = kwargs.get("episodes_info_json") - elastic_documents_dir = get_output_path(settings.output_subdirs.elastic_documents) - - indexer = ElasticSearchIndexer({ - "name": name, - "elastic_documents_dir": elastic_documents_dir, - "dry_run": dry_run, - "append": False, - "state_manager": state_manager, - "series_name": name, - "episodes_info_json": episodes_info_json, - }) - return indexer.work() - - -def run_frame_processing_step( # pylint: disable=too-many-locals,too-many-arguments - device, - state_manager, - ramdisk_path, - skip_image_hashing, - skip_video_embeddings, - skip_character_detection, - skip_emotion_detection, - skip_character_visualization, - skip_face_clustering, - skip_object_detection, - skip_object_visualization, - debug_visualizations=False, - **kwargs, -): - name = kwargs.get("name") - episodes_info_json = kwargs.get("episodes_info_json") - output_frames = kwargs.get("output_frames", settings.frame_export.output_dir) - - processor = FrameProcessor( - { - "frames_dir": output_frames, - "ramdisk_path": ramdisk_path or Path("/dev/shm"), - "series_name": name, - "episodes_info_json": episodes_info_json, - "state_manager": state_manager, - }, - ) - - sub_processors = [] - - if not skip_image_hashing: - hash_sub = ImageHashSubProcessor( - device=device, - batch_size=settings.embedding.batch_size, - ) - processor.add_sub_processor(hash_sub) - sub_processors.append(hash_sub) - - if not skip_video_embeddings: - embedding_sub = VideoEmbeddingSubProcessor( - device=device, - batch_size=settings.embedding.batch_size, - model_name=settings.embedding_model.model_name, - model_revision=settings.embedding_model.model_revision, - ) - processor.add_sub_processor(embedding_sub) - sub_processors.append(embedding_sub) - - if not skip_character_detection: - char_detection_sub = CharacterDetectionSubProcessor( - characters_dir=Path(settings.character.output_dir), - use_gpu=True, - threshold=settings.character.frame_detection_threshold, - ) - processor.add_sub_processor(char_detection_sub) - sub_processors.append(char_detection_sub) - - if not skip_emotion_detection: - emotion_detection_sub = EmotionDetectionSubProcessor() - processor.add_sub_processor(emotion_detection_sub) - sub_processors.append(emotion_detection_sub) - - if not skip_character_visualization: - char_viz_sub = CharacterDetectionVisualizationSubProcessor() - processor.add_sub_processor(char_viz_sub) - sub_processors.append(char_viz_sub) - - if not skip_face_clustering: - face_clustering_sub = FaceClusteringSubProcessor( - min_cluster_size=settings.face_clustering.min_cluster_size, - min_samples=settings.face_clustering.min_samples, - save_noise=settings.face_clustering.save_noise, - save_full_frames=debug_visualizations, - ) - processor.add_sub_processor(face_clustering_sub) - sub_processors.append(face_clustering_sub) - - if not skip_object_detection: - object_detection_sub = ObjectDetectionSubProcessor( - model_name=settings.object_detection.model_name, - conf_threshold=settings.object_detection.conf_threshold, - ) - processor.add_sub_processor(object_detection_sub) - sub_processors.append(object_detection_sub) - - if not skip_object_visualization: - object_viz_sub = ObjectDetectionVisualizationSubProcessor() - processor.add_sub_processor(object_viz_sub) - sub_processors.append(object_viz_sub) - - try: - return processor.work() - finally: - for sub in sub_processors: - sub.cleanup() - processor.cleanup() - - -def run_validation_step(name, episodes_info_json, **kwargs): # pylint: disable=too-many-locals - from preprocessor.config.config import BASE_OUTPUT_DIR # pylint: disable=import-outside-toplevel - from preprocessor.validation.global_validator import GlobalValidator # pylint: disable=import-outside-toplevel - from preprocessor.validation.validator import Validator # pylint: disable=import-outside-toplevel - - console.print("[bold cyan]Running global validation...[/bold cyan]") - global_validator = GlobalValidator(series_name=name, base_output_dir=BASE_OUTPUT_DIR) - global_result = global_validator.validate() - - validation_reports_dir = BASE_OUTPUT_DIR / settings.output_subdirs.validation_reports - validation_reports_dir.mkdir(parents=True, exist_ok=True) - - from preprocessor.utils.file_utils import atomic_write_json # pylint: disable=import-outside-toplevel - global_report_path = validation_reports_dir / f"{name}_global.json" - atomic_write_json(global_report_path, global_result.to_dict()) - - if global_result.errors: - console.print(f"[red]Global validation errors: {len(global_result.errors)}[/red]") - for error in global_result.errors[:5]: - console.print(f" - {error}") - if global_result.warnings: - console.print(f"[yellow]Global validation warnings: {len(global_result.warnings)}[/yellow]") - - input_videos_path = kwargs.get("videos") - if not input_videos_path or not input_videos_path.exists(): - console.print("[yellow]No input videos directory found, skipping episode validation[/yellow]") - return 0 - - seasons = sorted([d for d in input_videos_path.iterdir() if d.is_dir() and d.name.startswith("S")]) - if not seasons: - console.print("[yellow]No seasons found in input videos directory, skipping episode validation[/yellow]") - return 0 - - seasons_with_videos = [] - for season_dir in seasons: - video_files = [] - for ext in SUPPORTED_VIDEO_EXTENSIONS: - video_files.extend(list(season_dir.glob(f"**/*{ext}"))) - if video_files: - seasons_with_videos.append(season_dir) - console.print(f"[cyan]Found {len(video_files)} video file(s) in {season_dir.name}[/cyan]") - else: - console.print(f"[yellow]Skipping {season_dir.name}: no video files found[/yellow]") - - if not seasons_with_videos: - console.print("[yellow]No seasons with video files found, skipping episode validation[/yellow]") - return 0 - - for season_dir in seasons_with_videos: - import re # pylint: disable=import-outside-toplevel - - season_name = season_dir.name - match = re.search(r'(\d+)', season_name) - if match: - season_number = int(match.group(1)) - season = f"S{season_number:02d}" - else: - season = season_name - - validator = Validator( - season=season, - series_name=name, - anomaly_threshold=20.0, - base_output_dir=BASE_OUTPUT_DIR, - episodes_info_json=episodes_info_json, - ) - - console.print(f"[cyan]Validating season {season} (from folder: {season_name})...[/cyan]") - exit_code = validator.validate() - - if exit_code != 0: - console.print(f"[red]Validation failed for season {season}[/red]") - return exit_code - - console.print("[green]All validations completed successfully[/green]") - return 0 - - -def run_text_analysis_step(name, episodes_info_json, language, state_manager, **_kwargs): - from preprocessor.text_analysis.text_analyzer import TextAnalyzer # pylint: disable=import-outside-toplevel - - analyzer = TextAnalyzer( - { - "series_name": name, - "episodes_info_json": episodes_info_json, - "language": language, - "state_manager": state_manager, - }, - ) - return analyzer.work() - - -def run_archive_generation_step(**kwargs): - from preprocessor.config.config import ( # pylint: disable=import-outside-toplevel - BASE_OUTPUT_DIR, - get_output_path, - ) - from preprocessor.indexing.archive_generator import ArchiveGenerator # pylint: disable=import-outside-toplevel - - elastic_documents_dir = get_output_path(settings.output_subdirs.elastic_documents) - output_dir = BASE_OUTPUT_DIR / settings.output_subdirs.archives - name = kwargs.get("name") - episodes_info_json = kwargs.get("episodes_info_json") - - generator = ArchiveGenerator( - { - "elastic_documents_dir": elastic_documents_dir, - "output_dir": output_dir, - "series_name": name, - "episodes_info_json": episodes_info_json, - }, - ) - return generator.work() diff --git a/preprocessor/cli/search_handler.py b/preprocessor/cli/search_handler.py new file mode 100644 index 000000000..044ebda3d --- /dev/null +++ b/preprocessor/cli/search_handler.py @@ -0,0 +1,231 @@ +import json +from pathlib import Path +from typing import ( + Any, + Awaitable, + Callable, + Dict, + List, + Optional, +) + +import click +from elasticsearch import AsyncElasticsearch + +from preprocessor.services.search.clients.elasticsearch_queries import ElasticsearchQueries +from preprocessor.services.search.clients.embedding_service import EmbeddingService +from preprocessor.services.search.clients.hash_service import HashService +from preprocessor.services.search.clients.result_formatters import ResultFormatter + + +class SearchFilters: + def __init__( + self, + season: Optional[int] = None, + episode: Optional[int] = None, + character: Optional[str] = None, + limit: int = 20, + ) -> None: + self.__season = season + self.__episode = episode + self.__character = character + self.__limit = limit + + @property + def season(self) -> Optional[int]: + return self.__season + + @property + def episode(self) -> Optional[int]: + return self.__episode + + @property + def character(self) -> Optional[str]: + return self.__character + + @property + def limit(self) -> int: + return self.__limit + + +class SearchCommandHandler: + def __init__( + self, + es_client: AsyncElasticsearch, + _embedding_service: EmbeddingService, + queries: ElasticsearchQueries, + json_output: bool, + ) -> None: + self.__es = es_client + self.__queries = queries + self.__json_output = json_output + + async def handle_stats(self) -> str: + result = await self.__queries.get_stats(self.__es) + if self.__json_output: + return json.dumps(result, indent=2) + + output = [ + "\nStatystyki:", f" Segments: {result['segments']:,}", + f" Text Embeddings: {result['text_embeddings']:,}", + f" Video Embeddings: {result['video_embeddings']:,}", + f" Episode Names: {result['episode_names']:,}", + ] + return "\n".join(output) + + async def handle_list_characters(self) -> str: + chars = await self.__queries.list_characters(self.__es) + if self.__json_output: + return json.dumps(chars, indent=2) + + output = [f"\nZnaleziono {len(chars)} postaci:"] + for char_name, count in sorted(chars, key=lambda x: -x[1]): + output.append(f" {char_name}: {count:,} wystapien") + return "\n".join(output) + + async def handle_list_objects(self) -> str: + objects = await self.__queries.list_objects(self.__es) + if self.__json_output: + return json.dumps(objects, indent=2) + + output = [f"\nZnaleziono {len(objects)} klas obiektow:"] + for obj_name, count in sorted(objects, key=lambda x: -x[1]): + output.append(f" {obj_name}: {count:,} wystapien") + return "\n".join(output) + + async def handle_text_search(self, query: str, filters: SearchFilters) -> str: + return await self.__execute_search( + search_func=lambda: self.__queries.search_text_query( + self.__es, query, filters.season, filters.episode, filters.limit, + ), + result_type="text", + ) + + async def handle_text_semantic_search(self, query: str, filters: SearchFilters) -> str: + return await self.__execute_search( + search_func=lambda: self.__queries.search_text_semantic( + self.__es, query, filters.season, filters.episode, filters.limit, + ), + result_type="text_semantic", + ) + + async def handle_text_to_video_search(self, query: str, filters: SearchFilters) -> str: + return await self.__execute_search( + search_func=lambda: self.__queries.search_text_to_video( + self.__es, query, filters.season, filters.episode, filters.character, filters.limit, + ), + result_type="video", + ) + + async def handle_image_search(self, image_path: Path, filters: SearchFilters) -> str: + return await self.__execute_search( + search_func=lambda: self.__queries.search_video_semantic( + self.__es, str(image_path), filters.season, filters.episode, filters.character, filters.limit, + ), + result_type="video", + ) + + async def handle_emotion_search(self, emotion: str, filters: SearchFilters) -> str: + return await self.__execute_search( + search_func=lambda: self.__queries.search_by_emotion( + self.__es, emotion, filters.season, filters.episode, filters.character, filters.limit, + ), + result_type="video", + ) + + async def handle_character_search(self, character: str, filters: SearchFilters) -> str: + return await self.__execute_search( + search_func=lambda: self.__queries.search_by_character( + self.__es, character, filters.season, filters.episode, filters.limit, + ), + result_type="video", + ) + + async def handle_object_search(self, object_query: str, filters: SearchFilters) -> str: + return await self.__execute_search( + search_func=lambda: self.__queries.search_by_object( + self.__es, object_query, filters.season, filters.episode, filters.limit, + ), + result_type="video", + ) + + async def handle_hash_search(self, hash_value: str, filters: SearchFilters) -> str: + return await self.__execute_search( + search_func=lambda: self.__queries.search_perceptual_hash(self.__es, hash_value, filters.limit), + result_type="video", + ) + + async def handle_episode_name_search(self, episode_name: str, filters: SearchFilters) -> str: + return await self.__execute_search( + search_func=lambda: self.__queries.search_episode_name( + self.__es, episode_name, filters.season, filters.limit, + ), + result_type="episode_name", + ) + + async def handle_episode_name_semantic_search(self, episode_name: str, filters: SearchFilters) -> str: + return await self.__execute_search( + search_func=lambda: self.__queries.search_episode_name_semantic( + self.__es, episode_name, filters.season, filters.limit, + ), + result_type="episode_name", + ) + + @staticmethod + def compute_perceptual_hash(phash_input: str) -> Optional[str]: + phash_path = Path(phash_input) + if phash_path.exists() and phash_path.is_file(): + click.echo(f"Computing perceptual hash from image: {phash_input}", err=True) + hash_svc = HashService() + hash_value = hash_svc.get_perceptual_hash(str(phash_path)) + + if hash_value: + click.echo(f"Computed hash: {hash_value}", err=True) + else: + click.echo("Failed to compute hash from image", err=True) + return None + + hash_svc.cleanup() + return hash_value + + return phash_input + + async def __execute_search( + self, + search_func: Callable[..., Awaitable[Dict[str, Any]]], + result_type: str, + result_key: str = "hits", + ) -> str: + result = await search_func() + + if self.__json_output: + return json.dumps(result.get(result_key, result), indent=2) + + return self.__format_console_output(result, result_type) + + @staticmethod + def __format_console_output(result: Dict[str, Any], result_type: str) -> str: + class __StringBuffer: + def __init__(self) -> None: + self.buffer: List[str] = [] + + def write(self, text: str) -> None: + self.buffer.append(text) + + def getvalue(self) -> str: + return ''.join(self.buffer) + + buffer = __StringBuffer() + original_echo = click.echo + + def __buffer_echo(message: Optional[str] = None, **_kwargs: Any) -> None: + if message is not None: + buffer.write(str(message) + '\n') + + click.echo = __buffer_echo + try: + ResultFormatter.print_results(result, result_type) + finally: + click.echo = original_echo + + return buffer.getvalue().rstrip() diff --git a/preprocessor/cli/search_params.py b/preprocessor/cli/search_params.py new file mode 100644 index 000000000..f9930197d --- /dev/null +++ b/preprocessor/cli/search_params.py @@ -0,0 +1,60 @@ +from dataclasses import dataclass +from pathlib import Path +from typing import Optional + +from preprocessor.cli.search_handler import SearchFilters + + +@dataclass(frozen=True) +class SearchQueryParams: + text: Optional[str] = None + text_semantic: Optional[str] = None + text_to_video: Optional[str] = None + image: Optional[Path] = None + phash: Optional[str] = None + character: Optional[str] = None + emotion: Optional[str] = None + object_query: Optional[str] = None + episode_name: Optional[str] = None + episode_name_semantic: Optional[str] = None + + def has_search_criteria(self) -> bool: + return any([ + self.text, + self.text_semantic, + self.text_to_video, + self.image, + self.phash, + self.character, + self.emotion, + self.object_query, + self.episode_name, + self.episode_name_semantic, + ]) + + +@dataclass(frozen=True) +class SearchActionParams: + list_chars_flag: bool = False + list_objects_flag: bool = False + stats: bool = False + + def has_action(self) -> bool: + return any([ + self.list_chars_flag, + self.list_objects_flag, + self.stats, + ]) + + +@dataclass(frozen=True) +class SearchConfig: + series: str + query: SearchQueryParams + filters: SearchFilters + actions: SearchActionParams + json_output: bool = False + host: str = "http://localhost:9200" + + def has_any_operation(self) -> bool: + return self.query.has_search_criteria() or self.actions.has_action() diff --git a/preprocessor/cli/skip_list_builder.py b/preprocessor/cli/skip_list_builder.py new file mode 100644 index 000000000..e53016e45 --- /dev/null +++ b/preprocessor/cli/skip_list_builder.py @@ -0,0 +1,25 @@ +from typing import ( + List, + Tuple, +) + +from preprocessor.config.series_config import SeriesConfig +from preprocessor.services.core.logging import ErrorHandlingLogger + + +class SkipListBuilder: + @staticmethod + def build( + cli_skip: Tuple[str, ...], + series_config: SeriesConfig, + logger: ErrorHandlingLogger, + ) -> List[str]: + skip_list = list(cli_skip) + + if series_config.pipeline_mode == "selective" and series_config.skip_steps: + logger.info( + f"Selective mode: auto-skipping {', '.join(series_config.skip_steps)}", + ) + skip_list.extend(series_config.skip_steps) + + return list(set(skip_list)) diff --git a/preprocessor/cli/utils.py b/preprocessor/cli/utils.py deleted file mode 100644 index 272d5ae52..000000000 --- a/preprocessor/cli/utils.py +++ /dev/null @@ -1,20 +0,0 @@ -from pathlib import Path -from typing import Optional - -from preprocessor.core.state_manager import StateManager -from preprocessor.utils.console import console - - -def create_state_manager(name: str, no_state: bool) -> Optional[StateManager]: - if no_state or not name: - return None - - state_manager = StateManager(series_name=name, working_dir=Path(".")) - state_manager.register_interrupt_handler() - state_manager.load_or_create_state() - - resume_info = state_manager.get_resume_info() - if resume_info: - console.print(f"[cyan]{resume_info}[/cyan]") - - return state_manager diff --git a/preprocessor/cli_utils/resource_scope.py b/preprocessor/cli_utils/resource_scope.py deleted file mode 100644 index 59b04b8c6..000000000 --- a/preprocessor/cli_utils/resource_scope.py +++ /dev/null @@ -1,19 +0,0 @@ -import gc -import sys - - -class ResourceScope: - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - gc.collect() - if "torch" in sys.modules: - import torch # pylint: disable=import-outside-toplevel - - if torch.cuda.is_available() and torch.cuda.is_initialized(): - try: - torch.cuda.synchronize() - torch.cuda.empty_cache() - except Exception: - pass diff --git a/preprocessor/config/config.py b/preprocessor/config/config.py index 6fd98dc05..15cb84c2c 100644 --- a/preprocessor/config/config.py +++ b/preprocessor/config/config.py @@ -3,365 +3,303 @@ field, ) import os -from pathlib import Path from typing import ( - Any, - Dict, - List, + ClassVar, Optional, Tuple, ) from pydantic import SecretStr -from preprocessor.utils.resolution import Resolution +from preprocessor.config.mixins import OutputDirMixin +from preprocessor.services.media.resolution import Resolution -# ============================================================================ -# CONSTANTS & HELPERS -# ============================================================================ -is_docker = os.getenv("DOCKER_CONTAINER", "false").lower() == "true" -BASE_OUTPUT_DIR = Path("/app/output_data") if is_docker else Path("preprocessor/output_data") - - -def get_output_path(relative_path: str) -> Path: - return BASE_OUTPUT_DIR / relative_path - - -# ============================================================================ -# OUTPUT DIRECTORY STRUCTURE -# ============================================================================ - -@dataclass +@dataclass(frozen=True) class ElasticDocumentSubdirs: - text_segments: str = "text_segments" - text_embeddings: str = "text_embeddings" - video_frames: str = "video_frames" - episode_names: str = "episode_names" - text_statistics: str = "text_statistics" - full_episode_embeddings: str = "full_episode_embeddings" - sound_events: str = "sound_events" - sound_event_embeddings: str = "sound_event_embeddings" + episode_names: str = 'episode_names' + full_episode_embeddings: str = 'full_episode_embeddings' + sound_event_embeddings: str = 'sound_event_embeddings' + sound_events: str = 'sound_events' + text_embeddings: str = 'text_embeddings' + text_segments: str = 'text_segments' + text_statistics: str = 'text_statistics' + video_frames: str = 'video_frames' -@dataclass +@dataclass(frozen=True) class TranscriptionSubdirs: - raw: str = "raw" - clean: str = "clean" - sound_events: str = "sound_events" + clean: str = 'clean' + raw: str = 'raw' + sound_events: str = 'sound_events' -@dataclass -class OutputSubdirs: # pylint: disable=too-many-instance-attributes - video: str = "transcoded_videos" - transcriptions: str = "transcriptions" - transcription_subdirs: TranscriptionSubdirs = field(default_factory=TranscriptionSubdirs) - scenes: str = "scene_timestamps" - frames: str = "exported_frames" - embeddings: str = "embeddings" - image_hashes: str = "image_hashes" - character_detections: str = "character_detections" - character_visualizations: str = "character_detections/visualizations" - face_clusters: str = "face_clusters" - object_detections: str = "object_detections" - object_visualizations: str = "object_detections/visualizations" - elastic_documents: str = "elastic_documents" - archives: str = "archives" - validation_reports: str = "validation_reports" +@dataclass(frozen=True) +class OutputSubdirs: # pylint: disable=too-many-instance-attributes # Configuration dataclass - all subdirs needed + archives: str = 'archives' + character_detections: str = 'detections/characters' + character_visualizations: str = 'detections/characters/visualizations' elastic_document_subdirs: ElasticDocumentSubdirs = field(default_factory=ElasticDocumentSubdirs) + elastic_documents: str = 'elastic_documents' + embeddings: str = 'embeddings' + face_clusters: str = 'clusters/faces' + frames: str = 'frames' + image_hashes: str = 'hashes' + object_detections: str = 'detections/objects' + object_visualizations: str = 'detections/objects/visualizations' + scenes: str = 'scene_detections' + transcription_subdirs: TranscriptionSubdirs = field(default_factory=TranscriptionSubdirs) + transcriptions: str = 'transcriptions' + validation_reports: str = 'validation_reports' + video: str = 'transcoded_videos' -# ============================================================================ -# BASE CLASSES -# ============================================================================ - -@dataclass +@dataclass(frozen=True) class BaseAPISettings: - _api_key: Optional[SecretStr] = None + _api_key: Optional[SecretStr] = field(default=None, repr=False) @property def api_key(self) -> Optional[str]: return self._api_key.get_secret_value() if self._api_key else None -# ============================================================================ -# VIDEO PROCESSING -# ============================================================================ +@dataclass(frozen=True) +class TranscodeSettings(OutputDirMixin): + OUTPUT_SUBDIR: ClassVar[str] = 'transcoded_videos' -@dataclass -class TranscodeSettings: - output_dir: Path = BASE_OUTPUT_DIR / "transcoded_videos" - codec: str = "h264_nvenc" - target_file_size_mb: float = 50.0 - target_duration_seconds: float = 100.0 audio_bitrate_kbps: int = 128 + codec: str = 'h264_nvenc' gop_size: float = 0.5 + target_duration_seconds: float = 100.0 + target_file_size_mb: float = 50.0 - def calculate_video_bitrate_mbps(self) -> float: - total_bitrate_mbps = (self.target_file_size_mb * 8) / self.target_duration_seconds - audio_bitrate_mbps = self.audio_bitrate_kbps / 1000.0 - video_bitrate_mbps = total_bitrate_mbps - audio_bitrate_mbps - return round(video_bitrate_mbps, 2) - - def calculate_minrate_mbps(self, percent: float = 0.5) -> float: - return round(self.calculate_video_bitrate_mbps() * percent, 2) - - def calculate_maxrate_mbps(self, percent: float = 1.75) -> float: - return round(self.calculate_video_bitrate_mbps() * percent, 2) - - def calculate_bufsize_mbps(self, multiplier: float = 2.0) -> float: - return round(self.calculate_video_bitrate_mbps() * multiplier, 2) +@dataclass(frozen=True) +class SceneDetectionSettings(OutputDirMixin): + OUTPUT_SUBDIR: ClassVar[str] = 'scene_timestamps' -@dataclass -class SceneDetectionSettings: - threshold: float = 0.5 min_scene_len: int = 10 - output_dir: Path = BASE_OUTPUT_DIR / "scene_timestamps" + threshold: float = 0.5 -@dataclass +@dataclass(frozen=True) class SceneChangesSettings: frames_per_scene: int = 1 -@dataclass +@dataclass(frozen=True) class KeyframeExtractionSettings: - strategy: str = "scene_changes" scene_changes: SceneChangesSettings = field(default_factory=SceneChangesSettings) + strategy: str = 'scene_changes' + +@dataclass(frozen=True) +class FrameExportSettings(OutputDirMixin): + OUTPUT_SUBDIR: ClassVar[str] = 'exported_frames' -@dataclass -class FrameExportSettings: - output_dir: Path = BASE_OUTPUT_DIR / "exported_frames" resolution: Resolution = Resolution.R1080P -# ============================================================================ -# TRANSCRIPTION & TEXT PROCESSING -# ============================================================================ +@dataclass(frozen=True) +class TranscriptionSettings(OutputDirMixin): + OUTPUT_SUBDIR: ClassVar[str] = 'transcriptions' -@dataclass -class TranscriptionSettings: - output_dir: Path = BASE_OUTPUT_DIR / "transcriptions" - model: str = "large-v3-turbo" - language: str = "Polish" - device: str = "cuda" + device: str = 'cuda' + language: str = 'Polish' + model: str = 'large-v3-turbo' -@dataclass +@dataclass(frozen=True) class WhisperSettings: - model: str = "large-v3-turbo" + model: str = 'large-v3-turbo' @classmethod - def _from_env(cls) -> "WhisperSettings": - return cls( - model=os.getenv("WHISPER_MODEL", "large-v3-turbo"), - ) + def from_env(cls) -> 'WhisperSettings': + return cls(model=os.getenv('WHISPER_MODEL', 'large-v3-turbo')) -@dataclass +@dataclass(frozen=True) class TextChunkingSettings: segments_per_embedding: int = 5 - text_sentences_per_chunk: int = 8 text_chunk_overlap: int = 3 + text_sentences_per_chunk: int = 8 -@dataclass +@dataclass(frozen=True) class ElevenLabsSettings(BaseAPISettings): - model_id: str = "scribe_v1" - language_code: str = "pol" diarize: bool = True - polling_interval: int = 20 + language_code: str = 'pol' max_attempts: int = 60 + model_id: str = 'scribe_v2' + polling_interval: int = 20 @classmethod - def _from_env(cls) -> "ElevenLabsSettings": + def from_env(cls) -> 'ElevenLabsSettings': api_key = None - if os.getenv("ELEVEN_API_KEY"): - api_key = SecretStr(os.getenv("ELEVEN_API_KEY", "")) + if os.getenv('ELEVEN_API_KEY'): + api_key = SecretStr(os.getenv('ELEVEN_API_KEY', '')) return cls(_api_key=api_key) -# ============================================================================ -# EMBEDDINGS -# ============================================================================ - -@dataclass +@dataclass(frozen=True) class EmbeddingModelSettings: - model_name: str = "Qwen/Qwen3-VL-Embedding-8B" - model_revision: str = "main" embedding_dim: int = 4096 + enable_chunked_prefill: bool = True + enforce_eager: bool = False gpu_memory_utilization: float = 0.85 - tensor_parallel_size: int = 1 + image_placeholder: str = '<|vision_start|><|image_pad|><|vision_end|>' max_model_len: int = 8192 - image_placeholder: str = "<|vision_start|><|image_pad|><|vision_end|>" - enable_chunked_prefill: bool = True max_num_batched_tokens: int = 8192 - enforce_eager: bool = False + model_name: str = 'Qwen/Qwen3-VL-Embedding-8B' + model_revision: str = 'main' + tensor_parallel_size: int = 1 + +@dataclass(frozen=True) +class EmbeddingSettings(OutputDirMixin): + OUTPUT_SUBDIR: ClassVar[str] = 'embeddings' -@dataclass -class EmbeddingSettings: - default_output_dir: Path = BASE_OUTPUT_DIR / "embeddings" batch_size: int = 32 - text_batch_size: int = 64 - progress_sub_batch_size: int = 100 - prefetch_chunks: int = 2 generate_full_episode_embedding: bool = True + prefetch_chunks: int = 2 + progress_sub_batch_size: int = 100 + text_batch_size: int = 64 -# ============================================================================ -# COMPUTER VISION -# ============================================================================ - -@dataclass +@dataclass(frozen=True) class FaceRecognitionSettings: - model_name: str = "buffalo_l" detection_size: Tuple[int, int] = (1280, 1280) + model_name: str = 'buffalo_l' + +@dataclass(frozen=True) +class FaceClusteringSettings(OutputDirMixin): + OUTPUT_SUBDIR: ClassVar[str] = 'face_clusters' -@dataclass -class FaceClusteringSettings: - output_dir: Path = BASE_OUTPUT_DIR / "face_clusters" min_cluster_size: int = 5 + min_face_px: int = 40 min_samples: int = 3 + min_det_score: float = 0.4 save_noise: bool = True -@dataclass +@dataclass(frozen=True) class EmotionDetectionSettings: - model_name: str = "enet_b2_8" + model_name: str = 'enet_b2_8' @classmethod - def _from_env(cls) -> "EmotionDetectionSettings": - model_name = os.getenv("EMOTION_MODEL_NAME", "enet_b2_8") + def from_env(cls) -> 'EmotionDetectionSettings': + model_name = os.getenv('EMOTION_MODEL_NAME', 'enet_b2_8') return cls(model_name=model_name) -@dataclass -class CharacterSettings: - output_dir: Path = BASE_OUTPUT_DIR / "characters" - reference_images_per_character: int = 3 - characters_list_file: Path = BASE_OUTPUT_DIR / "characters.json" - detections_dir: Path = BASE_OUTPUT_DIR / "character_detections" - processed_references_dir: Path = BASE_OUTPUT_DIR / "character_references_processed" - normalized_face_size: Tuple[int, int] = (112, 112) +@dataclass(frozen=True) +class CharacterSettings(OutputDirMixin): + OUTPUT_SUBDIR: ClassVar[str] = 'characters' + face_detection_threshold: float = 0.2 - reference_matching_threshold: float = 0.50 frame_detection_threshold: float = 0.55 + normalized_face_size: Tuple[int, int] = (112, 112) + reference_images_per_character: int = 3 + reference_matching_threshold: float = 0.5 -_OBJECT_DETECTIONS_DIR = BASE_OUTPUT_DIR / "object_detections" - - -@dataclass -class ObjectDetectionSettings: - model_name: str = "ustc-community/dfine-xlarge-obj2coco" - conf_threshold: float = 0.30 - output_dir: Path = _OBJECT_DETECTIONS_DIR - visualized_output_dir: Path = _OBJECT_DETECTIONS_DIR / "visualizations" +@dataclass(frozen=True) +class ObjectDetectionSettings(OutputDirMixin): + OUTPUT_SUBDIR: ClassVar[str] = 'object_detections' + conf_threshold: float = 0.3 + model_name: str = 'ustc-community/dfine-xlarge-obj2coco' -# ============================================================================ -# UTILITIES -# ============================================================================ -@dataclass -class ImageHashSettings: - output_dir: Path = BASE_OUTPUT_DIR / "image_hashes" +@dataclass(frozen=True) +class ImageHashSettings(OutputDirMixin): + OUTPUT_SUBDIR: ClassVar[str] = 'image_hashes' -@dataclass +@dataclass(frozen=True) class ImageScraperSettings(BaseAPISettings): - max_results_to_scrape: int = 50 - min_image_width: int = 800 - min_image_height: int = 600 - retry_attempts: int = 3 - retry_delay: float = 3.0 - request_delay_min: float = 3.0 - request_delay_max: float = 6.0 + image_download_timeout: int = 8000 + max_results_to_scrape: int = 100 + min_image_height: int = 60 + min_image_width: int = 60 page_navigation_timeout: int = 30000 - - @classmethod - def _from_env(cls) -> "ImageScraperSettings": - api_key = None - if os.getenv("SERPAPI_API_KEY"): - api_key = SecretStr(os.getenv("SERPAPI_API_KEY", "")) - return cls(_api_key=api_key) + request_delay_max: float = 5.0 + request_delay_min: float = 2.0 + retry_attempts: int = 3 + retry_delay: float = 5.0 @property def serpapi_key(self) -> Optional[str]: return self.api_key + @classmethod + def from_env(cls) -> 'ImageScraperSettings': + api_key = None + if os.getenv('SERPAPI_API_KEY'): + api_key = SecretStr(os.getenv('SERPAPI_API_KEY', '')) + return cls(_api_key=api_key) -@dataclass -class ScraperSettings: - output_dir: Path = BASE_OUTPUT_DIR / "scraped_pages" +@dataclass(frozen=True) +class ScraperSettings(OutputDirMixin): + OUTPUT_SUBDIR: ClassVar[str] = 'scraped_pages' -# ============================================================================ -# EXTERNAL SERVICES -# ============================================================================ -@dataclass +@dataclass(frozen=True) class ElasticsearchSettings: - host: str = "" - user: str = "" - password: str = "" + host: str = '' + password: str = field(default='', repr=False) + user: str = '' @classmethod - def _from_env(cls) -> "ElasticsearchSettings": + def from_env(cls) -> 'ElasticsearchSettings': return cls( - host=os.getenv("ES_HOST", ""), - user=os.getenv("ES_USER", ""), - password=os.getenv("ES_PASS", ""), + host=os.getenv('ES_HOST', ''), + user=os.getenv('ES_USER', ''), + password=os.getenv('ES_PASS', ''), ) -@dataclass +@dataclass(frozen=True) class GeminiSettings(BaseAPISettings): @classmethod - def _from_env(cls) -> "GeminiSettings": + def from_env(cls) -> 'GeminiSettings': api_key = None - if os.getenv("GEMINI_API_KEY"): - api_key = SecretStr(os.getenv("GEMINI_API_KEY", "")) + if os.getenv('GEMINI_API_KEY'): + api_key = SecretStr(os.getenv('GEMINI_API_KEY', '')) return cls(_api_key=api_key) -# ============================================================================ -# MAIN SETTINGS -# ============================================================================ - -@dataclass -class Settings: # pylint: disable=too-many-instance-attributes - output_subdirs: OutputSubdirs - whisper: WhisperSettings - text_chunking: TextChunkingSettings - embedding_model: EmbeddingModelSettings +@dataclass(frozen=True) +class Settings: # pylint: disable=too-many-instance-attributes # Main settings dataclass aggregating all subsettings + character: CharacterSettings + elasticsearch: ElasticsearchSettings + elevenlabs: ElevenLabsSettings embedding: EmbeddingSettings - scene_detection: SceneDetectionSettings - keyframe_extraction: KeyframeExtractionSettings + embedding_model: EmbeddingModelSettings + emotion_detection: EmotionDetectionSettings + face_clustering: FaceClusteringSettings + face_recognition: FaceRecognitionSettings frame_export: FrameExportSettings + gemini: GeminiSettings image_hash: ImageHashSettings - scraper: ScraperSettings - character: CharacterSettings - object_detection: ObjectDetectionSettings - face_recognition: FaceRecognitionSettings - face_clustering: FaceClusteringSettings - emotion_detection: EmotionDetectionSettings image_scraper: ImageScraperSettings - elevenlabs: ElevenLabsSettings - elasticsearch: ElasticsearchSettings - gemini: GeminiSettings + keyframe_extraction: KeyframeExtractionSettings + object_detection: ObjectDetectionSettings + output_subdirs: OutputSubdirs + scene_detection: SceneDetectionSettings + scraper: ScraperSettings + text_chunking: TextChunkingSettings transcode: TranscodeSettings transcription: TranscriptionSettings + whisper: WhisperSettings @classmethod - def _from_env(cls) -> "Settings": + def from_env(cls) -> 'Settings': return cls( output_subdirs=OutputSubdirs(), - whisper=WhisperSettings._from_env(), + whisper=WhisperSettings.from_env(), text_chunking=TextChunkingSettings(), embedding_model=EmbeddingModelSettings(), embedding=EmbeddingSettings(), @@ -374,92 +312,11 @@ def _from_env(cls) -> "Settings": object_detection=ObjectDetectionSettings(), face_recognition=FaceRecognitionSettings(), face_clustering=FaceClusteringSettings(), - emotion_detection=EmotionDetectionSettings._from_env(), - image_scraper=ImageScraperSettings._from_env(), - elevenlabs=ElevenLabsSettings._from_env(), - elasticsearch=ElasticsearchSettings._from_env(), - gemini=GeminiSettings._from_env(), + emotion_detection=EmotionDetectionSettings.from_env(), + image_scraper=ImageScraperSettings.from_env(), + elevenlabs=ElevenLabsSettings.from_env(), + elasticsearch=ElasticsearchSettings.from_env(), + gemini=GeminiSettings.from_env(), transcode=TranscodeSettings(), transcription=TranscriptionSettings(), ) - - -# ============================================================================ -# PIPELINE CONFIGS -# ============================================================================ - -@dataclass -class TranscodeConfig: - videos: Path - transcoded_videos: Path - resolution: Resolution - codec: str - gop_size: float - episodes_info_json: Optional[Path] = None - video_bitrate_mbps: Optional[float] = None - minrate_mbps: Optional[float] = None - maxrate_mbps: Optional[float] = None - bufsize_mbps: Optional[float] = None - audio_bitrate_kbps: int = 128 - - def to_dict(self) -> Dict[str, Any]: - return { - "videos": self.videos, - "transcoded_videos": self.transcoded_videos, - "resolution": self.resolution, - "codec": self.codec, - "video_bitrate_mbps": self.video_bitrate_mbps, - "minrate_mbps": self.minrate_mbps, - "maxrate_mbps": self.maxrate_mbps, - "bufsize_mbps": self.bufsize_mbps, - "audio_bitrate_kbps": self.audio_bitrate_kbps, - "gop_size": self.gop_size, - "episodes_info_json": self.episodes_info_json, - } - - -@dataclass -class TranscriptionConfig: - videos: Path - episodes_info_json: Path - transcription_jsons: Path - model: str - language: str - device: str - name: str - extra_json_keys_to_remove: List[str] = field(default_factory=list) - - def to_dict(self) -> Dict[str, Any]: - return { - "videos": self.videos, - "episodes_info_json": self.episodes_info_json, - "transcription_jsons": self.transcription_jsons, - "model": self.model, - "language": self.language, - "device": self.device, - "extra_json_keys_to_remove": self.extra_json_keys_to_remove, - "name": self.name, - } - - -@dataclass -class IndexConfig: - name: str - transcription_jsons: Path - dry_run: bool = False - append: bool = False - - def to_dict(self) -> Dict[str, Any]: - return { - "name": self.name, - "transcription_jsons": str(self.transcription_jsons), - "dry_run": self.dry_run, - "append": self.append, - } - - -# ============================================================================ -# GLOBAL INSTANCE -# ============================================================================ - -settings = Settings._from_env() diff --git a/preprocessor/config/constants.py b/preprocessor/config/constants.py new file mode 100644 index 000000000..4db6eb47c --- /dev/null +++ b/preprocessor/config/constants.py @@ -0,0 +1,71 @@ +SUPPORTED_VIDEO_EXTENSIONS = ('.mp4', '.avi', '.mkv', '.mov', '.flv', '.wmv', '.webm') +DEFAULT_VIDEO_EXTENSION = '.mp4' + +FILE_SUFFIXES = { + 'segmented': '_segmented', + 'text_segments': '_text_segments', + 'simple': '_simple', + 'clean': '_clean_transcription', + 'clean_alt': '_clean', + 'scenes': '_scenes', + 'sound_events': '_sound_events', + 'text_stats': '_text_stats', + 'embeddings_text': '_embeddings_text', + 'embeddings_video': '_embeddings_video', + 'embeddings_full': 'embeddings_full_episode', + 'embeddings_sound': 'embeddings_sound_events', + 'episode_name': 'episode_name_embedding', + 'image_hashes': '_image_hashes', + 'detections': 'detections', + 'character_detections': '_character_detections', +} + +FILE_EXTENSIONS = { + 'json': '.json', + 'jsonl': '.jsonl', + 'txt': '.txt', + 'srt': '.srt', + 'mp4': '.mp4', + 'jpg': '.jpg', +} + +OUTPUT_FILE_NAMES = { + 'detections': 'detections.json', + 'episode_embedding': 'episode_name_embedding.json', + 'embeddings_text': 'embeddings_text.json', +} + +OUTPUT_FILE_PATTERNS = { + 'frame': '*_frame_*.jpg', + 'scenes_suffix': '_scenes.json', +} + + +class EpisodesDataKeys: + EPISODES = 'episodes' + SEASONS = 'seasons' + SEASON_NUMBER = 'season_number' + + +class EpisodeMetadataKeys: + EPISODE_NUMBER = 'episode_in_season' + PREMIERE_DATE = 'premiere_date' + TITLE = 'title' + VIEWERSHIP = 'viewership' + + +class FfprobeKeys: + FORMAT = 'format' + STREAMS = 'streams' + + +ELASTIC_DOC_TYPES = [ + ("text_segments", "text_segments"), + ("sound_events", "sound_events"), + ("text_embeddings", "text_embeddings"), + ("video_frames", "video_frames"), + ("episode_names", "episode_name"), + ("text_statistics", "text_statistics"), + ("full_episode_embeddings", "full_episode_embedding"), + ("sound_event_embeddings", "sound_event_embeddings"), +] diff --git a/preprocessor/config/enums.py b/preprocessor/config/enums.py new file mode 100644 index 000000000..cebade622 --- /dev/null +++ b/preprocessor/config/enums.py @@ -0,0 +1,35 @@ +from enum import Enum + + +class KeyframeStrategy(str, Enum): + SCENE_CHANGES = 'scene_changes' + + +class FrameType(str, Enum): + SCENE_END = 'scene_end' + SCENE_SINGLE = 'scene_single' + SCENE_START = 'scene_start' + + @staticmethod + def scene_mid(index: int) -> str: + return f'scene_mid_{index}' + + +class ScraperMethod(str, Enum): + CLIPBOARD = 'clipboard' + CRAWL4AI = 'crawl4ai' + + +class ParserMode(str, Enum): + NORMAL = 'normal' + PREMIUM = 'premium' + + +class TranscriptionFormat(str, Enum): + ELEVENLABS = '11labs' + ELEVENLABS_SEGMENTED = '11labs_segmented' + + +class Device(str, Enum): + CPU = 'cpu' + CUDA = 'cuda' diff --git a/preprocessor/config/mixins.py b/preprocessor/config/mixins.py new file mode 100644 index 000000000..9134b99e4 --- /dev/null +++ b/preprocessor/config/mixins.py @@ -0,0 +1,16 @@ +from pathlib import Path +from typing import ClassVar + +from preprocessor.config.output_paths import get_base_output_dir + + +class OutputDirMixin: + OUTPUT_SUBDIR: ClassVar[str] + + @classmethod + def get_output_dir(cls, series_name: str) -> Path: + if not hasattr(cls, 'OUTPUT_SUBDIR'): + raise NotImplementedError( + f"{cls.__name__} must define OUTPUT_SUBDIR class variable", + ) + return get_base_output_dir(series_name) / cls.OUTPUT_SUBDIR diff --git a/preprocessor/config/output_paths.py b/preprocessor/config/output_paths.py new file mode 100644 index 000000000..73b34c183 --- /dev/null +++ b/preprocessor/config/output_paths.py @@ -0,0 +1,19 @@ +from pathlib import Path +from typing import Optional + +from preprocessor.services.core.environment import Environment + + +def get_base_output_dir(series_name: Optional[str] = None) -> Path: + if Environment.is_docker(): + base = Path('/app/output_data') + else: + base = Path('preprocessor/output_data') + + if series_name: + return base / series_name.lower() + return base + + +def get_output_path(relative_path: str, series_name: Optional[str] = None) -> Path: + return get_base_output_dir(series_name) / relative_path diff --git a/preprocessor/prompts/__init__.py b/preprocessor/config/prompts/__init__.py similarity index 50% rename from preprocessor/prompts/__init__.py rename to preprocessor/config/prompts/__init__.py index e39180497..1b4110e68 100644 --- a/preprocessor/prompts/__init__.py +++ b/preprocessor/config/prompts/__init__.py @@ -12,14 +12,14 @@ ) __all__ = [ - "extract_all_seasons_system", - "extract_all_seasons_user", - "extract_characters_system", - "extract_characters_user", - "extract_episode_metadata_system", - "extract_episode_metadata_user", - "extract_season_system", - "extract_season_user", - "merge_episode_data_system", - "merge_episode_data_user", + 'extract_all_seasons_system', + 'extract_all_seasons_user', + 'extract_characters_system', + 'extract_characters_user', + 'extract_episode_metadata_system', + 'extract_episode_metadata_user', + 'extract_season_system', + 'extract_season_user', + 'merge_episode_data_system', + 'merge_episode_data_user', ] diff --git a/preprocessor/config/prompts/common_schemas.py b/preprocessor/config/prompts/common_schemas.py new file mode 100644 index 000000000..163e299ee --- /dev/null +++ b/preprocessor/config/prompts/common_schemas.py @@ -0,0 +1,10 @@ +def episode_metadata_schema() -> str: + return ( + '{\n' + ' "title": str,\n' + ' "description": str,\n' + ' "summary": str,\n' + ' "season": int or null,\n' + ' "episode_number": int or null\n' + '}' + ) diff --git a/preprocessor/config/prompts/extract_all_seasons_system.py b/preprocessor/config/prompts/extract_all_seasons_system.py new file mode 100644 index 000000000..14b4dacdd --- /dev/null +++ b/preprocessor/config/prompts/extract_all_seasons_system.py @@ -0,0 +1,64 @@ +def get() -> str: + return ( + 'You are extracting episode data from TV series wiki pages.\n' + 'Your task is to find tables or lists containing episode information ' + 'and extract the EXACT data.\n\n' + 'Look for patterns like:\n' + 'Nr | Tytuł | Premiera | Oglądalność\n' + '1 | _[Episode Title]_ | 05.03.2006 | 4 396 564\n\n' + 'CRITICAL RULES:\n' + '1. Extract EXACT titles from the table - do NOT make up generic titles ' + 'like "Odcinek 1"\n' + '2. Extract EXACT premiere dates as shown - do NOT invent dates\n' + '3. If premiere date contains multiple dates separated by "/" (e.g., ' + '"31.12.2008"), extract ONLY the FIRST date: "31.12.2008"\n' + '4. Extract EXACT viewership numbers - remove spaces: "4 396 564" -> ' + '4396564\n' + '5. If episode number is in format like "E12" or "S01E12", extract just ' + 'the number: 12\n' + '6. Do NOT hallucinate or make up any data - only extract what you see\n\n' + 'IMPORTANT: Each episode must have TWO numbers:\n' + '- episode_in_season: The episode number within its season (resets to 1 ' + 'for each season)\n' + '- overall_episode_number: The absolute episode number across all seasons ' + '(continues counting)\n\n' + 'Example extraction from this markdown:\n' + '```\n' + 'Sezon 1:\n' + 'Nr | Tytuł | Premiera | Oglądalność\n' + '1 | _[Spadek]_ | 05.03.2006 | 4 396 564\n' + '2 | _[Goście z zaświatów]_ | 12.03.2006 | 4 308 423\n\n' + 'Sezon 2:\n' + 'Nr | Tytuł | Premiera | Oglądalność\n' + '14 | _[Sztuka i władza]_ | 18.03.2007 | 6 993 951\n' + '15 | _[Gmina to ja]_ | 25.03.2007 | 6 754 211\n' + '```\n\n' + 'Should produce:\n' + '{\n' + ' "seasons": [\n' + ' {\n' + ' "season_number": 1,\n' + ' "episodes": [\n' + ' {"episode_in_season": 1, "overall_episode_number": 1, ' + '"title": "Spadek", "premiere_date": "05.03.2006", ' + '"viewership": "4396564"},\n' + ' {"episode_in_season": 2, "overall_episode_number": 2, ' + '"title": "Goście z zaświatów", "premiere_date": "12.03.2006", ' + '"viewership": "4308423"}\n' + ' ]\n' + ' },\n' + ' {\n' + ' "season_number": 2,\n' + ' "episodes": [\n' + ' {"episode_in_season": 1, "overall_episode_number": 14, ' + '"title": "Sztuka i władza", "premiere_date": "18.03.2007", ' + '"viewership": "6993951"},\n' + ' {"episode_in_season": 2, "overall_episode_number": 15, ' + '"title": "Gmina to ja", "premiere_date": "25.03.2007", ' + '"viewership": "6754211"}\n' + ' ]\n' + ' }\n' + ' ]\n' + '}\n\n' + 'Return ONLY valid JSON. Extract ONLY what you see, do NOT invent data.' + ) diff --git a/preprocessor/config/prompts/extract_all_seasons_user.py b/preprocessor/config/prompts/extract_all_seasons_user.py new file mode 100644 index 000000000..489577bb4 --- /dev/null +++ b/preprocessor/config/prompts/extract_all_seasons_user.py @@ -0,0 +1,7 @@ +def get() -> str: + return ( + 'Extract ALL episodes from ALL {num_sources} sources below.\n' + 'Return a complete list of ALL seasons found.\n\n' + '{combined_content}\n\n' + 'Extract ALL seasons and episodes from above sources.' + ) diff --git a/preprocessor/config/prompts/extract_characters_system.py b/preprocessor/config/prompts/extract_characters_system.py new file mode 100644 index 000000000..ec49c8dae --- /dev/null +++ b/preprocessor/config/prompts/extract_characters_system.py @@ -0,0 +1,120 @@ +def get() -> str: + return ( + 'You are an expert at extracting character information from TV series ' + 'documentation and wikis.\n\n' + 'Your task is to analyze scraped web pages and extract a COMPLETE list ' + 'of ALL characters from a TV series.\n\n' + 'For each character, extract ONLY the name (full name if available, ' + 'otherwise commonly used name).\n\n' + '### RULES FOR EXTRACTION:\n\n' + '1. **Completeness:** Extract ALL characters: main, supporting, recurring, ' + 'and episodic (even if they appear once).\n' + '2. **Source:** Extract ONLY what you see in the content. Do NOT invent ' + 'characters.\n' + '3. **CRITICAL - Single Series Only:** The scraped content may include ' + 'references to other TV series (e.g., in footers, sidebars, "See also" ' + 'sections, or related links). You MUST extract characters ONLY from the ' + 'specific series mentioned in the user prompt. IGNORE all characters from ' + 'any other series.\n' + '4. **Multi-Source Deduplication:** When processing multiple sources:\n' + ' - Merge character lists from all sources\n' + ' - Remove duplicates (same character mentioned in multiple sources)\n' + ' - If a character has different name variants across sources, use the ' + 'most complete/formal version\n' + ' - Combine information to get the most accurate character list\n' + '5. **Naming:** Use the Polish name if the series is Polish. If a ' + 'character has multiple aliases, use the most formal/common one.\n\n' + '6. **Text Cleaning (CRITICAL):**\n' + ' - Remove ALL special characters that are not letters (e.g., quotes ' + '`"`, brackets `()`, hyphens `-` inside titles, etc.).\n' + ' - Remove actor names typically found in brackets.\n' + ' - The final output string must contain **ONLY letters (including ' + 'Polish diacritics: ą, ć, ę, ł, ń, ó, ś, ź, ż) and spaces**.\n' + ' - Do not leave trailing periods after expanding titles.\n\n' + '7. **ABBREVIATION EXPANSION (Mandatory):**\n' + ' You MUST expand ALL abbreviations to their full Polish forms.\n' + ' **IMPORTANT:** Process compound abbreviations (2+ words) BEFORE ' + 'single word abbreviations.\n\n' + ' **Ecclesiastical (Religious):**\n' + ' - ks. prob. / ks.prob. -> Ksiądz Proboszcz\n' + ' - ks. wik. / ks.wik. -> Ksiądz Wikariusz\n' + ' - ks. kan. -> Ksiądz Kanonik\n' + ' - ks. bp -> Ksiądz Biskup\n' + ' - ks. kard. -> Ksiądz Kardynał\n' + ' - ks. -> Ksiądz\n' + ' - o. -> Ojciec (e.g., Ojciec Mateusz)\n' + ' - s. -> Siostra\n' + ' - br. -> Brat\n' + ' - bp -> Biskup\n' + ' - abp -> Arcybiskup\n' + ' - kard. -> Kardynał\n' + ' - pap. -> Papież\n' + ' - wik. -> Wikariusz\n' + ' - prob. -> Proboszcz\n\n' + ' **Academic & Medical:**\n' + ' - dr hab. -> Doktor habilitowany\n' + ' - prof. nadzw. -> Profesor nadzwyczajny\n' + ' - prof. zw. -> Profesor zwyczajny\n' + ' - prof. -> Profesor\n' + ' - dr -> Doktor\n' + ' - mgr -> Magister\n' + ' - inż. -> Inżynier\n' + ' - lek. med. / lek. -> Lekarz\n' + ' - doc. -> Docent\n' + ' - piel. -> Pielęgniarka / Pielęgniarz\n\n' + ' **Military, Police & Services:**\n' + ' - nadkom. -> Nadkomisarz\n' + ' - podkom. -> Podkomisarz\n' + ' - kom. -> Komisarz\n' + ' - asp. sztab. -> Aspirant sztabowy\n' + ' - asp. -> Aspirant\n' + ' - st. post. -> Starszy posterunkowy\n' + ' - post. -> Posterunkowy\n' + ' - sierż. -> Sierżant\n' + ' - gen. -> Generał\n' + ' - płk -> Pułkownik\n' + ' - ppłk -> Podpułkownik\n' + ' - mjr -> Major\n' + ' - kpt. -> Kapitan\n' + ' - por. -> Porucznik\n' + ' - ppor. -> Podporucznik\n\n' + ' **Legal, Political & Administrative:**\n' + ' - mec. -> Mecenas\n' + ' - prok. -> Prokurator\n' + ' - sędz. -> Sędzia\n' + ' - dyr. -> Dyrektor\n' + ' - prez. -> Prezydent\n' + ' - min. -> Minister\n' + ' - sen. -> Senator\n' + ' - pos. -> Poseł\n' + ' - przew. -> Przewodniczący\n' + ' - z-ca -> Zastępca\n\n' + ' **Other:**\n' + ' - red. -> Redaktor\n\n' + ' *If you encounter an abbreviation not listed here, expand it to its ' + 'correct full Polish form based on context.*\n\n' + '### EXAMPLE EXTRACTION:\n\n' + 'Source 1:\n' + '```\n' + 'Główni bohaterowie:\n' + '- ks. prob. Krzysztof Robert (Artur Żmijewski)\n' + '- Lucy Wilska (Ilona Ostrowska)\n' + '```\n\n' + 'Source 2:\n' + '```\n' + 'Postacie:\n' + '- Ksiądz Proboszcz Krzysztof Robert\n' + '- dr Cezary Pazura\n' + '- kom. Paweł Kozioł\n' + '```\n\n' + 'Should produce (deduplicated and cleaned):\n' + '{\n' + ' "characters": [\n' + ' {"name": "Ksiądz Proboszcz Krzysztof Robert"},\n' + ' {"name": "Lucy Wilska"},\n' + ' {"name": "Doktor Cezary Pazura"},\n' + ' {"name": "Komisarz Paweł Kozioł"}\n' + ' ]\n' + '}\n\n' + 'Return ONLY valid JSON.' + ) diff --git a/preprocessor/config/prompts/extract_characters_user.py b/preprocessor/config/prompts/extract_characters_user.py new file mode 100644 index 000000000..2da692cf7 --- /dev/null +++ b/preprocessor/config/prompts/extract_characters_user.py @@ -0,0 +1,15 @@ +def get() -> str: + return ( + 'Extract ALL characters from the TV series "{series_name}" from ALL ' + '{num_sources} source(s) below.\n\n' + '**CRITICAL:** Multiple sources may have overlapping or complementary ' + 'character lists.\n' + '- Merge and deduplicate characters across all sources\n' + '- Extract ONLY characters from "{series_name}" (ignore other series ' + 'mentioned in footers/sidebars)\n' + '- Return a single unified list\n\n' + 'Here is the content from all sources combined:\n\n' + '{combined_content}\n\n' + '---\n' + 'Extract ALL characters from "{series_name}" found in the content above.' + ) diff --git a/preprocessor/config/prompts/extract_episode_metadata_system.py b/preprocessor/config/prompts/extract_episode_metadata_system.py new file mode 100644 index 000000000..81f54b2c5 --- /dev/null +++ b/preprocessor/config/prompts/extract_episode_metadata_system.py @@ -0,0 +1,17 @@ +from preprocessor.config.prompts.common_schemas import episode_metadata_schema + + +def get() -> str: + return ( + 'Extract episode information from the provided web page content.\n' + 'Focus on finding:\n' + '- Episode title (exact title, not description)\n' + '- Episode description (1-2 sentences summarizing the plot)\n' + '- Episode summary (detailed summary, 3-5 sentences)\n' + '- Season number (if mentioned)\n' + '- Episode number (if mentioned)\n\n' + 'If information is missing, use empty string for text fields and null ' + 'for numbers.\n' + 'Be precise and extract only factual information from the text.\n\n' + f'Return ONLY valid JSON matching this schema:\n{episode_metadata_schema()}' + ) diff --git a/preprocessor/config/prompts/extract_episode_metadata_user.py b/preprocessor/config/prompts/extract_episode_metadata_user.py new file mode 100644 index 000000000..b11fbc1b4 --- /dev/null +++ b/preprocessor/config/prompts/extract_episode_metadata_user.py @@ -0,0 +1,2 @@ +def get() -> str: + return 'URL: {url}\n\nPage content:\n{page_text}\n\nExtract the episode metadata from above.' diff --git a/preprocessor/config/prompts/extract_season_system.py b/preprocessor/config/prompts/extract_season_system.py new file mode 100644 index 000000000..4b8c8a77e --- /dev/null +++ b/preprocessor/config/prompts/extract_season_system.py @@ -0,0 +1,31 @@ +def get() -> str: + return ( + 'You are extracting episode data from a TV series page.\n' + 'Extract ALL episodes you can find on the page. Look for tables, lists, ' + 'or any structured data.\n\n' + 'For each episode extract:\n' + '- episode_in_season: The episode number within its season (1, 2, 3... ' + 'resets each season)\n' + '- overall_episode_number: The absolute episode number across all seasons ' + '(continues counting)\n' + '- title: string (clean title without markdown formatting)\n' + '- premiere_date: string (date format as found on page; if multiple dates ' + 'separated by "/" like "31.12.2008", extract ONLY the FIRST date: ' + '"31.12.2008")\n' + '- viewership: string (remove spaces from numbers like "4 396 564" -> ' + '"4396564", use null if not available)\n\n' + 'The season number should be determined from the page content or URL.\n\n' + 'Return ONLY valid JSON matching this schema:\n' + '{\n' + ' "season_number": int,\n' + ' "episodes": [\n' + ' {\n' + ' "episode_in_season": int,\n' + ' "overall_episode_number": int,\n' + ' "title": str,\n' + ' "premiere_date": str,\n' + ' "viewership": str\n' + ' }\n' + ' ]\n' + '}' + ) diff --git a/preprocessor/config/prompts/extract_season_user.py b/preprocessor/config/prompts/extract_season_user.py new file mode 100644 index 000000000..b0f1c32ba --- /dev/null +++ b/preprocessor/config/prompts/extract_season_user.py @@ -0,0 +1,2 @@ +def get() -> str: + return 'URL: {url}\n\nPage content (markdown):\n{page_text}\n\nExtract ALL episodes from this page and return as JSON.' diff --git a/preprocessor/config/prompts/merge_episode_data_system.py b/preprocessor/config/prompts/merge_episode_data_system.py new file mode 100644 index 000000000..bb48a3fe2 --- /dev/null +++ b/preprocessor/config/prompts/merge_episode_data_system.py @@ -0,0 +1,14 @@ +from preprocessor.config.prompts.common_schemas import episode_metadata_schema + + +def get() -> str: + return ( + 'You are merging episode information from multiple sources.\n' + 'Create a single, accurate metadata entry by:\n' + '- Choosing the most complete and accurate title\n' + '- Combining descriptions into a coherent 1-2 sentence description\n' + '- Merging summaries into a comprehensive 3-5 sentence summary\n' + '- Using the most reliable season/episode numbers\n\n' + 'Prefer longer, more detailed information when merging.\n\n' + f'Return ONLY valid JSON matching this schema:\n{episode_metadata_schema()}' + ) diff --git a/preprocessor/config/prompts/merge_episode_data_user.py b/preprocessor/config/prompts/merge_episode_data_user.py new file mode 100644 index 000000000..eedb27c9f --- /dev/null +++ b/preprocessor/config/prompts/merge_episode_data_user.py @@ -0,0 +1,2 @@ +def get() -> str: + return 'Merge the following episode metadata from {num_sources} sources:\n\n{combined_text}\n\nCreate a single, unified metadata entry.' diff --git a/preprocessor/config/series_config.py b/preprocessor/config/series_config.py new file mode 100644 index 000000000..dfa224881 --- /dev/null +++ b/preprocessor/config/series_config.py @@ -0,0 +1,229 @@ +from dataclasses import ( + dataclass, + field, +) +import json +from pathlib import Path +from typing import ( + Any, + Dict, + List, + Optional, +) + + +def _deep_merge(base: Dict[str, Any], override: Dict[str, Any]) -> Dict[str, Any]: + result: Dict[str, Any] = base.copy() + for key, value in override.items(): + if key.startswith('_'): + continue + if key in result and isinstance(result[key], dict) and isinstance(value, dict): + result[key] = _deep_merge(result[key], value) + else: + result[key] = value + return result + + +@dataclass +class EpisodeScrapingConfig: + parser_mode: str + urls: List[str] + + +@dataclass +class CharacterScrapingConfig: + parser_mode: str + urls: List[str] + + +@dataclass +class CharacterReferencesConfig: + images_per_character: int + search_engine: str + search_query_template: str + source: str = 'clusters' + + +@dataclass +class ScrapingConfig: + character_references: CharacterReferencesConfig + characters: CharacterScrapingConfig + episodes: EpisodeScrapingConfig + + +@dataclass +class TranscriptionProcessingConfig: + device: str + language: str + mode: str + model: str + + +@dataclass +class TranscriptionImportProcessingConfig: + format_type: str + season_remap: Dict[str, int] + source_dir: str + + +@dataclass +class TranscodeProcessingConfig: + bitrate_boost_ratio: float + force_deinterlace: bool + keyframe_interval_seconds: float + max_bitrate_duration_seconds: float + max_bitrate_file_size_mb: float + min_bitrate_mbps: float + resolution: str + + +@dataclass +class SceneDetectionProcessingConfig: + min_scene_len: int + threshold: float + + +@dataclass +class FrameExportProcessingConfig: + frames_per_scene: int + + +@dataclass +class ProcessingConfig: + frame_export: FrameExportProcessingConfig + scene_detection: SceneDetectionProcessingConfig + transcode: TranscodeProcessingConfig + transcription: TranscriptionProcessingConfig + transcription_import: Optional[TranscriptionImportProcessingConfig] = field(default=None) + + +@dataclass +class ElasticsearchIndexingConfig: + append: bool + dry_run: bool + host: str + index_name: str + + +@dataclass +class IndexingConfig: + elasticsearch: ElasticsearchIndexingConfig + + +@dataclass +class SeriesConfig: + display_name: str + indexing: IndexingConfig + pipeline_mode: str + processing: ProcessingConfig + scraping: ScrapingConfig + series_name: str + skip_steps: List[str] + + @staticmethod + def load(series_name: str) -> 'SeriesConfig': + config_dir: Path = Path('preprocessor/series_configs') + config_path: Path = config_dir / f'{series_name}.json' + + return SeriesConfig.__load_from_file(config_path) + + @staticmethod + def __load_defaults() -> Dict[str, Any]: + defaults_path: Path = Path('preprocessor/series_configs/defaults.json') + if not defaults_path.exists(): + return {} + with open(defaults_path, 'r', encoding='utf-8') as f: + data: Dict[str, Any] = json.load(f) + return {k: v for k, v in data.items() if not k.startswith('_')} + + @staticmethod + def __load_from_dict(data: Dict[str, Any]) -> 'SeriesConfig': + return SeriesConfig( + series_name=data['series_name'], + display_name=data['display_name'], + pipeline_mode=data.get('pipeline_mode', 'full'), + skip_steps=data.get('skip_steps', []), + scraping=ScrapingConfig( + episodes=EpisodeScrapingConfig( + urls=data['scraping']['episodes']['urls'], + parser_mode=data['scraping']['episodes']['parser_mode'], + ), + characters=CharacterScrapingConfig( + urls=data['scraping']['characters']['urls'], + parser_mode=data['scraping']['characters']['parser_mode'], + ), + character_references=CharacterReferencesConfig( + search_engine=data['scraping']['character_references']['search_engine'], + images_per_character=data['scraping']['character_references']['images_per_character'], + search_query_template=data['scraping']['character_references']['search_query_template'], + source=data['scraping']['character_references'].get('source', 'web'), + ), + ), + processing=SeriesConfig.__build_processing_config(data), + indexing=IndexingConfig( + elasticsearch=ElasticsearchIndexingConfig( + index_name=data['indexing']['elasticsearch']['index_name'], + host=data['indexing']['elasticsearch']['host'], + dry_run=data['indexing']['elasticsearch']['dry_run'], + append=data['indexing']['elasticsearch']['append'], + ), + ), + ) + + @staticmethod + def __build_processing_config(data: Dict[str, Any]) -> 'ProcessingConfig': + import_cfg = data.get('processing', {}).get('transcription_import') + transcription_import = None + if import_cfg and import_cfg.get('source_dir'): + transcription_import = TranscriptionImportProcessingConfig( + source_dir=import_cfg['source_dir'], + format_type=import_cfg.get('format_type', '11labs_segmented'), + season_remap=import_cfg.get('season_remap', {}), + ) + return ProcessingConfig( + transcription=TranscriptionProcessingConfig( + mode=data['processing']['transcription']['mode'], + model=data['processing']['transcription']['model'], + language=data['processing']['transcription']['language'], + device=data['processing']['transcription']['device'], + ), + transcode=TranscodeProcessingConfig( + max_bitrate_file_size_mb=data['processing']['transcode']['max_bitrate_file_size_mb'], + max_bitrate_duration_seconds=data['processing']['transcode']['max_bitrate_duration_seconds'], + min_bitrate_mbps=data['processing']['transcode']['min_bitrate_mbps'], + bitrate_boost_ratio=data['processing']['transcode']['bitrate_boost_ratio'], + force_deinterlace=data['processing']['transcode']['force_deinterlace'], + keyframe_interval_seconds=data['processing']['transcode']['keyframe_interval_seconds'], + resolution=data['processing']['transcode']['resolution'], + ), + scene_detection=SceneDetectionProcessingConfig( + threshold=data['processing']['scene_detection']['threshold'], + min_scene_len=data['processing']['scene_detection']['min_scene_len'], + ), + frame_export=FrameExportProcessingConfig( + frames_per_scene=data['processing']['frame_export']['frames_per_scene'], + ), + transcription_import=transcription_import, + ) + + @staticmethod + def __load_from_file(config_path: Path) -> 'SeriesConfig': + if not config_path.exists(): + raise FileNotFoundError( + f"Series config not found: {config_path}\n" + f"Create it using template: preprocessor/series_configs/template.json", + ) + + defaults: Dict[str, Any] = SeriesConfig.__load_defaults() + + with open(config_path, 'r', encoding='utf-8') as f: + series_overrides: Dict[str, Any] = json.load(f) + + series_filtered: Dict[str, Any] = { + k: v for k, v in series_overrides.items() + if not k.startswith('_') + } + + merged_config: Dict[str, Any] = _deep_merge(defaults, series_filtered) + + return SeriesConfig.__load_from_dict(merged_config) diff --git a/preprocessor/config/settings_factory.py b/preprocessor/config/settings_factory.py new file mode 100644 index 000000000..915b5a9fc --- /dev/null +++ b/preprocessor/config/settings_factory.py @@ -0,0 +1,17 @@ +from typing import Optional + +from preprocessor.config.config import Settings + + +class SettingsFactory: + __instance: Optional[Settings] = None + + @classmethod + def get_settings(cls) -> Settings: + if cls.__instance is None: + cls.__instance = Settings.from_env() + return cls.__instance + + @classmethod + def reset(cls, new_settings: Optional[Settings] = None) -> None: + cls.__instance = new_settings diff --git a/preprocessor/config/settings_instance.py b/preprocessor/config/settings_instance.py new file mode 100644 index 000000000..e8e1ea29d --- /dev/null +++ b/preprocessor/config/settings_instance.py @@ -0,0 +1,4 @@ +from preprocessor.config.config import Settings +from preprocessor.config.settings_factory import SettingsFactory + +settings: Settings = SettingsFactory.get_settings() diff --git a/preprocessor/config/step_configs.py b/preprocessor/config/step_configs.py new file mode 100644 index 000000000..a4fa7eca6 --- /dev/null +++ b/preprocessor/config/step_configs.py @@ -0,0 +1,240 @@ +from pathlib import Path +from typing import ( + Dict, + List, + Literal, + Optional, +) + +from pydantic import ( + BaseModel, + ConfigDict, + Field, +) + +from preprocessor.config.enums import KeyframeStrategy +from preprocessor.services.media.resolution import Resolution + + +class TranscodeConfig(BaseModel): + model_config = ConfigDict(arbitrary_types_allowed=True) + + bitrate_boost_ratio: float = Field(default=1.1, ge=1.0, le=2.0) + force_deinterlace: bool = False + keyframe_interval_seconds: float = Field(gt=0) + max_bitrate_duration_seconds: float = Field(gt=0) + max_bitrate_file_size_mb: float = Field(gt=0) + max_parallel_episodes: int = Field(default=3, ge=1, le=10) + min_bitrate_mbps: float = Field(default=2.0, gt=0) + resolution: Resolution = Field(default=Resolution.R720P) + + @property + def audio_bitrate_kbps(self) -> int: + return 128 + + @property + def codec(self) -> str: + return 'h264_nvenc' + + @property + def preset(self) -> str: + return 'p7' + + @property + def video_bitrate_mbps(self) -> float: + total = (self.max_bitrate_file_size_mb * 8) / self.max_bitrate_duration_seconds + audio = self.audio_bitrate_kbps / 1000.0 + return round(total - audio, 2) + + def calculate_minrate_mbps(self, percent: float = 0.6) -> float: + return round(self.video_bitrate_mbps * percent, 2) + + def calculate_maxrate_mbps(self, percent: float = 1.4) -> float: + return round(self.video_bitrate_mbps * percent, 2) + + def calculate_bufsize_mbps(self, multiplier: float = 2.0) -> float: + return round(self.video_bitrate_mbps * multiplier, 2) + + +class ResolutionAnalysisConfig(BaseModel): + model_config = ConfigDict(arbitrary_types_allowed=True) + + max_parallel_episodes: int = Field(default=10, ge=1, le=20) + resolution: Resolution = Field(default=Resolution.R720P) + + +class SceneDetectionConfig(BaseModel): + max_parallel_episodes: int = Field(default=4, ge=1, le=8) + min_scene_len: int = Field(default=10, ge=1) + threshold: float = Field(default=0.5, ge=0, le=1) + + +class FrameExportConfig(BaseModel): + model_config = ConfigDict(arbitrary_types_allowed=True) + + frames_per_scene: int = Field(default=1, ge=1) + keyframe_strategy: KeyframeStrategy = KeyframeStrategy.SCENE_CHANGES + max_parallel_episodes: int = Field(default=4, ge=1, le=16) + max_parallel_frames: int = Field(default=12, ge=1, le=32) + resolution: Resolution = Field(default=Resolution.R720P) + scene_change_offset_seconds: float = Field(default=0.5, ge=0) + + +class TranscriptionConfig(BaseModel): + beam_size: int = Field(default=10, ge=1) + device: str = 'cuda' + language: str = 'pl' + max_chunk_duration_seconds: int = Field(default=1800, ge=60) + max_parallel_episodes: int = Field(default=2, ge=1, le=4) + mode: str = 'whisper' + model: str = 'large-v3-turbo' + temperature: float = Field(default=0.0, ge=0.0, le=1.0) + + +class TextAnalysisConfig(BaseModel): + language: str = 'pl' + max_parallel_episodes: int = Field(default=8, ge=1, le=16) + + +class SegmentFilterConfig(BaseModel): + max_parallel_episodes: int = Field(default=8, ge=1, le=16) + + +class TextCleaningConfig(SegmentFilterConfig): + pass + + +class SoundEventsConfig(SegmentFilterConfig): + pass + + +class TextEmbeddingConfig(BaseModel): + batch_size: int = Field(default=8, ge=1) + device: str = 'cuda' + max_parallel_episodes: int = Field(default=1, ge=1, le=2) + model_name: str = 'Qwen/Qwen3-VL-Embedding-8B' + text_chunk_overlap: int = Field(default=3, ge=0) + text_sentences_per_chunk: int = Field(default=8, ge=1) + + +class VideoEmbeddingConfig(BaseModel): + batch_size: int = Field(default=8, ge=1) + device: str = 'cuda' + max_parallel_episodes: int = Field(default=1, ge=1, le=2) + model_name: str = 'Qwen/Qwen3-VL-Embedding-8B' + + +class SoundEventEmbeddingConfig(BaseModel): + batch_size: int = Field(default=64, ge=1) + device: str = 'cuda' + max_parallel_episodes: int = Field(default=1, ge=1, le=2) + model_name: str = 'Qwen/Qwen3-VL-Embedding-8B' + segments_per_embedding: int = Field(default=5, ge=1) + + +class FullEpisodeEmbeddingConfig(BaseModel): + device: str = 'cuda' + max_chars_per_chunk: int = Field(default=6000, ge=100) + max_parallel_episodes: int = Field(default=1, ge=1, le=2) + min_chunk_length: int = Field(default=100, ge=1) + model_name: str = 'Qwen/Qwen3-VL-Embedding-8B' + overlap_chars: int = Field(default=4500, ge=0) + + +class EpisodeNameEmbeddingConfig(BaseModel): + device: str = 'cuda' + max_parallel_episodes: int = Field(default=1, ge=1, le=2) + model_name: str = 'Qwen/Qwen3-VL-Embedding-8B' + + +class SoundSeparationConfig(BaseModel): + max_parallel_episodes: int = Field(default=4, ge=1, le=8) + + +class DocumentGenerationConfig(BaseModel): + max_parallel_episodes: int = Field(default=8, ge=1, le=16) + + +class ImageHashConfig(BaseModel): + batch_size: int = Field(default=32, ge=1) + device: str = 'cuda' + max_parallel_episodes: int = Field(default=2, ge=1, le=4) + + +class TranscriptionImportConfig(BaseModel): + format_type: str = '11labs_segmented' + season_remap: Dict[str, int] = {} + source_dir: Path + + +class ElasticsearchConfig(BaseModel): + append: bool = False + dry_run: bool = False + host: str = 'localhost:9200' + index_name: str + max_parallel_episodes: int = Field(default=4, ge=1, le=8) + + +class AudioExtractionConfig(BaseModel): + channels: int = Field(default=1, ge=1, le=2) + format: str = 'wav' + max_parallel_episodes: int = Field(default=4, ge=1, le=8) + sample_rate: int = Field(default=48000, ge=8000, le=96000) + + +class CharacterDetectionConfig(BaseModel): + max_parallel_episodes: int = Field(default=4, ge=1, le=8) + threshold: float = Field(default=0.55, ge=0.0, le=1.0) + + +class EmotionDetectionConfig(BaseModel): + max_parallel_episodes: int = Field(default=2, ge=1, le=4) + + +class SeriesFaceClusteringConfig(BaseModel): + prefetch_workers: int = Field(default=8, ge=1, le=32) + + +class ObjectDetectionConfig(BaseModel): + batch_size: int = Field(default=8, ge=1) + conf_threshold: float = Field(default=0.3, ge=0.0, le=1.0) + max_parallel_episodes: int = Field(default=2, ge=1, le=4) + model_name: str = 'ustc-community/dfine-xlarge-obj2coco' + + +class ArchiveConfig(BaseModel): + allow_partial: bool = False + max_parallel_episodes: int = Field(default=4, ge=1, le=8) + + +class ValidationConfig(BaseModel): + anomaly_threshold: float = 20.0 + episodes_info_json: Optional[Path] = None + max_parallel_episodes: int = Field(default=1, ge=1, le=16) + + +class EpisodeScraperConfig(BaseModel): + headless: bool = True + merge_sources: bool = True + parser_mode: str = "normal" + scraper_method: str = "crawl4ai" + urls: List[str] + + +class CharacterScraperConfig(BaseModel): + headless: bool = True + parser_mode: str = "normal" + scraper_method: str = "crawl4ai" + urls: List[str] + + +class CharacterReferenceConfig(BaseModel): + images_per_character: int = Field(default=5, ge=0, le=20) + max_parallel_episodes: int = Field(default=4, ge=1, le=8) + search_engine: str = "normal" + search_query_template: str = "Serial {series_name} {char_name} postać" + + +class CharacterReferenceProcessorConfig(BaseModel): + reference_source: Literal["web", "clusters"] = "clusters" + similarity_threshold: float = Field(default=0.45, ge=0.0, le=1.0) diff --git a/preprocessor/config/step_defaults.py b/preprocessor/config/step_defaults.py new file mode 100644 index 000000000..56a65f20c --- /dev/null +++ b/preprocessor/config/step_defaults.py @@ -0,0 +1,71 @@ +from typing import Dict + +from preprocessor.config.step_configs import ( + ArchiveConfig, + CharacterDetectionConfig, + DocumentGenerationConfig, + ElasticsearchConfig, + EmotionDetectionConfig, + FrameExportConfig, + ImageHashConfig, + ObjectDetectionConfig, + SceneDetectionConfig, + SoundSeparationConfig, + TextAnalysisConfig, + TextEmbeddingConfig, + TranscodeConfig, + TranscriptionConfig, + VideoEmbeddingConfig, +) +from preprocessor.services.media.resolution import Resolution + + +class DefaultConfigFactory: + @staticmethod + def get_configs(series_name: str) -> Dict[str, object]: + return { + 'transcode': TranscodeConfig( + max_bitrate_file_size_mb=50.0, + max_bitrate_duration_seconds=100.0, + keyframe_interval_seconds=0.5, + min_bitrate_mbps=2.0, + bitrate_boost_ratio=1.1, + ), + 'transcribe': TranscriptionConfig( + mode='whisper', + model='large-v3-turbo', + language='pl', + device='cuda', + beam_size=5, + temperature=0.0, + max_parallel_episodes=1, + ), + 'separate_sounds': SoundSeparationConfig(), + 'analyze_text': TextAnalysisConfig(language='pl'), + 'detect_scenes': SceneDetectionConfig(threshold=0.5, min_scene_len=10), + 'export_frames': FrameExportConfig(frames_per_scene=1, resolution=Resolution.R1080P), + 'text_embeddings': TextEmbeddingConfig( + model_name='Qwen/Qwen3-VL-Embedding-8B', + batch_size=8, + device='cuda', + text_sentences_per_chunk=5, + text_chunk_overlap=1, + ), + 'image_hashing': ImageHashConfig(batch_size=32), + 'video_embeddings': VideoEmbeddingConfig( + model_name='Qwen/Qwen3-VL-Embedding-8B', + batch_size=8, + device='cuda', + ), + 'character_detection': CharacterDetectionConfig(threshold=0.7), + 'emotion_detection': EmotionDetectionConfig(), + 'object_detection': ObjectDetectionConfig(), + 'generate_elastic_documents': DocumentGenerationConfig(), + 'generate_archives': ArchiveConfig(), + 'index': ElasticsearchConfig( + index_name=f'{series_name}_clips', + host='localhost:9200', + dry_run=False, + append=False, + ), + } diff --git a/preprocessor/config/types/__init__.py b/preprocessor/config/types/__init__.py new file mode 100644 index 000000000..512f5a419 --- /dev/null +++ b/preprocessor/config/types/__init__.py @@ -0,0 +1,81 @@ +from .clip import ClipSegment +from .detection import ( + CharacterDetectionInFrame, + Detection, + ObjectDetectionInFrame, +) +from .episode import ( + EpisodeInfo, + EpisodeMetadata, + SeasonInfo, + SeasonInfoDict, +) +from .frame import FrameRequest +from .keys import ( + ElasticsearchAggregationKeys, + ElasticsearchKeys, + EpisodeMetadataKeys, + WordKeys, + WordTypeValues, +) +from .scene import ( + SceneDict, + SceneTimestamp, + SceneTimestampPoint, + SceneTimestampsData, +) +from .search import ( + ElasticsearchAggregations, + ElasticsearchHit, + ElasticsearchHits, + ElasticsearchResponse, + EpisodeBucket, + SearchSegment, + SeasonBucket, +) +from .transcription import ( + BaseSegment, + ElasticsearchSegment, + SegmentWithScore, + SegmentWithTimes, + TranscriptionContext, +) +from .video import ( + HashResult, + VideoMetadata, +) + +__all__ = [ + 'EpisodeInfo', + 'EpisodeMetadata', + 'SeasonInfo', + 'SeasonInfoDict', + 'FrameRequest', + 'SceneDict', + 'SceneTimestamp', + 'SceneTimestampPoint', + 'SceneTimestampsData', + 'ClipSegment', + 'CharacterDetectionInFrame', + 'Detection', + 'ObjectDetectionInFrame', + 'HashResult', + 'VideoMetadata', + 'BaseSegment', + 'ElasticsearchSegment', + 'SegmentWithScore', + 'SegmentWithTimes', + 'TranscriptionContext', + 'ElasticsearchAggregations', + 'ElasticsearchHit', + 'ElasticsearchHits', + 'ElasticsearchResponse', + 'EpisodeBucket', + 'SearchSegment', + 'SeasonBucket', + 'WordKeys', + 'WordTypeValues', + 'ElasticsearchKeys', + 'ElasticsearchAggregationKeys', + 'EpisodeMetadataKeys', +] diff --git a/preprocessor/config/types/clip.py b/preprocessor/config/types/clip.py new file mode 100644 index 000000000..f0fde8b36 --- /dev/null +++ b/preprocessor/config/types/clip.py @@ -0,0 +1,11 @@ +from typing import ( + Any, + TypedDict, + Union, +) + + +class ClipSegment(TypedDict): + end_time: float + start_time: float + video_path: Union[str, Any] diff --git a/preprocessor/config/types/detection.py b/preprocessor/config/types/detection.py new file mode 100644 index 000000000..a45d19291 --- /dev/null +++ b/preprocessor/config/types/detection.py @@ -0,0 +1,27 @@ +from typing import ( + List, + NotRequired, + TypedDict, +) + + +class CharacterDetectionInFrame(TypedDict): + bbox: List[int] + confidence: float + embedding: NotRequired[List[float]] + name: str + + +class ObjectDetectionInFrame(TypedDict): + bbox: List[int] + class_id: int + class_name: str + confidence: float + + +class Detection(TypedDict): + bbox: List[int] + class_id: NotRequired[int] + class_name: NotRequired[str] + confidence: float + name: NotRequired[str] diff --git a/preprocessor/config/types/episode.py b/preprocessor/config/types/episode.py new file mode 100644 index 000000000..2d3d087ee --- /dev/null +++ b/preprocessor/config/types/episode.py @@ -0,0 +1,28 @@ +from typing import ( + Dict, + TypedDict, + Union, +) + + +class EpisodeInfo(TypedDict): + episode_number: int + premiere_date: str + title: str + viewership: Union[str, int, float] + + +class EpisodeMetadata(TypedDict): + episode_number: int + premiere_date: str + season: int + series_name: str + title: str + viewership: Union[str, int, float] + + +class SeasonInfo(TypedDict): + pass + + +SeasonInfoDict = Dict[str, int] diff --git a/preprocessor/config/types/frame.py b/preprocessor/config/types/frame.py new file mode 100644 index 000000000..dc2e637f7 --- /dev/null +++ b/preprocessor/config/types/frame.py @@ -0,0 +1,13 @@ +from typing import ( + NotRequired, + TypedDict, +) + + +class FrameRequest(TypedDict): + frame_number: int + timestamp: float + type: str + scene_number: NotRequired[int] + original_timestamp: NotRequired[float] + snapped_to_keyframe: NotRequired[bool] diff --git a/preprocessor/config/types/keys.py b/preprocessor/config/types/keys.py new file mode 100644 index 000000000..5e2b69b3f --- /dev/null +++ b/preprocessor/config/types/keys.py @@ -0,0 +1,200 @@ +class SegmentKeys: + END = 'end' + END_TIME = 'end_time' + ID = 'id' + SEGMENT_ID = 'segment_id' + START = 'start' + START_TIME = 'start_time' + TEXT = 'text' + VIDEO_PATH = 'video_path' + + +class EpisodeMetadataKeys: + EPISODE_INFO = 'episode_info' + EPISODE_METADATA = 'episode_metadata' + EPISODE_NUMBER = 'episode_number' + PREMIERE_DATE = 'premiere_date' + SEASON = 'season' + SERIES_NAME = 'series_name' + TITLE = 'title' + VIEWERSHIP = 'viewership' + + +class ElasticsearchKeys: + AGGREGATIONS = 'aggregations' + BUCKETS = 'buckets' + HITS = 'hits' + KEY = 'key' + SCORE = '_score' + SOURCE = '_source' + TOTAL = 'total' + + +class ElasticsearchAggregationKeys: + SEASONS = 'seasons' + UNIQUE_EPISODES = 'unique_episodes' + VALUE = 'value' + + +class TranscriptionContextKeys: + CONTEXT = 'context' + OVERALL_END_TIME = 'overall_end_time' + OVERALL_START_TIME = 'overall_start_time' + TARGET = 'target' + + +class ElasticsearchQueryKeys: + AGGS = 'aggs' + ASC = 'asc' + AUTO = 'AUTO' + BOOL = 'bool' + CARDINALITY = 'cardinality' + DESC = 'desc' + FIELD = 'field' + FILTER = 'filter' + FUZZINESS = 'fuzziness' + GT = 'gt' + INCLUDES = 'includes' + KEY = '_key' + LT = 'lt' + MATCH = 'match' + MUST = 'must' + ORDER = 'order' + QUERY = 'query' + RANGE = 'range' + SIZE = 'size' + SORT = 'sort' + SOURCE = '_source' + TERM = 'term' + TERMS = 'terms' + TOP_HITS = 'top_hits' + + +class EpisodesDataKeys: + EPISODES = 'episodes' + SEASONS = 'seasons' + SEASON_NUMBER = 'season_number' + + +class FfprobeKeys: + FORMAT = 'format' + STREAMS = 'streams' + + +class FfprobeStreamKeys: + BIT_RATE = 'bit_rate' + CODEC_NAME = 'codec_name' + DURATION = 'duration' + HEIGHT = 'height' + R_FRAME_RATE = 'r_frame_rate' + WIDTH = 'width' + + +class FfprobeFormatKeys: + DURATION = 'duration' + SIZE = 'size' + + +class DetectionKeys: + CHARACTERS = 'characters' + DETECTIONS = 'detections' + FRAME = 'frame' + FRAME_FILE = 'frame_file' + FRAME_NAME = 'frame_name' + FRAME_NUMBER = 'frame_number' + + +class CharacterDetectionKeys: + BBOX = 'bbox' + CONFIDENCE = 'confidence' + EMOTION = 'emotion' + NAME = 'name' + + +class EmotionKeys: + CONFIDENCE = 'confidence' + LABEL = 'label' + + +class ObjectDetectionKeys: + BBOX = 'bbox' + CLASS_ID = 'class_id' + CLASS_NAME = 'class_name' + CONFIDENCE = 'confidence' + + +class SceneKeys: + END = 'end' + SCENES = 'scenes' + SCENE_END_FRAME = 'scene_end_frame' + SCENE_END_TIME = 'scene_end_time' + SCENE_NUMBER = 'scene_number' + SCENE_START_FRAME = 'scene_start_frame' + SCENE_START_TIME = 'scene_start_time' + START = 'start' + + +class SceneTimeKeys: + FRAME = 'frame' + SECONDS = 'seconds' + + +class ElasticDocKeys: + CHARACTER_APPEARANCES = 'character_appearances' + DETECTED_OBJECTS = 'detected_objects' + PERCEPTUAL_HASH = 'perceptual_hash' + PERCEPTUAL_HASH_INT = 'perceptual_hash_int' + SCENE_INFO = 'scene_info' + + +class EmbeddingKeys: + EMBEDDING = 'embedding' + EPISODE_ID = 'episode_id' + EPISODE_METADATA = 'episode_metadata' + FRAME_NUMBER = 'frame_number' + FRAME_PATH = 'frame_path' + PERCEPTUAL_HASH = 'perceptual_hash' + SCENE_NUMBER = 'scene_number' + TIMESTAMP = 'timestamp' + TITLE = 'title' + TITLE_EMBEDDING = 'title_embedding' + + +class ValidationMetadataKeys: + CODEC = 'codec' + DURATION = 'duration' + FORMAT = 'format' + HEIGHT = 'height' + LINE_COUNT = 'line_count' + SIZE_BYTES = 'size_bytes' + SIZE_MB = 'size_mb' + WIDTH = 'width' + + +class WordKeys: + END = 'end' + START = 'start' + TEXT = 'text' + TYPE = 'type' + WORD = 'word' + WORDS = 'words' + + +class WordTypeValues: + AUDIO_EVENT = 'audio_event' + SPACING = 'spacing' + + +class GoogleSearchKeys: + API_KEY = 'api_key' + ENGINE = 'engine' + GL = 'gl' + HL = 'hl' + IMAGES_RESULTS = 'images_results' + Q = 'q' + + +class ImageResultKeys: + IMAGE = 'image' + ORIGINAL = 'original' + THUMBNAIL = 'thumbnail' diff --git a/preprocessor/config/types/scene.py b/preprocessor/config/types/scene.py new file mode 100644 index 000000000..ad98498d1 --- /dev/null +++ b/preprocessor/config/types/scene.py @@ -0,0 +1,31 @@ +from typing import ( + List, + NotRequired, + TypedDict, +) + + +class SceneDict(TypedDict): + end_frame: int + end_time: float + fps: float + scene_number: int + start_frame: int + start_time: float + + +class SceneTimestampPoint(TypedDict): + frame: int + seconds: float + + +class SceneTimestamp(TypedDict): + end: SceneTimestampPoint + scene_number: int + start: SceneTimestampPoint + + +class SceneTimestampsData(TypedDict): + fps: NotRequired[float] + scenes: List[SceneTimestamp] + total_scenes: NotRequired[int] diff --git a/preprocessor/config/types/search.py b/preprocessor/config/types/search.py new file mode 100644 index 000000000..390a30364 --- /dev/null +++ b/preprocessor/config/types/search.py @@ -0,0 +1,60 @@ +from typing import ( + Any, + Dict, + List, + NotRequired, + TypedDict, + Union, +) + + +class SearchSegment(TypedDict): + end_time: float + episode_number: int + season: int + start_time: float + title: str + + +class ElasticsearchSegment(TypedDict): + end_time: float + episode_number: int + season: int + start_time: float + title: str + + +class ElasticsearchHit(TypedDict): + _score: float + _source: ElasticsearchSegment + + +class ElasticsearchHits(TypedDict): + hits: List[ElasticsearchHit] + max_score: float + total: Dict[str, Any] + + +class ElasticsearchResponse(TypedDict): + aggregations: NotRequired[Dict[str, Any]] + hits: ElasticsearchHits + timed_out: bool + took: int + + +class EpisodeBucket(TypedDict): + doc_count: int + episode_metadata: Dict[str, Any] + key: int + + +class SeasonBucket(TypedDict): + doc_count: int + key: int + unique_episodes: Dict[str, int] + + +class ElasticsearchAggregations(TypedDict): + buckets: NotRequired[List[Union[SeasonBucket, EpisodeBucket]]] + seasons: Dict[str, Union[List[SeasonBucket], int]] + unique_episodes: Dict[str, Union[List[EpisodeBucket], int]] diff --git a/preprocessor/config/types/transcription.py b/preprocessor/config/types/transcription.py new file mode 100644 index 000000000..c3c68463e --- /dev/null +++ b/preprocessor/config/types/transcription.py @@ -0,0 +1,56 @@ +from typing import ( + Any, + List, + NotRequired, + TypedDict, + Union, +) + +from preprocessor.config.types.episode import EpisodeMetadata + + +class ClipSegment(TypedDict): + end_time: float + start_time: float + video_path: Union[str, Any] + + +class BaseSegment(TypedDict): + end: float + id: int + start: float + text: str + + +class SegmentWithTimes(TypedDict): + end_time: float + episode_metadata: EpisodeMetadata + segment_id: int + start_time: float + text: str + video_path: NotRequired[str] + + +class SegmentWithScore(SegmentWithTimes): + _score: float + + +class ElasticsearchSegment(TypedDict): + _score: NotRequired[float] + end: NotRequired[float] + end_time: NotRequired[float] + episode_info: NotRequired[EpisodeMetadata] + episode_metadata: NotRequired[EpisodeMetadata] + id: NotRequired[int] + segment_id: NotRequired[int] + start: NotRequired[float] + start_time: NotRequired[float] + text: str + video_path: NotRequired[str] + + +class TranscriptionContext(TypedDict): + context: List[BaseSegment] + overall_end_time: float + overall_start_time: float + target: ElasticsearchSegment diff --git a/preprocessor/config/types/video.py b/preprocessor/config/types/video.py new file mode 100644 index 000000000..5dbb0af4f --- /dev/null +++ b/preprocessor/config/types/video.py @@ -0,0 +1,20 @@ +from typing import ( + NotRequired, + TypedDict, +) + + +class HashResult(TypedDict): + file_path: NotRequired[str] + frame_number: int + hash: str + timestamp: float + + +class VideoMetadata(TypedDict): + bitrate: NotRequired[int] + codec: NotRequired[str] + duration: float + fps: float + height: int + width: int diff --git a/preprocessor/core/artifacts.py b/preprocessor/core/artifacts.py new file mode 100644 index 000000000..e269ecd7c --- /dev/null +++ b/preprocessor/core/artifacts.py @@ -0,0 +1,152 @@ +from __future__ import annotations + +from dataclasses import ( + dataclass, + field, +) +from pathlib import Path +from typing import ( + Any, + Dict, + List, + Optional, +) + +from preprocessor.services.episodes.types import EpisodeInfo + + +@dataclass(frozen=True) +class Artifact: + pass + + +@dataclass(frozen=True) +class EpisodeArtifact(Artifact): + episode_id: str + episode_info: EpisodeInfo + + +@dataclass(frozen=True) +class SourceVideo(EpisodeArtifact): + path: Path + + +@dataclass(frozen=True) +class TranscodedVideo(EpisodeArtifact): + codec: str + path: Path + resolution: str + source_video_path: Path + + +@dataclass(frozen=True) +class SceneCollection(EpisodeArtifact): + min_scene_len: int + path: Path + scenes: List[Dict[str, Any]] + threshold: float + video_path: Path + source_video_path: Path + + +@dataclass(frozen=True) +class FrameCollection(EpisodeArtifact): + directory: Path + frame_count: int + metadata_path: Path + + +@dataclass(frozen=True) +class TranscriptionData(EpisodeArtifact): + format: str + language: str + model: str + path: Path + + +@dataclass(frozen=True) +class EmbeddingCollection(EpisodeArtifact): + embedding_count: int + embedding_type: str + model_name: str + path: Path + + +@dataclass(frozen=True) +class DetectionResults(EpisodeArtifact): + detection_count: int + detection_type: str + path: Path + + +@dataclass(frozen=True) +class ElasticDocuments(EpisodeArtifact): + document_count: int + path: Path + + +@dataclass(frozen=True) +class TextAnalysisResults(EpisodeArtifact): + path: Path + statistics: Dict[str, Any] + metadata: Optional[Dict[str, Any]] = field(default=None) + + +@dataclass(frozen=True) +class AudioArtifact(EpisodeArtifact): + format: str + path: Path + + +@dataclass(frozen=True) +class IndexingResult(Artifact): + document_count: int + index_name: str + success: bool + + +@dataclass(frozen=True) +class ImageHashCollection(EpisodeArtifact): + hash_count: int + path: Path + + +@dataclass(frozen=True) +class EmotionData(EpisodeArtifact): + path: Path + + +@dataclass(frozen=True) +class ObjectDetectionData(EpisodeArtifact): + path: Path + + +@dataclass(frozen=True) +class ArchiveArtifact(EpisodeArtifact): + path: Path + + +@dataclass(frozen=True) +class CharacterMetadata(EpisodeArtifact): + path: Path + character_count: int + + +@dataclass(frozen=True) +class EpisodeMetadata(EpisodeArtifact): + path: Path + + +@dataclass(frozen=True) +class ValidationResult(Artifact): + season: str + validation_report_dir: Path + + +@dataclass(frozen=True) +class ResolutionAnalysisResult(Artifact): + total_files: int + upscaling_percentage: float + + +ProcessedEpisode = ElasticDocuments diff --git a/preprocessor/core/base_processor.py b/preprocessor/core/base_processor.py deleted file mode 100644 index fa8e5aa04..000000000 --- a/preprocessor/core/base_processor.py +++ /dev/null @@ -1,340 +0,0 @@ -from abc import ( - ABC, - abstractmethod, -) -from dataclasses import dataclass -import logging -from pathlib import Path -import re -from typing import ( - Any, - Dict, - List, - Optional, - Tuple, -) - -from preprocessor.core.constants import ( - FILE_SUFFIXES, - SUPPORTED_VIDEO_EXTENSIONS, -) -from preprocessor.core.state_manager import StateManager -from preprocessor.utils.console import ( - console, - create_progress, -) -from preprocessor.utils.error_handling_logger import ErrorHandlingLogger - - -@dataclass -class ProcessingItem: - episode_id: str - input_path: Path - metadata: Dict[str, Any] - - -@dataclass -class OutputSpec: - path: Path - required: bool = True - - -class BaseProcessor(ABC): - SUPPORTED_VIDEO_EXTENSIONS = SUPPORTED_VIDEO_EXTENSIONS - - def __init__( - self, - args: Dict[str, Any], - class_name: str, - error_exit_code: int, - loglevel: int = logging.DEBUG, - ): - self._validate_args(args) - self._args = args - - self.logger = ErrorHandlingLogger( - class_name=class_name, - loglevel=loglevel, - error_exit_code=error_exit_code, - ) - - self.state_manager: Optional[StateManager] = args.get("state_manager") - self.series_name: str = args.get("series_name", "unknown") - - from preprocessor.utils.progress_tracker import ProgressTracker # pylint: disable=import-outside-toplevel - self.progress = args.get("progress_tracker", ProgressTracker()) - - @classmethod - def get_video_glob_patterns(cls) -> List[str]: - return [f"*{ext}" for ext in cls.SUPPORTED_VIDEO_EXTENSIONS] - - @abstractmethod - def _validate_args(self, args: Dict[str, Any]) -> None: - pass - - def work(self) -> int: - try: - self._execute() - except KeyboardInterrupt: - console.print("\n[yellow]Process interrupted by user[/yellow]") - self.cleanup() - self.logger.finalize() - return 130 - except Exception as e: - self.logger.error(f"{self.__class__.__name__} failed: {e}") - - self.cleanup() - return self.logger.finalize() - - def cleanup(self) -> None: - pass - - def _load_resources(self) -> bool: - return True - - def _get_processing_info(self) -> List[str]: - return [] - - @staticmethod - def _get_episode_processing_items_from_metadata( - metadata_pattern: str, - base_dir: Path, - episode_manager: "EpisodeManager", - ) -> List[ProcessingItem]: - all_metadata_files = list(base_dir.glob(metadata_pattern)) - items = [] - - for metadata_file in all_metadata_files: - episode_info = episode_manager.parse_filename(metadata_file) - if not episode_info: - continue - - episode_id = episode_manager.get_episode_id_for_state(episode_info) - - items.append( - ProcessingItem( - episode_id=episode_id, - input_path=metadata_file, - metadata={ - "episode_info": episode_info, - "series_name": episode_manager.series_name, - }, - ), - ) - - return items - - def _get_processing_items(self) -> List[ProcessingItem]: - raise NotImplementedError( - f"{self.__class__.__name__} must implement _get_processing_items() " - "or override _execute() directly (legacy mode)", - ) - - def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: - raise NotImplementedError( - f"{self.__class__.__name__} must implement _get_expected_outputs() " - "or override _execute() directly (legacy mode)", - ) - - def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> None: - raise NotImplementedError( - f"{self.__class__.__name__} must implement _process_item() " - "or override _execute() directly (legacy mode)", - ) - - def __get_step_name(self) -> str: - class_name = self.__class__.__name__ - name = class_name.replace("Processor", "").replace("Generator", "").replace("Detector", "") - name = name.replace("Transcoder", "").replace("Importer", "").replace("Indexer", "") - return self.__to_snake_case(name) - - @staticmethod - def __to_snake_case(name: str) -> str: - name = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name) - return re.sub('([a-z0-9])([A-Z])', r'\1_\2', name).lower() - - def _should_skip_item(self, item: ProcessingItem) -> Tuple[bool, List[OutputSpec], str]: - expected_outputs = self._get_expected_outputs(item) - - if not expected_outputs: - return False, [], "" - - missing_outputs = [ - output for output in expected_outputs - if not output.path.exists() or output.path.stat().st_size == 0 - ] - - step_name = self.__get_step_name() - state_completed = ( - self.state_manager and - self.state_manager.is_step_completed(step_name, item.episode_id) - ) - - if not missing_outputs and state_completed: - return True, [], f"[yellow]Skipping (completed): {item.episode_id}[/yellow]" - - if not missing_outputs and not state_completed: - if self.state_manager: - self.state_manager.mark_step_completed(step_name, item.episode_id) - return True, [], f"[yellow]Skipping (files exist, state synced): {item.episode_id}[/yellow]" - - if missing_outputs and state_completed: - console.print( - f"[yellow]Warning: State marked complete but outputs missing for {item.episode_id}[/yellow]", - ) - return False, missing_outputs, "" - - return False, missing_outputs, "" - - def _execute(self) -> None: - all_items = self._get_processing_items() - - if not all_items: - console.print("[yellow]No items to process[/yellow]") - return - - items_to_process = [] - skipped_count = 0 - skip_messages = [] - - for item in all_items: - should_skip, missing_outputs, skip_message = self._should_skip_item(item) - - if should_skip: - if skip_message: - skip_messages.append(skip_message) - skipped_count += 1 - else: - item.metadata['missing_outputs'] = missing_outputs - items_to_process.append(item) - - if not items_to_process: - console.print( - f"[yellow]All items already processed ({len(all_items)} total, {skipped_count} skipped)[/yellow]", - ) - return - - for skip_message in skip_messages: - console.print(skip_message) - - console.print( - f"[blue]Processing {len(items_to_process)} items " - f"(of {len(all_items)} total, {skipped_count} skipped)[/blue]", - ) - - self.__execute_processing(items_to_process) - - def __execute_processing(self, items: List[ProcessingItem]) -> None: - if not items: - console.print("[yellow]No items to process, skipping resource loading[/yellow]") - return - - for info_line in self._get_processing_info(): - console.print(info_line) - - if not self._load_resources(): - return - - step_name = self.__get_step_name() - - try: - with create_progress() as progress: - task = progress.add_task( - self._get_progress_description(), - total=len(items), - ) - - for item in items: - try: - if self.state_manager: - temp_files = self._get_temp_files(item) - self.state_manager.mark_step_started( - step_name, - item.episode_id, - temp_files, - ) - - missing_outputs = item.metadata.get('missing_outputs', []) - self._process_item(item, missing_outputs) - - if self.state_manager: - self.state_manager.mark_step_completed(step_name, item.episode_id) - - except Exception as e: - self.logger.error(f"Failed to process {item.episode_id}: {e}") - finally: - progress.advance(task) - except KeyboardInterrupt: - console.print("\n[yellow]Processing interrupted[/yellow]") - raise - - def _get_temp_files(self, item: ProcessingItem) -> List[str]: # pylint: disable=unused-argument - return [] - - def _get_progress_description(self) -> str: - return f"Processing {self.__class__.__name__}" - - def _create_video_processing_items( - self, - source_path: Path, - extensions: List[str], - episode_manager: "EpisodeManager", - skip_unparseable: bool = True, - subdirectory_filter: Optional[str] = None, - ) -> List[ProcessingItem]: - from preprocessor.core.episode_manager import EpisodeManager # pylint: disable=import-outside-toplevel - - video_files = [] - - if source_path.is_file(): - video_files = [source_path] - else: - for ext in extensions: - if subdirectory_filter: - pattern = f"**/{subdirectory_filter}/{ext}" - else: - pattern = f"**/{ext}" - video_files.extend(source_path.glob(pattern)) - - items = [] - for video_file in sorted(video_files): - episode_info = episode_manager.parse_filename(video_file) - - if not episode_info: - if skip_unparseable: - self.logger.error(f"Cannot parse episode info from {video_file.name}") - continue - episode_id = video_file.stem - else: - episode_id = EpisodeManager.get_episode_id_for_state(episode_info) - - items.append( - ProcessingItem( - episode_id=episode_id, - input_path=video_file, - metadata={ - "episode_info": episode_info, - }, - ), - ) - - return items - - def _create_transcription_processing_item(self, transcription_file: Path) -> ProcessingItem: - from preprocessor.core.episode_manager import EpisodeManager # pylint: disable=import-outside-toplevel - - base_name = transcription_file.stem.replace(FILE_SUFFIXES["segmented"], "").replace(FILE_SUFFIXES["simple"], "") - - episode_info = self.episode_manager.parse_filename(transcription_file) if hasattr(self, 'episode_manager') else None - if episode_info: - episode_id = EpisodeManager.get_episode_id_for_state(episode_info) - else: - episode_id = base_name - - return ProcessingItem( - episode_id=episode_id, - input_path=transcription_file, - metadata={ - "base_name": base_name, - }, - ) diff --git a/preprocessor/core/base_step.py b/preprocessor/core/base_step.py new file mode 100644 index 000000000..3d5235431 --- /dev/null +++ b/preprocessor/core/base_step.py @@ -0,0 +1,311 @@ +from __future__ import annotations + +from abc import ( + ABC, + abstractmethod, +) +from concurrent.futures import ( + ThreadPoolExecutor, + as_completed, +) +from pathlib import Path +import re +from typing import ( + Callable, + Dict, + Generic, + List, + Optional, + TypeVar, +) + +from pydantic import BaseModel + +from preprocessor.core.context import ExecutionContext +from preprocessor.core.output_descriptors import OutputDescriptor +from preprocessor.core.temp_files import StepTempFile + +InputT = TypeVar("InputT") +OutputT = TypeVar("OutputT") +ConfigT = TypeVar("ConfigT", bound=BaseModel) + + +class PipelineStep(ABC, Generic[InputT, OutputT, ConfigT]): + def __init__(self, config: ConfigT) -> None: + self.__config: ConfigT = config + + @property + def name(self) -> str: + class_name = self.__class__.__name__ + if class_name.endswith('Step'): + class_name = class_name[:-4] + + snake_case = re.sub(r'(? ConfigT: + return self.__config + + @property + def is_global(self) -> bool: + return False + + @property + def uses_caching(self) -> bool: + return True + + @property + def uses_global_completion(self) -> bool: + return True + + @property + def supports_batch_processing(self) -> bool: + return False + + def execute(self, input_data: InputT, context: ExecutionContext) -> OutputT: + if not self.uses_caching: + return self._process(input_data, context) + + return self.__execute_managed_flow(input_data, context) + + def execute_batch( + self, input_data: List[InputT], context: ExecutionContext, + ) -> List[OutputT]: + return [self.execute(item, context) for item in input_data] + + def load_all_from_cache( + self, input_list: List[InputT], context: ExecutionContext, + ) -> List[OutputT]: + results = [] + for inp in input_list: + result = self._load_from_cache(self._get_cache_path(inp, context), inp, context) + results.append(result if result else inp) + return results + + def all_outputs_exist( + self, input_list: List[InputT], context: ExecutionContext, + ) -> bool: + if context.force_rerun: + return False + try: + return all(self._get_cache_path(inp, context).exists() for inp in input_list) + except NotImplementedError: + return False + + def should_skip_execution( + self, + episode_id: str, + context: ExecutionContext, + context_vars: Optional[Dict[str, str]] = None, + ) -> bool: + if context.force_rerun: + return False + + if not context.is_step_completed(self.name, episode_id): + return False + + return self.__validate_all_descriptors(context, context_vars, episode_id) + + def setup_resources(self, context: ExecutionContext) -> None: + pass + + def teardown_resources(self, context: ExecutionContext) -> None: + pass + + def cleanup(self) -> None: + pass + + @abstractmethod + def _process(self, input_data: InputT, context: ExecutionContext) -> OutputT: + raise NotImplementedError( + f'{self.__class__.__name__} must implement _process()', + ) + + def get_output_descriptors(self) -> List[OutputDescriptor]: + return [] + + def _get_cache_path(self, input_data: InputT, context: ExecutionContext) -> Path: + raise NotImplementedError( + f'{self.__class__.__name__} must implement _get_cache_path() when caching is enabled', + ) + + def _load_from_cache( + self, cache_path: Path, input_data: InputT, context: ExecutionContext, + ) -> OutputT: + raise NotImplementedError( + f'{self.__class__.__name__} must implement _load_from_cache() when caching is enabled', + ) + + def _resolve_output_path( + self, + descriptor_index: int, + context: ExecutionContext, + context_vars: Optional[Dict[str, str]] = None, + ) -> Path: + descriptors = self.get_output_descriptors() + if not descriptors or descriptor_index >= len(descriptors): + raise ValueError( + f'Step {self.name} has no output descriptor at index {descriptor_index}', + ) + + descriptor = descriptors[descriptor_index] + + if not descriptor.subdir: + formatted_pattern = descriptor.format_pattern(context_vars) + return context.base_output_dir / self.name / formatted_pattern + + return descriptor.resolve_path(context.base_output_dir, context_vars) + + def _get_standard_cache_path( + self, + input_data: InputT, + context: ExecutionContext, + descriptor_index: int = 0, + ) -> Path: + return self._resolve_output_path( + descriptor_index, + context, + { + 'season': input_data.episode_info.season_code(), + 'episode': input_data.episode_info.episode_code(), + }, + ) + + @staticmethod + def _execute_with_threadpool( + input_data: List[InputT], + context: ExecutionContext, + max_workers: int, + executor_fn: Callable[[InputT, ExecutionContext], OutputT], + ) -> List[OutputT]: + context.logger.info( + f"Batch processing {len(input_data)} episodes with {max_workers} workers", + ) + + with ThreadPoolExecutor(max_workers=max_workers) as executor: + futures_to_input = { + executor.submit(executor_fn, artifact, context): artifact + for artifact in input_data + } + + results_dict: Dict[int, OutputT] = {} + try: + for future in as_completed(futures_to_input): + input_artifact = futures_to_input[future] + result = future.result() + results_dict[id(input_artifact)] = result + except KeyboardInterrupt: + context.logger.warning("Batch processing interrupted - cancelling remaining tasks") + for future in futures_to_input: + future.cancel() + raise + + return [results_dict[id(artifact)] for artifact in input_data] + + @staticmethod + def _execute_sequential( + input_data: List[InputT], + context: ExecutionContext, + executor_fn: Callable[[InputT, ExecutionContext], OutputT], + ) -> List[OutputT]: + context.logger.info( + f"Batch processing {len(input_data)} episodes sequentially", + ) + + results = [] + for artifact in input_data: + result = executor_fn(artifact, context) + results.append(result) + + return results + + @staticmethod + def _atomic_write( + final_path: Path, + write_func: Callable[[Path], None], + temp_suffix: str = '.tmp', + ) -> None: + with StepTempFile(final_path, temp_suffix) as temp_path: + write_func(temp_path) + + def __execute_managed_flow( + self, input_data: InputT, context: ExecutionContext, + ) -> OutputT: + cache_path = self._get_cache_path(input_data, context) + + if self.__should_restore_from_cache(cache_path, input_data, context): + return self.__restore_result(cache_path, input_data, context) + + return self.__compute_new_result(input_data, context) + + def __should_restore_from_cache( + self, cache_path: Path, input_data: InputT, context: ExecutionContext, + ) -> bool: + episode_id = 'all' if input_data is None else input_data.episode_id + return self._check_cache_validity( + cache_path, context, episode_id, 'cached', + ) + + def __restore_result( + self, cache_path: Path, input_data: InputT, context: ExecutionContext, + ) -> OutputT: + episode_id = 'all' if input_data is None else input_data.episode_id + context.logger.info(f'Loading {episode_id} from cache') + return self._load_from_cache(cache_path, input_data, context) + + def __compute_new_result( + self, input_data: InputT, context: ExecutionContext, + ) -> OutputT: + episode_id = 'all' if input_data is None else input_data.episode_id + context.logger.info(f'Processing {episode_id}') + context.mark_step_started(self.name, episode_id) + + result = self._process(input_data, context) + + context.mark_step_completed(self.name, episode_id) + return result + + def _check_cache_validity( + self, + output_path: Path, + context: ExecutionContext, + episode_id: str, + cache_description: str, + ) -> bool: + if output_path.exists() and not context.force_rerun: + if not context.is_step_completed(self.name, episode_id): + context.mark_step_completed(self.name, episode_id) + context.logger.info(f'Skipping {episode_id} ({cache_description})') + return True + return False + + def __validate_all_descriptors( + self, + context: ExecutionContext, + context_vars: Optional[Dict[str, str]], + episode_id: str, + ) -> bool: + descriptors = self.get_output_descriptors() + if not descriptors: + return True + + return all( + self.__validate_single_descriptor(descriptor, context, context_vars, episode_id) + for descriptor in descriptors + ) + + @staticmethod + def __validate_single_descriptor( + descriptor: OutputDescriptor, + context: ExecutionContext, + context_vars: Optional[Dict[str, str]], + episode_id: str, + ) -> bool: + result = descriptor.validate(context.base_output_dir, context_vars) + if not result.is_valid: + context.logger.warning( + f'{episode_id} - output invalid: {result.message}', + ) + return False + return True diff --git a/preprocessor/core/base_transcription_step.py b/preprocessor/core/base_transcription_step.py new file mode 100644 index 000000000..6f3efdb15 --- /dev/null +++ b/preprocessor/core/base_transcription_step.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from pathlib import Path +from typing import ( + List, + TypeVar, +) + +from pydantic import BaseModel + +from preprocessor.core.artifacts import ( + EpisodeArtifact, + TranscriptionData, +) +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext +from preprocessor.core.output_descriptors import JsonFileOutput + +EpisodeInputT = TypeVar('EpisodeInputT', bound=EpisodeArtifact) +ConfigT = TypeVar('ConfigT', bound=BaseModel) + + +class BaseTranscriptionStep(PipelineStep[EpisodeInputT, TranscriptionData, ConfigT]): + def get_output_descriptors(self) -> List[JsonFileOutput]: + return [ + JsonFileOutput( + pattern='{season}/{episode_num}/{episode}.json', + subdir='transcriptions/raw', + min_size_bytes=50, + ), + ] + + def _get_cache_path( + self, input_data: EpisodeInputT, context: ExecutionContext, + ) -> Path: + return self._resolve_output_path( + 0, + context, + { + 'season': input_data.episode_info.season_code(), + 'episode_num': input_data.episode_info.episode_num(), + 'episode': input_data.episode_info.episode_code(), + }, + ) diff --git a/preprocessor/core/constants.py b/preprocessor/core/constants.py deleted file mode 100644 index 06063e773..000000000 --- a/preprocessor/core/constants.py +++ /dev/null @@ -1,50 +0,0 @@ -SUPPORTED_VIDEO_EXTENSIONS = ( - ".mp4", - ".avi", - ".mkv", - ".mov", - ".flv", - ".wmv", - ".webm", -) - -DEFAULT_VIDEO_EXTENSION = ".mp4" - -FILE_SUFFIXES = { - "segmented": "_segmented", - "text_segments": "_text_segments", - "simple": "_simple", - "clean": "_clean_transcription", - "clean_alt": "_clean", - "scenes": "_scenes", - "sound_events": "_sound_events", - "text_stats": "_text_stats", - "embeddings_text": "_embeddings_text", - "embeddings_video": "_embeddings_video", - "embeddings_full": "embeddings_full_episode", - "embeddings_sound": "embeddings_sound_events", - "episode_name": "episode_name_embedding", - "image_hashes": "_image_hashes", - "detections": "detections", - "character_detections": "_character_detections", -} - -FILE_EXTENSIONS = { - "json": ".json", - "jsonl": ".jsonl", - "txt": ".txt", - "srt": ".srt", - "mp4": ".mp4", - "jpg": ".jpg", -} - -OUTPUT_FILE_NAMES = { - "detections": "detections.json", - "episode_embedding": "episode_name_embedding.json", - "embeddings_text": "embeddings_text.json", -} - -OUTPUT_FILE_PATTERNS = { - "frame": "*_frame_*.jpg", - "scenes_suffix": "_scenes.json", -} diff --git a/preprocessor/core/context.py b/preprocessor/core/context.py new file mode 100644 index 000000000..b50a395a9 --- /dev/null +++ b/preprocessor/core/context.py @@ -0,0 +1,102 @@ +from __future__ import annotations + +from pathlib import Path +from typing import ( + List, + Optional, +) + +from preprocessor.config.config import Settings +from preprocessor.config.settings_factory import SettingsFactory +from preprocessor.core.model_pool import ModelPool +from preprocessor.core.state_manager import StateManager +from preprocessor.services.core.logging import ErrorHandlingLogger +from preprocessor.services.episodes.types import EpisodeInfo + + +class ExecutionContext: + def __init__( + self, + series_name: str, + base_output_dir: Path, + logger: ErrorHandlingLogger, + state_manager: Optional[StateManager] = None, + force_rerun: bool = False, + disable_parallel: bool = False, + settings_instance: Optional[Settings] = None, + ) -> None: + self.__series_name: str = series_name + self.__base_output_dir: Path = base_output_dir / series_name + self.__state_manager: Optional[StateManager] = state_manager + self.__force_rerun: bool = force_rerun + self.__disable_parallel: bool = disable_parallel + self.__logger: ErrorHandlingLogger = logger + self.__settings: Settings = settings_instance or SettingsFactory.get_settings() + self.__model_pool: ModelPool = ModelPool() + + @property + def base_output_dir(self) -> Path: + return self.__base_output_dir + + @property + def disable_parallel(self) -> bool: + return self.__disable_parallel + + @property + def force_rerun(self) -> bool: + return self.__force_rerun + + @property + def logger(self) -> ErrorHandlingLogger: + return self.__logger + + @property + def model_pool(self) -> ModelPool: + return self.__model_pool + + @property + def series_name(self) -> str: + return self.__series_name + + @property + def settings(self) -> Settings: + """Get active Settings instance for this context.""" + return self.__settings + + @property + def state_manager(self) -> Optional[StateManager]: + return self.__state_manager + + def get_output_path( + self, episode_info: EpisodeInfo, subdir: str, filename: str, + ) -> Path: + season_code: str = episode_info.season_code() + episode_code: str = episode_info.episode_num() + + path = self.__base_output_dir / subdir / season_code / episode_code / filename + path.parent.mkdir(parents=True, exist_ok=True) + return path + + def get_season_output_path( + self, episode_info: EpisodeInfo, subdir: str, filename: str, + ) -> Path: + season_code: str = episode_info.season_code() + + path = self.__base_output_dir / subdir / season_code / filename + path.parent.mkdir(parents=True, exist_ok=True) + return path + + def is_step_completed(self, step_name: str, episode_id: str) -> bool: + if not self.__state_manager: + return False + return self.__state_manager.is_step_completed(step_name, episode_id) + + def mark_step_completed(self, step_name: str, episode_id: str) -> None: + if self.__state_manager: + self.__state_manager.mark_step_completed(step_name, episode_id) + + def mark_step_started( + self, step_name: str, episode_id: str, temp_files: Optional[List[str]] = None, + ) -> None: + if self.__state_manager: + self.__state_manager.mark_step_started(step_name, episode_id, temp_files) diff --git a/preprocessor/core/enums.py b/preprocessor/core/enums.py deleted file mode 100644 index 88003ee93..000000000 --- a/preprocessor/core/enums.py +++ /dev/null @@ -1,35 +0,0 @@ -from enum import Enum - - -class KeyframeStrategy(str, Enum): - SCENE_CHANGES = "scene_changes" - - -class FrameType(str, Enum): - SCENE_SINGLE = "scene_single" - SCENE_START = "scene_start" - SCENE_END = "scene_end" - - @staticmethod - def scene_mid(index: int) -> str: - return f"scene_mid_{index}" - - -class ScraperMethod(str, Enum): - CLIPBOARD = "clipboard" - CRAWL4AI = "crawl4ai" - - -class ParserMode(str, Enum): - NORMAL = "normal" - PREMIUM = "premium" - - -class TranscriptionFormat(str, Enum): - ELEVENLABS_SEGMENTED = "11labs_segmented" - ELEVENLABS = "11labs" - - -class Device(str, Enum): - CUDA = "cuda" - CPU = "cpu" diff --git a/preprocessor/core/episode_file_finder.py b/preprocessor/core/episode_file_finder.py deleted file mode 100644 index 84556d66c..000000000 --- a/preprocessor/core/episode_file_finder.py +++ /dev/null @@ -1,108 +0,0 @@ -import json -import logging -from pathlib import Path -import re -from typing import ( - Any, - Dict, - List, - Optional, -) - -from preprocessor.core.constants import SUPPORTED_VIDEO_EXTENSIONS -from preprocessor.core.file_naming import FileNamingConventions -from preprocessor.core.output_path_builder import OutputPathBuilder - -logger = logging.getLogger(__name__) - - -class EpisodeFileFinder: - def __init__(self, series_name: str): - self.file_naming = FileNamingConventions(series_name) - - @staticmethod - def find_video_file(episode_info, search_dir: Path) -> Optional[Path]: - if not search_dir.exists(): - return None - - if search_dir.is_file(): - return search_dir - - episode_code = episode_info.episode_code() - season_dir_name = OutputPathBuilder.get_season_dir(episode_info) - search_dirs = [search_dir / season_dir_name, search_dir] - - for dir_path in search_dirs: - if not dir_path.exists(): - continue - - for ext in SUPPORTED_VIDEO_EXTENSIONS: - for video_file in dir_path.glob(f"*{ext}"): - if re.search(episode_code, video_file.name, re.IGNORECASE): - return video_file - - return None - - def find_transcription_file( - self, - episode_info, - search_dir: Path, - prefer_segmented: bool = True, - ) -> Optional[Path]: - if not search_dir.exists(): - return None - - season_dir_name = OutputPathBuilder.get_season_dir(episode_info) - season_dir = search_dir / season_dir_name - if not season_dir.exists(): - return None - - if prefer_segmented: - segmented = season_dir / self.file_naming.build_filename( - episode_info, - extension="json", - suffix="segmented", - ) - if segmented.exists(): - return segmented - - regular = season_dir / self.file_naming.build_filename(episode_info, extension="json") - if regular.exists(): - return regular - - return None - - @staticmethod - def find_scene_timestamps_file(episode_info, search_dir: Path) -> Optional[Path]: - if not search_dir.exists(): - return None - - episode_code = episode_info.episode_code() - pattern = f"**/*{episode_code}*_scenes.json" - - for scene_file in search_dir.glob(pattern): - return scene_file - - return None - - @staticmethod - def load_scene_timestamps( - episode_info, - search_dir: Optional[Path], - _logger=None, - ) -> Optional[List[Dict[str, Any]]]: - if not search_dir: - return None - - finder = EpisodeFileFinder("") - scene_file = finder.find_scene_timestamps_file(episode_info, search_dir) - if not scene_file: - return None - - try: - with open(scene_file, "r", encoding="utf-8") as f: - return json.load(f) - except (OSError, json.JSONDecodeError) as e: - if _logger: - _logger.error(f"Failed to load scene timestamps: {e}") - return None diff --git a/preprocessor/core/episode_manager.py b/preprocessor/core/episode_manager.py deleted file mode 100644 index e910e9b94..000000000 --- a/preprocessor/core/episode_manager.py +++ /dev/null @@ -1,173 +0,0 @@ -from dataclasses import dataclass -import json -import logging -from pathlib import Path -from typing import ( - Any, - Dict, - List, - Optional, -) - -from preprocessor.core.constants import DEFAULT_VIDEO_EXTENSION -from preprocessor.core.episode_file_finder import EpisodeFileFinder -from preprocessor.core.episode_parser import EpisodeInfoParser -from preprocessor.core.file_naming import FileNamingConventions -from preprocessor.core.output_path_builder import OutputPathBuilder -from preprocessor.utils.constants import ( - EpisodeMetadataKeys, - EpisodesDataKeys, -) - -logger = logging.getLogger(__name__) - - -@dataclass -class EpisodeInfo: - absolute_episode: int - season: int - relative_episode: int - title: str - series_name: Optional[str] = None - premiere_date: Optional[str] = None - viewership: Optional[str] = None - - def episode_code(self) -> str: - return f"S{self.season:02d}E{self.relative_episode:02d}" - - def season_dir_name(self) -> str: - return f"S{self.season:02d}" - - def is_special(self) -> bool: - return self.season == 0 - - -class EpisodeManager: - def __init__(self, episodes_info_json: Optional[Path], series_name: str): - self.series_name = series_name.lower() - self.episodes_data: Optional[Dict[str, Any]] = None - self.file_naming = FileNamingConventions(self.series_name) - self.file_finder = EpisodeFileFinder(self.series_name) - self.parser = EpisodeInfoParser() - - if episodes_info_json and episodes_info_json.exists(): - with open(episodes_info_json, "r", encoding="utf-8") as f: - self.episodes_data = json.load(f) - - def parse_filename(self, file_path: Path) -> Optional[EpisodeInfo]: - return self.parser.parse_filename(file_path, self) - - def get_episode_by_season_and_relative(self, season: int, relative_episode: int) -> Optional[EpisodeInfo]: - if not self.episodes_data: - return EpisodeInfo( - absolute_episode=0, - season=season, - relative_episode=relative_episode, - title=f"S{season:02d}E{relative_episode:02d}", - series_name=self.series_name, - ) - - for season_data in self.episodes_data.get(EpisodesDataKeys.SEASONS, []): - if season_data.get(EpisodesDataKeys.SEASON_NUMBER) == season: - episodes = sorted( - season_data.get(EpisodesDataKeys.EPISODES, []), - key=lambda ep: ep.get(EpisodeMetadataKeys.EPISODE_NUMBER, 0), - ) - - if 0 < relative_episode <= len(episodes): - ep_data = episodes[relative_episode - 1] - return EpisodeInfo( - absolute_episode=0, - season=season, - relative_episode=relative_episode, - title=ep_data.get(EpisodeMetadataKeys.TITLE, f"S{season:02d}E{relative_episode:02d}"), - series_name=self.series_name, - premiere_date=ep_data.get(EpisodeMetadataKeys.PREMIERE_DATE), - viewership=ep_data.get(EpisodeMetadataKeys.VIEWERSHIP), - ) - - logger.warning( - f"Season {season} not found in episodes_info_json! " - f"Processing S{season:02d}E{relative_episode:02d} with filename-only metadata. " - f"Scrape episode info for season {season} to get title, premiere date, etc.", - ) - - return EpisodeInfo( - absolute_episode=0, - season=season, - relative_episode=relative_episode, - title=f"S{season:02d}E{relative_episode:02d}", - series_name=self.series_name, - ) - - def build_output_path(self, episode_info: EpisodeInfo, base_dir: Path, extension: str = DEFAULT_VIDEO_EXTENSION) -> Path: - filename = self.file_naming.build_filename(episode_info, extension=extension.lstrip('.')) - season_dir_name = OutputPathBuilder.get_season_dir(episode_info) - season_dir = base_dir / season_dir_name - season_dir.mkdir(parents=True, exist_ok=True) - return season_dir / filename - - @staticmethod - def get_episode_subdir(episode_info: EpisodeInfo, subdir: str) -> Path: - return OutputPathBuilder.get_episode_dir(episode_info, subdir) - - @staticmethod - def build_episode_output_path(episode_info: EpisodeInfo, subdir: str, filename: str) -> Path: - return OutputPathBuilder.build_output_path(episode_info, subdir, filename) - - def build_video_path_for_elastic(self, episode_info: EpisodeInfo) -> str: - return OutputPathBuilder.build_elastic_video_path(episode_info, self.series_name) - - def find_transcription_file(self, episode_info: EpisodeInfo, search_dir: Path, prefer_segmented: bool = True) -> Optional[Path]: - return self.file_finder.find_transcription_file(episode_info, search_dir, prefer_segmented) - - @staticmethod - def find_scene_timestamps_file(episode_info: EpisodeInfo, search_dir: Path) -> Optional[Path]: - finder = EpisodeFileFinder("") - return finder.find_scene_timestamps_file(episode_info, search_dir) - - @staticmethod - def load_scene_timestamps(episode_info: EpisodeInfo, search_dir: Optional[Path], _logger=None) -> Optional[List[Dict[str, Any]]]: - return EpisodeFileFinder.load_scene_timestamps(episode_info, search_dir, _logger) - - @staticmethod - def get_metadata(episode_info: EpisodeInfo) -> Dict[str, Any]: - return { - "season": episode_info.season, - "episode_number": episode_info.relative_episode, - "title": episode_info.title, - "premiere_date": episode_info.premiere_date, - "viewership": episode_info.viewership, - } - - @staticmethod - def get_episode_id_for_state(episode_info: EpisodeInfo) -> str: - return EpisodeInfoParser.get_episode_id(episode_info) - - def list_all_episodes(self) -> List[EpisodeInfo]: - episodes = [] - - if not self.episodes_data: - return episodes - - for season_data in self.episodes_data.get(EpisodesDataKeys.SEASONS, []): - season_num = season_data.get(EpisodesDataKeys.SEASON_NUMBER, 1) - season_episodes = sorted( - season_data.get(EpisodesDataKeys.EPISODES, []), - key=lambda ep: ep.get(EpisodeMetadataKeys.EPISODE_NUMBER, 0), - ) - - for idx, ep_data in enumerate(season_episodes): - episodes.append( - EpisodeInfo( - absolute_episode=0, - season=season_num, - relative_episode=idx + 1, - title=ep_data.get(EpisodeMetadataKeys.TITLE, f"S{season_num:02d}E{idx + 1:02d}"), - series_name=self.series_name, - premiere_date=ep_data.get(EpisodeMetadataKeys.PREMIERE_DATE), - viewership=ep_data.get(EpisodeMetadataKeys.VIEWERSHIP), - ), - ) - - return episodes diff --git a/preprocessor/core/episode_parser.py b/preprocessor/core/episode_parser.py deleted file mode 100644 index b0fbdd710..000000000 --- a/preprocessor/core/episode_parser.py +++ /dev/null @@ -1,29 +0,0 @@ -import logging -from pathlib import Path -import re -from typing import Optional - -logger = logging.getLogger(__name__) - - -class EpisodeInfoParser: - @staticmethod - def parse_filename(file_path: Path, episode_manager) -> Optional: - full_path_str = str(file_path) - - match_season_episode = re.search(r'S(\d+)[/\\]?E(\d+)', full_path_str, re.IGNORECASE) - if match_season_episode: - season = int(match_season_episode.group(1)) - episode = int(match_season_episode.group(2)) - return episode_manager.get_episode_by_season_and_relative(season, episode) - - logger.error( - f"Cannot parse episode from filename: {file_path.name}. " - f"Expected format: S##E## (e.g., S01E05, S10E13). " - f"Absolute episode numbers (E## without season) are not supported.", - ) - return None - - @staticmethod - def get_episode_id(episode_info) -> str: - return episode_info.episode_code() diff --git a/preprocessor/core/file_naming.py b/preprocessor/core/file_naming.py deleted file mode 100644 index 8bb05deb3..000000000 --- a/preprocessor/core/file_naming.py +++ /dev/null @@ -1,44 +0,0 @@ -from pathlib import Path -from typing import Optional - -from preprocessor.core.constants import ( - FILE_EXTENSIONS, - FILE_SUFFIXES, -) - - -class FileNamingConventions: - def __init__(self, series_name: str): - self.series_name = series_name.lower() - - def build_base_filename(self, episode_info) -> str: - return f"{self.series_name}_{episode_info.episode_code()}" - - def build_filename( - self, - episode_info, - extension: str = "json", - suffix: Optional[str] = None, - ) -> str: - base = self.build_base_filename(episode_info) - suffix_str = FILE_SUFFIXES.get(suffix, suffix) if suffix else "" - ext = FILE_EXTENSIONS.get(extension, f".{extension}") - return f"{base}{suffix_str}{ext}" - - @staticmethod - def parse_base_filename(filename: str) -> str: - name = Path(filename).stem - for suffix_value in FILE_SUFFIXES.values(): - if name.endswith(suffix_value): - return name[:-len(suffix_value)] - return name - - @staticmethod - def add_suffix_to_filename(filename: str, suffix: str) -> str: - path = Path(filename) - suffix_str = FILE_SUFFIXES.get(suffix, suffix) if suffix else "" - return str(path.parent / f"{path.stem}{suffix_str}{path.suffix}") - - @staticmethod - def get_suffix(suffix_key: str) -> str: - return FILE_SUFFIXES.get(suffix_key, "") diff --git a/preprocessor/core/model_pool.py b/preprocessor/core/model_pool.py new file mode 100644 index 000000000..c7e6932cf --- /dev/null +++ b/preprocessor/core/model_pool.py @@ -0,0 +1,49 @@ +import threading +from typing import ( + Any, + Callable, + Dict, + Optional, +) + +from preprocessor.services.core.logging import ErrorHandlingLogger + + +class ModelPool: + def __init__(self) -> None: + self._models: Dict[str, Any] = {} + self._lock = threading.Lock() + self._ref_counts: Dict[str, int] = {} + + def get_or_load( + self, + model_id: str, + loader: Callable[[], Any], + logger: Optional[ErrorHandlingLogger] = None, + ) -> Any: + with self._lock: + if model_id not in self._models: + if logger: + logger.info(f"Loading model to pool: {model_id}") + self._models[model_id] = loader() + self._ref_counts[model_id] = 0 + + self._ref_counts[model_id] += 1 + return self._models[model_id] + + def release(self, model_id: str, logger: Optional[ErrorHandlingLogger] = None) -> None: + with self._lock: + if model_id in self._ref_counts: + self._ref_counts[model_id] -= 1 + if self._ref_counts[model_id] <= 0: + if logger: + logger.info(f"Removing model from pool: {model_id}") + del self._models[model_id] + del self._ref_counts[model_id] + + def cleanup_all(self, logger: Optional[ErrorHandlingLogger] = None) -> None: + with self._lock: + if logger and self._models: + logger.info(f"Cleaning up {len(self._models)} models from pool") + self._models.clear() + self._ref_counts.clear() diff --git a/preprocessor/core/models/__init__.py b/preprocessor/core/models/__init__.py new file mode 100644 index 000000000..97147acbb --- /dev/null +++ b/preprocessor/core/models/__init__.py @@ -0,0 +1,3 @@ +from preprocessor.core.models.analysis_models import AnalysisData + +__all__ = ['AnalysisData'] diff --git a/preprocessor/core/models/analysis_models.py b/preprocessor/core/models/analysis_models.py new file mode 100644 index 000000000..09d8f478d --- /dev/null +++ b/preprocessor/core/models/analysis_models.py @@ -0,0 +1,22 @@ +from collections import Counter +from dataclasses import dataclass +from typing import ( + Any, + Dict, + List, +) + + +@dataclass(frozen=True) +class AnalysisData: + video_info: List[Dict[str, Any]] + resolution_counts: Counter + total_episodes: int + target_width: int + target_height: int + target_pixels: int + upscaling_count: int + upscaling_pct: float + progressive_count: int + needs_deinterlace_count: int + metadata_mismatch_count: int diff --git a/preprocessor/core/output_descriptors.py b/preprocessor/core/output_descriptors.py new file mode 100644 index 000000000..f41bb40c1 --- /dev/null +++ b/preprocessor/core/output_descriptors.py @@ -0,0 +1,282 @@ +from abc import ( + ABC, + abstractmethod, +) +from dataclasses import dataclass +import json +from pathlib import Path +from typing import ( + Callable, + Dict, + Optional, +) + + +@dataclass +class ValidationResult: + is_valid: bool + message: str = '' + file_count: int = 0 + total_size_bytes: int = 0 + + +class OutputDescriptor(ABC): + def __init__(self, pattern: str, subdir: str = "") -> None: + self._pattern = pattern + self._subdir = subdir + + @property + def pattern(self) -> str: + return self._pattern + + @property + def subdir(self) -> str: + return self._subdir + + @abstractmethod + def resolve_path(self, base_dir: Path, context_vars: Optional[Dict[str, str]] = None) -> Path: + pass + + @abstractmethod + def validate(self, base_dir: Path, context_vars: Optional[Dict[str, str]] = None) -> ValidationResult: + pass + + def format_pattern(self, context_vars: Optional[Dict[str, str]] = None) -> str: + if not context_vars: + return self._pattern + return self._pattern.format(**context_vars) + + +class FileOutput(OutputDescriptor): + def __init__( + self, + pattern: str, + subdir: str = "", + min_size_bytes: int = 1, + expected_count: int = 1, + ) -> None: + super().__init__(pattern, subdir) + self._min_size_bytes = min_size_bytes + self._expected_count = expected_count + + def resolve_path(self, base_dir: Path, context_vars: Optional[Dict[str, str]] = None) -> Path: + formatted_pattern = self.format_pattern(context_vars) + return base_dir / self._subdir / formatted_pattern + + def validate(self, base_dir: Path, context_vars: Optional[Dict[str, str]] = None) -> ValidationResult: + file_path = self.resolve_path(base_dir, context_vars) + + if not file_path.exists(): + return ValidationResult( + is_valid=False, + message=f'File does not exist: {file_path}', + ) + + if not file_path.is_file(): + return ValidationResult( + is_valid=False, + message=f'Path exists but is not a file: {file_path}', + ) + + file_size = file_path.stat().st_size + + if file_size < self._min_size_bytes: + return ValidationResult( + is_valid=False, + message=f'File too small ({file_size} bytes < {self._min_size_bytes}): {file_path}', + file_count=1, + total_size_bytes=file_size, + ) + + return ValidationResult( + is_valid=True, + message=f'File valid: {file_path}', + file_count=1, + total_size_bytes=file_size, + ) + + +class DirectoryOutput(OutputDescriptor): + def __init__( + self, + pattern: str, + subdir: str = "", + expected_file_pattern: Optional[str] = None, + min_files: int = 1, + min_size_per_file_bytes: int = 1, + ) -> None: + super().__init__(pattern, subdir) + self._expected_file_pattern = expected_file_pattern + self._min_files = min_files + self._min_size_per_file_bytes = min_size_per_file_bytes + + def resolve_path(self, base_dir: Path, context_vars: Optional[Dict[str, str]] = None) -> Path: + formatted_pattern = self.format_pattern(context_vars) + return base_dir / self._subdir / formatted_pattern + + def validate(self, base_dir: Path, context_vars: Optional[Dict[str, str]] = None) -> ValidationResult: + dir_path = self.resolve_path(base_dir, context_vars) + + if not dir_path.exists(): + return ValidationResult( + is_valid=False, + message=f'Directory does not exist: {dir_path}', + ) + + if not dir_path.is_dir(): + return ValidationResult( + is_valid=False, + message=f'Path exists but is not a directory: {dir_path}', + ) + + if self._expected_file_pattern: + files = list(dir_path.glob(self._expected_file_pattern)) + else: + files = [f for f in dir_path.iterdir() if f.is_file()] + + if len(files) < self._min_files: + return ValidationResult( + is_valid=False, + message=( + f'Not enough files in directory ({len(files)} < {self._min_files}): ' + f'{dir_path}' + ), + file_count=len(files), + ) + + total_size = 0 + for file_path in files: + file_size = file_path.stat().st_size + total_size += file_size + + if file_size < self._min_size_per_file_bytes: + return ValidationResult( + is_valid=False, + message=( + f'File too small ({file_size} bytes < {self._min_size_per_file_bytes}): ' + f'{file_path}' + ), + file_count=len(files), + total_size_bytes=total_size, + ) + + return ValidationResult( + is_valid=True, + message=f'Directory valid: {dir_path} ({len(files)} files, {total_size} bytes)', + file_count=len(files), + total_size_bytes=total_size, + ) + + +class JsonFileOutput(FileOutput): + def __init__( + self, + pattern: str, + subdir: str = "", + min_size_bytes: int = 2, + schema_validator: Optional[Callable[[Dict], bool]] = None, + ) -> None: + super().__init__(pattern, subdir, min_size_bytes) + self._schema_validator = schema_validator + + def validate(self, base_dir: Path, context_vars: Optional[Dict[str, str]] = None) -> ValidationResult: + basic_validation = super().validate(base_dir, context_vars) + + if not basic_validation.is_valid: + return basic_validation + + file_path = self.resolve_path(base_dir, context_vars) + + try: + with open(file_path, 'r', encoding='utf-8') as f: + data = json.load(f) + except json.JSONDecodeError as e: + return ValidationResult( + is_valid=False, + message=f'Invalid JSON in {file_path}: {e}', + file_count=1, + total_size_bytes=basic_validation.total_size_bytes, + ) + except Exception as e: + return ValidationResult( + is_valid=False, + message=f'Failed to read JSON from {file_path}: {e}', + file_count=1, + total_size_bytes=basic_validation.total_size_bytes, + ) + + if self._schema_validator: + try: + if not self._schema_validator(data): + return ValidationResult( + is_valid=False, + message=f'JSON schema validation failed: {file_path}', + file_count=1, + total_size_bytes=basic_validation.total_size_bytes, + ) + except Exception as e: + return ValidationResult( + is_valid=False, + message=f'Schema validation error for {file_path}: {e}', + file_count=1, + total_size_bytes=basic_validation.total_size_bytes, + ) + + return ValidationResult( + is_valid=True, + message=f'JSON file valid: {file_path}', + file_count=1, + total_size_bytes=basic_validation.total_size_bytes, + ) + + +class GlobalOutput(OutputDescriptor): + def resolve_path(self, base_dir: Path, context_vars: Optional[Dict[str, str]] = None) -> Path: + formatted_pattern = self.format_pattern(context_vars) + if self._subdir: + return base_dir / self._subdir / formatted_pattern + return base_dir / formatted_pattern + + def validate(self, base_dir: Path, context_vars: Optional[Dict[str, str]] = None) -> ValidationResult: + file_path = self.resolve_path(base_dir, context_vars) + + if not file_path.exists(): + return ValidationResult( + is_valid=False, + message=f'Global output does not exist: {file_path}', + ) + + if file_path.is_file(): + file_size = file_path.stat().st_size + return ValidationResult( + is_valid=True, + message=f'Global file valid: {file_path}', + file_count=1, + total_size_bytes=file_size, + ) + + if file_path.is_dir(): + files = [f for f in file_path.rglob('*') if f.is_file()] + total_size = sum(f.stat().st_size for f in files) + return ValidationResult( + is_valid=True, + message=f'Global directory valid: {file_path} ({len(files)} files)', + file_count=len(files), + total_size_bytes=total_size, + ) + + return ValidationResult( + is_valid=False, + message=f'Global output path is neither file nor directory: {file_path}', + ) + + +def create_frames_output() -> DirectoryOutput: + """Create standard DirectoryOutput descriptor for exported frames.""" + return DirectoryOutput( + pattern="{season}/{episode}", + subdir="frames", + expected_file_pattern="*.png", + min_files=1, + min_size_per_file_bytes=1024, + ) diff --git a/preprocessor/core/output_path_builder.py b/preprocessor/core/output_path_builder.py deleted file mode 100644 index 6bfba6456..000000000 --- a/preprocessor/core/output_path_builder.py +++ /dev/null @@ -1,72 +0,0 @@ -from pathlib import Path - -from preprocessor.config.config import ( - BASE_OUTPUT_DIR, - settings, -) -from preprocessor.core.constants import ( - DEFAULT_VIDEO_EXTENSION, - FILE_EXTENSIONS, -) - - -class OutputPathBuilder: - @staticmethod - def get_episode_dir(episode_info, base_subdir: str) -> Path: - season_code = f"S{episode_info.season:02d}" - episode_code = f"E{episode_info.relative_episode:02d}" - return BASE_OUTPUT_DIR / base_subdir / season_code / episode_code - - @staticmethod - def get_season_dir(episode_info) -> str: - return f"S{episode_info.season:02d}" - - @staticmethod - def build_transcription_path(episode_info, filename: str, subdir: str = "raw") -> Path: - season_code = f"S{episode_info.season:02d}" - episode_code = f"E{episode_info.relative_episode:02d}" - path = BASE_OUTPUT_DIR / settings.output_subdirs.transcriptions / season_code / episode_code / subdir / filename - path.parent.mkdir(parents=True, exist_ok=True) - return path - - @staticmethod - def build_output_path(episode_info, subdir: str, filename: str) -> Path: - path = OutputPathBuilder.get_episode_dir(episode_info, subdir) / filename - path.parent.mkdir(parents=True, exist_ok=True) - return path - - @staticmethod - def build_video_path(episode_info, series_name: str, extension: str = DEFAULT_VIDEO_EXTENSION) -> Path: - filename = f"{series_name.lower()}_{episode_info.episode_code()}{extension}" - season_dir_name = OutputPathBuilder.get_season_dir(episode_info) - season_dir = BASE_OUTPUT_DIR / settings.output_subdirs.video / season_dir_name - season_dir.mkdir(parents=True, exist_ok=True) - return season_dir / filename - - @staticmethod - def build_elastic_video_path(episode_info, series_name: str) -> str: - filename = f"{series_name.lower()}_{episode_info.episode_code()}{FILE_EXTENSIONS['mp4']}" - season_dir_name = OutputPathBuilder.get_season_dir(episode_info) - path = Path("bot") / f"{series_name.upper()}-WIDEO" / season_dir_name / filename - return path.as_posix() - - @staticmethod - def build_embedding_path(episode_info, filename: str) -> Path: - return OutputPathBuilder.build_output_path( - episode_info, - settings.output_subdirs.embeddings, - filename, - ) - - @staticmethod - def build_scene_path(episode_info, filename: str) -> Path: - return OutputPathBuilder.build_output_path( - episode_info, - settings.output_subdirs.scenes, - filename, - ) - - @staticmethod - def build_elastic_document_path(episode_info, subdoc_type: str, filename: str) -> Path: - full_subdir = f"{settings.output_subdirs.elastic_documents}/{subdoc_type}" - return OutputPathBuilder.build_output_path(episode_info, full_subdir, filename) diff --git a/preprocessor/core/processing_metadata.py b/preprocessor/core/processing_metadata.py index 8eb2ab96d..502bd1bdc 100644 --- a/preprocessor/core/processing_metadata.py +++ b/preprocessor/core/processing_metadata.py @@ -3,7 +3,6 @@ field, ) from datetime import datetime -import json from pathlib import Path from typing import ( Any, @@ -17,104 +16,125 @@ class StepMetadata: name: str step_num: str - start_time: Optional[datetime] = None - end_time: Optional[datetime] = None duration_seconds: Optional[float] = None - status: str = "pending" + end_time: Optional[datetime] = None exit_code: Optional[int] = None extra_info: Dict[str, Any] = field(default_factory=dict) + start_time: Optional[datetime] = None + status: str = 'pending' - def start(self): - self.start_time = datetime.now() - self.status = "running" - - def finish(self, exit_code: int): - self.end_time = datetime.now() - self.exit_code = exit_code - if self.start_time: - self.duration_seconds = (self.end_time - self.start_time).total_seconds() - self.status = "success" if exit_code == 0 else "failed" + def skip(self) -> None: + self.status = 'skipped' - def skip(self): - self.status = "skipped" + def start(self) -> None: + self.start_time = datetime.now() + self.status = 'running' def to_dict(self) -> Dict[str, Any]: return { - "name": self.name, - "step_num": self.step_num, - "start_time": self.start_time.isoformat() if self.start_time else None, - "end_time": self.end_time.isoformat() if self.end_time else None, - "duration_seconds": round(self.duration_seconds, 2) if self.duration_seconds else None, - "status": self.status, - "exit_code": self.exit_code, - "extra_info": self.extra_info, + 'name': self.name, + 'step_num': self.step_num, + 'start_time': self.start_time.isoformat() if self.start_time else None, + 'end_time': self.end_time.isoformat() if self.end_time else None, + 'duration_seconds': ( + round(self.duration_seconds, 2) if self.duration_seconds else None + ), + 'status': self.status, + 'exit_code': self.exit_code, + 'extra_info': self.extra_info, } class ProcessingMetadata: - def __init__(self, series_name: str, params: Dict[str, Any]): - self.series_name = series_name - self.params = self.__sanitize_params(params) - self.start_time = datetime.now() - self.end_time: Optional[datetime] = None - self.total_duration_seconds: Optional[float] = None - self.steps: List[StepMetadata] = [] - self.final_status = "running" - - @staticmethod - def __sanitize_params(params: Dict[str, Any]) -> Dict[str, Any]: - sanitized = {} - for key, value in params.items(): - if key in set("state_manager"): - continue - if isinstance(value, Path): - sanitized[key] = str(value) - elif isinstance(value, (str, int, float, bool, list, dict, type(None))): - sanitized[key] = value - else: - sanitized[key] = str(value) - return sanitized + def __init__(self, series_name: str, params: Dict[str, Any]) -> None: + self.__series_name = series_name + self.__params = self.__sanitize_params(params) + self.__start_time = datetime.now() + self.__end_time: Optional[datetime] = None + self.__total_duration_seconds: Optional[float] = None + self.__steps: List[StepMetadata] = [] + self.__final_status = 'running' + + @property + def final_status(self) -> str: + return self.__final_status + + @final_status.setter + def final_status(self, value: str) -> None: + self.__final_status = value + + @property + def end_time(self) -> Optional[datetime]: + return self.__end_time + + @end_time.setter + def end_time(self, value: datetime) -> None: + self.__end_time = value + + @property + def total_duration_seconds(self) -> Optional[float]: + return self.__total_duration_seconds + + @total_duration_seconds.setter + def total_duration_seconds(self, value: float) -> None: + self.__total_duration_seconds = value def add_step(self, name: str, step_num: str) -> StepMetadata: step = StepMetadata(name=name, step_num=step_num) - self.steps.append(step) + self.__steps.append(step) return step - def finish_processing(self, final_exit_code: int, additional_stats: Optional[Dict[str, Any]] = None): - self.end_time = datetime.now() - self.total_duration_seconds = (self.end_time - self.start_time).total_seconds() - self.final_status = "success" if final_exit_code == 0 else "failed" - if additional_stats: - self.params["additional_statistics"] = additional_stats + def to_dict(self) -> Dict[str, Any]: + return { + 'series_name': self.__series_name, + 'start_time': self.__start_time.isoformat(), + 'end_time': self.__end_time.isoformat() if self.__end_time else None, + 'final_status': self.__final_status, + 'parameters': self.__params, + 'steps': [step.to_dict() for step in self.__steps], + 'statistics': self.__get_statistics(), + } def __get_statistics(self) -> Dict[str, Any]: - completed_steps = [s for s in self.steps if s.status == "success"] - failed_steps = [s for s in self.steps if s.status == "failed"] - skipped_steps = [s for s in self.steps if s.status == "skipped"] + completed_steps = [s for s in self.__steps if s.status == 'success'] + failed_steps = [s for s in self.__steps if s.status == 'failed'] + skipped_steps = [s for s in self.__steps if s.status == 'skipped'] - step_durations = [s.duration_seconds for s in self.steps if s.duration_seconds is not None] + step_durations = [ + s.duration_seconds for s in self.__steps if s.duration_seconds is not None + ] - return { - "total_steps": len(self.steps), - "completed_steps": len(completed_steps), - "failed_steps": len(failed_steps), - "skipped_steps": len(skipped_steps), - "total_duration_seconds": round(self.total_duration_seconds, 2) if self.total_duration_seconds else None, - "average_step_duration_seconds": round(sum(step_durations) / len(step_durations), 2) if step_durations else None, - } + avg_duration = ( + round(sum(step_durations) / len(step_durations), 2) + if step_durations else None + ) - def to_dict(self) -> Dict[str, Any]: return { - "series_name": self.series_name, - "start_time": self.start_time.isoformat(), - "end_time": self.end_time.isoformat() if self.end_time else None, - "final_status": self.final_status, - "parameters": self.params, - "steps": [step.to_dict() for step in self.steps], - "statistics": self.__get_statistics(), + 'total_steps': len(self.__steps), + 'completed_steps': len(completed_steps), + 'failed_steps': len(failed_steps), + 'skipped_steps': len(skipped_steps), + 'total_duration_seconds': ( + round(self.__total_duration_seconds, 2) + if self.__total_duration_seconds else None + ), + 'average_step_duration_seconds': avg_duration, } - def save_to_file(self, output_path: Path): - output_path.parent.mkdir(parents=True, exist_ok=True) - with open(output_path, "w", encoding="utf-8") as f: - json.dump(self.to_dict(), f, indent=2, ensure_ascii=False) + @staticmethod + def __sanitize_params(params: Dict[str, Any]) -> Dict[str, Any]: + sanitized: Dict[str, Any] = {} + ignored_keys = {'state_manager'} + + for key, value in params.items(): + if key in ignored_keys: + continue + + if isinstance(value, Path): + sanitized[key] = str(value) + elif isinstance(value, (str, int, float, bool, list, dict, type(None))): + sanitized[key] = value + else: + sanitized[key] = str(value) + + return sanitized diff --git a/preprocessor/core/state_manager.py b/preprocessor/core/state_manager.py index 9929748e1..61be585fa 100644 --- a/preprocessor/core/state_manager.py +++ b/preprocessor/core/state_manager.py @@ -6,8 +6,7 @@ from datetime import datetime import json from pathlib import Path -import signal -import sys +import threading from typing import ( Any, Dict, @@ -15,180 +14,176 @@ Optional, ) -from preprocessor.utils.console import console +from preprocessor.core.temp_files import StepTempFile +from preprocessor.services.ui.console import console -@dataclass +@dataclass(frozen=True) class StepCheckpoint: - step: str - episode: str completed_at: str + episode: str + step: str -@dataclass +@dataclass(frozen=True) class InProgressStep: - step: str episode: str started_at: str + step: str temp_files: List[str] = field(default_factory=list) @dataclass class ProcessingState: + last_checkpoint: str series_name: str started_at: str - last_checkpoint: str completed_steps: List[StepCheckpoint] = field(default_factory=list) - in_progress: Optional[InProgressStep] = None + in_progress: List[InProgressStep] = field(default_factory=list) def to_dict(self) -> Dict[str, Any]: return { - "series_name": self.series_name, - "started_at": self.started_at, - "last_checkpoint": self.last_checkpoint, - "completed_steps": [asdict(step) for step in self.completed_steps], - "in_progress": asdict(self.in_progress) if self.in_progress else None, + 'series_name': self.series_name, + 'started_at': self.started_at, + 'last_checkpoint': self.last_checkpoint, + 'completed_steps': [asdict(step) for step in self.completed_steps], + 'in_progress': [asdict(step) for step in self.in_progress], } @classmethod - def _from_dict(cls, data: Dict[str, Any]) -> "ProcessingState": + def from_dict(cls, data: Dict[str, Any]) -> 'ProcessingState': completed_steps = [ - StepCheckpoint(**step) for step in data.get("completed_steps", []) + StepCheckpoint(**step) for step in data.get('completed_steps', []) ] - in_progress_data = data.get("in_progress") - in_progress = InProgressStep(**in_progress_data) if in_progress_data else None + in_progress_data = data.get('in_progress', []) + in_progress = ( + [InProgressStep(**step) for step in in_progress_data] + if isinstance(in_progress_data, list) + else [] + ) return cls( - series_name=data["series_name"], - started_at=data["started_at"], - last_checkpoint=data["last_checkpoint"], + series_name=data['series_name'], + started_at=data['started_at'], + last_checkpoint=data['last_checkpoint'], completed_steps=completed_steps, in_progress=in_progress, ) class StateManager: - STATE_FILE: str = ".preprocessing_state.json" + __STATE_FILE_TEMPLATE: str = '.preprocessing_state_{series}.json' + __lock = threading.Lock() + + def __init__(self, series_name: str, working_dir: Path = Path('.')) -> None: + self.__series_name = series_name - def __init__(self, series_name: str, working_dir: Path = Path(".")) -> None: - self.__series_name: str = series_name - self.__state_file: Path = working_dir / self.STATE_FILE + state_filename = self.__STATE_FILE_TEMPLATE.format(series=series_name) + self.__state_file: Path = working_dir / state_filename self.__state: Optional[ProcessingState] = None - self.__cleanup_registered: bool = False - self.__interrupted: bool = False + + def cleanup(self) -> None: + with self.__lock: + if self.__state_file.exists(): + console.print(f'[blue]Cleaning up state file: {self.__state_file}[/blue]') + self.__state_file.unlink() + + def is_step_completed(self, step: str, episode: str) -> bool: + if self.__state is None: + return False + + return any( + s.step == step and s.episode == episode + for s in self.__state.completed_steps + ) def load_or_create_state(self) -> ProcessingState: if self.__state_file.exists(): - console.print(f"[yellow]Found existing state file: {self.__state_file}[/yellow]") - with open(self.__state_file, "r", encoding="utf-8") as f: - data = json.load(f) - self.__state = ProcessingState._from_dict(data) - console.print(f"[green]Loaded state for series: {self.__state.series_name}[/green]") - console.print(f"[green]Completed steps: {len(self.__state.completed_steps)}[/green]") - return self.__state - else: - console.print("[blue]Creating new processing state...[/blue]") - now = datetime.now().isoformat() - self.__state = ProcessingState( - series_name=self.__series_name, - started_at=now, - last_checkpoint=now, - ) - self.__save_state() - return self.__state + return self.__load_existing_state() + return self.__create_new_state() - def __save_state(self) -> None: - if self.__state is None: - return + def mark_step_completed(self, step: str, episode: str) -> None: + with self.__lock: + self.__ensure_state_initialized() - self.__state.last_checkpoint = datetime.now().isoformat() - with open(self.__state_file, "w", encoding="utf-8") as f: - json.dump(self.__state.to_dict(), f, indent=2, ensure_ascii=False) + checkpoint = StepCheckpoint( + step=step, + episode=episode, + completed_at=datetime.now().isoformat(), + ) - def mark_step_started(self, step: str, episode: str, temp_files: Optional[List[str]] = None) -> None: - if self.__state is None: - raise RuntimeError("State not initialized") + self.__state.completed_steps.append(checkpoint) + self.__state.in_progress = [ + s for s in self.__state.in_progress + if not (s.step == step and s.episode == episode) + ] + self.__save_state() - self.__state.in_progress = InProgressStep( - step=step, - episode=episode, - started_at=datetime.now().isoformat(), - temp_files=temp_files or [], - ) - self.__save_state() - console.print(f"[cyan]Started: {step} for {episode}[/cyan]") + console.print(f'[green]Completed: {step} for {episode}[/green]') - def mark_step_completed(self, step: str, episode: str) -> None: - if self.__state is None: - raise RuntimeError("State not initialized") + def mark_step_started( + self, step: str, episode: str, temp_files: Optional[List[str]] = None, + ) -> None: + with self.__lock: + self.__ensure_state_initialized() - checkpoint = StepCheckpoint( - step=step, - episode=episode, - completed_at=datetime.now().isoformat(), - ) - self.__state.completed_steps.append(checkpoint) - self.__state.in_progress = None - self.__save_state() - console.print(f"[green]✓ Completed: {step} for {episode}[/green]") + in_progress_step = InProgressStep( + step=step, + episode=episode, + started_at=datetime.now().isoformat(), + temp_files=temp_files or [], + ) + self.__state.in_progress.append(in_progress_step) + self.__save_state() - def is_step_completed(self, step: str, episode: str) -> bool: - if self.__state is None: - return False + console.print(f'[cyan]Started: {step} for {episode}[/cyan]') - return any( - s.step == step and s.episode == episode - for s in self.__state.completed_steps - ) + def __load_existing_state(self) -> ProcessingState: + console.print(f'[yellow]Found existing state file: {self.__state_file}[/yellow]') - def __rollback_in_progress(self) -> None: - if self.__state is None or self.__state.in_progress is None: - return + with open(self.__state_file, 'r', encoding='utf-8') as f: + data = json.load(f) - console.print(f"[yellow]Rolling back in-progress step: {self.__state.in_progress.step}[/yellow]") + self.__state = ProcessingState.from_dict(data) - for temp_file in self.__state.in_progress.temp_files: - temp_path = Path(temp_file) - if temp_path.exists(): - try: - temp_path.unlink() - console.print(f"[yellow]Removed temp file: {temp_file}[/yellow]") - except OSError as e: - console.print(f"[red]Failed to remove {temp_file}: {e}[/red]") + console.print(f'[green]Loaded state for series: {self.__state.series_name}[/green]') + console.print(f'[green]Completed steps: {len(self.__state.completed_steps)}[/green]') + return self.__state - self.__state.in_progress = None + def __create_new_state(self) -> ProcessingState: + console.print('[blue]Creating new processing state...[/blue]') + now = datetime.now().isoformat() + + self.__state = ProcessingState( + series_name=self.__series_name, + started_at=now, + last_checkpoint=now, + ) self.__save_state() + return self.__state - def cleanup(self) -> None: - if self.__state_file.exists(): - console.print(f"[blue]Cleaning up state file: {self.__state_file}[/blue]") - self.__state_file.unlink() + def __ensure_state_initialized(self) -> None: + if self.__state is None: + raise RuntimeError('State not initialized. Call load_or_create_state() first.') - def register_interrupt_handler(self) -> None: - if self.__cleanup_registered: + def __save_state(self) -> None: + if self.__state is None: return - def _signal_handler(_sig: int, _frame: Any) -> None: - if self.__interrupted: - console.print("\n[red]Force quit! Not cleaning up.[/red]") - sys.exit(1) - - self.__interrupted = True - console.print("\n[yellow]Interrupt received (Ctrl+C)...[/yellow]") - console.print("[yellow]Rolling back incomplete work...[/yellow]") - self.__rollback_in_progress() - console.print("[green]Cleanup complete. You can resume later.[/green]") - console.print("[blue]To resume: run the same command again[/blue]") - sys.exit(0) - - signal.signal(signal.SIGINT, _signal_handler) - signal.signal(signal.SIGTERM, _signal_handler) - self.__cleanup_registered = True - console.print("[blue]Interrupt handler registered (Ctrl+C to safely stop)[/blue]") - - def get_resume_info(self) -> Optional[str]: - if self.__state is None or not self.__state.completed_steps: - return None - - last_step = self.__state.completed_steps[-1] - return f"Resuming from: {last_step.step} ({last_step.episode}) at {last_step.completed_at}" + self.__state.last_checkpoint = datetime.now().isoformat() + with StepTempFile(self.__state_file) as temp_path: + with open(temp_path, 'w', encoding='utf-8') as f: + json.dump(self.__state.to_dict(), f, indent=2, ensure_ascii=False) + + def rebuild_state(self, completed_steps: List[StepCheckpoint]) -> ProcessingState: + now = datetime.now().isoformat() + self.__state = ProcessingState( + series_name=self.__series_name, + started_at=now, + last_checkpoint=now, + completed_steps=completed_steps, + in_progress=[], + ) + self.__save_state() + console.print(f'[green]State rebuilt with {len(completed_steps)} completed steps[/green]') + return self.__state diff --git a/preprocessor/core/state_reconstruction.py b/preprocessor/core/state_reconstruction.py new file mode 100644 index 000000000..0d2e0cb47 --- /dev/null +++ b/preprocessor/core/state_reconstruction.py @@ -0,0 +1,100 @@ +from datetime import datetime +from pathlib import Path +from typing import ( + Dict, + List, +) + +from preprocessor.app.pipeline import PipelineDefinition +from preprocessor.core.state_manager import StepCheckpoint +from preprocessor.services.episodes.types import EpisodeInfo +from preprocessor.services.ui.console import console + + +class StateReconstructor: + @staticmethod + def scan_filesystem( + pipeline: PipelineDefinition, + episodes_list: List[EpisodeInfo], + base_output_dir: Path, + series_name: str, + ) -> List[StepCheckpoint]: + console.print('[cyan]Reconstructing state from filesystem...[/cyan]') + + now = datetime.now().isoformat() + completed_steps: List[StepCheckpoint] = [] + + total_checked = 0 + total_completed = 0 + + for step_id, step_def in pipeline.get_all_steps().items(): + step_instance = step_def.step_class(step_def.config) + step_name = step_instance.name + + if step_instance.is_global: + if StateReconstructor.__check_global_step_outputs(step_def, base_output_dir): + checkpoint = StepCheckpoint( + step=step_name, + episode='all', + completed_at=now, + ) + completed_steps.append(checkpoint) + total_completed += 1 + console.print(f'[green]✓ {step_id} ({step_name}) - global[/green]') + else: + console.print(f'[yellow]✗ {step_id} ({step_name}) - global - outputs missing[/yellow]') + total_checked += 1 + else: + for episode_info in episodes_list: + episode_id = f'S{episode_info.season:02d}E{episode_info.relative_episode:02d}' + context_vars = { + 'season': episode_info.season_code(), + 'episode': episode_info.episode_code(), + 'episode_num': episode_info.episode_num(), + 'series_name': series_name, + } + + if StateReconstructor.__check_episode_step_outputs( + step_def, base_output_dir, context_vars, + ): + checkpoint = StepCheckpoint( + step=step_name, + episode=episode_id, + completed_at=now, + ) + completed_steps.append(checkpoint) + total_completed += 1 + total_checked += 1 + + console.print('\n[green]Filesystem scan complete:[/green]') + console.print(f' Checked: {total_checked} step-episode combinations') + console.print(f' Found completed: {total_completed}') + console.print(f' Missing: {total_checked - total_completed}') + + return completed_steps + + @staticmethod + def __check_global_step_outputs(step_def, base_output_dir: Path) -> bool: + descriptors = step_def.get_output_descriptors() + if not descriptors: + return True + + return all( + descriptor.validate(base_output_dir).is_valid + for descriptor in descriptors + ) + + @staticmethod + def __check_episode_step_outputs( + step_def, + base_output_dir: Path, + context_vars: Dict[str, str], + ) -> bool: + descriptors = step_def.get_output_descriptors() + if not descriptors: + return True + + return all( + descriptor.validate(base_output_dir, context_vars).is_valid + for descriptor in descriptors + ) diff --git a/preprocessor/core/temp_files.py b/preprocessor/core/temp_files.py new file mode 100644 index 000000000..3efd86c17 --- /dev/null +++ b/preprocessor/core/temp_files.py @@ -0,0 +1,37 @@ +from pathlib import Path +from typing import Optional + + +class StepTempFile: + def __init__(self, final_path: Path, temp_suffix: str = '.tmp') -> None: + self.__final_path: Path = final_path + self.__temp_suffix: str = temp_suffix + self.__temp_path: Optional[Path] = None + + @property + def final_path(self) -> Path: + return self.__final_path + + @property + def temp_path(self) -> Path: + if self.__temp_path is None: + raise RuntimeError('Context manager not entered yet') + return self.__temp_path + + def __enter__(self) -> Path: + self.__temp_path = self.__final_path.with_suffix( + f'{self.__final_path.suffix}{self.__temp_suffix}', + ) + self.__temp_path.parent.mkdir(parents=True, exist_ok=True) + return self.__temp_path + + def __exit__(self, exc_type, exc_val, exc_tb) -> bool: + if self.__temp_path is None: + return False + + if exc_type is None: + self.__temp_path.replace(self.__final_path) + elif self.__temp_path.exists(): + self.__temp_path.unlink() + + return False diff --git a/preprocessor/embeddings/embedding_generator.py b/preprocessor/embeddings/embedding_generator.py deleted file mode 100644 index 81b2f00ef..000000000 --- a/preprocessor/embeddings/embedding_generator.py +++ /dev/null @@ -1,813 +0,0 @@ -import gc -import json -import logging -from pathlib import Path -import re -from typing import ( - Any, - Dict, - List, - Optional, -) - -import numpy as np -import torch - -from preprocessor.config.config import settings -from preprocessor.core.base_processor import ( - BaseProcessor, - OutputSpec, - ProcessingItem, -) -from preprocessor.core.constants import FILE_SUFFIXES -from preprocessor.core.episode_manager import EpisodeManager -from preprocessor.core.output_path_builder import OutputPathBuilder -from preprocessor.embeddings.episode_name_embedder import EpisodeNameEmbedder -from preprocessor.embeddings.gpu_batch_processor import GPUBatchProcessor -from preprocessor.embeddings.qwen3_vl_embedding import Qwen3VLEmbedder -from preprocessor.utils.batch_processing_utils import compute_embeddings_in_batches -from preprocessor.utils.console import console -from preprocessor.utils.constants import EpisodeMetadataKeys -from preprocessor.utils.file_utils import atomic_write_json -from preprocessor.utils.image_hash_utils import load_image_hashes_for_episode -from preprocessor.utils.metadata_utils import create_processing_metadata - -# pylint: disable=duplicate-code - - - -class EmbeddingGenerator(BaseProcessor): # pylint: disable=too-many-instance-attributes - def __init__(self, args: Dict[str, Any]): - super().__init__( - args=args, - class_name=self.__class__.__name__, - error_exit_code=9, - loglevel=logging.DEBUG, - ) - - self.transcription_jsons: Path = self._args["transcription_jsons"] - self.frames_dir: Path = self._args.get("frames_dir", settings.frame_export.output_dir) - self.output_dir: Path = self._args.get("output_dir", settings.embedding.default_output_dir) - - self.model_name: str = self._args.get("model", settings.embedding_model.model_name) - self.model_revision: str = self._args.get("model_revision", settings.embedding_model.model_revision) - self.batch_size: int = self._args.get("batch_size", settings.embedding.batch_size) - self.device: str = "cuda" - - self.segments_per_embedding: int = self._args.get("segments_per_embedding", settings.text_chunking.segments_per_embedding) - self.text_sentences_per_chunk: int = self._args.get("text_sentences_per_chunk", settings.text_chunking.text_sentences_per_chunk) - self.text_chunk_overlap: int = self._args.get("text_chunk_overlap", settings.text_chunking.text_chunk_overlap) - self.generate_text: bool = self._args.get("generate_text", True) - self.generate_video: bool = self._args.get("generate_video", True) - self.generate_episode_names: bool = self._args.get("generate_episode_names", True) - self.generate_full_episode: bool = self._args.get("generate_full_episode", settings.embedding.generate_full_episode_embedding) - self.generate_sound_events: bool = self._args.get("generate_sound_events", True) - - self.image_hashes_dir: Path = Path(self._args.get("image_hashes_dir", settings.image_hash.output_dir)) - - episodes_info_json = self._args.get("episodes_info_json") - self.episode_manager = EpisodeManager(episodes_info_json, self.series_name) - - self.model = None - self.processor = None - self.gpu_processor: Optional[GPUBatchProcessor] = None - self.episode_name_embedder: Optional[EpisodeNameEmbedder] = None - - def _validate_args(self, args: Dict[str, Any]) -> None: - if "transcription_jsons" not in args: - raise ValueError("transcription_jsons is required") - if not torch.cuda.is_available(): - raise RuntimeError("CUDA is not available. This application requires GPU.") - - def cleanup(self) -> None: - console.print("[cyan]Unloading embedding model...[/cyan]") - self.model = None - self.processor = None - self._cleanup_memory() - console.print("[green]✓ Model unloaded[/green]") - - def _get_processing_items(self) -> List[ProcessingItem]: - all_transcription_files = list(self.transcription_jsons.glob("**/*.json")) - items = [] - seen_episodes = set() - - for trans_file in all_transcription_files: - if "_simple.json" in trans_file.name or "_text_stats.json" in trans_file.name: - continue - - if trans_file.parent.name in {"clean", "sound_events"}: - continue - - if not trans_file.name.endswith("_segmented.json"): - segmented_version = trans_file.parent / f"{trans_file.stem}_segmented.json" - if segmented_version.exists(): - continue - - episode_info = self.episode_manager.parse_filename(trans_file) - if episode_info: - episode_key = (episode_info.season, episode_info.relative_episode) - if episode_key in seen_episodes: - continue - seen_episodes.add(episode_key) - - items.append(self._create_transcription_processing_item(trans_file)) - - return items - - def _should_skip_item(self, item: ProcessingItem): - trans_file = item.input_path - parent_name = trans_file.parent.name - if parent_name in {"raw", "clean", "sound_events"}: - episode_dir = trans_file.parent.parent - else: - episode_dir = trans_file.parent - - clean_dir = episode_dir / settings.output_subdirs.transcription_subdirs.clean - base_name = self.__remove_all_suffixes(trans_file.stem) - clean_transcription_file = clean_dir / f"{base_name}_clean_transcription.json" - - if clean_transcription_file.exists(): - try: - with open(clean_transcription_file, "r", encoding="utf-8") as f: - data = json.load(f) - segments = data.get("segments", []) - if not segments: - episode_id = item.episode_id - self.logger.warning( - f"Empty clean transcription (no text segments) for {episode_id}, " - f"will skip text embeddings but generate other types (sound events, episode names, etc.)", - ) - except Exception as e: - self.logger.error(f"Failed to read {clean_transcription_file}: {e}") - - return super()._should_skip_item(item) - - def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: - outputs = [] - episode_info = self.episode_manager.parse_filename(item.input_path) - if not episode_info: - return outputs - - if self.generate_text: - text_filename = self.episode_manager.file_naming.build_filename( - episode_info, - extension="json", - suffix="_embeddings_text", - ) - text_output = OutputPathBuilder.build_embedding_path(episode_info, text_filename) - outputs.append(OutputSpec(path=text_output, required=True)) - - if self.generate_episode_names: - episode_name_filename = f"{FILE_SUFFIXES['episode_name']}.json" - episode_name_output = OutputPathBuilder.build_embedding_path(episode_info, episode_name_filename) - outputs.append(OutputSpec(path=episode_name_output, required=True)) - - if self.generate_video: - video_filename = self.episode_manager.file_naming.build_filename( - episode_info, - extension="json", - suffix="_embeddings_video", - ) - video_output = OutputPathBuilder.build_embedding_path(episode_info, video_filename) - outputs.append(OutputSpec(path=video_output, required=True)) - - if self.generate_full_episode: - full_episode_filename = self.episode_manager.file_naming.build_filename( - episode_info, - extension="json", - suffix="_embeddings_full_episode", - ) - full_episode_output = OutputPathBuilder.build_embedding_path(episode_info, full_episode_filename) - outputs.append(OutputSpec(path=full_episode_output, required=True)) - - if self.generate_sound_events: - sound_events_filename = self.episode_manager.file_naming.build_filename( - episode_info, - extension="json", - suffix="_embeddings_sound_events", - ) - sound_events_output = OutputPathBuilder.build_embedding_path(episode_info, sound_events_filename) - outputs.append(OutputSpec(path=sound_events_output, required=True)) - - return outputs - - def _get_temp_files(self, item: ProcessingItem) -> List[str]: - temp_files = [] - expected_outputs = self._get_expected_outputs(item) - for output in expected_outputs: - temp_path = output.path.with_suffix('.json.tmp') - temp_files.append(str(temp_path)) - return temp_files - - def _get_processing_info(self) -> List[str]: - return [ - f"[cyan]Loading model: {self.model_name}[/cyan]", - f"[cyan]Device: {self.device}[/cyan]", - f"[cyan]Batch size: {self.batch_size}[/cyan]", - ] - - def _load_resources(self) -> bool: - self.__load_model() - self.gpu_processor = GPUBatchProcessor( - self.model, - self.batch_size, - self.logger, - self.device, - progress_sub_batch_size=settings.embedding.progress_sub_batch_size, - ) - self.episode_name_embedder = EpisodeNameEmbedder( - model=self.model, - episode_manager=self.episode_manager, - series_name=self.series_name, - logger=self.logger, - ) - return True - - def __load_model(self) -> None: - try: - self.model = Qwen3VLEmbedder( - model_name_or_path=self.model_name, - torch_dtype=torch.bfloat16, - ) - console.print("[green]Qwen3-VL-Embedding model loaded successfully (vLLM)[/green]") - except Exception as e: - self.logger.error(f"Failed to load model: {e}") - raise - - def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> None: # pylint: disable=too-many-locals,too-many-statements - trans_file = item.input_path - - parent_name = trans_file.parent.name - if parent_name in {"raw", "clean", "sound_events"}: - episode_dir = trans_file.parent.parent - else: - episode_dir = trans_file.parent - - clean_dir = episode_dir / settings.output_subdirs.transcription_subdirs.clean - - base_name = self.__remove_all_suffixes(trans_file.stem) - clean_transcription_file = clean_dir / f"{base_name}_clean_transcription.json" - - if not clean_transcription_file.exists(): - self.logger.warning(f"Clean transcription not found: {clean_transcription_file}, skipping text embeddings generation") - with open(trans_file, "r", encoding="utf-8") as f: - data = json.load(f) - data["segments"] = [] - else: - with open(clean_transcription_file, "r", encoding="utf-8") as f: - data = json.load(f) - - has_segments = bool(data.get("segments")) - segmented_file = trans_file.parent / f"{trans_file.stem}_segmented.json" - - if not has_segments and segmented_file.exists(): - return - - need_text = any("embeddings_text.json" in str(o.path) for o in missing_outputs) - need_video = any("embeddings_video.json" in str(o.path) for o in missing_outputs) - need_episode_name = any("episode_name_embedding.json" in str(o.path) for o in missing_outputs) - need_full_episode = any("embeddings_full_episode.json" in str(o.path) for o in missing_outputs) - need_sound_events = any("embeddings_sound_events.json" in str(o.path) for o in missing_outputs) - - text_embeddings = [] - if need_text: - text_embeddings = self.__generate_text_embeddings(data) - - sound_event_embeddings = [] - if need_sound_events: - sound_event_embeddings = self.__generate_sound_event_embeddings(trans_file) - - video_embeddings = [] - if need_video: - episode_info = data.get("episode_info", {}) - frame_metadata = self.__load_frame_metadata(episode_info) - if frame_metadata: - video_embeddings = self.__generate_video_embeddings(episode_info, frame_metadata) - - if need_episode_name and self.episode_name_embedder: - self.episode_name_embedder.generate_and_save_for_transcription(data) - - full_episode_embedding = None - if need_full_episode: - full_episode_embedding = self.__generate_full_episode_embedding(trans_file) - - episode_dir = self.__get_episode_output_dir(trans_file) - episode_info_dict = data.get("episode_info", {}) - season = episode_info_dict.get("season", 0) - episode_num = episode_info_dict.get("episode_number", 0) - - episode_info_temp = self.episode_manager.get_episode_by_season_and_relative(season, episode_num) - if episode_info_temp: - episode_code = episode_info_temp.episode_code() - else: - episode_code = f"S{season:02d}E{episode_num:02d}" - - text_output = episode_dir / f"{self.episode_manager.series_name}_{episode_code}_embeddings_text.json" - video_output = episode_dir / f"{self.episode_manager.series_name}_{episode_code}_embeddings_video.json" - full_episode_output = episode_dir / f"{self.episode_manager.series_name}_{episode_code}_embeddings_full_episode.json" - sound_events_output = episode_dir / f"{self.episode_manager.series_name}_{episode_code}_embeddings_sound_events.json" - self.__save_embeddings( - data, - text_embeddings, - video_embeddings, - full_episode_embedding, - sound_event_embeddings, - text_output, - video_output, - full_episode_output, - sound_events_output, - ) - self._cleanup_memory() - - def __generate_text_embeddings(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: # pylint: disable=too-many-locals - segments = data.get("segments", []) - if not segments: - return [] - - text_chunks = [] - chunk_metadata = [] - - if True: # Always use sentence-based chunking for text # pylint: disable=using-constant-test - full_text = " ".join([seg.get("text", "") for seg in segments]) - sentences = self.__split_into_sentences(full_text) - - sentences_per_chunk = self.text_sentences_per_chunk - overlap = self.text_chunk_overlap - step = sentences_per_chunk - overlap - - for i in range(0, len(sentences), step): - chunk_sentences = sentences[i:i + sentences_per_chunk] - if not chunk_sentences: - continue - - chunk_text = " ".join(chunk_sentences).strip() - if not chunk_text: - continue - - char_start = sum(len(s) + 1 for s in sentences[:i]) - char_end = char_start + len(chunk_text) - - start_seg_id = self.__find_segment_at_position(segments, char_start) - end_seg_id = self.__find_segment_at_position(segments, char_end) - - text_chunks.append(chunk_text) - chunk_metadata.append({ - "segment_range": [start_seg_id, end_seg_id], - "text": chunk_text, - }) - else: - for i in range(0, len(segments), self.segments_per_embedding): - chunk = segments[i: i + self.segments_per_embedding] - combined_text = " ".join([seg.get("text", "") for seg in chunk]) - - if combined_text.strip(): - text_chunks.append(combined_text) - chunk_metadata.append({ - "segment_range": [i, i + len(chunk) - 1], - "text": combined_text, - }) - - if not text_chunks: - return [] - - embeddings = [] - text_batch_size = settings.embedding.text_batch_size - - with self.progress.track_operation( - f"Text embeddings ({len(text_chunks)} chunks)", - (len(text_chunks) + text_batch_size - 1) // text_batch_size, - ) as tracker: - for batch_idx in range(0, len(text_chunks), text_batch_size): - batch_texts = text_chunks[batch_idx: batch_idx + text_batch_size] - batch_meta = chunk_metadata[batch_idx: batch_idx + text_batch_size] - - try: - batch_embeddings = self.__encode_text_batch(batch_texts) - for meta, embedding in zip(batch_meta, batch_embeddings): - embeddings.append({ - **meta, - "embedding": embedding.tolist(), - }) - except (RuntimeError, ValueError, OSError) as e: - self.logger.error(f"Failed text embedding batch {batch_idx}: {e}") - - tracker.update((batch_idx // text_batch_size) + 1, interval=5) - - return embeddings - - def __generate_sound_event_embeddings(self, trans_file: Path) -> List[Dict[str, Any]]: # pylint: disable=too-many-locals - parent_name = trans_file.parent.name - if parent_name in {"raw", "clean", "sound_events"}: - episode_dir = trans_file.parent.parent - else: - episode_dir = trans_file.parent - - sound_events_dir = episode_dir / settings.output_subdirs.transcription_subdirs.sound_events - - base_name = self.__remove_all_suffixes(trans_file.stem) - sound_events_file = sound_events_dir / f"{base_name}_sound_events.json" - - if not sound_events_file.exists(): - self.logger.warning(f"Sound events file not found: {sound_events_file}, skipping sound event embeddings generation") - return [] - - try: - with open(sound_events_file, "r", encoding="utf-8") as f: - sound_events_data = json.load(f) - except Exception as e: - self.logger.error(f"Failed to load sound events file {sound_events_file}: {e}") - return [] - - segments = sound_events_data.get("segments", []) - if not segments: - return [] - - text_chunks = [] - chunk_metadata = [] - - for i in range(0, len(segments), self.segments_per_embedding): - chunk = segments[i: i + self.segments_per_embedding] - combined_text = " ".join([seg.get("text", "") for seg in chunk]) - - if combined_text.strip(): - sound_types = set() - for seg in chunk: - sound_type = seg.get("sound_type", "sound") - sound_types.add(sound_type) - - start_time = chunk[0].get("start", 0.0) if chunk else 0.0 - end_time = chunk[-1].get("end", 0.0) if chunk else 0.0 - - text_chunks.append(combined_text) - chunk_metadata.append({ - "segment_range": [i, i + len(chunk) - 1], - "text": combined_text, - "sound_types": list(sound_types), - "start_time": start_time, - "end_time": end_time, - }) - - if not text_chunks: - return [] - - embeddings = [] - text_batch_size = settings.embedding.text_batch_size - - with self.progress.track_operation( - f"Sound event embeddings ({len(text_chunks)} chunks)", - (len(text_chunks) + text_batch_size - 1) // text_batch_size, - ) as tracker: - for batch_idx in range(0, len(text_chunks), text_batch_size): - batch_texts = text_chunks[batch_idx: batch_idx + text_batch_size] - batch_meta = chunk_metadata[batch_idx: batch_idx + text_batch_size] - - try: - batch_embeddings = self.__encode_text_batch(batch_texts) - for meta, embedding in zip(batch_meta, batch_embeddings): - embeddings.append({ - **meta, - "embedding": embedding.tolist(), - }) - except (RuntimeError, ValueError, OSError) as e: - self.logger.error(f"Failed sound event embedding batch {batch_idx}: {e}") - - tracker.update((batch_idx // text_batch_size) + 1, interval=5) - - return embeddings - - @staticmethod - def __remove_all_suffixes(base_name: str) -> str: - suffixes = (FILE_SUFFIXES["segmented"], FILE_SUFFIXES["sound_events"], FILE_SUFFIXES["clean"], FILE_SUFFIXES["clean_alt"]) - while True: - removed = False - for suffix in suffixes: - if base_name.endswith(suffix): - base_name = base_name[:-len(suffix)] - removed = True - break - if not removed: - break - return base_name - - @staticmethod - def __split_into_sentences(text: str) -> List[str]: - normalized_text = re.sub(r'\.{2,}', '.', text) - normalized_text = re.sub(r'!{2,}', '!', normalized_text) - normalized_text = re.sub(r'\?{2,}', '?', normalized_text) - - sentences = re.split(r'([.!?]+(?:\s+|$))', normalized_text) - raw_sentences = [] - for i in range(0, len(sentences) - 1, 2): - sentence = sentences[i] + (sentences[i + 1] if i + 1 < len(sentences) else "") - sentence = sentence.strip() - if sentence: - raw_sentences.append(sentence) - if len(sentences) % 2 == 1 and sentences[-1].strip(): - raw_sentences.append(sentences[-1].strip()) - - result = [] - buffer = "" - min_sentence_length = 30 - - for sentence in raw_sentences: - buffer = (buffer + " " + sentence).strip() if buffer else sentence - - if len(buffer) >= min_sentence_length: - result.append(buffer) - buffer = "" - - if buffer: - if result: - result[-1] = result[-1] + " " + buffer - else: - result.append(buffer) - - return result - - @staticmethod - def __find_segment_at_position(segments: List[Dict[str, Any]], char_pos: int) -> int: - cumulative_length = 0 - for idx, seg in enumerate(segments): - seg_text = seg.get("text", "") - seg_length = len(seg_text) + 1 - if cumulative_length <= char_pos < cumulative_length + seg_length: - return idx - cumulative_length += seg_length - return len(segments) - 1 if segments else 0 - - def __encode_text_batch(self, texts: List[str]) -> List[np.ndarray]: - inputs = [{"text": text} for text in texts] - embeddings_tensor = self.model.process(inputs, normalize=True) - embeddings = [emb.cpu().numpy() for emb in embeddings_tensor] - del embeddings_tensor - return embeddings - - def __generate_full_episode_embedding(self, trans_file: Path) -> Optional[Dict[str, Any]]: # pylint: disable=too-many-locals,too-many-statements - parent_name = trans_file.parent.name - if parent_name in {"raw", "clean", "sound_events"}: - episode_dir = trans_file.parent.parent - else: - episode_dir = trans_file.parent - - clean_dir = episode_dir / settings.output_subdirs.transcription_subdirs.clean - - base_name = self.__remove_all_suffixes(trans_file.stem) - clean_txt_file = clean_dir / f"{base_name}_clean_transcription.txt" - - if not clean_txt_file.exists(): - self.logger.warning(f"Clean transcript file not found: {clean_txt_file}") - return None - - try: # pylint: disable=too-many-try-statements - with open(clean_txt_file, "r", encoding="utf-8") as f: - full_text = f.read().strip() - - if not full_text: - self.logger.warning(f"Empty clean transcript file: {clean_txt_file}") - return None - - console.print(f"[cyan]Generating full episode embedding ({len(full_text)} chars)...[/cyan]") - - max_chars_per_chunk = 6000 - overlap_chars = 4500 - - if len(full_text) > max_chars_per_chunk: - console.print( - f"[yellow]Text too long ({len(full_text)} chars), " - f"using sliding window (chunk={max_chars_per_chunk}, overlap={overlap_chars})...[/yellow]", - ) - - chunks = [] - step_size = max_chars_per_chunk - overlap_chars - - for i in range(0, len(full_text), step_size): - chunk_end = min(i + max_chars_per_chunk, len(full_text)) - chunk = full_text[i:chunk_end] - - if len(chunk.strip()) < 100: - continue - - chunks.append(chunk) - - if chunk_end >= len(full_text): - break - - console.print(f"[cyan]Processing {len(chunks)} overlapping chunks...[/cyan]") - chunk_embeddings = [] - chunk_weights = [] - - for idx, chunk in enumerate(chunks): - inputs = [{"text": chunk}] - embeddings_tensor = self.model.process(inputs, normalize=True) - chunk_embedding = embeddings_tensor[0].cpu().numpy() - chunk_embeddings.append(chunk_embedding) - del embeddings_tensor - - weight = len(chunk) / max_chars_per_chunk - chunk_weights.append(weight) - - if (idx + 1) % 5 == 0 or idx == len(chunks) - 1: - console.print(f"[cyan]Processed chunk {idx + 1}/{len(chunks)}[/cyan]") - - chunk_weights_array = np.array(chunk_weights) - chunk_weights_normalized = chunk_weights_array / chunk_weights_array.sum() - - embedding = np.average(chunk_embeddings, axis=0, weights=chunk_weights_normalized) - embedding = embedding / np.linalg.norm(embedding) - - console.print(f"[green]✓ Weighted-averaged {len(chunks)} overlapping chunks[/green]") - else: - inputs = [{"text": full_text}] - embeddings_tensor = self.model.process(inputs, normalize=True) - embedding = embeddings_tensor[0].cpu().numpy() - del embeddings_tensor - - return { - "text": full_text, - "embedding": embedding.tolist(), - "transcript_length": len(full_text), - } - - except Exception as e: - self.logger.error(f"Failed to generate full episode embedding: {e}") - return None - - def __load_frame_metadata(self, episode_info_dict: Dict[str, Any]) -> Optional[Dict[str, Any]]: - season = episode_info_dict.get("season") - episode = episode_info_dict.get("episode_number") - if season is None or episode is None: - return None - - episode_info_obj = self.episode_manager.get_episode_by_season_and_relative(season, episode) - if not episode_info_obj: - return None - - frames_episode_dir = self.episode_manager.get_episode_subdir(episode_info_obj, settings.output_subdirs.frames) - metadata_file = frames_episode_dir / f"{self.episode_manager.series_name}_{episode_info_obj.episode_code()}_frame_metadata.json" - - if not metadata_file.exists(): - self.logger.warning(f"Frame metadata not found: {metadata_file}") - return None - - with open(metadata_file, "r", encoding="utf-8") as f: - return json.load(f) - - def __load_image_hashes(self, episode_info_dict: Dict[str, Any]) -> Dict[int, str]: - return load_image_hashes_for_episode(episode_info_dict, self.logger) - - def __generate_video_embeddings(self, episode_info_dict: Dict[str, Any], frame_metadata: Dict[str, Any]) -> List[Dict[str, Any]]: - frame_requests = frame_metadata.get("frames", []) - if not frame_requests: - return [] - - season = episode_info_dict.get("season") - episode = episode_info_dict.get("episode_number") - - episode_info_obj = self.episode_manager.get_episode_by_season_and_relative(season, episode) - if not episode_info_obj: - return [] - - frames_episode_dir = self.episode_manager.get_episode_subdir(episode_info_obj, settings.output_subdirs.frames) - episode_output_dir = self.episode_manager.get_episode_subdir(episode_info_obj, settings.output_subdirs.embeddings) - checkpoint_file = episode_output_dir / "embeddings_video_checkpoint.json" - - image_hashes = self.__load_image_hashes(episode_info_dict) - embeddings = compute_embeddings_in_batches( - frames_episode_dir, - frame_requests, - self.gpu_processor, - self.batch_size, - image_hashes, - checkpoint_file=checkpoint_file, - checkpoint_interval=20, - prefetch_count=settings.embedding.prefetch_chunks, - ) - self._cleanup_memory() - return embeddings - - def __get_episode_output_dir(self, transcription_file: Path) -> Path: - episode_info_from_file = self.episode_manager.parse_filename(transcription_file) - if episode_info_from_file: - return self.episode_manager.get_episode_subdir(episode_info_from_file, settings.output_subdirs.embeddings) - return self.episode_manager.get_episode_subdir(None, settings.output_subdirs.embeddings) - - def __save_embeddings( - self, - data, - text_embeddings, - video_embeddings, - full_episode_embedding, - sound_event_embeddings, - text_output, - video_output, - full_episode_output, - sound_events_output, - ): - episode_info = data.get(EpisodeMetadataKeys.EPISODE_INFO, {}) - text_output.parent.mkdir(parents=True, exist_ok=True) - - if text_embeddings: - text_data = create_processing_metadata( - episode_info=type( - 'obj', (object,), { - 'season': episode_info.get(EpisodeMetadataKeys.SEASON), - 'relative_episode': episode_info.get(EpisodeMetadataKeys.EPISODE_NUMBER), - }, - )(), - processing_params={ - "model_name": self.model_name, - "model_revision": self.model_revision, - "segments_per_embedding": self.segments_per_embedding, - "use_sentence_based_chunking": True, - "text_sentences_per_chunk": self.text_sentences_per_chunk, - "text_chunk_overlap": self.text_chunk_overlap, - "device": self.device, - }, - statistics={ - "total_embeddings": len(text_embeddings), - "embedding_dimension": len(text_embeddings[0]["embedding"]) if text_embeddings else 0, - }, - results_key="text_embeddings", - results_data=text_embeddings, - ) - atomic_write_json(text_output, text_data, indent=2, ensure_ascii=False) - - if video_embeddings: - video_data = create_processing_metadata( - episode_info=type( - 'obj', (object,), { - 'season': episode_info.get(EpisodeMetadataKeys.SEASON), - 'relative_episode': episode_info.get(EpisodeMetadataKeys.EPISODE_NUMBER), - }, - )(), - processing_params={ - "model_name": self.model_name, - "model_revision": self.model_revision, - "batch_size": self.batch_size, - "device": self.device, - }, - statistics={ - "total_embeddings": len(video_embeddings), - "embedding_dimension": len(video_embeddings[0]["embedding"]) if video_embeddings else 0, - "frames_with_hash": sum(1 for e in video_embeddings if "perceptual_hash" in e), - }, - results_key="video_embeddings", - results_data=video_embeddings, - ) - atomic_write_json(video_output, video_data, indent=2, ensure_ascii=False) - - if full_episode_embedding: - full_episode_data = create_processing_metadata( - episode_info=type( - 'obj', (object,), { - 'season': episode_info.get(EpisodeMetadataKeys.SEASON), - 'relative_episode': episode_info.get(EpisodeMetadataKeys.EPISODE_NUMBER), - }, - )(), - processing_params={ - "model_name": self.model_name, - "model_revision": self.model_revision, - "device": self.device, - }, - statistics={ - "transcript_length": full_episode_embedding.get("transcript_length", 0), - "embedding_dimension": len(full_episode_embedding["embedding"]) if "embedding" in full_episode_embedding else 0, - }, - results_key="full_episode_embedding", - results_data=full_episode_embedding, - ) - atomic_write_json(full_episode_output, full_episode_data, indent=2, ensure_ascii=False) - console.print(f"[green]✓ Saved full episode embedding to: {full_episode_output}[/green]") - - if sound_event_embeddings: - sound_events_data = create_processing_metadata( - episode_info=type( - 'obj', (object,), { - 'season': episode_info.get(EpisodeMetadataKeys.SEASON), - 'relative_episode': episode_info.get(EpisodeMetadataKeys.EPISODE_NUMBER), - }, - )(), - processing_params={ - "model_name": self.model_name, - "model_revision": self.model_revision, - "segments_per_embedding": self.segments_per_embedding, - "use_sentence_based_chunking": True, - "text_sentences_per_chunk": self.text_sentences_per_chunk, - "text_chunk_overlap": self.text_chunk_overlap, - "device": self.device, - }, - statistics={ - "total_embeddings": len(sound_event_embeddings), - "embedding_dimension": len(sound_event_embeddings[0]["embedding"]) if sound_event_embeddings else 0, - }, - results_key="sound_event_embeddings", - results_data=sound_event_embeddings, - ) - atomic_write_json(sound_events_output, sound_events_data, indent=2, ensure_ascii=False) - console.print(f"[green]✓ Saved sound event embeddings to: {sound_events_output}[/green]") - - @staticmethod - def _cleanup_memory() -> None: - gc.collect() - if torch.cuda.is_available(): - torch.cuda.empty_cache() diff --git a/preprocessor/embeddings/episode_name_embedder.py b/preprocessor/embeddings/episode_name_embedder.py deleted file mode 100644 index 46256818c..000000000 --- a/preprocessor/embeddings/episode_name_embedder.py +++ /dev/null @@ -1,158 +0,0 @@ -import json -import logging -from pathlib import Path -from typing import ( - Any, - Dict, - Optional, -) - -import numpy as np - -from preprocessor.config.config import settings -from preprocessor.core.episode_manager import EpisodeManager -from preprocessor.utils.console import console -from preprocessor.utils.constants import EmbeddingKeys -from preprocessor.utils.file_utils import atomic_write_json - - -class EpisodeNameEmbedder: - def __init__( - self, - model, - episode_manager: EpisodeManager, - series_name: str, - output_dir: Optional[Path] = None, - logger: Optional[logging.Logger] = None, - ): - self.model = model - self.episode_manager = episode_manager - self.series_name = series_name - self.output_dir = output_dir or settings.embedding.default_output_dir - self.logger = logger or logging.getLogger(__name__) - - def __generate_episode_name_embeddings( - self, - transcription_data: Dict[str, Any], - ) -> Optional[Dict[str, Any]]: - episode_info_dict = transcription_data.get("episode_info", {}) - season = episode_info_dict.get("season") - episode_number = episode_info_dict.get("episode_number") - - if season is None or episode_number is None: - self.logger.warning( - f"Missing season or episode_number in transcription data: episode_info={episode_info_dict}", - ) - return None - - episode_info = self.episode_manager.get_episode_by_season_and_relative( - season, - episode_number, - ) - if not episode_info: - self.logger.warning(f"Cannot find episode info for S{season:02d}E{episode_number:02d}") - return None - - metadata = self.episode_manager.get_metadata(episode_info) - title = metadata.get("title") - - if not title: - self.logger.warning(f"No title found for {episode_info.episode_code()}") - return None - - embedding = self.__generate_title_embedding(title) - if embedding is None: - return None - - episode_id = episode_info.episode_code() - - result = { - EmbeddingKeys.EPISODE_ID: episode_id, - EmbeddingKeys.TITLE: title, - EmbeddingKeys.TITLE_EMBEDDING: embedding.tolist(), - EmbeddingKeys.EPISODE_METADATA: { - "season": season, - "episode_number": episode_number, - "title": title, - "premiere_date": metadata.get("premiere_date"), - "series_name": self.series_name, - "viewership": metadata.get("viewership"), - }, - } - - return result - - def __generate_title_embedding(self, title: str) -> Optional[np.ndarray]: - try: - embeddings_tensor = self.model.get_text_embeddings(texts=[title]) - embedding = embeddings_tensor[0].cpu().numpy() - del embeddings_tensor - return embedding - except Exception as e: - self.logger.error(f"Failed to generate embedding for title '{title}': {e}") - return None - - @staticmethod - def __save_episode_name_embedding( - season: int, - episode: int, - embedding_data: Dict[str, Any], - ) -> Path: - from preprocessor.core.episode_manager import EpisodeInfo # pylint: disable=import-outside-toplevel - from preprocessor.core.output_path_builder import OutputPathBuilder # pylint: disable=import-outside-toplevel - - episode_info = EpisodeInfo( - absolute_episode=0, - season=season, - relative_episode=episode, - title="", - ) - output_file = OutputPathBuilder.build_embedding_path(episode_info, "episode_name_embedding.json") - - atomic_write_json(output_file, embedding_data, indent=2, ensure_ascii=False) - - return output_file - - def generate_and_save_for_transcription( - self, - transcription_data: Dict[str, Any], - ) -> Optional[Path]: - embedding_data = self.__generate_episode_name_embeddings(transcription_data) - if not embedding_data: - return None - - season = embedding_data[EmbeddingKeys.EPISODE_METADATA]["season"] - episode = embedding_data[EmbeddingKeys.EPISODE_METADATA]["episode_number"] - - output_file = self.__save_episode_name_embedding(season, episode, embedding_data) - console.print( - f"[green]Generated episode name embedding for {embedding_data[EmbeddingKeys.EPISODE_ID]}: {embedding_data[EmbeddingKeys.TITLE]}[/green]", - ) - - return output_file - - @staticmethod - def load_episode_name_embedding( - season: int, - episode: int, - output_dir: Optional[Path] = None, - ) -> Optional[Dict[str, Any]]: - from preprocessor.core.episode_manager import EpisodeInfo # pylint: disable=import-outside-toplevel - from preprocessor.core.output_path_builder import OutputPathBuilder # pylint: disable=import-outside-toplevel - - if output_dir is None: - output_dir = settings.embedding.default_output_dir - - episode_info = EpisodeInfo( - absolute_episode=0, - season=season, - relative_episode=episode, - title="", - ) - embedding_file = OutputPathBuilder.build_embedding_path(episode_info, "episode_name_embedding.json") - - if not embedding_file.exists(): - return None - - with open(embedding_file, "r", encoding="utf-8") as f: - return json.load(f) diff --git a/preprocessor/embeddings/gpu_batch_processor.py b/preprocessor/embeddings/gpu_batch_processor.py deleted file mode 100644 index 99d25a881..000000000 --- a/preprocessor/embeddings/gpu_batch_processor.py +++ /dev/null @@ -1,114 +0,0 @@ -import time -from typing import ( - Any, - Dict, - List, -) - -from PIL import Image -import torch - -from preprocessor.utils.console import console -from preprocessor.utils.error_handling_logger import ErrorHandlingLogger - - -class GPUBatchProcessor: - def __init__( - self, - model, - batch_size: int, - logger: ErrorHandlingLogger, - device: str, - progress_sub_batch_size: int = 100, - ): - self.model = model - self.batch_size = batch_size - self.progress_sub_batch_size = progress_sub_batch_size - self.logger = logger - self.device = device - self.max_vram_used = 0.0 - self.vram_samples = [] - - def __log_vram_usage(self) -> None: - if torch.cuda.is_available(): - vram_reserved = torch.cuda.memory_reserved(self.device) / 1024**3 - self.max_vram_used = max(self.max_vram_used, vram_reserved) - self.vram_samples.append(vram_reserved) - - def get_vram_stats(self) -> Dict[str, Any]: - if not self.vram_samples: - return {} - return { - "max_vram_gb": round(self.max_vram_used, 2), - "avg_vram_gb": round(sum(self.vram_samples) / len(self.vram_samples), 2), - "samples": len(self.vram_samples), - } - - def suggest_optimal_batch_size(self, target_vram_gb: float = 21.0) -> int: - if not self.vram_samples: - return self.batch_size - - avg_vram = sum(self.vram_samples) / len(self.vram_samples) - if avg_vram <= 0: - return self.batch_size - - vram_ratio = target_vram_gb / avg_vram - suggested = int(self.batch_size * vram_ratio * 0.9) - - suggested = max(50, min(suggested, 1000)) - - return suggested - - def process_images_batch( # pylint: disable=too-many-locals - self, - pil_images: List[Image.Image], - chunk_idx: int, - ) -> List[List[float]]: - results = [] - total_images = len(pil_images) - effective_batch_size = min(self.batch_size, self.progress_sub_batch_size) - batch_start_time = time.time() - - for sub_idx in range(0, total_images, effective_batch_size): - sub_end = min(sub_idx + effective_batch_size, total_images) - batch_pil = pil_images[sub_idx:sub_end] - current_batch_size = len(batch_pil) - - try: # pylint: disable=too-many-try-statements - sub_batch_start = time.time() - - inputs = [{"image": img} for img in batch_pil] - embeddings_tensor = self.model.process(inputs, normalize=True) - self.__log_vram_usage() - batch_np = embeddings_tensor.cpu().numpy() - del embeddings_tensor - results.extend([emb.tolist() for emb in batch_np]) - del batch_np - torch.cuda.empty_cache() - - if total_images > self.progress_sub_batch_size: - elapsed = time.time() - sub_batch_start - rate = current_batch_size / elapsed if elapsed > 0 else 0 - console.print( - f" [dim cyan]→ {sub_idx + 1}-{sub_end}/{total_images} " - f"({sub_end / total_images * 100:.0f}%) - {elapsed:.1f}s ({rate:.3f} img/s)[/dim cyan]", - ) - - elapsed_total = time.time() - batch_start_time - if sub_end < total_images: - remaining_images = total_images - sub_end - eta = remaining_images / (sub_end / elapsed_total) if elapsed_total > 0 else 0 - console.print(f" [dim]Batch ETA: {eta:.0f}s[/dim]") - except RuntimeError as e: - if "out of memory" in str(e).lower(): - torch.cuda.empty_cache() - self.logger.error( - f"OOM in chunk {chunk_idx} with batch_size={current_batch_size}. " - f"Try reducing progress_sub_batch_size in config.", - ) - raise e - except Exception as e: - self.logger.error(f"Unexpected error in chunk {chunk_idx} sub-batch {sub_idx}-{sub_end}: {e}") - raise e - - return results diff --git a/preprocessor/embeddings/qwen3_vl_embedding.py b/preprocessor/embeddings/qwen3_vl_embedding.py deleted file mode 100644 index e6edd5309..000000000 --- a/preprocessor/embeddings/qwen3_vl_embedding.py +++ /dev/null @@ -1,112 +0,0 @@ -import logging -from typing import ( - Any, - Dict, - List, - Optional, -) - -from PIL import Image -import torch -import torch.nn.functional as F -from vllm import LLM - -from preprocessor.config.config import settings - -logger = logging.getLogger(__name__) - - -class Qwen3VLEmbedder: - def __init__( - self, - model_name_or_path: str, - max_length: Optional[int] = None, - tensor_parallel_size: Optional[int] = None, - gpu_memory_utilization: Optional[float] = None, - **kwargs, - ): - if not torch.cuda.is_available(): - raise RuntimeError("CUDA is required but not available. This pipeline requires GPU.") - - self.max_length = max_length or settings.embedding_model.max_model_len - self.model_name_or_path = model_name_or_path - self.image_placeholder = settings.embedding_model.image_placeholder - - dtype = kwargs.pop("torch_dtype", torch.bfloat16) - dtype_str = "bfloat16" if dtype == torch.bfloat16 else "float16" - - self.model = LLM( - model=model_name_or_path, - runner="pooling", - dtype=dtype_str, - trust_remote_code=True, - max_model_len=self.max_length, - gpu_memory_utilization=gpu_memory_utilization or settings.embedding_model.gpu_memory_utilization, - tensor_parallel_size=tensor_parallel_size or settings.embedding_model.tensor_parallel_size, - enable_chunked_prefill=settings.embedding_model.enable_chunked_prefill, - max_num_batched_tokens=settings.embedding_model.max_num_batched_tokens, - enforce_eager=settings.embedding_model.enforce_eager, - disable_log_stats=True, - ) - - logger.info(f"vLLM Qwen3-VL-Embedding loaded: {model_name_or_path}") - - def process(self, inputs: List[Dict[str, Any]], normalize: bool = True) -> torch.Tensor: - vllm_inputs = [] - - for item in inputs: - text = item.get("text") - image = item.get("image") - video = item.get("video") - - if image: - if isinstance(image, str): - img = Image.open(image).convert("RGB") - elif isinstance(image, Image.Image): - img = image - else: - raise TypeError(f"Unsupported image type: {type(image)}") - - vllm_inputs.append({ - "prompt": self.image_placeholder, - "multi_modal_data": {"image": img}, - }) - elif text: - vllm_inputs.append({ - "prompt": text, - }) - elif video: - if isinstance(video, list): - frames = [] - for frame in video: - if isinstance(frame, str): - frames.append(Image.open(frame).convert("RGB")) - elif isinstance(frame, Image.Image): - frames.append(frame) - else: - raise TypeError(f"Unsupported frame type: {type(frame)}") - - vllm_inputs.append({ - "prompt": self.image_placeholder, - "multi_modal_data": {"image": frames[0] if frames else None}, - }) - else: - raise TypeError(f"Unsupported video type: {type(video)}") - else: - vllm_inputs.append({"prompt": "NULL"}) - - outputs = self.model.embed(vllm_inputs) - - embeddings = torch.stack([ - torch.tensor(output.outputs.embedding, dtype=torch.float32) - for output in outputs - ]) - - if normalize: - embeddings = F.normalize(embeddings, p=2, dim=-1) - - return embeddings - - def get_text_embeddings(self, texts: List[str], normalize: bool = True) -> torch.Tensor: - inputs = [{"text": text} for text in texts] - return self.process(inputs, normalize=normalize) diff --git a/preprocessor/embeddings/strategies/__init__.py b/preprocessor/embeddings/strategies/__init__.py deleted file mode 100644 index 7cfb75d6b..000000000 --- a/preprocessor/embeddings/strategies/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -from preprocessor.embeddings.strategies.base_strategy import BaseKeyframeStrategy -from preprocessor.embeddings.strategies.scene_changes_strategy import SceneChangesStrategy - -__all__ = [ - "BaseKeyframeStrategy", - "SceneChangesStrategy", -] diff --git a/preprocessor/embeddings/strategies/scene_changes_strategy.py b/preprocessor/embeddings/strategies/scene_changes_strategy.py deleted file mode 100644 index 9244ad68e..000000000 --- a/preprocessor/embeddings/strategies/scene_changes_strategy.py +++ /dev/null @@ -1,67 +0,0 @@ -from pathlib import Path -from typing import ( - Any, - Dict, - List, -) - -from preprocessor.core.enums import FrameType -from preprocessor.embeddings.strategies.base_strategy import BaseKeyframeStrategy -from preprocessor.utils.console import console - - -class SceneChangesStrategy(BaseKeyframeStrategy): - def __init__(self, frames_per_scene: int): - self.frames_per_scene = frames_per_scene - - def extract_frame_requests( - self, - video_path: Path, - data: Dict[str, Any], - ) -> List[Dict[str, Any]]: - scene_timestamps = data.get("scene_timestamps", {}) - scenes = scene_timestamps.get("scenes", []) - - if not scenes: - console.print("[yellow]No scene timestamps found[/yellow]") - return [] - - video_info = scene_timestamps.get("video_info", {}) - fps = video_info.get("fps") - if fps is None: - raise ValueError("FPS not found in scene_timestamps video_info") - frame_requests = [] - - for i, scene in enumerate(scenes): - start_frame = scene.get("start", {}).get("frame", 0) - frame_count = scene.get("frame_count", 1) - - if frame_count <= 1: - frame_requests.append(self.__create_request(start_frame, fps, FrameType.SCENE_SINGLE, i)) - continue - - for frame_idx in range(self.frames_per_scene): - position = frame_idx / (self.frames_per_scene - 1) if self.frames_per_scene > 1 else 0.0 - frame_number = int(start_frame + position * (frame_count - 1)) - - if frame_idx == 0: - frame_type = FrameType.SCENE_START - elif frame_idx == self.frames_per_scene - 1: - frame_type = FrameType.SCENE_END - else: - frame_type = FrameType.scene_mid(frame_idx) - - frame_requests.append(self.__create_request(frame_number, fps, frame_type, i)) - - return frame_requests - - @staticmethod - def __create_request(frame: int, fps: float, type_name: str, scene_num: int = None) -> Dict[str, Any]: - req = { - "frame_number": int(frame), - "timestamp": float(frame / fps), - "type": type_name, - } - if scene_num is not None: - req["scene_number"] = scene_num - return req diff --git a/preprocessor/embeddings/strategies/strategy_factory.py b/preprocessor/embeddings/strategies/strategy_factory.py deleted file mode 100644 index 400e18624..000000000 --- a/preprocessor/embeddings/strategies/strategy_factory.py +++ /dev/null @@ -1,14 +0,0 @@ -from preprocessor.core.enums import KeyframeStrategy -from preprocessor.embeddings.strategies.base_strategy import BaseKeyframeStrategy -from preprocessor.embeddings.strategies.scene_changes_strategy import SceneChangesStrategy - - -class KeyframeStrategyFactory: - @staticmethod - def create( - strategy_type: KeyframeStrategy, - frames_per_scene: int = 1, - ) -> BaseKeyframeStrategy: - if strategy_type == KeyframeStrategy.SCENE_CHANGES: - return SceneChangesStrategy(frames_per_scene=frames_per_scene) - raise ValueError(f"Unknown keyframe strategy: {strategy_type}") diff --git a/preprocessor/entrypoint.sh b/preprocessor/entrypoint.sh index 15033bb39..71388463b 100755 --- a/preprocessor/entrypoint.sh +++ b/preprocessor/entrypoint.sh @@ -1,10 +1,5 @@ #!/bin/bash set -e -echo "Ensuring global output directories exist..." -mkdir -p /app/output_data/characters -mkdir -p /app/output_data/scraped_pages -mkdir -p /app/output_data/processing_metadata - echo "Starting application..." exec python -m preprocessor.cli "$@" diff --git a/preprocessor/hashing/__init__.py b/preprocessor/hashing/__init__.py deleted file mode 100644 index a9bc298b9..000000000 --- a/preprocessor/hashing/__init__.py +++ /dev/null @@ -1 +0,0 @@ -__all__ = ["ImageHashProcessor", "PerceptualHasher"] # pylint: disable=undefined-all-variable diff --git a/preprocessor/hashing/image_hash_processor.py b/preprocessor/hashing/image_hash_processor.py deleted file mode 100644 index 23f2949cd..000000000 --- a/preprocessor/hashing/image_hash_processor.py +++ /dev/null @@ -1,146 +0,0 @@ -import gc -import json -import logging -from pathlib import Path -from typing import ( - Any, - Dict, - List, - Optional, -) - -import torch - -from preprocessor.config.config import settings -from preprocessor.core.base_processor import ( - BaseProcessor, - OutputSpec, - ProcessingItem, -) -from preprocessor.core.episode_manager import EpisodeManager -from preprocessor.core.output_path_builder import OutputPathBuilder -from preprocessor.hashing.image_hasher import PerceptualHasher -from preprocessor.utils.batch_processing_utils import compute_hashes_in_batches -from preprocessor.utils.console import console -from preprocessor.utils.metadata_utils import create_processing_metadata - -# pylint: disable=duplicate-code - - - -class ImageHashProcessor(BaseProcessor): - def __init__(self, args: Dict[str, Any]) -> None: - super().__init__( - args=args, - class_name=self.__class__.__name__, - error_exit_code=11, - loglevel=logging.DEBUG, - ) - - self.frames_dir: Path = Path(self._args.get("frames_dir", settings.frame_export.output_dir)) - self.output_dir: Path = Path(self._args.get("output_dir", settings.image_hash.output_dir)) - self.batch_size: int = self._args.get("batch_size", settings.embedding.batch_size) - self.device: str = "cuda" - - episodes_info_json = self._args.get("episodes_info_json") - self.episode_manager = EpisodeManager(episodes_info_json, self.series_name) - - self.hasher: Optional[PerceptualHasher] = None - - def _validate_args(self, args: Dict[str, Any]) -> None: - if not torch.cuda.is_available(): - raise RuntimeError("CUDA is not available. This application requires GPU.") - - def cleanup(self) -> None: - console.print("[cyan]Unloading image hasher...[/cyan]") - self.hasher = None - self.__cleanup_memory() - console.print("[green]✓ Hasher unloaded[/green]") - - # pylint: disable=duplicate-code - def _get_processing_items(self) -> List[ProcessingItem]: - return self._get_episode_processing_items_from_metadata( - "**/*_frame_metadata.json", - self.frames_dir, - self.episode_manager, - ) - - def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: - episode_info = item.metadata["episode_info"] - hash_filename = self.episode_manager.file_naming.build_filename( - episode_info, - extension="json", - suffix="image_hashes", - ) - hash_output = OutputPathBuilder.build_output_path( - episode_info, - settings.output_subdirs.image_hashes, - hash_filename, - ) - return [OutputSpec(path=hash_output, required=True)] - # pylint: enable=duplicate-code - - def _get_processing_info(self) -> List[str]: - return [ - f"[cyan]Device: {self.device}[/cyan]", - f"[cyan]Batch size: {self.batch_size}[/cyan]", - ] - - def _load_resources(self) -> bool: - self.hasher = PerceptualHasher(device=self.device, hash_size=8) - return True - - def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> None: - metadata_file = item.input_path - episode_info = item.metadata["episode_info"] - - with open(metadata_file, "r", encoding="utf-8") as f: - metadata = json.load(f) - - frame_requests = metadata.get("frames", []) - if not frame_requests: - console.print(f"[yellow]No frames in metadata for {metadata_file}[/yellow]") - return - - frames_dir = metadata_file.parent - hash_results = compute_hashes_in_batches(frames_dir, frame_requests, self.hasher, self.batch_size) - - episode_dir = self.__get_episode_output_dir(episode_info) - self.__save_hashes(episode_dir, episode_info, hash_results) - self.__cleanup_memory() - - def __get_episode_output_dir(self, episode_info) -> Path: - return self.episode_manager.get_episode_subdir(episode_info, settings.output_subdirs.image_hashes) - - def __save_hashes(self, episode_dir: Path, episode_info, hash_results: List[Dict[str, Any]]) -> None: - episode_dir.mkdir(parents=True, exist_ok=True) - - hash_data = create_processing_metadata( - episode_info=episode_info, - processing_params={ - "device": self.device, - "batch_size": self.batch_size, - "hash_size": 8, - }, - statistics={ - "total_hashes": len(hash_results), - "unique_hashes": len(set(h.get("perceptual_hash") for h in hash_results if "perceptual_hash" in h)), - }, - results_key="image_hashes", - results_data=hash_results, - ) - - hash_filename = self.episode_manager.file_naming.build_filename( - episode_info, - extension="json", - suffix="image_hashes", - ) - hash_output = episode_dir / hash_filename - with open(hash_output, "w", encoding="utf-8") as f: - json.dump(hash_data, f, indent=2, ensure_ascii=False) - - @staticmethod - def __cleanup_memory() -> None: - gc.collect() - if torch.cuda.is_available(): - torch.cuda.empty_cache() diff --git a/preprocessor/hashing/image_hasher.py b/preprocessor/hashing/image_hasher.py deleted file mode 100644 index 3981c4703..000000000 --- a/preprocessor/hashing/image_hasher.py +++ /dev/null @@ -1,76 +0,0 @@ -import logging -from typing import List - -from PIL import Image -import numpy as np -import torch -import torch.nn.functional as F - - -class PerceptualHasher: - def __init__(self, device: str = "cuda", hash_size: int = 8): - self.device = device - self.hash_size = hash_size - self.resize_size = hash_size * 4 - self.logger = logging.getLogger(__name__) - - def compute_phash_batch(self, pil_images: List[Image.Image]) -> List[str]: - if not pil_images: - return [] - - try: - images_tensor = self.__pil_to_tensor_batch(pil_images) - hashes = self.__compute_phash_tensor(images_tensor) - return hashes - except Exception as e: - self.logger.error(f"Failed to compute pHash: {e}") - return ["0" * 16] * len(pil_images) - - def __pil_to_tensor_batch(self, pil_images: List[Image.Image]) -> torch.Tensor: - tensors = [] - for img in pil_images: - if img.mode != 'L': - img = img.convert('L') - img_resized = img.resize((self.resize_size, self.resize_size), Image.Resampling.LANCZOS) - img_array = np.array(img_resized, dtype=np.float32) - tensor = torch.from_numpy(img_array) - tensors.append(tensor) - - batch_tensor = torch.stack(tensors).unsqueeze(1).to(self.device) - return batch_tensor - - def __compute_phash_tensor(self, images: torch.Tensor) -> List[str]: - dct_coeffs = self.__batch_dct2d(images) - - top_left = dct_coeffs[:, :, :self.hash_size, :self.hash_size] - - top_left_flat = top_left.reshape(top_left.size(0), -1) - - median_vals = torch.median(top_left_flat, dim=1, keepdim=True)[0] - - hash_bits = (top_left_flat > median_vals).long() - - hashes = [] - for bits in hash_bits: - hash_int = 0 - for i, bit in enumerate(bits): - if bit: - hash_int |= (1 << i) - hash_hex = f"{hash_int:016x}" - hashes.append(hash_hex) - - return hashes - - # noinspection PyPep8Naming - def __batch_dct2d(self, images: torch.Tensor) -> torch.Tensor: - N, C, H, W = images.shape # pylint: disable=unused-variable - - if H != W or H != self.resize_size: - images = F.interpolate(images, size=(self.resize_size, self.resize_size), mode='bilinear', align_corners=False) - - freq_h = torch.fft.fft(images, dim=2) - freq_hw = torch.fft.fft(freq_h, dim=3) - - dct_coeffs = freq_hw.real - - return dct_coeffs diff --git a/preprocessor/indexing/archive_generator.py b/preprocessor/indexing/archive_generator.py deleted file mode 100644 index 711c9c670..000000000 --- a/preprocessor/indexing/archive_generator.py +++ /dev/null @@ -1,169 +0,0 @@ -import logging -from pathlib import Path -from typing import ( - Any, - Dict, - List, -) -import zipfile - -from preprocessor.config.config import settings -from preprocessor.core.base_processor import ( - BaseProcessor, - OutputSpec, - ProcessingItem, -) -from preprocessor.core.constants import ( - FILE_EXTENSIONS, - FILE_SUFFIXES, -) -from preprocessor.core.episode_manager import EpisodeManager -from preprocessor.utils.console import console - -ELASTIC_SUBDIRS = settings.output_subdirs.elastic_document_subdirs - - -class ArchiveGenerator(BaseProcessor): - FOLDER_TO_FILE_SUFFIX = { - ELASTIC_SUBDIRS.text_segments: "text_segments", - ELASTIC_SUBDIRS.text_embeddings: "text_embeddings", - ELASTIC_SUBDIRS.video_frames: "video_frames", - ELASTIC_SUBDIRS.episode_names: "episode_name", - ELASTIC_SUBDIRS.text_statistics: "text_statistics", - ELASTIC_SUBDIRS.full_episode_embeddings: "full_episode_embedding", - ELASTIC_SUBDIRS.sound_events: "sound_events", - ELASTIC_SUBDIRS.sound_event_embeddings: "sound_event_embeddings", - } - - def __init__(self, args: Dict[str, Any]): - super().__init__( - args=args, - class_name=self.__class__.__name__, - error_exit_code=11, - loglevel=logging.DEBUG, - ) - - self.elastic_documents_dir: Path = self._args["elastic_documents_dir"] - self.output_dir: Path = self._args.get("output_dir", Path("/app/output_data/archives")) - self.allow_partial: bool = self._args.get("allow_partial", False) - - episodes_info_json = self._args.get("episodes_info_json") - self.episode_manager = EpisodeManager(episodes_info_json, self.series_name) - - def _validate_args(self, args: Dict[str, Any]) -> None: - if "elastic_documents_dir" not in args: - raise ValueError("elastic_documents_dir is required") - - def _get_processing_items(self) -> List[ProcessingItem]: - segments_dir = self.elastic_documents_dir / ELASTIC_SUBDIRS.text_segments - if not segments_dir.exists(): - console.print(f"[yellow]Text segments directory not found: {segments_dir}[/yellow]") - return [] - - all_segment_files = list(segments_dir.glob(f"**/*{FILE_SUFFIXES['text_segments']}{FILE_EXTENSIONS['jsonl']}")) - items = [] - - for segment_file in all_segment_files: - episode_info = self.episode_manager.parse_filename(segment_file) - if not episode_info: - self.logger.warning(f"Cannot parse episode info from {segment_file}") - continue - - base_name = segment_file.stem.replace(FILE_SUFFIXES["text_segments"], "") - items.append( - ProcessingItem( - episode_id=episode_info.episode_code(), - input_path=segment_file, - metadata={ - "base_name": base_name, - "episode_info": episode_info, - }, - ), - ) - - return items - - def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: - episode_info = item.metadata["episode_info"] - base_name = item.metadata["base_name"] - - archive_name = f"{base_name}.zip" - archive_path = ( - self.output_dir - / f"S{episode_info.season:02d}" - / f"E{episode_info.relative_episode:02d}" - / archive_name - ) - - return [OutputSpec(path=archive_path, required=True)] - - def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> None: - episode_info = item.metadata["episode_info"] - base_name = item.metadata["base_name"] - - console.print(f"[cyan]Archiving documents for: {item.episode_id}[/cyan]") - - episode_files = self.__collect_episode_files(episode_info, base_name) - - if not episode_files: - self.logger.warning(f"No files found for {item.episode_id}") - return - - expected_count = len(self.FOLDER_TO_FILE_SUFFIX) - found_count = len(episode_files) - - if found_count < expected_count and not self.allow_partial: - console.print( - f"[yellow]Skipping {item.episode_id}: incomplete files " - f"({found_count}/{expected_count}). Use --allow-partial to archive anyway.[/yellow]", - ) - return - - for output_spec in missing_outputs: - self.__create_archive(output_spec.path, episode_files) - - console.print(f"[green]Completed archive for: {item.episode_id}[/green]") - - def __collect_episode_files(self, episode_info, base_name: str) -> Dict[str, Path]: - collected_files = {} - - for folder_name, file_suffix in self.FOLDER_TO_FILE_SUFFIX.items(): - file_name = f"{base_name}_{file_suffix}.jsonl" - file_path = ( - self.elastic_documents_dir - / folder_name - / f"S{episode_info.season:02d}" - / f"E{episode_info.relative_episode:02d}" - / file_name - ) - - if file_path.exists(): - collected_files[folder_name] = file_path - else: - self.logger.warning(f"File not found: {file_path}") - - return collected_files - - def __create_archive(self, archive_path: Path, files: Dict[str, Path]) -> None: - archive_path.parent.mkdir(parents=True, exist_ok=True) - - temp_path = archive_path.with_suffix(archive_path.suffix + ".tmp") - - try: - with zipfile.ZipFile(temp_path, "w", zipfile.ZIP_DEFLATED) as zipf: - for _, file_path in files.items(): - zipf.write(file_path, arcname=file_path.name) - self.logger.debug(f"Added to archive: {file_path.name}") - - temp_path.replace(archive_path) - - archive_size_mb = archive_path.stat().st_size / (1024 * 1024) - console.print( - f"[green]Created archive: {archive_path.name} " - f"({len(files)} files, {archive_size_mb:.2f} MB)[/green]", - ) - - except Exception as e: - if temp_path.exists(): - temp_path.unlink() - raise RuntimeError(f"Failed to create archive {archive_path}: {e}") from e diff --git a/preprocessor/indexing/elastic_document_generator.py b/preprocessor/indexing/elastic_document_generator.py deleted file mode 100644 index c238cf2e2..000000000 --- a/preprocessor/indexing/elastic_document_generator.py +++ /dev/null @@ -1,901 +0,0 @@ -import json -import logging -from pathlib import Path -from typing import ( - Any, - Dict, - List, - Optional, -) - -from bot.types import ( - CharacterDetectionInFrame, - EpisodeMetadata, - ObjectDetectionInFrame, - SceneTimestampsData, -) -from preprocessor.config.config import settings -from preprocessor.core.base_processor import ( - BaseProcessor, - OutputSpec, - ProcessingItem, -) -from preprocessor.core.constants import ( - FILE_EXTENSIONS, - FILE_SUFFIXES, -) -from preprocessor.core.episode_manager import EpisodeManager -from preprocessor.core.output_path_builder import OutputPathBuilder -from preprocessor.embeddings.episode_name_embedder import EpisodeNameEmbedder -from preprocessor.utils.console import console -from preprocessor.utils.constants import ( - CharacterDetectionKeys, - DetectionKeys, - ElasticDocKeys, - EmbeddingKeys, - EmotionKeys, - EpisodeMetadataKeys, - ObjectDetectionKeys, - SceneKeys, - SceneTimeKeys, -) - -ELASTIC_SUBDIRS = settings.output_subdirs.elastic_document_subdirs - - -class ElasticDocumentGenerator(BaseProcessor): - def __init__(self, args: Dict[str, Any]): - super().__init__( - args=args, - class_name=self.__class__.__name__, - error_exit_code=10, - loglevel=logging.DEBUG, - ) - - self.transcription_jsons: Path = self._args["transcription_jsons"] - self.embeddings_dir: Optional[Path] = self._args.get("embeddings_dir") - self.scene_timestamps_dir: Optional[Path] = self._args.get("scene_timestamps_dir") - self.character_detections_dir: Optional[Path] = self._args.get("character_detections_dir") - self.object_detections_dir: Optional[Path] = self._args.get("object_detections_dir") - self.output_dir: Path = self._args.get("output_dir", Path("/app/output_data/elastic_documents")) - - episodes_info_json = self._args.get("episodes_info_json") - self.episode_manager = EpisodeManager(episodes_info_json, self.series_name) - - def _validate_args(self, args: Dict[str, Any]) -> None: - if "transcription_jsons" not in args: - raise ValueError("transcription_jsons is required") - - def _get_processing_items(self) -> List[ProcessingItem]: - all_transcription_files = list(self.transcription_jsons.glob("**/raw/*_segmented.json")) - items = [] - - for trans_file in all_transcription_files: - items.append(self._create_transcription_processing_item(trans_file)) - - return items - - def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: # pylint: disable=too-many-locals,too-many-statements - base_name = item.metadata["base_name"] - episode_info = self.episode_manager.parse_filename(item.input_path) - - outputs = [] - - if episode_info: - segments_filename = f"{base_name}{FILE_SUFFIXES['text_segments']}{FILE_EXTENSIONS['jsonl']}" - segments_file = OutputPathBuilder.build_elastic_document_path( - episode_info, - ELASTIC_SUBDIRS.text_segments, - segments_filename, - ) - outputs.append(OutputSpec(path=segments_file, required=True)) - - trans_dir = OutputPathBuilder.get_episode_dir(episode_info, settings.output_subdirs.transcriptions) - sound_events_dir = trans_dir / settings.output_subdirs.transcription_subdirs.sound_events - sound_events_filename = self.episode_manager.file_naming.build_filename( - episode_info, - extension="json", - suffix="sound_events", - ) - sound_events_json = sound_events_dir / sound_events_filename - if sound_events_json.exists(): - sound_events_elastic = f"{base_name}_sound_events.jsonl" - sound_events_file = OutputPathBuilder.build_elastic_document_path( - episode_info, - ELASTIC_SUBDIRS.sound_events, - sound_events_elastic, - ) - outputs.append(OutputSpec(path=sound_events_file, required=False)) - else: - season_dir = item.input_path.parent.name - filename = f"{base_name}{FILE_SUFFIXES['text_segments']}{FILE_EXTENSIONS['jsonl']}" - path = self.output_dir / ELASTIC_SUBDIRS.text_segments / season_dir / filename - outputs.append( - OutputSpec( - path=path, - required=True, - ), - ) - - if self.embeddings_dir and episode_info: - episode_emb_dir = OutputPathBuilder.get_episode_dir(episode_info, settings.output_subdirs.embeddings) - text_emb_files = list(episode_emb_dir.glob("*_embeddings_text.json")) - text_emb_file = text_emb_files[0] if text_emb_files else None - video_emb_files = list(episode_emb_dir.glob("*_embeddings_video.json")) - video_emb_file = video_emb_files[0] if video_emb_files else None - - if text_emb_file and text_emb_file.exists(): - text_embeddings_filename = f"{base_name}_text_embeddings.jsonl" - text_embeddings_file = OutputPathBuilder.build_elastic_document_path( - episode_info, - ELASTIC_SUBDIRS.text_embeddings, - text_embeddings_filename, - ) - outputs.append(OutputSpec(path=text_embeddings_file, required=True)) - - if video_emb_file and video_emb_file.exists(): - video_frames_filename = f"{base_name}_video_frames.jsonl" - video_frames_file = OutputPathBuilder.build_elastic_document_path( - episode_info, - ELASTIC_SUBDIRS.video_frames, - video_frames_filename, - ) - outputs.append(OutputSpec(path=video_frames_file, required=True)) - - episode_name_emb = EpisodeNameEmbedder.load_episode_name_embedding( - episode_info.season, - episode_info.relative_episode, - output_dir=self.embeddings_dir, - ) - if episode_name_emb: - episode_name_filename = f"{base_name}_episode_name.jsonl" - episode_name_file = OutputPathBuilder.build_elastic_document_path( - episode_info, - ELASTIC_SUBDIRS.episode_names, - episode_name_filename, - ) - outputs.append(OutputSpec(path=episode_name_file, required=True)) - - trans_dir = OutputPathBuilder.get_episode_dir(episode_info, settings.output_subdirs.transcriptions) - clean_dir = trans_dir / settings.output_subdirs.transcription_subdirs.clean - text_stats_filename = f"{base_name}_text_stats.json" - text_stats_file = clean_dir / text_stats_filename - if text_stats_file.exists(): - text_stats_elastic_filename = f"{base_name}_text_statistics.jsonl" - text_stats_elastic_file = OutputPathBuilder.build_elastic_document_path( - episode_info, - ELASTIC_SUBDIRS.text_statistics, - text_stats_elastic_filename, - ) - outputs.append(OutputSpec(path=text_stats_elastic_file, required=True)) - - full_episode_emb_file = episode_emb_dir / f"{base_name}_embeddings_full_episode.json" - if full_episode_emb_file.exists(): - full_episode_elastic_filename = f"{base_name}_full_episode_embedding.jsonl" - full_episode_elastic_file = OutputPathBuilder.build_elastic_document_path( - episode_info, - ELASTIC_SUBDIRS.full_episode_embeddings, - full_episode_elastic_filename, - ) - outputs.append(OutputSpec(path=full_episode_elastic_file, required=True)) - - sound_event_emb_file = episode_emb_dir / f"{base_name}_embeddings_sound_events.json" - if sound_event_emb_file.exists(): - sound_event_elastic_filename = f"{base_name}_sound_event_embeddings.jsonl" - sound_event_elastic_file = OutputPathBuilder.build_elastic_document_path( - episode_info, - ELASTIC_SUBDIRS.sound_event_embeddings, - sound_event_elastic_filename, - ) - outputs.append(OutputSpec(path=sound_event_elastic_file, required=False)) - - return outputs - - def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> None: # pylint: disable=too-many-locals,too-many-statements - trans_file = item.input_path - base_name = item.metadata["base_name"] - season_dir = trans_file.parent.name - - console.print(f"[cyan]Processing: {trans_file.name}[/cyan]") - - episode_dir = trans_file.parent.parent - clean_dir = episode_dir / settings.output_subdirs.transcription_subdirs.clean - - base_name_for_clean = trans_file.stem - suffixes = (FILE_SUFFIXES["segmented"], FILE_SUFFIXES["sound_events"], FILE_SUFFIXES["clean"], FILE_SUFFIXES["clean_alt"]) - while True: - removed = False - for suffix in suffixes: - if base_name_for_clean.endswith(suffix): - base_name_for_clean = base_name_for_clean[:-len(suffix)] - removed = True - break - if not removed: - break - - clean_transcription_file = clean_dir / f"{base_name_for_clean}{FILE_SUFFIXES['clean']}{FILE_EXTENSIONS['json']}" - - if not clean_transcription_file.exists(): - self.logger.warning(f"Clean transcription not found: {clean_transcription_file}, skipping") - return - trans_file_for_segments = clean_transcription_file - - with open(trans_file_for_segments, "r", encoding="utf-8") as f: - transcription_data = json.load(f) - - episode_info_dict = transcription_data.get(EpisodeMetadataKeys.EPISODE_INFO, {}) - season = episode_info_dict.get(EpisodeMetadataKeys.SEASON) - episode_number = episode_info_dict.get(EpisodeMetadataKeys.EPISODE_NUMBER) - - if season is None or episode_number is None: - console.print(f"[red]Missing episode info in {trans_file.name}[/red]") - return - - episode_info = self.episode_manager.get_episode_by_season_and_relative(season, episode_number) - if not episode_info: - console.print(f"[red]Cannot find episode info for S{season:02d}E{episode_number:02d}[/red]") - return - - episode_metadata = self.__build_episode_metadata(episode_info) - episode_id = episode_info.episode_code() - video_path = self.episode_manager.build_video_path_for_elastic(episode_info) - - scene_timestamps = self.__load_scene_timestamps(episode_info) - character_detections = self.__load_character_detections(episode_info) - object_detections = self.__load_object_detections(episode_info) - - if any(f"{FILE_SUFFIXES['text_segments']}{FILE_EXTENSIONS['jsonl']}" in str(o.path) for o in missing_outputs): - self.__generate_segments( - transcription_data, - episode_id, - episode_metadata, - video_path, - scene_timestamps, - season_dir, - base_name, - ) - - trans_dir = self.episode_manager.get_episode_subdir(episode_info, settings.output_subdirs.transcriptions) - sound_events_dir = trans_dir / settings.output_subdirs.transcription_subdirs.sound_events - sound_events_json = sound_events_dir / f"{base_name}_sound_events.json" - if sound_events_json.exists() and any("_sound_events.jsonl" in str(o.path) for o in missing_outputs): - with open(sound_events_json, "r", encoding="utf-8") as f: - sound_events_data = json.load(f) - - self.__generate_sound_events( - sound_events_data, - episode_id, - episode_metadata, - video_path, - scene_timestamps, - episode_info, - base_name, - ) - - if self.embeddings_dir: - episode_emb_dir = self.episode_manager.get_episode_subdir(episode_info, settings.output_subdirs.embeddings) - text_emb_files = list(episode_emb_dir.glob("*_embeddings_text.json")) - text_emb_file = text_emb_files[0] if text_emb_files else None - - if text_emb_file and text_emb_file.exists() and any("_text_embeddings.jsonl" in str(o.path) for o in missing_outputs): - self.__generate_text_embeddings( - text_emb_file, - episode_id, - episode_metadata, - video_path, - episode_info, - base_name, - ) - - video_emb_files = list(episode_emb_dir.glob("*_embeddings_video.json")) - video_emb_file = video_emb_files[0] if video_emb_files else None - - if video_emb_file and video_emb_file.exists() and any("_video_frames.jsonl" in str(o.path) for o in missing_outputs): - self.__generate_video_frames( - video_emb_file, - episode_id, - episode_metadata, - video_path, - scene_timestamps, - character_detections, - object_detections, - episode_info, - base_name, - ) - - episode_name_emb = EpisodeNameEmbedder.load_episode_name_embedding( - season, - episode_number, - output_dir=self.embeddings_dir, - ) - if episode_name_emb and any("_episode_name.jsonl" in str(o.path) for o in missing_outputs): - self.__generate_episode_name_document( - episode_name_emb, - episode_id, - episode_metadata, - video_path, - episode_info, - base_name, - ) - - trans_dir = self.episode_manager.get_episode_subdir(episode_info, settings.output_subdirs.transcriptions) - clean_dir = trans_dir / settings.output_subdirs.transcription_subdirs.clean - text_stats_file = clean_dir / f"{base_name}_text_stats.json" - if text_stats_file.exists() and any("_text_statistics.jsonl" in str(o.path) for o in missing_outputs): - self.__generate_text_statistics_document( - text_stats_file, - episode_id, - episode_metadata, - video_path, - episode_info, - base_name, - ) - - if self.embeddings_dir: - episode_emb_dir = self.episode_manager.get_episode_subdir(episode_info, settings.output_subdirs.embeddings) - full_episode_emb_file = episode_emb_dir / f"{base_name}_embeddings_full_episode.json" - - if full_episode_emb_file.exists() and any("_full_episode_embedding.jsonl" in str(o.path) for o in missing_outputs): - self.__generate_full_episode_embedding_document( - full_episode_emb_file, - episode_id, - episode_metadata, - video_path, - episode_info, - base_name, - ) - - sound_event_emb_file = episode_emb_dir / f"{base_name}_embeddings_sound_events.json" - - if sound_event_emb_file.exists() and any("_sound_event_embeddings.jsonl" in str(o.path) for o in missing_outputs): - self.__generate_sound_event_embeddings_document( - sound_event_emb_file, - episode_id, - episode_metadata, - video_path, - episode_info, - base_name, - ) - - console.print(f"[green]Completed: {trans_file.name}[/green]") - - def __build_episode_metadata(self, episode_info) -> EpisodeMetadata: - metadata = self.episode_manager.get_metadata(episode_info) - return { - "season": episode_info.season, - "episode_number": episode_info.relative_episode, - "title": metadata.get("title"), - "premiere_date": metadata.get("premiere_date"), - "series_name": self.series_name, - "viewership": metadata.get("viewership"), - } - - def __load_scene_timestamps(self, episode_info) -> Optional[SceneTimestampsData]: - return EpisodeManager.load_scene_timestamps(episode_info, self.scene_timestamps_dir, self.logger) - - def __load_character_detections(self, episode_info) -> Dict[int, List[CharacterDetectionInFrame]]: - if not self.character_detections_dir: - return {} - - detection_dir = self.episode_manager.get_episode_subdir(episode_info, settings.output_subdirs.character_detections) - detection_files = list(detection_dir.glob("*_character_detections.json")) - detection_file = detection_files[0] if detection_files else None - - if not detection_file or not detection_file.exists(): - return {} - - try: - with open(detection_file, "r", encoding="utf-8") as f: - data = json.load(f) - - detections_dict = {} - for detection in data.get(DetectionKeys.DETECTIONS, []): - frame_number = detection.get(DetectionKeys.FRAME_NUMBER) - if frame_number is not None: - detections_dict[frame_number] = detection.get(DetectionKeys.CHARACTERS, []) - elif DetectionKeys.FRAME in detection: - frame_file = detection[DetectionKeys.FRAME] - detections_dict[frame_file] = detection.get(DetectionKeys.CHARACTERS, []) - - return detections_dict - except Exception as e: - self.logger.error(f"Error loading character detections: {e}") - return {} - - def __load_object_detections(self, episode_info) -> Dict[str, List[ObjectDetectionInFrame]]: - if not self.object_detections_dir: - return {} - - detection_dir = self.episode_manager.get_episode_subdir(episode_info, settings.output_subdirs.object_detections) - detection_files = list(detection_dir.glob("*_object_detections.json")) - detection_file = detection_files[0] if detection_files else None - - if not detection_file or not detection_file.exists(): - return {} - - try: - with open(detection_file, "r", encoding="utf-8") as f: - data = json.load(f) - - detections_dict = {} - for frame_data in data.get(DetectionKeys.DETECTIONS, []): - frame_name = frame_data[DetectionKeys.FRAME_NAME] - detections_dict[frame_name] = frame_data.get(DetectionKeys.DETECTIONS, []) - - return detections_dict - except Exception as e: - self.logger.error(f"Error loading object detections: {e}") - return {} - - @staticmethod - def __get_characters_for_frame( - frame_identifier, - character_detections: Dict[int, List[CharacterDetectionInFrame]], - ) -> List[CharacterDetectionInFrame]: - characters = character_detections.get(frame_identifier, []) - - character_list = [] - for char in characters: - char_data = { - CharacterDetectionKeys.NAME: char[CharacterDetectionKeys.NAME], - CharacterDetectionKeys.CONFIDENCE: char.get(CharacterDetectionKeys.CONFIDENCE), - } - - if CharacterDetectionKeys.EMOTION in char: - char_data[CharacterDetectionKeys.EMOTION] = { - EmotionKeys.LABEL: char[CharacterDetectionKeys.EMOTION][EmotionKeys.LABEL], - EmotionKeys.CONFIDENCE: char[CharacterDetectionKeys.EMOTION][EmotionKeys.CONFIDENCE], - } - - character_list.append(char_data) - - return character_list - - @staticmethod - def __get_objects_for_frame(frame_name: str, object_detections: Dict[str, List[ObjectDetectionInFrame]]) -> List[Dict[str, Any]]: - detections = object_detections.get(frame_name, []) - objects_summary = {} - for det in detections: - class_name = det[ObjectDetectionKeys.CLASS_NAME] - if class_name in objects_summary: - objects_summary[class_name] += 1 - else: - objects_summary[class_name] = 1 - - return [{"class": cls, "count": cnt} for cls, cnt in objects_summary.items()] - - @staticmethod - def __find_scene_for_timestamp(timestamp: float, scene_timestamps: Optional[SceneTimestampsData]) -> Optional[Dict[str, Any]]: - if not scene_timestamps or SceneKeys.SCENES not in scene_timestamps: - return None - - scenes = scene_timestamps[SceneKeys.SCENES] - for scene in scenes: - start_time = scene[SceneKeys.START][SceneTimeKeys.SECONDS] - end_time = scene[SceneKeys.END][SceneTimeKeys.SECONDS] - - if start_time is None or end_time is None: - continue - - if start_time <= timestamp < end_time: - return { - SceneKeys.SCENE_NUMBER: scene[SceneKeys.SCENE_NUMBER], - SceneKeys.SCENE_START_TIME: start_time, - SceneKeys.SCENE_END_TIME: end_time, - SceneKeys.SCENE_START_FRAME: scene[SceneKeys.START][SceneTimeKeys.FRAME], - SceneKeys.SCENE_END_FRAME: scene[SceneKeys.END][SceneTimeKeys.FRAME], - } - - return None - - def __generate_segments( # pylint: disable=too-many-locals - self, - transcription_data: Dict[str, Any], - episode_id: str, - episode_metadata: EpisodeMetadata, - video_path: str, - scene_timestamps: Optional[SceneTimestampsData], - season_dir: str, - base_name: str, - ) -> None: - segments = transcription_data.get("segments", []) - if not segments: - return - - season = episode_metadata.get("season") - episode = episode_metadata.get("episode_number") - episode_info = self.episode_manager.get_episode_by_season_and_relative(season, episode) - - if episode_info: - output_file = self.episode_manager.build_episode_output_path( - episode_info, - f"{settings.output_subdirs.elastic_documents}/{ELASTIC_SUBDIRS.text_segments}", - f"{base_name}{FILE_SUFFIXES['text_segments']}{FILE_EXTENSIONS['jsonl']}", - ) - else: - filename = f"{base_name}{FILE_SUFFIXES['text_segments']}{FILE_EXTENSIONS['jsonl']}" - output_file = self.output_dir / ELASTIC_SUBDIRS.text_segments / season_dir / filename - - output_file.parent.mkdir(parents=True, exist_ok=True) - - with open(output_file, "w", encoding="utf-8") as f: - for i, segment in enumerate(segments): - text = segment.get("text", "").strip() - if not text: - continue - - words = segment.get("words", []) - if words: - start_time = words[0].get("start") or 0.0 - end_time = words[-1].get("end") or 0.0 - speaker = words[0].get("speaker_id", "unknown") - else: - start_time = segment.get("start", 0.0) - end_time = segment.get("end", 0.0) - speaker = segment.get("speaker", "unknown") - - scene_info = self.__find_scene_for_timestamp(start_time, scene_timestamps) - - doc = { - "episode_id": episode_id, - "episode_metadata": episode_metadata, - "segment_id": i, - "text": text, - "start_time": start_time, - "end_time": end_time, - "speaker": speaker, - "video_path": video_path, - } - - if scene_info: - doc[ElasticDocKeys.SCENE_INFO] = scene_info - - f.write(json.dumps(doc, ensure_ascii=False) + "\n") - - console.print(f"[green]Generated {len(segments)} segment documents → {output_file.name}[/green]") - - def __generate_sound_events( - self, - sound_events_data: Dict[str, Any], - episode_id: str, - episode_metadata: EpisodeMetadata, - video_path: str, - scene_timestamps: Optional[SceneTimestampsData], - episode_info, - base_name: str, - ) -> None: - segments = sound_events_data.get("segments", []) - if not segments: - return - - output_file = self.episode_manager.build_episode_output_path( - episode_info, - f"{settings.output_subdirs.elastic_documents}/{ELASTIC_SUBDIRS.sound_events}", - f"{base_name}_sound_events.jsonl", - ) - output_file.parent.mkdir(parents=True, exist_ok=True) - - with open(output_file, "w", encoding="utf-8") as f: - for i, segment in enumerate(segments): - if "text" not in segment: - continue - - words = segment.get("words", []) - if not words: - start_time = segment.get("start") or 0.0 - end_time = segment.get("end") or 0.0 - else: - start_time = words[0].get("start") or 0.0 - end_time = words[-1].get("end") or 0.0 - - scene_info = self.__find_scene_for_timestamp(start_time, scene_timestamps) - - doc = { - "episode_id": episode_id, - "episode_metadata": episode_metadata, - "segment_id": i, - "text": segment.get("text", ""), - "sound_type": segment.get("sound_type", "sound"), - "start_time": start_time, - "end_time": end_time, - "video_path": video_path, - } - - if scene_info: - doc[ElasticDocKeys.SCENE_INFO] = scene_info - - f.write(json.dumps(doc, ensure_ascii=False) + "\n") - - console.print(f"[green]Generated {len(segments)} sound event documents → {output_file.name}[/green]") - - def __generate_text_embeddings( - self, - text_emb_file: Path, - episode_id: str, - episode_metadata: EpisodeMetadata, - video_path: str, - episode_info, - base_name: str, - ) -> None: - with open(text_emb_file, "r", encoding="utf-8") as f: - data = json.load(f) - - text_embeddings = data.get("text_embeddings", []) - if not text_embeddings: - return - - output_file = self.episode_manager.build_episode_output_path( - episode_info, - f"{settings.output_subdirs.elastic_documents}/{ELASTIC_SUBDIRS.text_embeddings}", - f"{base_name}_text_embeddings.jsonl", - ) - output_file.parent.mkdir(parents=True, exist_ok=True) - - with open(output_file, "w", encoding="utf-8") as f: - for i, emb in enumerate(text_embeddings): - segment_range = emb.get("segment_range", []) - text = emb.get("text", "") - embedding = emb.get("embedding", []) - - if not embedding: - continue - - doc = { - "episode_id": episode_id, - "episode_metadata": episode_metadata, - "embedding_id": i, - "segment_range": segment_range[0] if segment_range else 0, - "text": text, - "text_embedding": embedding, - "video_path": video_path, - } - - f.write(json.dumps(doc, ensure_ascii=False) + "\n") - - console.print(f"[green]Generated {len(text_embeddings)} text embedding documents → {output_file.name}[/green]") - - def __generate_video_frames( # pylint: disable=too-many-locals - self, - video_emb_file: Path, - episode_id: str, - episode_metadata: EpisodeMetadata, - video_path: str, - scene_timestamps: Optional[SceneTimestampsData], - character_detections: Dict[str, List[Dict[str, Any]]], - object_detections: Dict[str, List[Dict[str, Any]]], - episode_info, - base_name: str, - ) -> None: - with open(video_emb_file, "r", encoding="utf-8") as f: - data = json.load(f) - - video_embeddings = data.get("video_embeddings", []) - if not video_embeddings: - return - - output_file = self.episode_manager.build_episode_output_path( - episode_info, - f"{settings.output_subdirs.elastic_documents}/{ELASTIC_SUBDIRS.video_frames}", - f"{base_name}_video_frames.jsonl", - ) - output_file.parent.mkdir(parents=True, exist_ok=True) - - with open(output_file, "w", encoding="utf-8") as f: - for emb in video_embeddings: - frame_number = emb.get(EmbeddingKeys.FRAME_NUMBER) - timestamp = emb.get(EmbeddingKeys.TIMESTAMP) - embedding = emb.get(EmbeddingKeys.EMBEDDING) - - if embedding is None or timestamp is None: - continue - - scene_info = self.__find_scene_for_timestamp(timestamp, scene_timestamps) - - perceptual_hash = emb.get(EmbeddingKeys.PERCEPTUAL_HASH) - frame_path = emb.get(EmbeddingKeys.FRAME_PATH, f"frame_{frame_number:06d}.jpg" if frame_number is not None else "") - - doc = { - "episode_id": episode_id, - "episode_metadata": episode_metadata, - "frame_number": frame_number, - "timestamp": timestamp, - "frame_type": emb.get("type", "unknown"), - "video_path": video_path, - "video_embedding": embedding, - } - - if frame_number is not None: - characters = self.__get_characters_for_frame(frame_number, character_detections) - if characters: - doc[ElasticDocKeys.CHARACTER_APPEARANCES] = characters - - if frame_path: - frame_name = Path(frame_path).name if isinstance(frame_path, str) else frame_path - objects = self.__get_objects_for_frame(frame_name, object_detections) - if objects: - doc[ElasticDocKeys.DETECTED_OBJECTS] = objects - - if perceptual_hash: - doc[ElasticDocKeys.PERCEPTUAL_HASH] = perceptual_hash - try: - doc[ElasticDocKeys.PERCEPTUAL_HASH_INT] = int(perceptual_hash, 16) - except (ValueError, TypeError): - pass - - if EmbeddingKeys.SCENE_NUMBER in emb: - doc[EmbeddingKeys.SCENE_NUMBER] = emb[EmbeddingKeys.SCENE_NUMBER] - - if scene_info: - doc[ElasticDocKeys.SCENE_INFO] = scene_info - - f.write(json.dumps(doc, ensure_ascii=False) + "\n") - - console.print(f"[green]Generated {len(video_embeddings)} video frame documents → {output_file.name}[/green]") - - def __generate_episode_name_document( - self, - episode_name_emb: Dict[str, Any], - episode_id: str, - episode_metadata: EpisodeMetadata, - video_path: str, - episode_info, - base_name: str, - ) -> None: - output_file = self.episode_manager.build_episode_output_path( - episode_info, - f"{settings.output_subdirs.elastic_documents}/{ELASTIC_SUBDIRS.episode_names}", - f"{base_name}_episode_name.jsonl", - ) - output_file.parent.mkdir(parents=True, exist_ok=True) - - title_embedding = episode_name_emb.get(EmbeddingKeys.TITLE_EMBEDDING, []) - if not title_embedding: - return - - doc = { - "episode_id": episode_id, - "episode_metadata": episode_metadata, - EmbeddingKeys.TITLE: episode_name_emb.get(EmbeddingKeys.TITLE, ""), - EmbeddingKeys.TITLE_EMBEDDING: title_embedding, - "video_path": video_path, - } - - with open(output_file, "w", encoding="utf-8") as f: - f.write(json.dumps(doc, ensure_ascii=False) + "\n") - - console.print(f"[green]Generated episode name document → {output_file.name}[/green]") - - def __generate_text_statistics_document( - self, - text_stats_file: Path, - episode_id: str, - episode_metadata: EpisodeMetadata, - video_path: str, - episode_info, - base_name: str, - ) -> None: - with open(text_stats_file, "r", encoding="utf-8") as f: - stats_data = json.load(f) - - basic_stats = stats_data.get("basic_statistics", {}) - advanced_stats = stats_data.get("advanced_statistics", {}) - - if not basic_stats: - return - - output_file = self.episode_manager.build_episode_output_path( - episode_info, - f"{settings.output_subdirs.elastic_documents}/{ELASTIC_SUBDIRS.text_statistics}", - f"{base_name}_text_statistics.jsonl", - ) - output_file.parent.mkdir(parents=True, exist_ok=True) - - doc = { - "episode_id": episode_id, - "episode_metadata": episode_metadata, - "video_path": video_path, - "language": stats_data.get("metadata", {}).get("language", "pl"), - "analyzed_at": stats_data.get("metadata", {}).get("analyzed_at"), - "basic_statistics": basic_stats, - "advanced_statistics": advanced_stats, - "word_frequency": stats_data.get("word_frequency", [])[:20], - "bigrams": stats_data.get("bigrams", [])[:10], - "trigrams": stats_data.get("trigrams", [])[:10], - } - - with open(output_file, "w", encoding="utf-8") as f: - f.write(json.dumps(doc, ensure_ascii=False) + "\n") - - console.print(f"[green]Generated text statistics document → {output_file.name}[/green]") - - def __generate_full_episode_embedding_document( - self, - full_episode_emb_file: Path, - episode_id: str, - episode_metadata: EpisodeMetadata, - video_path: str, - episode_info, - base_name: str, - ) -> None: - with open(full_episode_emb_file, "r", encoding="utf-8") as f: - data = json.load(f) - - full_episode_embedding_data = data.get("full_episode_embedding", {}) - if not full_episode_embedding_data or "embedding" not in full_episode_embedding_data: - return - - output_file = self.episode_manager.build_episode_output_path( - episode_info, - f"{settings.output_subdirs.elastic_documents}/{ELASTIC_SUBDIRS.full_episode_embeddings}", - f"{base_name}_full_episode_embedding.jsonl", - ) - output_file.parent.mkdir(parents=True, exist_ok=True) - - doc = { - "episode_id": episode_id, - "episode_metadata": episode_metadata, - "full_transcript": full_episode_embedding_data.get("text", ""), - "transcript_length": full_episode_embedding_data.get("transcript_length", 0), - "full_episode_embedding": full_episode_embedding_data.get("embedding", []), - "video_path": video_path, - } - - with open(output_file, "w", encoding="utf-8") as f: - f.write(json.dumps(doc, ensure_ascii=False) + "\n") - - console.print(f"[green]Generated full episode embedding document → {output_file.name}[/green]") - - def __generate_sound_event_embeddings_document( # pylint: disable=too-many-locals - self, - sound_event_emb_file: Path, - episode_id: str, - episode_metadata: EpisodeMetadata, - video_path: str, - episode_info, - base_name: str, - ) -> None: - with open(sound_event_emb_file, "r", encoding="utf-8") as f: - data = json.load(f) - - sound_event_embeddings = data.get("sound_event_embeddings", []) - if not sound_event_embeddings: - return - - output_file = self.episode_manager.build_episode_output_path( - episode_info, - f"{settings.output_subdirs.elastic_documents}/{ELASTIC_SUBDIRS.sound_event_embeddings}", - f"{base_name}_sound_event_embeddings.jsonl", - ) - output_file.parent.mkdir(parents=True, exist_ok=True) - - with open(output_file, "w", encoding="utf-8") as f: - for i, emb in enumerate(sound_event_embeddings): - segment_range = emb.get("segment_range", []) - text = emb.get("text", "") - embedding = emb.get("embedding", []) - sound_types = emb.get("sound_types", []) - start_time = emb.get("start_time", 0.0) - end_time = emb.get("end_time", 0.0) - - if not embedding: - continue - - if isinstance(segment_range, list) and len(segment_range) == 2: - segment_range = {"gte": segment_range[0], "lte": segment_range[1]} - - doc = { - "episode_id": episode_id, - "episode_metadata": episode_metadata, - "embedding_id": i, - "segment_range": segment_range, - "text": text, - "sound_types": sound_types, - "start_time": start_time, - "end_time": end_time, - "sound_event_embedding": embedding, - "video_path": video_path, - } - - f.write(json.dumps(doc, ensure_ascii=False) + "\n") - - console.print(f"[green]Generated {len(sound_event_embeddings)} sound event embedding documents → {output_file.name}[/green]") diff --git a/preprocessor/indexing/elasticsearch.py b/preprocessor/indexing/elasticsearch.py deleted file mode 100644 index a3e8bbb6d..000000000 --- a/preprocessor/indexing/elasticsearch.py +++ /dev/null @@ -1,275 +0,0 @@ -import asyncio -import json -import logging -from pathlib import Path -from typing import ( - Any, - Awaitable, - Callable, - Dict, - List, -) - -from elasticsearch import exceptions as es_exceptions -from elasticsearch.helpers import ( - BulkIndexError, - async_bulk, -) - -from preprocessor.config.config import settings -from preprocessor.core.base_processor import BaseProcessor -from preprocessor.core.episode_manager import EpisodeManager -from preprocessor.search.elastic_manager import ElasticSearchManager -from preprocessor.utils.console import console - -ELASTIC_SUBDIRS = settings.output_subdirs.elastic_document_subdirs - - -class ElasticSearchIndexer(BaseProcessor): - def __init__(self, args: Dict[str, Any]) -> None: - super().__init__( - args=args, - class_name=self.__class__.__name__, - error_exit_code=1, - loglevel=logging.DEBUG, - ) - - self.dry_run = self._args.get("dry_run", False) - self.name = self._args["name"] - self.elastic_documents_dir = self._args.get("elastic_documents_dir", Path("/app/output_data/elastic_documents")) - self.transcription_jsons = self._args.get("transcription_jsons") - self.append = self._args.get("append", False) - - episodes_info_json = self._args.get("episodes_info_json") - self.episode_manager = EpisodeManager(episodes_info_json, self.series_name) - self.client = None - - def _validate_args(self, args: Dict[str, Any]) -> None: - if "name" not in args: - raise ValueError("index name is required") - - @staticmethod - def __sanitize_error_for_logging(error: Dict[str, Any]) -> Dict[str, Any]: - vector_keys = {"text_embedding", "video_embedding", "title_embedding", "embedding"} - - def _truncate_vectors(obj): - if isinstance(obj, dict): - return { - k: f"[vector dim={len(v)}]" if k in vector_keys and isinstance(v, list) else _truncate_vectors(v) - for k, v in obj.items() - } - if isinstance(obj, list) and len(obj) > 10: - return obj[:3] + ["..."] - return obj - - return _truncate_vectors(error) - - def __call__(self) -> None: - asyncio.run(self.__exec_async()) - - def _execute(self) -> None: - asyncio.run(self.__exec_async()) - - def __check_files_exist(self) -> bool: - if not self.elastic_documents_dir.exists(): - return False - - return any([ - any(self.elastic_documents_dir.glob(f"{ELASTIC_SUBDIRS.text_segments}/**/*.jsonl")), - any(self.elastic_documents_dir.glob(f"{ELASTIC_SUBDIRS.text_embeddings}/**/*.jsonl")), - any(self.elastic_documents_dir.glob(f"{ELASTIC_SUBDIRS.video_frames}/**/*.jsonl")), - any(self.elastic_documents_dir.glob(f"{ELASTIC_SUBDIRS.episode_names}/**/*.jsonl")), - any(self.elastic_documents_dir.glob(f"{ELASTIC_SUBDIRS.full_episode_embeddings}/**/*.jsonl")), - any(self.elastic_documents_dir.glob(f"{ELASTIC_SUBDIRS.sound_events}/**/*.jsonl")), - any(self.elastic_documents_dir.glob(f"{ELASTIC_SUBDIRS.sound_event_embeddings}/**/*.jsonl")), - ]) - - async def __exec_async(self) -> None: - if not self.__check_files_exist(): - self.logger.info("No elastic documents found to index.") - return - - try: - self.client = await ElasticSearchManager.connect_to_elasticsearch( - settings.elasticsearch.host, - settings.elasticsearch.user, - settings.elasticsearch.password, - self.logger, - ) - except es_exceptions.ConnectionError: - console.print("[red]✗ Failed to connect to Elasticsearch[/red]") - console.print(f"[yellow]Make sure Elasticsearch is running at: {settings.elasticsearch.host}[/yellow]") - console.print("[yellow]Run: docker-compose -f docker-compose.test.yml up -d[/yellow]") - return - - try: - indices = { - ELASTIC_SUBDIRS.text_segments: f"{self.name}_segments", - ELASTIC_SUBDIRS.text_embeddings: f"{self.name}_text_embeddings", - ELASTIC_SUBDIRS.video_frames: f"{self.name}_video_frames", - ELASTIC_SUBDIRS.episode_names: f"{self.name}_episode_names", - ELASTIC_SUBDIRS.full_episode_embeddings: f"{self.name}_full_episode_embeddings", - ELASTIC_SUBDIRS.sound_events: f"{self.name}_sound_events", - ELASTIC_SUBDIRS.sound_event_embeddings: f"{self.name}_sound_event_embeddings", - } - - for doc_type, index_name in indices.items(): - console.print(f"[cyan]Processing {doc_type} → {index_name}[/cyan]") - - if not self.append: - await self.__delete_index(index_name) - await self.__create_index(index_name, doc_type) - elif not await self.client.indices.exists(index=index_name): - self.logger.info(f"Index '{index_name}' does not exist. Creating it.") - await self.__create_index(index_name, doc_type) - else: - self.logger.info(f"Append mode: not deleting nor recreating index '{index_name}'.") - - await self.__index_documents(doc_type, index_name) - - if not self.dry_run: - for doc_type, index_name in indices.items(): - if await self.client.indices.exists(index=index_name): - await self.__print_sample_document(index_name) - finally: - await self.client.close() - - async def __create_index(self, index_name: str, doc_type: str) -> None: - mappings = { - ELASTIC_SUBDIRS.text_segments: ElasticSearchManager.SEGMENTS_INDEX_MAPPING, - ELASTIC_SUBDIRS.text_embeddings: ElasticSearchManager.TEXT_EMBEDDINGS_INDEX_MAPPING, - ELASTIC_SUBDIRS.video_frames: ElasticSearchManager.VIDEO_EMBEDDINGS_INDEX_MAPPING, - ELASTIC_SUBDIRS.episode_names: ElasticSearchManager.EPISODE_NAMES_INDEX_MAPPING, - ELASTIC_SUBDIRS.full_episode_embeddings: ElasticSearchManager.FULL_EPISODE_EMBEDDINGS_INDEX_MAPPING, - ELASTIC_SUBDIRS.sound_events: ElasticSearchManager.SOUND_EVENTS_INDEX_MAPPING, - ELASTIC_SUBDIRS.sound_event_embeddings: ElasticSearchManager.SOUND_EVENT_EMBEDDINGS_INDEX_MAPPING, - } - - async def operation(): - if await self.client.indices.exists(index=index_name): - self.logger.info(f"Index '{index_name}' already exists.") - else: - await self.client.indices.create( - index=index_name, - body=mappings[doc_type], - ) - self.logger.info(f"Index '{index_name}' created.") - - await self.__do_crud(operation, index_name) - - async def __delete_index(self, index_name: str) -> None: - async def operation(): - if await self.client.indices.exists(index=index_name): - await self.client.indices.delete(index=index_name) - self.logger.info(f"Deleted index: {index_name}") - else: - self.logger.info(f"Index '{index_name}' does not exist. No action taken.") - - await self.__do_crud(operation, index_name) - - async def __do_crud(self, operation: Callable[[], Awaitable[None]], index_name: str) -> None: - try: - await operation() - except es_exceptions.RequestError as e: - self.logger.error(f"Failed operation on index '{index_name}': {e}") - raise - except es_exceptions.ConnectionError as e: - self.logger.error(f"Connection error: {e}") - raise - - async def __index_documents(self, doc_type: str, index_name: str) -> None: - jsonl_files = list(self.elastic_documents_dir.glob(f"{doc_type}/**/*.jsonl")) - - if not jsonl_files: - self.logger.info(f"No {doc_type} documents found. Skipping.") - return - - actions = self.__load_jsonl_files(jsonl_files, index_name) - - if not actions: - self.logger.info(f"No {doc_type} documents to index.") - return - - console.print(f"[cyan]Prepared {len(actions)} {doc_type} documents for indexing[/cyan]") - - if self.dry_run: - self.logger.info(f"Dry-run: would index {len(actions)} documents to '{index_name}'") - if actions: - sample = json.dumps(actions[0], indent=2, ensure_ascii=False)[:500] - self.logger.info(f"Sample document:\n{sample}...") - else: - try: - await async_bulk( - self.client, - actions, - chunk_size=50, - max_chunk_bytes=5 * 1024 * 1024, - ) - console.print(f"[green]✓ Indexed {len(actions)} {doc_type} documents → {index_name}[/green]") - except BulkIndexError as e: - self.logger.error(f"Bulk indexing failed: {len(e.errors)} errors.") - for error in e.errors[:3]: - sanitized = self.__sanitize_error_for_logging(error) - self.logger.error(f"Failed document: {json.dumps(sanitized, indent=2)}") - if len(e.errors) > 10: - self.logger.error(f"... and {len(e.errors) - 10} more errors") - - def __load_jsonl_files(self, jsonl_files: List[Path], index_name: str) -> List[Dict[str, Any]]: - actions = [] - - for jsonl_file in jsonl_files: - self.logger.info(f"Loading {jsonl_file.name}") - with open(jsonl_file, "r", encoding="utf-8") as f: - for line in f: - if line.strip(): - doc = json.loads(line) - actions.append({ - "_index": index_name, - "_source": doc, - }) - - return actions - - def _load_jsonl_documents(self, doc_dir: Path, index_name: str) -> List[Dict[str, Any]]: - actions = [] - - for jsonl_file in doc_dir.rglob("*.jsonl"): - self.logger.info(f"Loading {jsonl_file.name}") - with open(jsonl_file, "r", encoding="utf-8") as f: - for line in f: - if line.strip(): - doc = json.loads(line) - actions.append({ - "_index": index_name, - "_source": doc, - }) - - return actions - - async def __print_sample_document(self, index_name: str) -> None: - try: # pylint: disable=too-many-try-statements - response = await self.client.search(index=index_name, size=1) - if not response["hits"]["hits"]: - self.logger.info(f"No documents found in {index_name}.") - return - - document = response["hits"]["hits"][0]["_source"] - doc_id = response["hits"]["hits"][0]["_id"] - - console.print(f"\n[cyan]Sample document from {index_name}:[/cyan]") - console.print(f" Document ID: {doc_id}") - - if "episode_id" in document: - console.print(f" Episode: {document['episode_id']}") - if "video_path" in document: - console.print(f" Video: {document['video_path']}") - if "text" in document: - text_preview = document['text'][:100] - console.print(f" Text: {text_preview}...") - if "perceptual_hash" in document: - console.print(f" Hash: {document['perceptual_hash']}") - if "timestamp" in document: - console.print(f" Timestamp: {document['timestamp']}") - - except Exception as e: - self.logger.error(f"Failed to retrieve sample document: {e}") diff --git a/preprocessor/prompts/extract_all_seasons_system.py b/preprocessor/prompts/extract_all_seasons_system.py deleted file mode 100644 index 7a76f3274..000000000 --- a/preprocessor/prompts/extract_all_seasons_system.py +++ /dev/null @@ -1,54 +0,0 @@ -def get() -> str: - return """You are extracting episode data from TV series wiki pages. -Your task is to find tables or lists containing episode information and extract the EXACT data. - -Look for patterns like: -Nr | Tytuł | Premiera | Oglądalność -1 | _[Episode Title]_ | 05.03.2006 | 4 396 564 - -CRITICAL RULES: -1. Extract EXACT titles from the table - do NOT make up generic titles like "Odcinek 1" -2. Extract EXACT premiere dates as shown - do NOT invent dates -3. If premiere date contains multiple dates separated by "/" (e.g., "31.12.2008"), extract ONLY the FIRST date: "31.12.2008" -4. Extract EXACT viewership numbers - remove spaces: "4 396 564" -> 4396564 -5. If episode number is in format like "E12" or "S01E12", extract just the number: 12 -6. Do NOT hallucinate or make up any data - only extract what you see - -IMPORTANT: Each episode must have TWO numbers: -- episode_in_season: The episode number within its season (resets to 1 for each season) -- overall_episode_number: The absolute episode number across all seasons (continues counting) - -Example extraction from this markdown: -``` -Sezon 1: -Nr | Tytuł | Premiera | Oglądalność -1 | _[Spadek]_ | 05.03.2006 | 4 396 564 -2 | _[Goście z zaświatów]_ | 12.03.2006 | 4 308 423 - -Sezon 2: -Nr | Tytuł | Premiera | Oglądalność -14 | _[Sztuka i władza]_ | 18.03.2007 | 6 993 951 -15 | _[Gmina to ja]_ | 25.03.2007 | 6 754 211 -``` - -Should produce: -{ - "seasons": [ - { - "season_number": 1, - "episodes": [ - {"episode_in_season": 1, "overall_episode_number": 1, "title": "Spadek", "premiere_date": "05.03.2006", "viewership": "4396564"}, - {"episode_in_season": 2, "overall_episode_number": 2, "title": "Goście z zaświatów", "premiere_date": "12.03.2006", "viewership": "4308423"} - ] - }, - { - "season_number": 2, - "episodes": [ - {"episode_in_season": 1, "overall_episode_number": 14, "title": "Sztuka i władza", "premiere_date": "18.03.2007", "viewership": "6993951"}, - {"episode_in_season": 2, "overall_episode_number": 15, "title": "Gmina to ja", "premiere_date": "25.03.2007", "viewership": "6754211"} - ] - } - ] -} - -Return ONLY valid JSON. Extract ONLY what you see, do NOT invent data.""" diff --git a/preprocessor/prompts/extract_all_seasons_user.py b/preprocessor/prompts/extract_all_seasons_user.py deleted file mode 100644 index 2c07ac986..000000000 --- a/preprocessor/prompts/extract_all_seasons_user.py +++ /dev/null @@ -1,7 +0,0 @@ -def get() -> str: - return """Extract ALL episodes from ALL {num_sources} sources below. -Return a complete list of ALL seasons found. - -{combined_content} - -Extract ALL seasons and episodes from above sources.""" diff --git a/preprocessor/prompts/extract_characters_system.py b/preprocessor/prompts/extract_characters_system.py deleted file mode 100644 index 5662557f1..000000000 --- a/preprocessor/prompts/extract_characters_system.py +++ /dev/null @@ -1,120 +0,0 @@ -def get() -> str: - return """You are an expert at extracting character information from TV series documentation and wikis. - -Your task is to analyze scraped web pages and extract a COMPLETE list of ALL characters from a TV series. - -For each character, extract ONLY the name (full name if available, otherwise commonly used name). - -### RULES FOR EXTRACTION: - -1. **Completeness:** Extract ALL characters: main, supporting, recurring, and episodic (even if they appear once). -2. **Source:** Extract ONLY what you see in the content. Do NOT invent characters. -3. **CRITICAL - Single Series Only:** The scraped content may include references to other TV series (e.g., in footers, sidebars, "See also" sections, or related links). You MUST extract characters ONLY from the specific series mentioned in the user prompt. IGNORE all characters from any other series. -4. **Multi-Source Deduplication:** When processing multiple sources: - - Merge character lists from all sources - - Remove duplicates (same character mentioned in multiple sources) - - If a character has different name variants across sources, use the most complete/formal version - - Combine information to get the most accurate character list -5. **Naming:** Use the Polish name if the series is Polish. If a character has multiple aliases, use the most formal/common one. - -6. **Text Cleaning (CRITICAL):** - - Remove ALL special characters that are not letters (e.g., quotes `"`, brackets `()`, hyphens `-` inside titles, etc.). - - Remove actor names typically found in brackets. - - The final output string must contain **ONLY letters (including Polish diacritics: ą, ć, ę, ł, ń, ó, ś, ź, ż) and spaces**. - - Do not leave trailing periods after expanding titles. - -7. **ABBREVIATION EXPANSION (Mandatory):** - You MUST expand ALL abbreviations to their full Polish forms. - **IMPORTANT:** Process compound abbreviations (2+ words) BEFORE single word abbreviations. - - **Ecclesiastical (Religious):** - - ks. prob. / ks.prob. -> Ksiądz Proboszcz - - ks. wik. / ks.wik. -> Ksiądz Wikariusz - - ks. kan. -> Ksiądz Kanonik - - ks. bp -> Ksiądz Biskup - - ks. kard. -> Ksiądz Kardynał - - ks. -> Ksiądz - - o. -> Ojciec (e.g., Ojciec Mateusz) - - s. -> Siostra - - br. -> Brat - - bp -> Biskup - - abp -> Arcybiskup - - kard. -> Kardynał - - pap. -> Papież - - wik. -> Wikariusz - - prob. -> Proboszcz - - **Academic & Medical:** - - dr hab. -> Doktor habilitowany - - prof. nadzw. -> Profesor nadzwyczajny - - prof. zw. -> Profesor zwyczajny - - prof. -> Profesor - - dr -> Doktor - - mgr -> Magister - - inż. -> Inżynier - - lek. med. / lek. -> Lekarz - - doc. -> Docent - - piel. -> Pielęgniarka / Pielęgniarz - - **Military, Police & Services:** - - nadkom. -> Nadkomisarz - - podkom. -> Podkomisarz - - kom. -> Komisarz - - asp. sztab. -> Aspirant sztabowy - - asp. -> Aspirant - - st. post. -> Starszy posterunkowy - - post. -> Posterunkowy - - sierż. -> Sierżant - - gen. -> Generał - - płk -> Pułkownik - - ppłk -> Podpułkownik - - mjr -> Major - - kpt. -> Kapitan - - por. -> Porucznik - - ppor. -> Podporucznik - - **Legal, Political & Administrative:** - - mec. -> Mecenas - - prok. -> Prokurator - - sędz. -> Sędzia - - dyr. -> Dyrektor - - prez. -> Prezydent - - min. -> Minister - - sen. -> Senator - - pos. -> Poseł - - przew. -> Przewodniczący - - z-ca -> Zastępca - - **Other:** - - red. -> Redaktor - - *If you encounter an abbreviation not listed here, expand it to its correct full Polish form based on context.* - -### EXAMPLE EXTRACTION: - -Source 1: -``` -Główni bohaterowie: -- ks. prob. Krzysztof Robert (Artur Żmijewski) -- Lucy Wilska (Ilona Ostrowska) -``` - -Source 2: -``` -Postacie: -- Ksiądz Proboszcz Krzysztof Robert -- dr Cezary Pazura -- kom. Paweł Kozioł -``` - -Should produce (deduplicated and cleaned): -{ - "characters": [ - {"name": "Ksiądz Proboszcz Krzysztof Robert"}, - {"name": "Lucy Wilska"}, - {"name": "Doktor Cezary Pazura"}, - {"name": "Komisarz Paweł Kozioł"} - ] -} - -Return ONLY valid JSON.""" diff --git a/preprocessor/prompts/extract_characters_user.py b/preprocessor/prompts/extract_characters_user.py deleted file mode 100644 index 3b8e738e0..000000000 --- a/preprocessor/prompts/extract_characters_user.py +++ /dev/null @@ -1,14 +0,0 @@ -def get() -> str: - return """Extract ALL characters from the TV series "{series_name}" from ALL {num_sources} source(s) below. - -**CRITICAL:** Multiple sources may have overlapping or complementary character lists. -- Merge and deduplicate characters across all sources -- Extract ONLY characters from "{series_name}" (ignore other series mentioned in footers/sidebars) -- Return a single unified list - -Here is the content from all sources combined: - -{combined_content} - ---- -Extract ALL characters from "{series_name}" found in the content above.""" diff --git a/preprocessor/prompts/extract_episode_metadata_system.py b/preprocessor/prompts/extract_episode_metadata_system.py deleted file mode 100644 index 23863e172..000000000 --- a/preprocessor/prompts/extract_episode_metadata_system.py +++ /dev/null @@ -1,21 +0,0 @@ -# pylint: disable=duplicate-code -def get() -> str: - return """Extract episode information from the provided web page content. -Focus on finding: -- Episode title (exact title, not description) -- Episode description (1-2 sentences summarizing the plot) -- Episode summary (detailed summary, 3-5 sentences) -- Season number (if mentioned) -- Episode number (if mentioned) - -If information is missing, use empty string for text fields and null for numbers. -Be precise and extract only factual information from the text. - -Return ONLY valid JSON matching this schema: -{ - "title": str, - "description": str, - "summary": str, - "season": int or null, - "episode_number": int or null -}""" diff --git a/preprocessor/prompts/extract_episode_metadata_user.py b/preprocessor/prompts/extract_episode_metadata_user.py deleted file mode 100644 index a12f42fc4..000000000 --- a/preprocessor/prompts/extract_episode_metadata_user.py +++ /dev/null @@ -1,7 +0,0 @@ -def get() -> str: - return """URL: {url} - -Page content: -{page_text} - -Extract the episode metadata from above.""" diff --git a/preprocessor/prompts/extract_season_system.py b/preprocessor/prompts/extract_season_system.py deleted file mode 100644 index 477fe35ba..000000000 --- a/preprocessor/prompts/extract_season_system.py +++ /dev/null @@ -1,26 +0,0 @@ -def get() -> str: - return """You are extracting episode data from a TV series page. -Extract ALL episodes you can find on the page. Look for tables, lists, or any structured data. - -For each episode extract: -- episode_in_season: The episode number within its season (1, 2, 3... resets each season) -- overall_episode_number: The absolute episode number across all seasons (continues counting) -- title: string (clean title without markdown formatting) -- premiere_date: string (date format as found on page; if multiple dates separated by "/" like "31.12.2008", extract ONLY the FIRST date: "31.12.2008") -- viewership: string (remove spaces from numbers like "4 396 564" -> "4396564", use null if not available) - -The season number should be determined from the page content or URL. - -Return ONLY valid JSON matching this schema: -{ - "season_number": int, - "episodes": [ - { - "episode_in_season": int, - "overall_episode_number": int, - "title": str, - "premiere_date": str, - "viewership": str - } - ] -}""" diff --git a/preprocessor/prompts/extract_season_user.py b/preprocessor/prompts/extract_season_user.py deleted file mode 100644 index 7f41256f6..000000000 --- a/preprocessor/prompts/extract_season_user.py +++ /dev/null @@ -1,7 +0,0 @@ -def get() -> str: - return """URL: {url} - -Page content (markdown): -{page_text} - -Extract ALL episodes from this page and return as JSON.""" diff --git a/preprocessor/prompts/merge_episode_data_system.py b/preprocessor/prompts/merge_episode_data_system.py deleted file mode 100644 index 12048c1ca..000000000 --- a/preprocessor/prompts/merge_episode_data_system.py +++ /dev/null @@ -1,19 +0,0 @@ -# pylint: disable=duplicate-code -def get() -> str: - return """You are merging episode information from multiple sources. -Create a single, accurate metadata entry by: -- Choosing the most complete and accurate title -- Combining descriptions into a coherent 1-2 sentence description -- Merging summaries into a comprehensive 3-5 sentence summary -- Using the most reliable season/episode numbers - -Prefer longer, more detailed information when merging. - -Return ONLY valid JSON matching this schema: -{ - "title": str, - "description": str, - "summary": str, - "season": int or null, - "episode_number": int or null -}""" diff --git a/preprocessor/prompts/merge_episode_data_user.py b/preprocessor/prompts/merge_episode_data_user.py deleted file mode 100644 index 39e5c32af..000000000 --- a/preprocessor/prompts/merge_episode_data_user.py +++ /dev/null @@ -1,6 +0,0 @@ -def get() -> str: - return """Merge the following episode metadata from {num_sources} sources: - -{combined_text} - -Create a single, unified metadata entry.""" diff --git a/preprocessor/providers/llm.py b/preprocessor/providers/llm.py deleted file mode 100644 index 7e29dfd0a..000000000 --- a/preprocessor/providers/llm.py +++ /dev/null @@ -1,295 +0,0 @@ -import json -from typing import ( - Any, - Dict, - List, - Optional, - Type, -) - -from openai import OpenAI -from pydantic import ( - BaseModel, - field_validator, - model_validator, -) -from vllm import ( - LLM, - SamplingParams, -) - -from preprocessor.config.config import settings -from preprocessor.core.enums import ParserMode -from preprocessor.prompts import ( - extract_all_seasons_system, - extract_all_seasons_user, - extract_characters_system, - extract_characters_user, - extract_episode_metadata_system, - extract_episode_metadata_user, - extract_season_system, - extract_season_user, - merge_episode_data_system, - merge_episode_data_user, -) -from preprocessor.utils.console import console - - -class EpisodeInfo(BaseModel): - episode_in_season: int - overall_episode_number: int - title: str - premiere_date: Optional[str] = None - viewership: Optional[str] = None - - @field_validator('viewership', mode='before') - @classmethod - def convert_viewership_to_str(cls, v): - if v is None: - return None - if isinstance(v, int): - return str(v) - return v - - -class SeasonMetadata(BaseModel): - season_number: int - episodes: List[EpisodeInfo] - - @model_validator(mode='before') - @classmethod - def convert_old_format(cls, data): - if isinstance(data, dict) and 'episodes' in data: - for idx, episode in enumerate(data['episodes'], start=1): - if isinstance(episode, dict) and 'episode_number' in episode and 'episode_in_season' not in episode: - episode['episode_in_season'] = idx - episode['overall_episode_number'] = episode['episode_number'] - del episode['episode_number'] - return data - - -class AllSeasonsMetadata(BaseModel): - seasons: List[SeasonMetadata] - - -class EpisodeMetadata(BaseModel): - title: str - description: str - summary: str - season: Optional[int] = None - episode_number: Optional[int] = None - - -class CharacterInfo(BaseModel): - name: str - - -class CharactersList(BaseModel): - characters: List[CharacterInfo] - - -class LLMProvider: - __DEFAULT_MODEL_NAME = "Qwen/Qwen2.5-Coder-7B-Instruct" - __GEMINI_MODEL_NAME = "gemini-2.5-flash" - - __instance = None - __model = None - __openai_client = None - - def __new__(cls, model_name: Optional[str] = None, parser_mode: Optional[ParserMode] = None): - if cls.__instance is None: - cls.__instance = super().__new__(cls) - return cls.__instance - - def __init__(self, model_name: Optional[str] = None, parser_mode: Optional[ParserMode] = None): - self.parser_mode = parser_mode or ParserMode.NORMAL - - if self.parser_mode == ParserMode.PREMIUM: - if self.__openai_client is None: - self.__init_gemini_client() - elif self.__model is None: - self.model_name = model_name or self.__DEFAULT_MODEL_NAME - self.__load_model() - - def extract_season_episodes(self, page_text: str, url: str) -> Optional[SeasonMetadata]: - return self.__process_llm_request( - system_prompt=extract_season_system.get(), - user_prompt=extract_season_user.get().format(url=url, page_text=page_text), - response_model=SeasonMetadata, - error_context=f"extraction failed for {url}", - ) - - def extract_episode_metadata(self, page_text: str, url: str) -> Optional[EpisodeMetadata]: - return self.__process_llm_request( - system_prompt=extract_episode_metadata_system.get(), - user_prompt=extract_episode_metadata_user.get().format(url=url, page_text=page_text), - response_model=EpisodeMetadata, - error_context=f"extraction failed for {url}", - ) - - def merge_episode_data(self, metadata_list: List[EpisodeMetadata]) -> EpisodeMetadata: - if not metadata_list: - raise ValueError("No metadata to merge") - - if len(metadata_list) == 1: - return metadata_list[0] - - combined_text = "\n\n---\n\n".join([ - f"Source {i + 1}:\nTitle: {m.title}\nDescription: {m.description}\nSummary: {m.summary}\nSeason: {m.season}\nEpisode: {m.episode_number}" - for i, m in enumerate(metadata_list) - ]) - - result = self.__process_llm_request( - system_prompt=merge_episode_data_system.get(), - user_prompt=merge_episode_data_user.get().format( - num_sources=len(metadata_list), - combined_text=combined_text, - ), - response_model=EpisodeMetadata, - error_context="merge failed", - ) - - return result if result else metadata_list[0] - - def extract_all_seasons(self, scraped_pages: List[Dict[str, Any]]) -> Optional[List[SeasonMetadata]]: - combined_content = "" - for i, page in enumerate(scraped_pages, 1): - url = page["url"] - markdown = page["markdown"] - combined_content += f"\n\n=== SOURCE {i}: {url} ===\n\n{markdown}\n" - - result = self.__process_llm_request( - system_prompt=extract_all_seasons_system.get(), - user_prompt=extract_all_seasons_user.get().format( - num_sources=len(scraped_pages), - combined_content=combined_content, - ), - response_model=AllSeasonsMetadata, - error_context="extraction failed", - ) - - return result.seasons if result else None - - def extract_characters(self, scraped_pages: List[Dict[str, Any]], series_name: str) -> Optional[List[CharacterInfo]]: - combined_content = "" - for i, page in enumerate(scraped_pages, 1): - url = page["url"] - markdown = page["markdown"] - combined_content += f"\n\n=== SOURCE {i}: {url} ===\n\n{markdown}\n" - - result = self.__process_llm_request( - system_prompt=extract_characters_system.get(), - user_prompt=extract_characters_user.get().format( - num_sources=len(scraped_pages), - series_name=series_name, - combined_content=combined_content, - ), - response_model=CharactersList, - error_context="character extraction failed", - ) - - return result.characters if result else None - - def __process_llm_request( - self, - system_prompt: str, - user_prompt: str, - response_model: Type[BaseModel], - error_context: str, - ) -> Optional[BaseModel]: - try: - messages = [ - {"role": "system", "content": system_prompt}, - {"role": "user", "content": user_prompt}, - ] - - if self.parser_mode == ParserMode.PREMIUM: - content = self.__generate_with_gemini(messages) - else: - content = self.__generate(messages) - - data = self.__extract_json(content) - return response_model(**data) - - except Exception as e: - console.print(f"[red]LLM {error_context}: {e}[/red]") - return None - - def __init_gemini_client(self) -> None: - console.print("[cyan]Initializing Gemini 2.5 Flash via OpenAI SDK...[/cyan]") - try: - api_key = settings.gemini.api_key - if not api_key: - raise ValueError("GEMINI_API_KEY not set in environment") - - self.__openai_client = OpenAI( - base_url="https://generativelanguage.googleapis.com/v1beta/openai/", - api_key=api_key, - ) - console.print("[green]✓ Gemini 2.5 Flash initialized[/green]") - except Exception as e: - console.print(f"[red]Failed to initialize Gemini client: {e}[/red]") - raise e - - def __load_model(self) -> None: - console.print(f"[cyan]Loading LLM: {self.model_name} (vLLM, 128K context)[/cyan]") - try: - self.__model = LLM( - model=self.model_name, - trust_remote_code=True, - max_model_len=131072, - gpu_memory_utilization=0.95, - tensor_parallel_size=1, - dtype="bfloat16", - enable_chunked_prefill=True, - max_num_batched_tokens=16384, - enforce_eager=True, - disable_log_stats=True, - ) - console.print("[green]✓ LLM loaded successfully (vLLM)[/green]") - except Exception as e: - console.print(f"[red]Failed to load model: {e}[/red]") - raise e - - def __generate(self, messages: List[Dict], max_tokens: int = 32768) -> str: - sampling_params = SamplingParams( - temperature=0.7, - top_p=0.8, - top_k=20, - max_tokens=max_tokens, - repetition_penalty=1.05, - ) - - outputs = self.__model.chat( - messages=[messages], - sampling_params=sampling_params, - ) - - return outputs[0].outputs[0].text.strip() - - def __generate_with_gemini(self, messages: List[Dict]) -> str: - response = self.__openai_client.chat.completions.create( - model=self.__GEMINI_MODEL_NAME, - messages=messages, - ) - return response.choices[0].message.content.strip() - - @staticmethod - def __extract_json(content: str) -> Dict[str, Any]: - try: - if "```json" in content: - start = content.find("```json") + 7 - end = content.find("```", start) - json_str = content[start:end].strip() - elif "```" in content: - start = content.find("```") + 3 - end = content.find("```", start) - json_str = content[start:end].strip() - else: - json_str = content.strip() - - return json.loads(json_str) - except json.JSONDecodeError as e: - console.print(f"[red]JSON parse error: {e}[/red]") - console.print(f"[yellow]Raw content:\n{content}[/yellow]") - raise diff --git a/preprocessor/scraping/base_scraper.py b/preprocessor/scraping/base_scraper.py deleted file mode 100644 index 3a1951a94..000000000 --- a/preprocessor/scraping/base_scraper.py +++ /dev/null @@ -1,112 +0,0 @@ -from abc import abstractmethod -import logging -from pathlib import Path -from typing import ( - Any, - Dict, - List, - Optional, -) - -from rich.progress import Progress - -from preprocessor.config.config import settings -from preprocessor.core.base_processor import BaseProcessor -from preprocessor.core.enums import ( - ParserMode, - ScraperMethod, -) -from preprocessor.providers.llm import LLMProvider -from preprocessor.scraping.clipboard import ScraperClipboard -from preprocessor.scraping.crawl4ai import ScraperCrawl4AI -from preprocessor.utils.console import ( - console, - create_progress, -) - - -class BaseScraper(BaseProcessor): - def __init__(self, args: Dict[str, Any], error_exit_code: int = 7): - super().__init__( - args=args, - class_name=self.__class__.__name__, - error_exit_code=error_exit_code, - loglevel=logging.DEBUG, - ) - - self.urls: List[str] = self._args["urls"] - self.output_file: Path = self._args["output_file"] - self.headless: bool = self._args.get("headless", True) - - scraper_method_str = self._args.get("scraper_method", "crawl4ai") - self.scraper_method = ScraperMethod(scraper_method_str) - - parser_mode_str = self._args.get("parser_mode", "normal") - self.parser_mode = ParserMode(parser_mode_str) - - self.llm: Optional[LLMProvider] = None - - def _validate_args(self, args: Dict[str, Any]) -> None: - if "urls" not in args or not args["urls"]: - raise ValueError("At least one URL is required") - if "output_file" not in args: - raise ValueError("output_file is required") - - def _execute(self) -> None: - self.llm = LLMProvider(parser_mode=self.parser_mode) - - console.print(f"[blue]Scraping {len(self.urls)} URLs...[/blue]") - - scraped_pages = self.__scrape_all_urls() - - if not scraped_pages: - console.print("[yellow]No pages scraped[/yellow]") - return - - console.print(f"[blue]Scraped {len(scraped_pages)} pages, processing with LLM...[/blue]") - - try: - self._process_scraped_pages(scraped_pages) - except Exception as e: - self.logger.error(f"LLM processing failed: {e}") - - def __scrape_all_urls(self) -> List[Dict[str, Any]]: - scraped_pages = [] - try: - with create_progress() as progress: - task = progress.add_task("Fetching pages", total=len(self.urls)) - - for url in self.urls: - try: - page_text = self.__scrape_url(url, progress) - if page_text: - scraped_pages.append({ - "url": url, - "markdown": page_text, - }) - progress.console.print(f"[green]✓[/green] {url}: {len(page_text)} chars") - else: - self.logger.error(f"Failed to scrape {url}") - except Exception as e: - self.logger.error(f"Error scraping {url}: {e}") - finally: - progress.advance(task) - except KeyboardInterrupt: - console.print("\n[yellow]Scraping interrupted[/yellow]") - raise - - return scraped_pages - - def __scrape_url(self, url: str, progress: "Progress") -> Optional[str]: - progress.console.print(f"[cyan]Scraping method: {self.scraper_method.value}[/cyan]") - - if self.scraper_method == ScraperMethod.CLIPBOARD: - return ScraperClipboard.scrape(url, headless=self.headless) - if self.scraper_method == ScraperMethod.CRAWL4AI: - return ScraperCrawl4AI.scrape(url, save_markdown=True, output_dir=settings.scraper.output_dir) - self.logger.error(f"Unknown scraper method: {self.scraper_method}") - return None - - @abstractmethod - def _process_scraped_pages(self, scraped_pages: List[Dict[str, Any]]) -> None: - pass diff --git a/preprocessor/scraping/character_scraper.py b/preprocessor/scraping/character_scraper.py deleted file mode 100644 index 67c29ebff..000000000 --- a/preprocessor/scraping/character_scraper.py +++ /dev/null @@ -1,33 +0,0 @@ -import json -from typing import ( - Any, - Dict, - List, -) - -from preprocessor.scraping.base_scraper import BaseScraper -from preprocessor.utils.console import console - - -class CharacterScraper(BaseScraper): - def __init__(self, args: Dict[str, Any]): - super().__init__(args) - self.series_name: str = self._args.get("series_name", "") - - def _process_scraped_pages(self, scraped_pages: List[Dict[str, Any]]) -> None: - characters = self.llm.extract_characters(scraped_pages, self.series_name) - if not characters: - self.logger.error("LLM failed to extract any character data") - return - - result = { - "sources": [item["url"] for item in scraped_pages], - "characters": [char.model_dump() for char in characters], - } - - self.output_file.parent.mkdir(parents=True, exist_ok=True) - with open(self.output_file, "w", encoding="utf-8") as f: - json.dump(result, f, indent=2, ensure_ascii=False) - - console.print(f"[green]✓ Extracted {len(characters)} characters[/green]") - console.print(f"[green]✓ Saved to: {self.output_file}[/green]") diff --git a/preprocessor/scraping/clipboard.py b/preprocessor/scraping/clipboard.py deleted file mode 100644 index daba5f0ec..000000000 --- a/preprocessor/scraping/clipboard.py +++ /dev/null @@ -1,37 +0,0 @@ -import logging -from typing import Optional - -from patchright.sync_api import sync_playwright - -logger = logging.getLogger(__name__) - - -class ScraperClipboard: - @staticmethod - def scrape(url: str, headless: bool = True) -> Optional[str]: - try: - with sync_playwright() as p: - browser = p.chromium.launch( - headless=headless, - args=[ - '--no-sandbox', - '--disable-dev-shm-usage', - '--disable-gpu', - ], - ) - context = browser.new_context() - page = context.new_page() - - page.goto(url, wait_until="networkidle", timeout=30000) - - page.keyboard.press("Control+A") - page.keyboard.press("Control+C") - - clipboard_text = page.evaluate("navigator.clipboard.readText()") - - browser.close() - return clipboard_text - - except Exception as e: - logger.error(f"Clipboard scraping failed: {e}") - return None diff --git a/preprocessor/scraping/crawl4ai.py b/preprocessor/scraping/crawl4ai.py deleted file mode 100644 index 732ddb81d..000000000 --- a/preprocessor/scraping/crawl4ai.py +++ /dev/null @@ -1,64 +0,0 @@ -import asyncio -import logging -from pathlib import Path -from typing import Optional - -from crawl4ai import AsyncWebCrawler -from crawl4ai.async_configs import ( - BrowserConfig, - CrawlerRunConfig, -) -from pathvalidate import sanitize_filename -import ua_generator - -logger = logging.getLogger(__name__) - - -class ScraperCrawl4AI: - @staticmethod - def scrape(url: str, save_markdown: bool = False, output_dir: Optional[Path] = None) -> Optional[str]: - return asyncio.run(ScraperCrawl4AI.__scrape_async(url, save_markdown, output_dir)) - - @staticmethod - def __sanitize_url_to_filename(url: str) -> str: - return sanitize_filename(url.replace("://", "_").replace("/", "_")) - - @staticmethod - def __save_markdown(content: str, url: str, output_dir: Path) -> None: - output_dir.mkdir(parents=True, exist_ok=True) - filename = ScraperCrawl4AI.__sanitize_url_to_filename(url) - md_file = output_dir / f"{filename}.md" - with open(md_file, "w", encoding="utf-8") as f: - f.write(content) - logger.info(f"Saved markdown to: {md_file}") - - @staticmethod - async def __scrape_async(url: str, save_markdown: bool = False, output_dir: Optional[Path] = None) -> Optional[str]: - try: - ua = ua_generator.generate() - browser_config = BrowserConfig( - headless=True, - enable_stealth=True, - viewport_width=1920, - viewport_height=1080, - user_agent=str(ua), - ) - run_config = CrawlerRunConfig( - wait_until="networkidle", - page_timeout=60000, - delay_before_return_html=2.0, - ) - - async with AsyncWebCrawler(config=browser_config) as crawler: - result = await crawler.arun(url=url, config=run_config) - - if result.success: - if save_markdown and output_dir: - ScraperCrawl4AI.__save_markdown(result.markdown, url, output_dir) - return result.markdown - logger.error(f"Crawl4AI failed: {result.error_message}") - return None - - except Exception as e: - logger.error(f"Crawl4AI error: {e}") - return None diff --git a/preprocessor/scraping/episode_scraper.py b/preprocessor/scraping/episode_scraper.py deleted file mode 100644 index a9671860e..000000000 --- a/preprocessor/scraping/episode_scraper.py +++ /dev/null @@ -1,84 +0,0 @@ -from pathlib import Path -from typing import ( - Any, - Dict, - List, - Optional, -) - -from patchright.sync_api import sync_playwright # noqa: F401 # pylint: disable=unused-import - -from preprocessor.scraping.base_scraper import BaseScraper -from preprocessor.utils.console import console -from preprocessor.utils.file_utils import atomic_write_json - - -class EpisodeScraper(BaseScraper): - def __init__(self, args: Dict[str, Any]): - super().__init__(args) - self.merge_sources: bool = self._args.get("merge_sources", True) - self.expected_episodes_count: Optional[int] = self._args.get("expected_episodes_count") - self.videos_dir: Optional[Path] = self._args.get("videos_dir") - - def _process_scraped_pages(self, scraped_pages: List[Dict[str, Any]]) -> None: - all_seasons = self.llm.extract_all_seasons(scraped_pages) - if not all_seasons: - self.logger.error("LLM failed to extract any season data") - return - - result = { - "sources": [item["url"] for item in scraped_pages], - "seasons": [season.model_dump() for season in all_seasons], - } - - self.output_file.parent.mkdir(parents=True, exist_ok=True) - atomic_write_json(self.output_file, result, indent=2, ensure_ascii=False) - - total_episodes = sum(len(season.episodes) for season in all_seasons) - console.print(f"[green]✓ Extracted {len(all_seasons)} seasons, {total_episodes} episodes[/green]") - console.print(f"[green]✓ Saved to: {self.output_file}[/green]") - - self.__validate_episode_coverage(total_episodes) - - def __validate_episode_coverage(self, scraped_episodes_count: int) -> None: - expected_count = self.__get_expected_episodes_count() - - if expected_count is None: - console.print("\n[yellow]⚠ Coverage validation:[/yellow]") - console.print(f" [cyan]Scraped episodes: {scraped_episodes_count}[/cyan]") - console.print(" [yellow]No video directory provided - unable to validate coverage[/yellow]") - console.print(" [dim]Make sure the scraped episodes cover all your video files[/dim]") - console.print(" [dim]You can add more --scrape-urls if needed[/dim]\n") - return - - coverage_percentage = (scraped_episodes_count / expected_count * 100) if expected_count > 0 else 0 - - console.print("\n[yellow]⚠ Episode coverage validation:[/yellow]") - console.print(f" [cyan]Scraped episodes: {scraped_episodes_count}[/cyan]") - console.print(f" [cyan]Video files found: {expected_count}[/cyan]") - console.print(f" [cyan]Coverage: {coverage_percentage:.1f}%[/cyan]") - - if scraped_episodes_count < expected_count: - console.print(f"\n[red]✗ WARNING: Missing {expected_count - scraped_episodes_count} episodes![/red]") - console.print(" [yellow]Consider adding more URLs to --scrape-urls[/yellow]") - console.print(" [dim]Not all video files will have metadata available[/dim]\n") - elif scraped_episodes_count > expected_count: - console.print(f"\n[yellow]⚠ Note: Scraped {scraped_episodes_count - expected_count} more episodes than video files[/yellow]") - console.print(" [dim]This is OK if you plan to add more videos later[/dim]\n") - else: - console.print("\n[green]✓ Perfect coverage - all video files have metadata![/green]\n") - - def __get_expected_episodes_count(self) -> Optional[int]: - if self.expected_episodes_count is not None: - return self.expected_episodes_count - - if self.videos_dir and self.videos_dir.exists(): - return self.__count_video_files(self.videos_dir) - - return None - - def __count_video_files(self, directory: Path) -> int: - count = 0 - for ext in self.SUPPORTED_VIDEO_EXTENSIONS: - count += len(list(directory.rglob(f"*{ext}"))) - return count diff --git a/preprocessor/characters/__init__.py b/preprocessor/scripts/__init__.py similarity index 100% rename from preprocessor/characters/__init__.py rename to preprocessor/scripts/__init__.py diff --git a/preprocessor/scripts/compare_scribe_models.py b/preprocessor/scripts/compare_scribe_models.py new file mode 100644 index 000000000..95fe43745 --- /dev/null +++ b/preprocessor/scripts/compare_scribe_models.py @@ -0,0 +1,278 @@ +# pylint: skip-file +import argparse +import json +import os +from pathlib import Path +import subprocess +import tempfile +import time +from typing import ( + Any, + Dict, + List, + Optional, + Tuple, +) + +from elevenlabs.client import ElevenLabs +from elevenlabs.core import ApiError + +_ELEVENLABS_MODELS = ('scribe_v1', 'scribe_v2') +_WHISPER_MODEL = 'large-v3-turbo' +_POLLING_INTERVAL = 20 +_MAX_ATTEMPTS = 120 +_ADDITIONAL_FORMATS: List[Dict[str, Any]] = [ + {'format': 'srt'}, + { + 'format': 'segmented_json', + 'include_speakers': True, + 'include_timestamps': True, + 'segment_on_silence_longer_than_s': 0.5, + 'max_segment_duration_s': 10.0, + 'max_segment_chars': 200, + }, +] + +_WHISPER_DOCKER_SCRIPT = """ +import json, sys +from pathlib import Path +from faster_whisper import WhisperModel + +audio_path = sys.argv[1] +out_path = sys.argv[2] +model_name = sys.argv[3] + +print(f'[whisper] Loading {model_name} on cuda...') +model = WhisperModel( + model_name, + device='cuda', + compute_type='float16', + download_root='/models/huggingface', +) + +print(f'[whisper] Transcribing {Path(audio_path).name}...') +segments_iter, info = model.transcribe( + audio_path, + language='pl', + beam_size=10, + temperature=0.0, + vad_filter=True, +) + +segments = [] +text_parts = [] +for seg in segments_iter: + text = seg.text.strip() + text_parts.append(text) + segments.append({ + 'text': text, + 'start': round(seg.start, 3), + 'end': round(seg.end, 3), + 'words': [ + {'text': w.word, 'start': round(w.start, 3), 'end': round(w.end, 3)} + for w in (seg.words or []) + ], + }) + +result = {'text': ' '.join(text_parts), 'language_code': info.language, 'segments': segments} +Path(out_path).write_text(json.dumps(result, ensure_ascii=False, indent=2), encoding='utf-8') +print(f'[whisper] Done. {len(segments)} segments saved to {out_path}') +""" + + +def _extract_audio(video_path: Path, audio_path: Path) -> None: + print(f'Extracting audio from {video_path.name}...') + subprocess.run( + ['ffmpeg', '-y', '-i', str(video_path), '-vn', '-acodec', 'aac', '-b:a', '192k', str(audio_path)], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + check=True, + ) + print(f'Audio extracted: {audio_path.name} ({audio_path.stat().st_size / 1024 / 1024:.1f} MB)') + + +def _submit_job(client: ElevenLabs, audio_path: Path, model_id: str, language_code: str, diarize: bool) -> str: + print(f'[{model_id}] Submitting transcription job...') + with open(audio_path, 'rb') as f: + audio_data = f.read() + + response = client.speech_to_text.convert( + file=audio_data, + model_id=model_id, + language_code=language_code, + tag_audio_events=True, + timestamps_granularity='character', + diarize=diarize, + use_multi_channel=False, + additional_formats=_ADDITIONAL_FORMATS, + webhook=True, + ) + job_id = response.transcription_id + print(f'[{model_id}] Job submitted. ID: {job_id}') + return job_id + + +def _poll_job(client: ElevenLabs, model_id: str, job_id: str) -> Optional[Any]: + print(f'[{model_id}] Polling for results (ID: {job_id})...') + for attempt in range(1, _MAX_ATTEMPTS + 1): + try: + result = client.speech_to_text.transcripts.get(transcription_id=job_id) + print(f'[{model_id}] Done after {attempt} poll(s).') + return result + except ApiError as e: + if e.status_code == 404: + print(f'[{model_id}] Not ready yet (attempt {attempt}/{_MAX_ATTEMPTS}), waiting {_POLLING_INTERVAL}s...') + time.sleep(_POLLING_INTERVAL) + else: + raise + raise TimeoutError(f'[{model_id}] Timeout after {_MAX_ATTEMPTS} attempts') + + +def _elevenlabs_result_to_dict(result: Any) -> Dict[str, Any]: + data: Dict[str, Any] = { + 'text': result.text, + 'language_code': result.language_code, + 'segments': [], + 'srt': None, + } + + if not result.additional_formats: + return data + + for fmt in result.additional_formats: + if fmt.requested_format == 'srt': + data['srt'] = fmt.content + elif fmt.requested_format == 'segmented_json': + segmented = json.loads(fmt.content) + for seg in segmented.get('segments', []): + words = seg.get('words', []) + if not words: + continue + non_spacing = [w for w in words if w.get('type') != 'spacing'] + segment: Dict[str, Any] = {'text': seg.get('text', '').strip(), 'words': words} + if non_spacing: + segment['start'] = non_spacing[0].get('start') + segment['end'] = non_spacing[-1].get('end') + segment['speaker'] = non_spacing[0].get('speaker_id') + data['segments'].append(segment) + + return data + + +def _transcribe_whisper_docker(audio_path: Path, json_out: Path) -> None: + print(f'[whisper_{_WHISPER_MODEL}] Running via Docker (model download may take a moment on first run)...') + output_dir = audio_path.parent.resolve() + + audio_in_container = f'/compare_output/{audio_path.name}' + json_in_container = f'/compare_output/{json_out.name}' + + cmd = [ + 'docker', 'run', '--rm', '--gpus', 'all', + '--entrypoint', 'python', + '-v', f'ranchbot-ai-models:/models', + '-v', f'{output_dir}:/compare_output', + 'ranczo-preprocessor:latest', + '-c', _WHISPER_DOCKER_SCRIPT, + audio_in_container, json_in_container, _WHISPER_MODEL, + ] + + subprocess.run(cmd, check=True) + + +def _save_elevenlabs_result(data: Dict[str, Any], output_dir: Path, model_label: str, stem: str) -> Tuple[Path, Optional[Path]]: + json_path = output_dir / f'{stem}_{model_label}.json' + srt_content = data.pop('srt', None) + json_path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding='utf-8') + + srt_path = None + if srt_content: + srt_path = output_dir / f'{stem}_{model_label}.srt' + srt_path.write_text(srt_content, encoding='utf-8') + + return json_path, srt_path + + +def _json_exists(output_dir: Path, model_label: str, stem: str) -> bool: + return (output_dir / f'{stem}_{model_label}.json').exists() + + +def main() -> None: + parser = argparse.ArgumentParser(description='Compare scribe_v1, scribe_v2 and Whisper transcription quality.') + parser.add_argument('video', type=Path, help='Path to the video file') + parser.add_argument('--output-dir', '-o', type=Path, default=None, help='Output directory (default: same as video)') + parser.add_argument('--language', default='pol', help='ElevenLabs language code (default: pol)') + parser.add_argument('--no-diarize', action='store_true', help='Disable speaker diarization') + parser.add_argument('--no-whisper', action='store_true', help='Skip Whisper transcription') + args = parser.parse_args() + + video_path: Path = args.video.resolve() + if not video_path.exists(): + raise FileNotFoundError(f'Video file not found: {video_path}') + + output_dir: Path = (args.output_dir or video_path.parent).resolve() + output_dir.mkdir(parents=True, exist_ok=True) + + stem = video_path.stem + diarize = not args.no_diarize + whisper_label = f'whisper_{_WHISPER_MODEL}' + + elevenlabs_to_run = [m for m in _ELEVENLABS_MODELS if not _json_exists(output_dir, m, stem)] + whisper_needed = not args.no_whisper and not _json_exists(output_dir, whisper_label, stem) + need_audio = bool(elevenlabs_to_run) or whisper_needed + + for m in _ELEVENLABS_MODELS: + if _json_exists(output_dir, m, stem): + print(f'[{m}] Already exists, skipping API call.') + if not whisper_needed and not args.no_whisper: + print(f'[{whisper_label}] Already exists, skipping.') + + job_ids: Dict[str, str] = {} + audio_temp_dir: Optional[tempfile.TemporaryDirectory] = None # type: ignore[type-arg] + audio_path: Optional[Path] = None + + if need_audio: + audio_temp_dir = tempfile.TemporaryDirectory() + audio_path = Path(audio_temp_dir.name) / f'{stem}.aac' + _extract_audio(video_path, audio_path) + + if elevenlabs_to_run: + api_key = os.getenv('ELEVEN_API_KEY', '') + if not api_key: + raise ValueError('ELEVEN_API_KEY environment variable is not set.') + client = ElevenLabs(api_key=api_key) + + assert audio_path is not None + for model in elevenlabs_to_run: + job_ids[model] = _submit_job(client, audio_path, model, args.language, diarize) + + print(f'\n{len(job_ids)} job(s) submitted. Polling for results...\n') + for model in elevenlabs_to_run: + result = _poll_job(client, model, job_ids[model]) + data = _elevenlabs_result_to_dict(result) + json_path, srt_path = _save_elevenlabs_result(data, output_dir, model, stem) + print(f'[{model}] Saved: {json_path.name} ({len(data["segments"])} segments, {len(data["text"])} chars)') + if srt_path: + print(f'[{model}] Saved: {srt_path.name}') + + if whisper_needed: + assert audio_path is not None + whisper_audio = output_dir / f'_whisper_tmp_{stem}.aac' + import shutil + shutil.copy2(audio_path, whisper_audio) + try: + json_out = output_dir / f'{stem}_{whisper_label}.json' + _transcribe_whisper_docker(whisper_audio, json_out) + if json_out.exists(): + data = json.loads(json_out.read_text(encoding='utf-8')) + print(f'[{whisper_label}] Saved: {json_out.name} ({len(data["segments"])} segments, {len(data["text"])} chars)') + finally: + whisper_audio.unlink(missing_ok=True) + + if audio_temp_dir: + audio_temp_dir.cleanup() + + print(f'\nDone. Compare files in: {output_dir}') + + +if __name__ == '__main__': + main() diff --git a/preprocessor/scripts/deploy_to_nas.py b/preprocessor/scripts/deploy_to_nas.py new file mode 100644 index 000000000..c86baab87 --- /dev/null +++ b/preprocessor/scripts/deploy_to_nas.py @@ -0,0 +1,213 @@ +# pylint: skip-file +import argparse +from concurrent.futures import ( + ThreadPoolExecutor, + as_completed, +) +import os +from pathlib import Path +import shutil +import sys +from typing import ( + List, + Tuple, +) + +_DEPLOY_SUBDIRS = ("archives", "transcoded_videos") +_DEFAULT_WORKERS = 1 + + +def _resolve_source_base(source_path: str) -> Path: + if source_path: + return Path(source_path) + script_dir = Path(__file__).resolve().parent + return script_dir.parent / "output_data" + + +def _collect_files(source_series_dir: Path, target_series_dir: Path) -> List[Tuple[Path, Path]]: + pairs = [] + for subdir in _DEPLOY_SUBDIRS: + source_subdir = source_series_dir / subdir + if not source_subdir.exists(): + print(f" [SKIP] Source not found: {source_subdir}") + continue + for source_file in source_subdir.rglob("*"): + if source_file.is_file(): + relative = source_file.relative_to(source_subdir) + target_file = target_series_dir / relative + pairs.append((source_file, target_file)) + return pairs + + +def _copy_file(src: Path, dst: Path, dry_run: bool) -> Tuple[Path, Path, bool, str]: + try: + if not dry_run: + dst.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(src, dst) + return src, dst, True, "" + except Exception as e: + return src, dst, False, str(e) + + +def _print_summary(total: int, copied: int, skipped: int, failed: int, dry_run: bool) -> None: + prefix = "[DRY RUN] " if dry_run else "" + print(f"\n{prefix}Summary:") + print(f" Total files : {total}") + print(f" Copied : {copied}") + print(f" Skipped : {skipped}") + print(f" Failed : {failed}") + + +def _is_changed(src: Path, dst: Path) -> bool: + if not dst.exists(): + return True + src_stat = os.stat(src) + dst_stat = os.stat(dst) + if src_stat.st_size != dst_stat.st_size: + return True + return src_stat.st_mtime > dst_stat.st_mtime + 1 + + +def _filter_files_to_copy( + pairs: List[Tuple[Path, Path]], overwrite: bool, diff_only: bool, +) -> Tuple[List[Tuple[Path, Path]], int]: + to_copy = [] + skipped = 0 + for src, dst in pairs: + if diff_only: + if _is_changed(src, dst): + to_copy.append((src, dst)) + else: + skipped += 1 + elif not overwrite and dst.exists(): + skipped += 1 + else: + to_copy.append((src, dst)) + return to_copy, skipped + + +def _execute_copy_batch( + to_copy: List[Tuple[Path, Path]], + target_series_dir: Path, + dry_run: bool, + workers: int, +) -> Tuple[int, int]: + copied = 0 + failed = 0 + done = 0 + with ThreadPoolExecutor(max_workers=workers) as executor: + futures = {executor.submit(_copy_file, src, dst, dry_run): src for src, dst in to_copy} + for future in as_completed(futures): + src, result_dst, success, error = future.result() + done += 1 + if success: + copied += 1 + rel = result_dst.relative_to(target_series_dir) + print(f" [{'DRY' if dry_run else 'OK'}] {rel} ({done}/{len(to_copy)})") + else: + failed += 1 + print(f" [FAIL] {src.name} — {error}") + return copied, failed + + +def deploy( + source_base: Path, + target_base: Path, + series: str, + dry_run: bool, + workers: int, + overwrite: bool, + diff_only: bool, +) -> int: + source_series_dir = source_base / series + target_series_dir = target_base / series + + if not source_series_dir.exists(): + print(f"ERROR: Source directory not found: {source_series_dir}") + return 1 + + mode_flags = f"{'DRY RUN' if dry_run else 'COPY'} | workers={workers}" + if diff_only: + mode_flags += " | diff-only" + elif overwrite: + mode_flags += " | overwrite" + + print(f"Source : {source_series_dir}") + print(f"Target : {target_series_dir}") + print(f"Mode : {mode_flags}") + print() + + pairs = _collect_files(source_series_dir, target_series_dir) + if not pairs: + print("No files found to copy.") + return 0 + + to_copy, skipped = _filter_files_to_copy(pairs, overwrite, diff_only) + skip_reason = "unchanged (size+mtime)" if diff_only else "already exist, use --overwrite to replace" + print(f"Files to copy : {len(to_copy)}") + print(f"Files skipped : {skipped} ({skip_reason})") + print() + + if not to_copy: + _print_summary(len(pairs), 0, skipped, 0, dry_run) + return 0 + + copied, failed = _execute_copy_batch(to_copy, target_series_dir, dry_run, workers) + _print_summary(len(pairs), copied, skipped, failed, dry_run) + return 1 if failed else 0 + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Copy processed series archives and videos to NAS storage.", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog="Example:\n python -m preprocessor.scripts.deploy_to_nas" + " --target-path //TRUENAS/RanchBot --series kiepscy", + ) + parser.add_argument( + "--target-path", + required=True, + help="Base NAS path (e.g. //TRUENAS/RanchBot or /mnt/truenas/RanchBot)", + ) + parser.add_argument( + "--series", + required=True, + help="Series name (e.g. kiepscy, ranczo)", + ) + parser.add_argument( + "--source-path", + default="", + help="Override local output_data base path (default: auto-detected relative to this script)", + ) + parser.add_argument( + "--overwrite", + action="store_true", + help="Overwrite files that already exist on target", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Show what would be copied without actually copying", + ) + parser.add_argument( + "--diff-only", + action="store_true", + help="Only copy files that are missing or differ from target (by size or modification time)", + ) + parser.add_argument( + "--workers", + type=int, + default=_DEFAULT_WORKERS, + help=f"Number of parallel copy workers (default: {_DEFAULT_WORKERS})", + ) + + args = parser.parse_args() + + source_base = _resolve_source_base(args.source_path) + target_base = Path(args.target_path) + + sys.exit(deploy(source_base, target_base, args.series, args.dry_run, args.workers, args.overwrite, args.diff_only)) + + +if __name__ == "__main__": + main() diff --git a/preprocessor/scripts/split_double_episodes.py b/preprocessor/scripts/split_double_episodes.py new file mode 100644 index 000000000..5b856a34d --- /dev/null +++ b/preprocessor/scripts/split_double_episodes.py @@ -0,0 +1,261 @@ +# pylint: skip-file +import argparse +import json +import math +from pathlib import Path +import re +import shutil +import subprocess +import sys +from typing import ( + Any, + Dict, + List, + Optional, + Tuple, +) + +sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent)) + +from preprocessor.services.media.scene_detection import TransNetWrapper # noqa: E402 # pylint: disable=wrong-import-position + +_VIDEO_EXTENSIONS: Tuple[str, ...] = ('.mkv', '.mp4', '.avi') +_EP_PATTERN = re.compile(r'(S\d{2})E(\d{2})') +_BLACK_PATTERN = re.compile(r'black_start:([\d.]+)\s+black_end:([\d.]+)') + + +def _probe_duration(video_path: Path) -> float: + result = subprocess.run( + ['ffprobe', '-v', 'quiet', '-print_format', 'json', '-show_format', str(video_path)], + stdout=subprocess.PIPE, + stderr=subprocess.DEVNULL, + check=True, + text=True, + ) + return float(json.loads(result.stdout)['format']['duration']) + + +def _detect_scenes(video_path: Path, threshold: float, min_scene_len: int) -> List[Dict[str, Any]]: + wrapper = TransNetWrapper() + wrapper.load_model() + try: + return wrapper.detect_scenes(video_path, threshold=threshold, min_scene_len=min_scene_len) + finally: + wrapper.cleanup() + + +def _scene_cut_timestamps(scenes: List[Dict[str, Any]]) -> List[float]: + return [ + float(s['start']['seconds']) if isinstance(s.get('start'), dict) else float(s.get('start', 0)) + for s in scenes[1:] + ] + + +def _nearest_cut(cuts: List[float], target: float) -> float: + return min(cuts, key=lambda t: abs(t - target)) + + +def _detect_black_frames( + video_path: Path, + cut: float, + half_window: float, + black_duration: float = 0.02, + pix_threshold: float = 0.10, +) -> List[Tuple[float, float]]: + scan_start = max(0.0, cut - half_window) + result = subprocess.run( + [ + 'ffmpeg', + '-ss', str(scan_start), + '-t', str(half_window * 2), + '-i', str(video_path), + '-vf', f'blackdetect=d={black_duration}:pix_th={pix_threshold}', + '-an', '-f', 'null', '-', + ], + stdout=subprocess.DEVNULL, + stderr=subprocess.PIPE, + text=True, + check=False, + ) + return [ + (float(m.group(1)) + scan_start, float(m.group(2)) + scan_start) + for m in _BLACK_PATTERN.finditer(result.stderr) + ] + + +def _adjust_for_black_frames( + cut: float, + black_intervals: List[Tuple[float, float]], + max_distance: float = 5.0, +) -> float: + best_interval: Optional[Tuple[float, float]] = None + best_dist = math.inf + + for black_start, black_end in black_intervals: + if black_start <= cut <= black_end: + dist = 0.0 + elif black_end < cut: + dist = cut - black_end + else: + dist = black_start - cut + + if dist <= max_distance and dist < best_dist: + best_dist = dist + best_interval = (black_start, black_end) + + return best_interval[1] if best_interval is not None else cut + + +def _classify_file(video_path: Path, half_window: float) -> Tuple[bool, float]: + midpoint = _probe_duration(video_path) / 2.0 + black_intervals = _detect_black_frames(video_path, midpoint, half_window) + adjusted = _adjust_for_black_frames(midpoint, black_intervals) + return adjusted != midpoint or bool(black_intervals), adjusted + + +def _rename_episode(filename: str, new_ep: int, special: bool = False) -> str: + match = _EP_PATTERN.search(filename) + if not match: + raise ValueError(f'No SxxExx pattern in filename: {filename}') + season = match.group(1) + suffix = '_SPECIAL' if special else '' + replacement = f'{season}E{new_ep:02d}{suffix}' + return filename[:match.start()] + replacement + filename[match.end():] + + +def _ffmpeg_split(video_path: Path, cut_time: float, ep1_path: Path, ep2_path: Path) -> None: + codec = ['-c:v', 'hevc_nvenc', '-preset', 'p4', '-cq', '18', '-c:a', 'copy'] + subprocess.run( + ['ffmpeg', '-y', '-i', str(video_path), '-t', str(cut_time)] + codec + [str(ep1_path)], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + check=True, + ) + subprocess.run( + ['ffmpeg', '-y', '-ss', str(cut_time), '-i', str(video_path)] + codec + [str(ep2_path)], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + check=True, + ) + + +def _split_double( + video: Path, + approx_cut: float, + half_window: float, + threshold: float, + min_scene_len: int, + output_dir: Path, + ep_counter: int, +) -> int: + scenes = _detect_scenes(video, threshold, min_scene_len) + cuts = _scene_cut_timestamps(scenes) + raw_cut = _nearest_cut(cuts, approx_cut) if cuts else approx_cut + black_intervals = _detect_black_frames(video, raw_cut, half_window) + final_cut = _adjust_for_black_frames(raw_cut, black_intervals) + + ep1_name = _rename_episode(video.name, ep_counter) + ep2_name = _rename_episode(video.name, ep_counter + 1) + + direction = '' + if final_cut != raw_cut: + arrow = 'forward' if final_cut > raw_cut else 'backward' + direction = f' ({arrow} {raw_cut:.3f}s -> {final_cut:.3f}s)' + + print(f' [SPLIT] {video.name} cut={final_cut:.3f}s{direction}') + print(f' E{ep_counter:02d} -> {ep1_name}') + print(f' E{ep_counter + 1:02d} -> {ep2_name}') + + _ffmpeg_split(video, final_cut, output_dir / ep1_name, output_dir / ep2_name) + return ep_counter + 2 + + +def _process_season( + season_dir: Path, + output_dir: Path, + half_window: float, + threshold: float, + min_scene_len: int, + dry_run: bool, +) -> None: + videos = sorted(p for p in season_dir.iterdir() if p.suffix.lower() in _VIDEO_EXTENSIONS) + if not videos: + print(f'[{season_dir.name}] no videos found') + return + + print(f'\n[{season_dir.name}] classifying {len(videos)} file(s)...') + classifications: List[Tuple[Path, bool, float]] = [] + for video in videos: + is_double, cut = _classify_file(video, half_window) + label = 'DOUBLE' if is_double else 'SPECIAL' + cut_info = f' cut={cut:.3f}s' if is_double else '' + print(f' [{label}] {video.name}{cut_info}') + classifications.append((video, is_double, cut)) + + if dry_run: + specials = [v.name for v, is_double, _ in classifications if not is_double] + if specials: + print(f' --- SPECIALS: {specials}') + return + + output_dir.mkdir(parents=True, exist_ok=True) + ep_counter = 1 + + for video, is_double, approx_cut in classifications: + if is_double: + ep_counter = _split_double(video, approx_cut, half_window, threshold, min_scene_len, output_dir, ep_counter) + else: + special_name = _rename_episode(video.name, ep_counter, special=True) + print(f' [COPY ] {video.name}') + print(f' E{ep_counter:02d} -> {special_name}') + shutil.copy2(str(video), str(output_dir / special_name)) + ep_counter += 1 + + +def main() -> None: + parser = argparse.ArgumentParser( + description='Split double-episode files and renumber sequentially per season.', + ) + parser.add_argument( + 'season_dirs', nargs='+', type=Path, + help='Season directory/directories to process', + ) + parser.add_argument( + '--output-dir', '-o', type=Path, required=True, + help='Root output directory (S01/S02/... subdirs created automatically)', + ) + parser.add_argument( + '--threshold', type=float, default=0.5, + help='TransNetV2 scene detection threshold (default: 0.5)', + ) + parser.add_argument( + '--min-scene-len', type=int, default=10, + help='Minimum scene length in frames (default: 10)', + ) + parser.add_argument( + '--black-window', type=float, default=15.0, + help='Half-window in seconds for symmetric black frame scan (default: 15)', + ) + parser.add_argument( + '--dry-run', action='store_true', + help='Classify only — no TransNetV2, no splitting, no copying', + ) + + args = parser.parse_args() + + for season_dir in args.season_dirs: + if not season_dir.is_dir(): + print(f'Not a directory, skipping: {season_dir}', file=sys.stderr) + continue + _process_season( + season_dir, + args.output_dir / season_dir.name, + args.black_window, + args.threshold, + args.min_scene_len, + args.dry_run, + ) + + +if __name__ == '__main__': + main() diff --git a/preprocessor/scripts_temp/import_transcriptions.py b/preprocessor/scripts_temp/import_transcriptions.py deleted file mode 100644 index dbe01f8a5..000000000 --- a/preprocessor/scripts_temp/import_transcriptions.py +++ /dev/null @@ -1,94 +0,0 @@ -import json -from pathlib import Path -import re -import shutil -from typing import ( - Optional, - Tuple, -) - -SOURCE_DIR = Path("/mnt/c/GIT_REPO/RANCZO_KLIPY/sceny-trans") -OUTPUT_DIR = Path("/mnt/c/GIT_REPO/RANCZO_KLIPY/preprocessor/output_data/transcriptions") -SERIES_NAME = "ranczo" - - -def parse_filename(filename: str) -> Optional[Tuple[int, int]]: - match = re.search(r"S(\d{2})E(\d{2})", filename, re.IGNORECASE) - if match: - return int(match.group(1)), int(match.group(2)) - return None - - -def _copy_and_fix_file(source_dir: Path, filename_base: str, season: int, episode: int) -> bool: - raw_dir = OUTPUT_DIR / f"S{season:02d}" / f"E{episode:02d}" / "raw" - raw_dir.mkdir(parents=True, exist_ok=True) - - episode_info = { - "season": season, - "episode_number": episode, - } - - segmented_src = source_dir / "segmented_json" / f"{filename_base}_segmented.json" - simple_src = source_dir / "simple_json" / f"{filename_base}_simple.json" - srt_src = source_dir / "srt" / f"{filename_base}.srt" - txt_src = source_dir / "txt" / f"{filename_base}.txt" - - if not segmented_src.exists(): - print(f" ERROR: Missing {segmented_src.name}") - return False - - try: # pylint: disable=too-many-try-statements - with open(segmented_src, "r", encoding="utf-8") as f: - segmented_data = json.load(f) - segmented_data["episode_info"] = episode_info - segmented_dst = raw_dir / f"{SERIES_NAME}_S{season:02d}E{episode:02d}_segmented.json" - with open(segmented_dst, "w", encoding="utf-8") as f: - json.dump(segmented_data, f, indent=2, ensure_ascii=False) - print(f" Created: {segmented_dst}") - - with open(simple_src, "r", encoding="utf-8") as f: - simple_data = json.load(f) - simple_data["episode_info"] = episode_info - simple_dst = raw_dir / f"{SERIES_NAME}_S{season:02d}E{episode:02d}_simple.json" - with open(simple_dst, "w", encoding="utf-8") as f: - json.dump(simple_data, f, indent=2, ensure_ascii=False) - print(f" Created: {simple_dst}") - - srt_dst = raw_dir / f"{SERIES_NAME}_S{season:02d}E{episode:02d}.srt" - shutil.copy2(srt_src, srt_dst) - print(f" Created: {srt_dst}") - - txt_dst = raw_dir / f"{SERIES_NAME}_S{season:02d}E{episode:02d}.txt" - shutil.copy2(txt_src, txt_dst) - print(f" Created: {txt_dst}") - - return True - except Exception as e: - print(f" ERROR: {e}") - return False - - -def main() -> None: - print(f"Source: {SOURCE_DIR}") - print(f"Output: {OUTPUT_DIR}") - - segmented_dir = SOURCE_DIR / "segmented_json" - if not segmented_dir.exists(): - print(f"ERROR: {segmented_dir} does not exist") - return - - for segmented_file in sorted(segmented_dir.glob("*_segmented.json")): - filename_base = segmented_file.stem.replace("_segmented", "") - - parsed = parse_filename(filename_base) - if not parsed: - print(f"Skipping (cannot parse): {filename_base}") - continue - - season, episode = parsed - print(f"{filename_base} -> S{season:02d}E{episode:02d}") - _copy_and_fix_file(SOURCE_DIR, filename_base, season, episode) - - -if __name__ == "__main__": - main() diff --git a/preprocessor/search/elastic_manager.py b/preprocessor/search/elastic_manager.py deleted file mode 100644 index 43d607d4c..000000000 --- a/preprocessor/search/elastic_manager.py +++ /dev/null @@ -1,379 +0,0 @@ -import json - -from elasticsearch import ( - AsyncElasticsearch, - exceptions as es_exceptions, -) -import urllib3 - -from preprocessor.config.config import settings - -urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) - -# pylint: disable=duplicate-code -class ElasticSearchManager: - INDEX_MAPPING: json = { - "mappings": { - "properties": { - "episode_info": { - "type": "object", - "properties": { - "season": {"type": "integer"}, - "episode_number": {"type": "integer"}, - "title": {"type": "text"}, - "premiere_date": {"type": "date", "format": "dd.MM.yyyy||d.MM.yyyy||d.M.yyyy||yyyy-MM-dd||strict_date_optional_time||epoch_millis"}, - "viewership": {"type": "keyword"}, - "description": {"type": "text"}, - "summary": {"type": "text"}, - "is_special_feature": {"type": "boolean"}, - "special_feature_type": {"type": "keyword"}, - }, - }, - "text": {"type": "text"}, - "start": {"type": "float"}, - "end": {"type": "float"}, - "video_path": {"type": "keyword"}, - "transcription": { - "type": "object", - "properties": { - "format": {"type": "keyword"}, - "source_file": {"type": "keyword"}, - "language_code": {"type": "keyword"}, - "language_probability": {"type": "float"}, - "segments": { - "type": "nested", - "properties": { - "id": {"type": "integer"}, - "start": {"type": "float"}, - "end": {"type": "float"}, - "text": {"type": "text"}, - "speaker": {"type": "keyword"}, - "words": {"type": "object", "enabled": False}, - }, - }, - }, - }, - "scene_timestamps": { - "type": "object", - "properties": { - "total_scenes": {"type": "integer"}, - "video_info": { - "type": "object", - "properties": { - "fps": {"type": "float"}, - "duration": {"type": "float"}, - "total_frames": {"type": "integer"}, - }, - }, - "detection_settings": {"type": "object", "enabled": False}, - "scenes": { - "type": "nested", - "properties": { - "scene_number": {"type": "integer"}, - "start": {"type": "object", "enabled": False}, - "end": {"type": "object", "enabled": False}, - "duration": {"type": "float"}, - "frame_count": {"type": "integer"}, - }, - }, - }, - }, - "text_embeddings": { - "type": "nested", - "properties": { - "segment_range": {"type": "integer"}, - "text": {"type": "text"}, - "embedding": {"type": "float", "index": False}, - }, - }, - "video_embeddings": { - "type": "nested", - "properties": { - "frame_number": {"type": "integer"}, - "timestamp": {"type": "float"}, - "type": {"type": "keyword"}, - "embedding": {"type": "float", "index": False}, - }, - }, - "id": {"type": "integer"}, - "seek": {"type": "integer"}, - "author": {"type": "keyword"}, - "comment": {"type": "text"}, - "tags": {"type": "keyword"}, - "location": {"type": "keyword"}, - "actors": {"type": "keyword"}, - }, - }, - } - - SEGMENTS_INDEX_MAPPING: json = { - "mappings": { - "properties": { - "episode_id": {"type": "keyword"}, - "episode_metadata": { - "properties": { - "season": {"type": "integer"}, - "episode_number": {"type": "integer"}, - "title": {"type": "text", "fields": {"keyword": {"type": "keyword"}}}, - "premiere_date": {"type": "date", "format": "dd.MM.yyyy||d.MM.yyyy||d.M.yyyy||yyyy-MM-dd||strict_date_optional_time||epoch_millis"}, - "series_name": {"type": "keyword"}, - "viewership": {"type": "keyword"}, - }, - }, - "segment_id": {"type": "integer"}, - "text": { - "type": "text", - "analyzer": "standard", - "fields": { - "keyword": {"type": "keyword"}, - }, - }, - "start_time": {"type": "float"}, - "end_time": {"type": "float"}, - "speaker": {"type": "keyword"}, - "video_path": {"type": "keyword"}, - "scene_info": { - "properties": { - "scene_number": {"type": "integer"}, - "scene_start_time": {"type": "float"}, - "scene_end_time": {"type": "float"}, - "scene_start_frame": {"type": "integer"}, - "scene_end_frame": {"type": "integer"}, - }, - }, - }, - }, - } - - TEXT_EMBEDDINGS_INDEX_MAPPING: json = { - "mappings": { - "properties": { - "episode_id": {"type": "keyword"}, - "episode_metadata": { - "properties": { - "season": {"type": "integer"}, - "episode_number": {"type": "integer"}, - "title": {"type": "text"}, - "premiere_date": {"type": "date", "format": "dd.MM.yyyy||d.MM.yyyy||d.M.yyyy||yyyy-MM-dd||strict_date_optional_time||epoch_millis"}, - "series_name": {"type": "keyword"}, - }, - }, - "embedding_id": {"type": "integer"}, - "segment_range": {"type": "integer"}, - "text": {"type": "text"}, - "text_embedding": { - "type": "dense_vector", - "dims": settings.embedding_model.embedding_dim, - "index": True, - "similarity": "cosine", - }, - "video_path": {"type": "keyword"}, - }, - }, - } - - VIDEO_EMBEDDINGS_INDEX_MAPPING: json = { - "mappings": { - "properties": { - "episode_id": {"type": "keyword"}, - "episode_metadata": { - "properties": { - "season": {"type": "integer"}, - "episode_number": {"type": "integer"}, - "title": {"type": "text"}, - "premiere_date": {"type": "date", "format": "dd.MM.yyyy||d.MM.yyyy||d.M.yyyy||yyyy-MM-dd||strict_date_optional_time||epoch_millis"}, - "series_name": {"type": "keyword"}, - }, - }, - "frame_number": {"type": "integer"}, - "timestamp": {"type": "float"}, - "frame_type": {"type": "keyword"}, - "scene_number": {"type": "integer"}, - "video_embedding": { - "type": "dense_vector", - "dims": settings.embedding_model.embedding_dim, - "index": True, - "similarity": "cosine", - }, - "perceptual_hash": {"type": "keyword"}, - "perceptual_hash_int": {"type": "unsigned_long"}, - "video_path": {"type": "keyword"}, - "character_appearances": { - "type": "nested", - "properties": { - "name": {"type": "keyword"}, - "confidence": {"type": "float"}, - "emotion": { - "properties": { - "label": {"type": "keyword"}, - "confidence": {"type": "float"}, - }, - }, - }, - }, - "detected_objects": { - "type": "nested", - "properties": { - "class": {"type": "keyword"}, - "count": {"type": "integer"}, - }, - }, - "scene_info": { - "properties": { - "scene_start_time": {"type": "float"}, - "scene_end_time": {"type": "float"}, - "scene_start_frame": {"type": "integer"}, - "scene_end_frame": {"type": "integer"}, - }, - }, - }, - }, - } - - EPISODE_NAMES_INDEX_MAPPING: json = { - "mappings": { - "properties": { - "episode_id": {"type": "keyword"}, - "episode_metadata": { - "properties": { - "season": {"type": "integer"}, - "episode_number": {"type": "integer"}, - "title": {"type": "text", "fields": {"keyword": {"type": "keyword"}}}, - "premiere_date": {"type": "date", "format": "dd.MM.yyyy||d.MM.yyyy||d.M.yyyy||yyyy-MM-dd||strict_date_optional_time||epoch_millis"}, - "series_name": {"type": "keyword"}, - "viewership": {"type": "keyword"}, - }, - }, - "title": { - "type": "text", - "analyzer": "standard", - "fields": { - "keyword": {"type": "keyword"}, - }, - }, - "title_embedding": { - "type": "dense_vector", - "dims": settings.embedding_model.embedding_dim, - "index": True, - "similarity": "cosine", - }, - "video_path": {"type": "keyword"}, - }, - }, - } - - FULL_EPISODE_EMBEDDINGS_INDEX_MAPPING: json = { - "mappings": { - "properties": { - "episode_id": {"type": "keyword"}, - "episode_metadata": { - "properties": { - "season": {"type": "integer"}, - "episode_number": {"type": "integer"}, - "title": {"type": "text", "fields": {"keyword": {"type": "keyword"}}}, - "premiere_date": {"type": "date", "format": "dd.MM.yyyy||d.MM.yyyy||d.M.yyyy||yyyy-MM-dd||strict_date_optional_time||epoch_millis"}, - "series_name": {"type": "keyword"}, - "viewership": {"type": "keyword"}, - }, - }, - "full_transcript": {"type": "text"}, - "transcript_length": {"type": "integer"}, - "full_episode_embedding": { - "type": "dense_vector", - "dims": settings.embedding_model.embedding_dim, - "index": True, - "similarity": "cosine", - }, - "video_path": {"type": "keyword"}, - }, - }, - } - - SOUND_EVENTS_INDEX_MAPPING = { - "mappings": { - "properties": { - "episode_id": {"type": "keyword"}, - "episode_metadata": { - "properties": { - "season": {"type": "integer"}, - "episode_number": {"type": "integer"}, - "title": {"type": "text"}, - }, - }, - "segment_id": {"type": "integer"}, - "text": {"type": "text", "analyzer": "standard"}, - "sound_type": {"type": "keyword"}, - "start_time": {"type": "float"}, - "end_time": {"type": "float"}, - "video_path": {"type": "keyword"}, - "scene_info": { - "properties": { - "scene_id": {"type": "integer"}, - "scene_start": {"type": "float"}, - "scene_end": {"type": "float"}, - }, - }, - }, - }, - } - - SOUND_EVENT_EMBEDDINGS_INDEX_MAPPING = { - "mappings": { - "properties": { - "episode_id": {"type": "keyword"}, - "episode_metadata": { - "properties": { - "season": {"type": "integer"}, - "episode_number": {"type": "integer"}, - "title": {"type": "text"}, - }, - }, - "embedding_id": {"type": "integer"}, - "segment_range": {"type": "integer_range"}, - "text": {"type": "text"}, - "sound_types": {"type": "keyword"}, - "start_time": {"type": "float"}, - "end_time": {"type": "float"}, - "sound_event_embedding": { - "type": "dense_vector", - "dims": settings.embedding_model.embedding_dim, - "index": True, - "similarity": "cosine", - }, - "video_path": {"type": "keyword"}, - }, - }, - } - - @staticmethod - async def connect_to_elasticsearch( - es_host: str, - es_user: str, - es_pass: str, - logger, - ) -> AsyncElasticsearch: - es_config = { - "hosts": [es_host], - "verify_certs": False, - "request_timeout": 30, - "max_retries": 3, - "retry_on_timeout": True, - } - - if es_user and es_pass: - es_config["basic_auth"] = (es_user, es_pass) - - es = AsyncElasticsearch(**es_config) - try: - if not await es.ping(): - raise es_exceptions.ConnectionError("Failed to connect to Elasticsearch") - logger.info(f"Connected to Elasticsearch at {es_host}") - return es - except (es_exceptions.ConnectionError, Exception) as e: - error_msg = f"Cannot connect to Elasticsearch at {es_host}" - if "Connection refused" in str(e) or "Failed to establish" in str(e): - logger.error(f"{error_msg} - is Elasticsearch running?") - else: - logger.error(f"{error_msg}: {str(e)}") - raise es_exceptions.ConnectionError(error_msg) from e -# pylint: enable=duplicate-code diff --git a/preprocessor/series_configs/README.md b/preprocessor/series_configs/README.md new file mode 100644 index 000000000..ce277fb8c --- /dev/null +++ b/preprocessor/series_configs/README.md @@ -0,0 +1,74 @@ +# Series Configs + +Każda seria to `{series_name}.json` zawierający **tylko różnice** względem `defaults.json`. + +## Wymagane pola + +```json +{ + "series_name": "nazwa_serii", + "display_name": "Nazwa Wyświetlana", + "indexing": { "elasticsearch": { "index_name": "nazwa_serii_clips" } }, + "scraping": { + "episodes": { "urls": ["https://..."] }, + "characters": { "urls": ["https://..."] } + } +} +``` + +> `series_name` musi zgadzać się z nazwą pliku i katalogu `input_data/`. +> `urls` są **zawsze wymagane** przez parser — nawet jeśli scraper jest w `skip_steps`. Jeśli dane masz ręcznie, wpisz URL źródłowy skąd pochodzą (dla dokumentacji). + +## Pipeline mode + +| Wartość | Opis | +|---|---| +| `"full"` (domyślny) | Uruchamia wszystkie kroki | +| `"selective"` | Pomija kroki z listy `skip_steps` | + +## skip_steps + +| ID | Co pomija | +|---|---| +| `episode_scraper` | Scrapowanie listy odcinków | +| `character_scraper` | Scrapowanie listy postaci | +| `character_reference` | Pobieranie zdjęć referencyjnych postaci | +| `transcription` | Transkrypcja audio | +| `index_to_elasticsearch` | Wysyłanie do Elasticsearch | +| `generate_archives` | Generowanie archiwów ZIP | + +Jednorazowe pominięcie bez zmiany configa: `run-all --series X --skip index_to_elasticsearch`. + +## Transkrypcja + +```json +"processing": { "transcription": { "mode": "elevenlabs" } } +``` + +| `mode` | Opis | +|---|---| +| `"whisper"` (domyślny) | Lokalny model Whisper (CUDA) | +| `"elevenlabs"` / `"11labs"` | API ElevenLabs (`ELEVENLABS_API_KEY`) | + +Import gotowych transkrypcji (format 11labs): +```json +"processing": { + "transcription_import": { "format_type": "11labs_segmented", "source_dir": "/transcriptions/nazwa_serii" } +} +``` + +## Zdjęcia referencyjne postaci + +```json +"scraping": { "character_references": { "images_per_character": 2, "search_engine": "google" } } +``` + +`images_per_character: 0` pomija pobieranie. Domyślna wyszukiwarka: `"duckduckgo"` (bez API). `"google"` wymaga SerpAPI. + +## Elasticsearch + +```json +"indexing": { "elasticsearch": { "index_name": "nazwa_serii_clips", "host": "localhost:9200", "append": false, "dry_run": false } } +``` + +`dry_run: true` — generuje dokumenty ale nie wysyła. `append: true` — dopisuje do istniejącego indeksu. diff --git a/preprocessor/series_configs/defaults.json b/preprocessor/series_configs/defaults.json new file mode 100644 index 000000000..126d6aadf --- /dev/null +++ b/preprocessor/series_configs/defaults.json @@ -0,0 +1,51 @@ +{ + "_comment": "Domy\u015blna konfiguracja - wszystkie serie dziedzicz\u0105 te ustawienia", + "_note": "Seria mo\u017ce nadpisa\u0107 dowolne pole w swoim pliku JSON", + "indexing": { + "elasticsearch": { + "append": false, + "dry_run": false, + "host": "localhost:9200" + } + }, + "pipeline_mode": "full", + "processing": { + "frame_export": { + "frames_per_scene": 1 + }, + "scene_detection": { + "min_scene_len": 10, + "threshold": 0.5 + }, + "transcode": { + "bitrate_boost_ratio": 1.1, + "force_deinterlace": false, + "keyframe_interval_seconds": 0.5, + "max_bitrate_duration_seconds": 100.0, + "max_bitrate_file_size_mb": 50.0, + "min_bitrate_mbps": 2.0, + "resolution": "720p" + }, + "transcription": { + "device": "cuda", + "language": "pl", + "mode": "whisper", + "model": "large-v3-turbo" + } + }, + "scraping": { + "character_references": { + "images_per_character": 3, + "search_engine": "duckduckgo", + "search_query_template": "Serial {series_name} {char_name} posta\u0107", + "source": "clusters" + }, + "characters": { + "parser_mode": "normal" + }, + "episodes": { + "parser_mode": "normal" + } + }, + "skip_steps": [] +} diff --git a/preprocessor/series_configs/kapitan_bomba.json b/preprocessor/series_configs/kapitan_bomba.json new file mode 100644 index 000000000..5b81694cb --- /dev/null +++ b/preprocessor/series_configs/kapitan_bomba.json @@ -0,0 +1,37 @@ +{ + "display_name": "Kapitan Bomba", + "indexing": { + "elasticsearch": { + "host": "http://192.168.1.210:19200", + "index_name": "kapitan_bomba" + } + }, + "processing": { + "transcription_import": { + "format_type": "11labs_segmented", + "season_remap": { + "10": 0 + }, + "source_dir": "/transcriptions/kapitan_bomba" + } + }, + "scraping": { + "character_references": { + "images_per_character": 0 + }, + "characters": { + "urls": [ + "https://dubbingpedia.pl/wiki/Kapitan_Bomba" + ] + }, + "episodes": { + "urls": [ + "https://pl.wikipedia.org/wiki/Kapitan_Bomba_(serial_animowany)" + ] + } + }, + "series_name": "kapitan_bomba", + "skip_steps": [ + "episode_scraper" + ] +} diff --git a/preprocessor/series_configs/kiepscy.json b/preprocessor/series_configs/kiepscy.json new file mode 100644 index 000000000..5b2921ccf --- /dev/null +++ b/preprocessor/series_configs/kiepscy.json @@ -0,0 +1,35 @@ +{ + "_comment": "Konfiguracja dla Kiepscy - tylko zmiany wzgl\u0119dem defaults.json", + "display_name": "\u015awiat wed\u0142ug Kiepskich", + "indexing": { + "elasticsearch": { + "index_name": "kiepscy" + } + }, + "processing": { + "transcode": { + "force_deinterlace": false + }, + "transcription": { + "mode": "11labs" + } + }, + "scraping": { + "character_references": { + "search_engine": "google" + }, + "characters": { + "parser_mode": "premium", + "urls": [ + "https://pl.wikipedia.org/wiki/Lista_postaci_serialu_%C5%9Awiat_wed%C5%82ug_Kiepskich" + ] + }, + "episodes": { + "parser_mode": "premium", + "urls": [ + "https://pl.wikipedia.org/wiki/Lista_odcink%C3%B3w_serialu_%C5%9Awiat_wed%C5%82ug_Kiepskich" + ] + } + }, + "series_name": "kiepscy" +} diff --git a/preprocessor/series_configs/pingwiny_z_madagaskaru.json b/preprocessor/series_configs/pingwiny_z_madagaskaru.json new file mode 100644 index 000000000..b8fc8a534 --- /dev/null +++ b/preprocessor/series_configs/pingwiny_z_madagaskaru.json @@ -0,0 +1,37 @@ +{ + "_comment": "Konfiguracja dla Pingwin\u00f3w z Madagaskaru - tylko zmiany wzgl\u0119dem defaults.json", + "_note": "Metadane odcink\u00f3w i bohater\u00f3w dostarczone r\u0119cznie. Transkrypcje przez ElevenLabs. Indeksowanie do ES r\u0119czne.", + "display_name": "Pingwiny z Madagaskaru", + "indexing": { + "elasticsearch": { + "index_name": "pingwiny_z_madagaskaru_clips" + } + }, + "pipeline_mode": "selective", + "processing": { + "transcription": { + "mode": "elevenlabs" + } + }, + "scraping": { + "character_references": { + "images_per_character": 2 + }, + "characters": { + "urls": [ + "https://pl.wikipedia.org/wiki/Lista_postaci_serialu_Pingwiny_z_Madagaskaru" + ] + }, + "episodes": { + "urls": [ + "https://pl.wikipedia.org/wiki/Lista_odcink%C3%B3w_serialu_Pingwiny_z_Madagaskaru" + ] + } + }, + "series_name": "pingwiny_z_madagaskaru", + "skip_steps": [ + "episode_scraper", + "character_scraper", + "index_to_elasticsearch" + ] +} diff --git a/preprocessor/series_configs/ranczo.json b/preprocessor/series_configs/ranczo.json new file mode 100644 index 000000000..9c1be0ef1 --- /dev/null +++ b/preprocessor/series_configs/ranczo.json @@ -0,0 +1,40 @@ +{ + "_comment": "Konfiguracja dla Ranczo - tylko zmiany wzgl\u0119dem defaults.json", + "_note": "Metadane, bohaterowie i transkrypcje przygotowane r\u0119cznie/z 11labs", + "display_name": "Ranczo", + "indexing": { + "elasticsearch": { + "index_name": "ranczo_clips" + } + }, + "pipeline_mode": "selective", + "processing": { + "transcription": { + "mode": "elevenlabs" + } + }, + "scraping": { + "character_references": { + "search_engine": "google" + }, + "characters": { + "parser_mode": "premium", + "urls": [ + "https://ranczo.fandom.com/pl/wiki/Postacie" + ] + }, + "episodes": { + "parser_mode": "premium", + "urls": [ + "https://ranczo.fandom.com/pl/wiki/Lista_odcink\u00f3w" + ] + } + }, + "series_name": "ranczo", + "skip_steps": [ + "episode_scraper", + "character_scraper", + "character_reference", + "transcription" + ] +} diff --git a/preprocessor/series_configs/sejm_demo.json b/preprocessor/series_configs/sejm_demo.json new file mode 100644 index 000000000..2164929bb --- /dev/null +++ b/preprocessor/series_configs/sejm_demo.json @@ -0,0 +1,31 @@ +{ + "display_name": "Sejm RP - Demo", + "indexing": { + "elasticsearch": { + "index_name": "sejm_demo" + } + }, + "pipeline_mode": "selective", + "scraping": { + "character_references": { + "images_per_character": 2, + "search_engine": "normal", + "search_query_template": "{char_name} pose\u0142" + }, + "characters": { + "urls": [ + "https://www.sejm.gov.pl/Sejm10.nsf/poslowie.xsp" + ] + }, + "episodes": { + "urls": [ + "https://www.sejm.gov.pl/Sejm10.nsf/transmisje.xsp" + ] + } + }, + "series_name": "sejm_demo", + "skip_steps": [ + "episode_scraper", + "character_scraper" + ] +} diff --git a/preprocessor/series_configs/template.json b/preprocessor/series_configs/template.json new file mode 100644 index 000000000..1f47d56cf --- /dev/null +++ b/preprocessor/series_configs/template.json @@ -0,0 +1,44 @@ +{ + "_comment": "Template - skopiuj i edytuj dla swojej serii", + "_instruction": "1. Skopiuj jako {nazwa_serii}.json, 2. Wype\u0142nij wymagane pola, 3. Dodaj tylko te opcje kt\u00f3re si\u0119 r\u00f3\u017cni\u0105 od defaults", + "_note": "Podaj tylko ZMIANY wzgl\u0119dem defaults.json (nie przepisuj wszystkiego!)", + "_optional_overrides": { + "_comment": "Poni\u017cej opcjonalne nadpisania defaults.json - odkomentuj i dostosuj wed\u0142ug potrzeb", + "pipeline_mode_example": "selective (je\u015bli masz gotowe dane)", + "processing.transcode.codec_example": "h265_nvenc (je\u015bli chcesz HEVC)", + "processing.transcode.resolution_example": "1080p (je\u015bli chcesz wy\u017csz\u0105 jako\u015b\u0107)", + "processing.transcription.mode_example": "elevenlabs (je\u015bli masz 11labs API)", + "scraping.character_references.search_engine_example": "google (je\u015bli masz SerpAPI)", + "scraping.episodes.parser_mode_example": "premium (je\u015bli masz Gemini API)", + "skip_steps_example": [ + "episode_scraper", + "transcription" + ] + }, + "_required_fields": { + "display_name": "WYMAGANE - nazwa wy\u015bwietlana", + "indexing.elasticsearch.index_name": "WYMAGANE - nazwa indeksu ES", + "scraping.characters.urls": "WYMAGANE - lista URLi do stron z postaciami", + "scraping.episodes.urls": "WYMAGANE - lista URLi do stron z odcinkami", + "series_name": "WYMAGANE - nazwa serii (musi zgadza\u0107 si\u0119 z nazw\u0105 pliku i katalogu input_data/)" + }, + "display_name": "Nazwa Wy\u015bwietlana", + "indexing": { + "elasticsearch": { + "index_name": "nazwa_serii_clips" + } + }, + "scraping": { + "characters": { + "urls": [ + "https://example.com/wiki/Characters" + ] + }, + "episodes": { + "urls": [ + "https://example.com/wiki/Episodes" + ] + } + }, + "series_name": "nazwa_serii" +} diff --git a/preprocessor/cli/options/__init__.py b/preprocessor/services/__init__.py similarity index 100% rename from preprocessor/cli/options/__init__.py rename to preprocessor/services/__init__.py diff --git a/preprocessor/services/ai/__init__.py b/preprocessor/services/ai/__init__.py new file mode 100644 index 000000000..6792c58b0 --- /dev/null +++ b/preprocessor/services/ai/__init__.py @@ -0,0 +1,8 @@ +from preprocessor.services.ai.models import ( + CharacterInfo, + EpisodeInfo, + SeasonMetadata, +) +from preprocessor.services.ai.provider import LLMProvider + +__all__ = ['LLMProvider', 'EpisodeInfo', 'SeasonMetadata', 'CharacterInfo'] diff --git a/preprocessor/services/ai/clients.py b/preprocessor/services/ai/clients.py new file mode 100644 index 000000000..042a23ffe --- /dev/null +++ b/preprocessor/services/ai/clients.py @@ -0,0 +1,113 @@ +from abc import ( + ABC, + abstractmethod, +) +from typing import ( + Dict, + List, + Optional, +) + +from openai import OpenAI +from vllm import ( + LLM, + SamplingParams, +) + +from preprocessor.config.settings_instance import settings +from preprocessor.services.ui.console import console + + +class BaseLLMClient(ABC): + @abstractmethod + def generate(self, messages: List[Dict[str, str]], max_tokens: int = 32768) -> str: + pass + + +class VLLMClient(BaseLLMClient): + __DEFAULT_MODEL_NAME = 'Qwen/Qwen3.5-9B' + + def __init__(self, model_name: Optional[str] = None) -> None: + self.__model_name = model_name or self.__DEFAULT_MODEL_NAME + self.__model: Optional[LLM] = None + self.__load_model() + + def generate(self, messages: List[Dict[str, str]], max_tokens: int = 32768) -> str: + if self.__model is None: + raise RuntimeError('Model not initialized') + + sampling_params = SamplingParams( + temperature=1.0, + top_p=0.95, + top_k=20, + min_p=0.0, + presence_penalty=1.5, + repetition_penalty=1.0, + max_tokens=max_tokens, + ) + outputs = self.__model.chat( + messages=[messages], + sampling_params=sampling_params, + chat_template_kwargs={'enable_thinking': False}, + ) + return outputs[0].outputs[0].text.strip() + + def __load_model(self) -> None: + console.print(f'[cyan]Loading LLM: {self.__model_name} (vLLM, 256K context)[/cyan]') + try: + self.__model = LLM( + model=self.__model_name, + trust_remote_code=True, + max_model_len=262144, + gpu_memory_utilization=0.90, + tensor_parallel_size=1, + dtype='bfloat16', + enable_chunked_prefill=True, + max_num_batched_tokens=16384, + enforce_eager=True, + disable_log_stats=True, + language_model_only=True, + ) + console.print('[green]LLM loaded successfully (vLLM)[/green]') + except Exception as e: + console.print(f'[red]Failed to load model: {e}[/red]') + raise + + +class GeminiClient(BaseLLMClient): + def __init__( + self, + model_name: str = 'gemini-2.5-flash', + base_url: str = 'https://generativelanguage.googleapis.com/v1beta/openai/', + api_key: Optional[str] = None, + ) -> None: + self.__model_name = model_name + self.__base_url = base_url + self.__api_key = api_key or settings.gemini.api_key + self.__client: Optional[OpenAI] = None + self.__init_client() + + def generate(self, messages: List[Dict[str, str]], max_tokens: int = 32768) -> str: + if self.__client is None: + raise RuntimeError('Gemini client not initialized') + + response = self.__client.chat.completions.create( + model=self.__model_name, + messages=messages, # type: ignore[arg-type] + ) + return response.choices[0].message.content.strip() + + def __init_client(self) -> None: + console.print(f'[cyan]Initializing {self.__model_name} via OpenAI SDK...[/cyan]') + try: + if not self.__api_key: + raise ValueError('GEMINI_API_KEY not set in environment') + + self.__client = OpenAI( + base_url=self.__base_url, + api_key=self.__api_key, + ) + console.print(f'[green]{self.__model_name} initialized[/green]') + except Exception as e: + console.print(f'[red]Failed to initialize Gemini client: {e}[/red]') + raise diff --git a/preprocessor/services/ai/models.py b/preprocessor/services/ai/models.py new file mode 100644 index 000000000..fbd102592 --- /dev/null +++ b/preprocessor/services/ai/models.py @@ -0,0 +1,64 @@ +from typing import ( + Dict, + List, + Optional, +) + +from pydantic import ( + BaseModel, + field_validator, + model_validator, +) + + +class EpisodeInfo(BaseModel): + episode_in_season: int + overall_episode_number: int + premiere_date: Optional[str] = None + title: str + viewership: Optional[str] = None + + @field_validator('viewership', mode='before') + @classmethod + def _convert_viewership_to_str(cls, v: Optional[int]) -> Optional[str]: + if v is None: + return None + if isinstance(v, int): + return str(v) + return v + + +class SeasonMetadata(BaseModel): + episodes: List[EpisodeInfo] + season_number: int + + @model_validator(mode='before') + @classmethod + def _convert_old_format(cls, data: Dict) -> Dict: + if isinstance(data, dict) and 'episodes' in data: + for idx, episode in enumerate(data['episodes'], start=1): + if isinstance(episode, dict) and 'episode_number' in episode and ('episode_in_season' not in episode): + episode['episode_in_season'] = idx + episode['overall_episode_number'] = episode['episode_number'] + del episode['episode_number'] + return data + + +class AllSeasonsMetadata(BaseModel): + seasons: List[SeasonMetadata] + + +class EpisodeMetadata(BaseModel): + description: str + episode_number: Optional[int] = None + season: Optional[int] = None + summary: str + title: str + + +class CharacterInfo(BaseModel): + name: str + + +class CharactersList(BaseModel): + characters: List[CharacterInfo] diff --git a/preprocessor/services/ai/provider.py b/preprocessor/services/ai/provider.py new file mode 100644 index 000000000..6c44c462e --- /dev/null +++ b/preprocessor/services/ai/provider.py @@ -0,0 +1,129 @@ +import json +from typing import ( + Any, + Dict, + List, + Optional, + Type, +) + +from pydantic import BaseModel + +from preprocessor.config.enums import ParserMode +from preprocessor.config.prompts import ( + extract_all_seasons_system, + extract_all_seasons_user, + extract_characters_system, + extract_characters_user, +) +from preprocessor.services.ai.clients import ( + BaseLLMClient, + GeminiClient, + VLLMClient, +) +from preprocessor.services.ai.models import ( + AllSeasonsMetadata, + CharacterInfo, + CharactersList, + SeasonMetadata, +) +from preprocessor.services.ui.console import console + + +class LLMProvider: + def __init__( + self, + model_name: Optional[str] = None, + parser_mode: Optional[ParserMode] = None, + ) -> None: + self.__parser_mode = parser_mode or ParserMode.NORMAL + + if self.__parser_mode == ParserMode.PREMIUM: + self.__client: BaseLLMClient = GeminiClient() + else: + self.__client: BaseLLMClient = VLLMClient(model_name=model_name) + + def extract_all_seasons( + self, scraped_pages: List[Dict[str, Any]], + ) -> Optional[List[SeasonMetadata]]: + combined_content = self.__build_combined_content(scraped_pages) + + result = self.__process_llm_request( + system_prompt=extract_all_seasons_system.get(), + user_prompt=extract_all_seasons_user.get().format( + num_sources=len(scraped_pages), + combined_content=combined_content, + ), + response_model=AllSeasonsMetadata, + error_context='extraction failed', + ) + return result.seasons if result else None + + def extract_characters( + self, + scraped_pages: List[Dict[str, Any]], + series_name: str, + ) -> Optional[List[CharacterInfo]]: + combined_content = self.__build_combined_content(scraped_pages) + + result = self.__process_llm_request( + system_prompt=extract_characters_system.get(), + user_prompt=extract_characters_user.get().format( + num_sources=len(scraped_pages), + series_name=series_name, + combined_content=combined_content, + ), + response_model=CharactersList, + error_context='character extraction failed', + ) + return result.characters if result else None + + @staticmethod + def __build_combined_content(scraped_pages: List[Dict[str, Any]]) -> str: + combined_parts: List[str] = [] + for i, page in enumerate(scraped_pages, 1): + url: str = page['url'] + markdown: str = page['markdown'] + combined_parts.append( + f'\n\n=== SOURCE {i}: {url} ===\n\n{markdown}\n', + ) + return ''.join(combined_parts) + + @staticmethod + def __extract_json(content: str) -> Dict[str, Any]: + try: + if '```json' in content: + start = content.find('```json') + 7 + end = content.find('```', start) + json_str = content[start:end].strip() + elif '```' in content: + start = content.find('```') + 3 + end = content.find('```', start) + json_str = content[start:end].strip() + else: + json_str = content.strip() + + return json.loads(json_str) + except json.JSONDecodeError as e: + console.print(f'[red]JSON parse error: {e}[/red]') + console.print(f'[yellow]Raw content:\n{content}[/yellow]') + raise + + def __process_llm_request( + self, + system_prompt: str, + user_prompt: str, + response_model: Type[BaseModel], + error_context: str, + ) -> Optional[BaseModel]: + try: + messages = [ + {'role': 'system', 'content': system_prompt}, + {'role': 'user', 'content': user_prompt}, + ] + content = self.__client.generate(messages) + data = self.__extract_json(content) + return response_model(**data) + except Exception as e: + console.print(f'[red]LLM {error_context}: {e}[/red]') + return None diff --git a/preprocessor/cli/pipeline/__init__.py b/preprocessor/services/audio/__init__.py similarity index 100% rename from preprocessor/cli/pipeline/__init__.py rename to preprocessor/services/audio/__init__.py diff --git a/preprocessor/services/audio/extraction.py b/preprocessor/services/audio/extraction.py new file mode 100644 index 000000000..74868f967 --- /dev/null +++ b/preprocessor/services/audio/extraction.py @@ -0,0 +1,73 @@ +from pathlib import Path + +from preprocessor.config.step_configs import AudioExtractionConfig +from preprocessor.core.artifacts import ( + AudioArtifact, + SourceVideo, +) +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext +from preprocessor.services.media.ffmpeg import FFmpegWrapper + + +class AudioExtractionStep(PipelineStep[SourceVideo, AudioArtifact, AudioExtractionConfig]): + def _process(self, input_data: SourceVideo, context: ExecutionContext) -> AudioArtifact: + raise NotImplementedError("AudioExtractionStep uses execute() instead of _process()") + + def execute(self, input_data: SourceVideo, context: ExecutionContext) -> AudioArtifact: + output_path = self.__resolve_output_path(input_data, context) + + if self.__is_cached(input_data, output_path, context): + context.logger.info(f'Skipping {input_data.episode_id} (cached audio)') + return self.__create_artifact(input_data, output_path) + + context.logger.info(f'Extracting audio for {input_data.episode_id}') + context.mark_step_started(self.name, input_data.episode_id) + + self.__extract_audio(input_data.path, output_path, context) + + context.mark_step_completed(self.name, input_data.episode_id) + return self.__create_artifact(input_data, output_path) + + def __resolve_output_path(self, input_data: SourceVideo, context: ExecutionContext) -> Path: + episode_code = input_data.episode_info.episode_code() + output_filename = f'{context.series_name}_{episode_code}.{self.config.format}' + + return context.get_output_path( + input_data.episode_info, + 'extracted_audio', + output_filename, + ) + + def __is_cached( + self, input_data: SourceVideo, output_path: Path, context: ExecutionContext, + ) -> bool: + if not output_path.exists() or context.force_rerun: + return False + + return context.is_step_completed(self.name, input_data.episode_id) + + def __extract_audio( + self, input_path: Path, output_path: Path, context: ExecutionContext, + ) -> None: + try: + FFmpegWrapper.extract_audio( + input_path, + output_path, + codec='pcm_s16le', + sample_rate=self.config.sample_rate, + channels=self.config.channels, + ) + except Exception as e: + context.logger.error(f'FFmpeg audio extraction failed: {e}') + if output_path.exists(): + output_path.unlink() + raise + + def __create_artifact(self, input_data: SourceVideo, output_path: Path) -> AudioArtifact: + return AudioArtifact( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=output_path, + format=self.config.format, + ) diff --git a/preprocessor/services/characters/__init__.py b/preprocessor/services/characters/__init__.py new file mode 100644 index 000000000..0140041a3 --- /dev/null +++ b/preprocessor/services/characters/__init__.py @@ -0,0 +1,19 @@ +from preprocessor.services.characters.cluster_folder_manager import ClusterFolderManager +from preprocessor.services.characters.face_clusterer import FaceClusterer +from preprocessor.services.characters.face_detection import FaceDetector +from preprocessor.services.characters.image_search import ( + BaseImageSearch, + BrowserBingImageSearch, + BrowserDuckDuckGoImageSearch, + GoogleImageSearch, +) + +__all__ = [ + 'BaseImageSearch', + 'BrowserBingImageSearch', + 'BrowserDuckDuckGoImageSearch', + 'ClusterFolderManager', + 'FaceClusterer', + 'FaceDetector', + 'GoogleImageSearch', +] diff --git a/preprocessor/services/characters/cluster_folder_manager.py b/preprocessor/services/characters/cluster_folder_manager.py new file mode 100644 index 000000000..61aad8e4a --- /dev/null +++ b/preprocessor/services/characters/cluster_folder_manager.py @@ -0,0 +1,199 @@ +from collections import defaultdict +import hashlib +from pathlib import Path +import shutil +from typing import ( + Any, + Dict, + List, + Optional, + Tuple, +) + +import cv2 +from insightface.app import FaceAnalysis +import numpy as np + +from preprocessor.services.core.logging import ErrorHandlingLogger + + +class ClusterFolderManager: + @staticmethod + def create_cluster_folders( + face_data: List[Dict[str, Any]], + labels: np.ndarray, + output_dir: Path, + logger: Optional[ErrorHandlingLogger] = None, + ) -> int: + groups: Dict[int, List[Dict[str, Any]]] = defaultdict(list) + noise: List[Dict[str, Any]] = [] + for face_info, label in zip(face_data, labels): + if int(label) == -1: + noise.append(face_info) + else: + groups[int(label)].append(face_info) + + sorted_clusters = sorted(groups.items(), key=lambda x: len(x[1]), reverse=True) + output_dir.mkdir(parents=True, exist_ok=True) + + for rank, (_, faces) in enumerate(sorted_clusters): + ClusterFolderManager.__populate_cluster_dir(output_dir / str(rank), faces) + + if noise: + ClusterFolderManager.__populate_cluster_dir(output_dir / '_noise', noise) + + cluster_count = len(sorted_clusters) + if logger: + logger.info(f"Created {cluster_count} cluster folders in {output_dir}") + return cluster_count + + @staticmethod + def __populate_cluster_dir( + cluster_dir: Path, + faces: List[Dict[str, Any]], + ) -> None: + frames_dir = cluster_dir / 'frames' + faces_dir = cluster_dir / 'faces' + frames_dir.mkdir(parents=True, exist_ok=True) + faces_dir.mkdir(parents=True, exist_ok=True) + + for frame_rank, (frame_path, _, bbox) in enumerate( + ClusterFolderManager._rank_frames_by_centrality(faces), + ): + hash8 = hashlib.sha256(str(frame_path).encode()).hexdigest()[:8] + dest_name = f"{frame_rank:04d}_{frame_path.stem}_{hash8}{frame_path.suffix}" + + frame_dest = frames_dir / dest_name + if not frame_dest.exists(): + shutil.copy2(frame_path, frame_dest) + + face_dest = faces_dir / dest_name + if not face_dest.exists(): + ClusterFolderManager._save_face_crop(frame_path, bbox, face_dest) + + @staticmethod + def _save_face_crop( + frame_path: Path, + bbox: Tuple[int, int, int, int], + dest_path: Path, + ) -> None: + img = cv2.imread(str(frame_path)) + if img is None: + return + x1, y1, x2, y2 = bbox + crop = img[y1:y2, x1:x2] + if crop.size > 0: + cv2.imwrite(str(dest_path), crop) + + @staticmethod + def _rank_frames_by_centrality( + faces: List[Dict[str, Any]], + ) -> List[Tuple[Path, float, Tuple[int, int, int, int]]]: + vectors = np.array([f['vector'] for f in faces]) + centroid = np.mean(vectors, axis=0) + norm = np.linalg.norm(centroid) + if norm > 1e-6: + centroid /= norm + + frame_best: Dict[Path, Tuple[float, Tuple[int, int, int, int]]] = {} + for face_info in faces: + frame_path: Path = face_info['frame_path'] + sim = float(np.dot(face_info['vector'], centroid)) + bbox: Tuple[int, int, int, int] = face_info['bbox'] + if frame_path not in frame_best or sim > frame_best[frame_path][0]: + frame_best[frame_path] = (sim, bbox) + + return sorted( + [(path, sim, bbox) for path, (sim, bbox) in frame_best.items()], + key=lambda x: x[1], + reverse=True, + ) + + @staticmethod + def get_labeled_folders(cluster_dir: Path) -> Dict[str, Path]: + if not cluster_dir.exists(): + return {} + return { + d.name: d + for d in sorted(cluster_dir.iterdir()) + if d.is_dir() and not d.name.isdigit() and not d.name.startswith('_') + } + + @staticmethod + def is_complete( + cluster_dir: Path, + character_names: List[str], + ) -> Tuple[bool, List[str]]: + labeled = ClusterFolderManager.get_labeled_folders(cluster_dir) + normalized_labels = {ClusterFolderManager._normalize_name(n) for n in labeled} + missing = [ + name for name in character_names + if ClusterFolderManager._normalize_name(name) not in normalized_labels + ] + return len(missing) == 0, missing + + @staticmethod + def extract_face_vector( + cluster_folder: Path, + face_app: FaceAnalysis, + logger: Optional[ErrorHandlingLogger] = None, + ) -> Optional[np.ndarray]: + frames_dir = cluster_folder / 'frames' + search_dir = frames_dir if frames_dir.exists() else cluster_folder + frame_files = sorted(search_dir.glob('*.jpg')) + if not frame_files: + if logger: + logger.warning(f"No frames in {cluster_folder}") + return None + + all_embeddings: List[np.ndarray] = [] + for frame_path in frame_files: + img = cv2.imread(str(frame_path)) + if img is None: + continue + for face in face_app.get(img): + all_embeddings.append(face.normed_embedding) + + if not all_embeddings: + if logger: + logger.warning(f"No faces detected in {cluster_folder}") + return None + + vectors = np.array(all_embeddings) + dominant = ClusterFolderManager._find_dominant_embedding(vectors) + if dominant is None: + return None + + norm = np.linalg.norm(dominant) + if norm < 1e-6: + return None + return dominant / norm + + @staticmethod + def _find_dominant_embedding(vectors: np.ndarray) -> Optional[np.ndarray]: + if len(vectors) == 1: + return vectors[0].copy() + + centroid = np.mean(vectors, axis=0) + norm = np.linalg.norm(centroid) + if norm < 1e-6: + return None + centroid = centroid / norm + + for _ in range(3): + sims = vectors @ centroid + threshold = float(np.percentile(sims, 30)) + mask = sims >= threshold + if mask.sum() < 1: + break + centroid = np.mean(vectors[mask], axis=0) + norm = np.linalg.norm(centroid) + if norm < 1e-6: + break + centroid = centroid / norm + + return centroid + + @staticmethod + def _normalize_name(name: str) -> str: + return name.lower().replace(' ', '_').replace('-', '_') diff --git a/preprocessor/services/characters/face_clusterer.py b/preprocessor/services/characters/face_clusterer.py new file mode 100644 index 000000000..c92c8834f --- /dev/null +++ b/preprocessor/services/characters/face_clusterer.py @@ -0,0 +1,182 @@ +from collections import ( + defaultdict, + deque, +) +from concurrent.futures import ThreadPoolExecutor +import gc +from itertools import islice +from pathlib import Path +from typing import ( + Any, + Dict, + Generator, + List, + Optional, + Tuple, +) + +from cuml.cluster import HDBSCAN as cuHDBSCAN +import cupy as cp +import cv2 +from insightface.app import FaceAnalysis +import numpy as np +import torch + + +class FaceClusterer: + @staticmethod + def extract_face_embeddings( + frame_files: List[Path], + face_app: FaceAnalysis, + prefetch_workers: int = 4, + min_det_score: float = 0.0, + min_face_px: int = 0, + ) -> List[Dict[str, Any]]: + face_data: List[Dict[str, Any]] = [] + + prefetch_size = prefetch_workers * 4 + for frame_path, img in FaceClusterer._prefetch_images(frame_files, prefetch_workers, prefetch_size): + if img is None: + continue + + for face_idx, face in enumerate(face_app.get(img)): + if face.det_score < min_det_score: + continue + + bbox = face.bbox.astype(int) + x1 = max(0, bbox[0]) + y1 = max(0, bbox[1]) + x2 = min(img.shape[1], bbox[2]) + y2 = min(img.shape[0], bbox[3]) + + if x2 <= x1 or y2 <= y1: + continue + if (x2 - x1) < min_face_px or (y2 - y1) < min_face_px: + continue + + face_data.append({ + 'vector': face.normed_embedding, + 'frame_path': frame_path, + 'face_idx': face_idx, + 'bbox': (x1, y1, x2, y2), + }) + + return face_data + + @staticmethod + def _prefetch_images( + frame_files: List[Path], + workers: int, + prefetch_size: int, + ) -> Generator[Tuple[Path, Optional[np.ndarray]], None, None]: + with ThreadPoolExecutor(max_workers=workers) as pool: + it = iter(frame_files) + pending: deque = deque( + (path, pool.submit(cv2.imread, str(path))) + for path in islice(it, prefetch_size) + ) + while pending: + path, future = pending.popleft() + try: + next_path = next(it) + pending.append((next_path, pool.submit(cv2.imread, str(next_path)))) + except StopIteration: + pass + yield path, future.result() + + @staticmethod + def cluster_embeddings( + face_data: List[Dict[str, Any]], + min_cluster_size: int, + min_samples: int, + ) -> np.ndarray: + n_samples = len(face_data) + if n_samples < 2: + return np.zeros(n_samples, dtype=np.intp) + + vectors = np.array([fd['vector'] for fd in face_data]) + vectors_gpu = cp.asarray(vectors) + + effective_min_samples = min(min_samples, n_samples) + effective_min_cluster_size = min(min_cluster_size, n_samples) + + clusterer = cuHDBSCAN( + min_cluster_size=effective_min_cluster_size, + min_samples=effective_min_samples, + metric='euclidean', + cluster_selection_method='eom', + ) + labels = clusterer.fit_predict(vectors_gpu) + return cp.asnumpy(labels) + + @staticmethod + def build_cluster_output( + face_data: List[Dict[str, Any]], + labels: np.ndarray, + save_noise: bool, + episode_id: str, + series_name: str, + min_cluster_size: int, + min_samples: int, + model_name: str, + total_frames: int, + ) -> Dict[str, Any]: + groups: Dict[int, List[Dict[str, Any]]] = defaultdict(list) + for face_info, label in zip(face_data, labels): + groups[int(label)].append(face_info) + + clusters, noise_info = FaceClusterer.__build_cluster_entries(groups, save_noise) + n_noise = len(groups.get(-1, [])) + frames_with_faces = len({fd['frame_path'] for fd in face_data}) + + return { + 'episode_id': episode_id, + 'series_name': series_name, + 'processing_params': { + 'min_cluster_size': min_cluster_size, + 'min_samples': min_samples, + 'metric': 'euclidean', + 'algorithm': 'hdbscan', + 'cluster_selection_method': 'eom', + 'model': model_name, + }, + 'statistics': { + 'total_faces_detected': len(face_data), + 'total_clusters': len(clusters), + 'noise_faces': n_noise, + 'frames_processed': total_frames, + 'frames_with_faces': frames_with_faces, + }, + 'clusters': clusters, + 'noise': noise_info if save_noise else {}, + } + + @staticmethod + def cleanup_gpu_memory() -> None: + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + @staticmethod + def __build_cluster_entries( + groups: Dict[int, List[Dict[str, Any]]], + save_noise: bool, + ) -> Tuple[Dict[str, Dict[str, Any]], Dict[str, Any]]: + clusters: Dict[str, Dict[str, Any]] = {} + noise_info: Dict[str, Any] = {} + + for cluster_id, faces in sorted(groups.items()): + frames_seen = sorted({fd['frame_path'].name for fd in faces}) + entry: Dict[str, Any] = { + 'face_count': len(faces), + 'frame_count': len(frames_seen), + 'frames': frames_seen, + 'character_name': None, + } + if cluster_id == -1: + if save_noise: + noise_info = entry + else: + clusters[f'cluster_{cluster_id}'] = entry + + return clusters, noise_info diff --git a/preprocessor/services/characters/face_detection.py b/preprocessor/services/characters/face_detection.py new file mode 100644 index 000000000..639866106 --- /dev/null +++ b/preprocessor/services/characters/face_detection.py @@ -0,0 +1,242 @@ +import os +from pathlib import Path +from typing import ( + Any, + Dict, + List, + Optional, + Tuple, +) +import warnings + +import cv2 +from insightface.app import FaceAnalysis +import numpy as np +from numpy.linalg import norm +import onnxruntime as ort + +from preprocessor.config.settings_instance import settings +from preprocessor.services.ui.console import console + +warnings.filterwarnings( + 'ignore', + message='.*estimate.*is deprecated.*', + category=FutureWarning, + module='insightface', +) + + +class FaceDetector: + @staticmethod + def detect_characters_in_frame( + frame_path: Path, + face_app: FaceAnalysis, + character_vectors: Dict[str, np.ndarray], + threshold: float, + ) -> List[Dict[str, Any]]: + img = cv2.imread(str(frame_path)) + if img is None: + return [] + + faces = face_app.get(img) + if not faces: + return [] + + detected = [] + for face in faces: + match = FaceDetector.__find_best_match( + face.normed_embedding, character_vectors, threshold, + ) + if match: + char_name, confidence = match + detected.append( + FaceDetector.__format_detection_result(char_name, confidence, face.bbox), + ) + + detected.sort(key=lambda x: x['confidence'], reverse=True) + return detected + + @staticmethod + def init(det_thresh: Optional[float] = None) -> FaceAnalysis: + model_root = os.getenv('INSIGHTFACE_HOME', os.path.expanduser('~/.insightface')) + FaceDetector.__check_cuda_availability() + + providers = FaceDetector.__build_providers_config() + + with warnings.catch_warnings(): + warnings.filterwarnings('ignore', category=UserWarning, module='onnxruntime') + warnings.filterwarnings('ignore', category=FutureWarning, module='insightface') + + face_app = FaceDetector.__init_face_app(model_root, providers, det_thresh) + FaceDetector.__verify_active_providers(face_app) + + FaceDetector.__print_init_success(model_root, det_thresh) + return face_app + + @staticmethod + def load_character_references( + characters_dir: Path, + face_app: FaceAnalysis, + ) -> Dict[str, np.ndarray]: + console.print('[blue]Loading character references...[/blue]') + character_vectors: Dict[str, np.ndarray] = {} + + for char_dir in characters_dir.iterdir(): + if not char_dir.is_dir(): + continue + + char_name = char_dir.name.replace('_', ' ').title() + vector = FaceDetector.__load_or_compute_vector(char_dir, char_name, face_app) + + if vector is not None: + character_vectors[char_name] = vector + + console.print(f'[green]Loaded {len(character_vectors)} characters[/green]') + return character_vectors + + @staticmethod + def __find_best_match( + face_embedding: np.ndarray, + character_vectors: Dict[str, np.ndarray], + threshold: float, + ) -> Optional[Tuple[str, float]]: + best_match = None + best_similarity = threshold + + for char_name, char_vector in character_vectors.items(): + similarity = float(np.dot(face_embedding, char_vector)) + if similarity > best_similarity: + best_similarity = similarity + best_match = char_name + + return (best_match, best_similarity) if best_match else None + + @staticmethod + def __format_detection_result( + char_name: str, + confidence: float, + bbox: np.ndarray, + ) -> Dict[str, Any]: + bbox_int = bbox.astype(int) + return { + 'name': char_name, + 'confidence': confidence, + 'bbox': { + 'x1': int(bbox_int[0]), + 'y1': int(bbox_int[1]), + 'x2': int(bbox_int[2]), + 'y2': int(bbox_int[3]), + }, + } + + @staticmethod + def __check_cuda_availability() -> None: + available_providers = ort.get_available_providers() + console.print(f"[dim]Available ONNX providers: {', '.join(available_providers)}[/dim]") + + if 'CUDAExecutionProvider' not in available_providers: + console.print('[red]CUDAExecutionProvider not available in onnxruntime[/red]') + console.print('[red] Check if onnxruntime-gpu is installed and CUDA libraries are accessible[/red]') + raise RuntimeError('CUDA provider not available in onnxruntime') + + @staticmethod + def __build_providers_config() -> List[Tuple[str, Dict[str, Any]]]: + return [( + 'CUDAExecutionProvider', + { + 'device_id': 0, + 'arena_extend_strategy': 'kNextPowerOfTwo', + 'gpu_mem_limit': 8 * 1024 * 1024 * 1024, + 'cudnn_conv_algo_search': 'EXHAUSTIVE', + 'do_copy_in_default_stream': True, + }, + )] + + @staticmethod + def __init_face_app( + model_root: str, + providers: List[Tuple[str, Dict[str, Any]]], + det_thresh_override: Optional[float], + ) -> FaceAnalysis: + model_name = settings.face_recognition.model_name + det_thresh = det_thresh_override if det_thresh_override is not None else settings.character.face_detection_threshold + console.print(f'[cyan]Loading {model_name} face detection model (GPU-only)...[/cyan]') + + try: + face_app = FaceAnalysis(name=model_name, root=model_root, providers=providers) + face_app.prepare( + ctx_id=0, + det_size=settings.face_recognition.detection_size, + det_thresh=det_thresh, + ) + return face_app + except Exception as e: + console.print('[red]Failed to initialize face detection on GPU[/red]') + console.print(f'[red] Error: {e}[/red]') + console.print('[red] Ensure CUDA and onnxruntime-gpu are properly configured[/red]') + raise RuntimeError('GPU required but face detection initialization failed') from e + + @staticmethod + def __verify_active_providers(face_app: FaceAnalysis) -> None: + actual_providers = face_app.models['detection'].session.get_providers() + if 'CUDAExecutionProvider' not in actual_providers: + console.print('[red]CUDA provider not active after initialization[/red]') + console.print(f"[red] Active providers: {', '.join(actual_providers)}[/red]") + raise RuntimeError('CUDA required but not available for face detection') + + @staticmethod + def __print_init_success(model_root: str, det_thresh_override: Optional[float]) -> None: + model_name = settings.face_recognition.model_name + det_size = settings.face_recognition.detection_size + det_thresh = det_thresh_override if det_thresh_override is not None else settings.character.face_detection_threshold + + console.print(f'[green]Face detection initialized ({model_name})[/green]') + console.print('[dim] Device: GPU (CUDA)[/dim]') + console.print(f'[dim] Detection size: {det_size}[/dim]') + console.print(f'[dim] Face detection threshold: {det_thresh}[/dim]') + console.print(f'[dim] Model cache: {model_root}[/dim]') + + @staticmethod + def __load_or_compute_vector( + char_dir: Path, + char_name: str, + face_app: FaceAnalysis, + ) -> Optional[np.ndarray]: + vector_file = char_dir / 'face_vector.npy' + if vector_file.exists(): + console.print(f'[dim]{char_name}: loaded from face_vector.npy[/dim]') + return np.load(vector_file) + + images = list(char_dir.glob('*.jpg')) + if not images: + return None + + embeddings = [] + for img_path in images: + emb = FaceDetector.__get_face_embedding(str(img_path), face_app) + if emb is not None: + embeddings.append(emb) + + if embeddings: + mean_emb = np.mean(embeddings, axis=0) + centroid = mean_emb / norm(mean_emb) + console.print(f'[green]{char_name}: {len(embeddings)} reference images[/green]') + return centroid + + return None + + @staticmethod + def __get_face_embedding(img_path: str, face_app: FaceAnalysis) -> Optional[np.ndarray]: + img = cv2.imread(img_path) + if img is None: + return None + + faces = face_app.get(img) + if not faces: + return None + + faces.sort( + key=lambda x: (x.bbox[2] - x.bbox[0]) * (x.bbox[3] - x.bbox[1]), + reverse=True, + ) + return faces[0].normed_embedding diff --git a/preprocessor/services/characters/image_search/__init__.py b/preprocessor/services/characters/image_search/__init__.py new file mode 100644 index 000000000..241b34030 --- /dev/null +++ b/preprocessor/services/characters/image_search/__init__.py @@ -0,0 +1,6 @@ +from preprocessor.services.characters.image_search.bing_image_search import BrowserBingImageSearch +from preprocessor.services.characters.image_search.duckduckgo_browser_image_search import BrowserDuckDuckGoImageSearch +from preprocessor.services.characters.image_search.google_image_search import GoogleImageSearch +from preprocessor.services.characters.image_search.image_search import BaseImageSearch + +__all__ = ['BaseImageSearch', 'BrowserBingImageSearch', 'BrowserDuckDuckGoImageSearch', 'GoogleImageSearch'] diff --git a/preprocessor/services/characters/image_search/bing_image_search.py b/preprocessor/services/characters/image_search/bing_image_search.py new file mode 100644 index 000000000..fdfd5b1d9 --- /dev/null +++ b/preprocessor/services/characters/image_search/bing_image_search.py @@ -0,0 +1,91 @@ +import signal +import time +from typing import ( + Any, + Dict, + Iterator, + List, +) +from urllib.parse import quote + +from patchright.sync_api import BrowserContext + +from preprocessor.services.characters.image_search.image_search import BaseImageSearch + +_SEARCH_URL = 'https://www.bing.com/images/search' +_PAGE_TIMEOUT_MS = 12000 +_LOAD_WAIT_S = 2.0 +_SCROLL_STEPS = 3 +_SCROLL_PAUSE_S = 1.0 +_HARD_TIMEOUT_S = 25 + + +class _SearchTimeout(Exception): + pass + + +class BrowserBingImageSearch(BaseImageSearch): + def __init__(self, browser_context: BrowserContext, max_results: int = 100) -> None: + super().__init__(max_results) + self.__browser_context = browser_context + + @property + def name(self) -> str: + return 'Bing Images (Browser)' + + def search(self, query: str) -> Iterator[Dict[str, str]]: + yield from self.__fetch_with_timeout(query) + + def __fetch_with_timeout(self, query: str) -> List[Dict[str, str]]: + page = self.__browser_context.new_page() + page.set_default_timeout(_PAGE_TIMEOUT_MS) + + old_handler = signal.signal(signal.SIGALRM, self.__raise_timeout) + signal.alarm(_HARD_TIMEOUT_S) + try: + url = f'{_SEARCH_URL}?q={quote(query)}&count={self._max_results}&form=HDRSC2' + page.goto(url, wait_until='commit', timeout=_PAGE_TIMEOUT_MS) + time.sleep(_LOAD_WAIT_S) + self.__scroll_for_more(page) + return self.__extract_results(page) + except Exception: + return [] + finally: + signal.alarm(0) + signal.signal(signal.SIGALRM, old_handler) + try: + page.close() + except Exception: + pass + + @staticmethod + def __raise_timeout(signum: int, frame: Any) -> None: + raise _SearchTimeout() + + @staticmethod + def __scroll_for_more(page: Any) -> None: + for _ in range(_SCROLL_STEPS): + try: + page.evaluate('window.scrollBy(0, window.innerHeight * 3)') + time.sleep(_SCROLL_PAUSE_S) + except Exception: + break + + def __extract_results(self, page: Any) -> List[Dict[str, str]]: + raw: List[Dict[str, str]] = page.evaluate("""() => { + const out = []; + for (const el of document.querySelectorAll('.iusc')) { + try { + const m = JSON.parse(el.getAttribute('m') || '{}'); + if (m.murl) out.push({image: m.murl, thumbnail: m.turl || ''}); + } catch(e) {} + } + if (out.length === 0) { + for (const img of document.querySelectorAll('img.mimg, img[data-src]')) { + const src = img.getAttribute('data-src') || img.src || ''; + if (src && src.startsWith('http')) out.push({image: src, thumbnail: src}); + } + } + return out; + }""") + return raw[:self._max_results] diff --git a/preprocessor/services/characters/image_search/duckduckgo_browser_image_search.py b/preprocessor/services/characters/image_search/duckduckgo_browser_image_search.py new file mode 100644 index 000000000..21f10ded7 --- /dev/null +++ b/preprocessor/services/characters/image_search/duckduckgo_browser_image_search.py @@ -0,0 +1,58 @@ +from typing import ( + Any, + Dict, + Iterator, + List, +) +from urllib.parse import quote + +from patchright.sync_api import BrowserContext + +from preprocessor.services.characters.image_search.image_search import BaseImageSearch + +_SEARCH_URL = 'https://duckduckgo.com/' +_NETWORK_IDLE_TIMEOUT = 15000 +_SCROLL_STEPS = 3 +_SCROLL_PAUSE_MS = 1500 + + +class BrowserDuckDuckGoImageSearch(BaseImageSearch): + def __init__(self, browser_context: BrowserContext, max_results: int = 100) -> None: + super().__init__(max_results) + self.__browser_context = browser_context + + @property + def name(self) -> str: + return 'DuckDuckGo Images (Browser)' + + def search(self, query: str) -> Iterator[Dict[str, str]]: + page = self.__browser_context.new_page() + collected: List[Dict[str, str]] = [] + + def _on_response(response: Any) -> None: + if 'duckduckgo.com/i.js' not in response.url: + return + try: + body = response.json() + for item in body.get('results', []): + url = item.get('image') or item.get('thumbnail', '') + if url: + collected.append({'image': url, 'thumbnail': item.get('thumbnail', '')}) + except Exception: + pass + + page.on('response', _on_response) + try: + url = f'{_SEARCH_URL}?q={quote(query)}&iax=images&ia=images' + page.goto(url, wait_until='networkidle', timeout=_NETWORK_IDLE_TIMEOUT) + self.__scroll_for_more(page, collected) + yield from collected[:self._max_results] + finally: + page.close() + + def __scroll_for_more(self, page: Any, collected: List[Dict[str, str]]) -> None: + for _ in range(_SCROLL_STEPS): + if len(collected) >= self._max_results: + break + page.evaluate('window.scrollBy(0, window.innerHeight * 3)') + page.wait_for_timeout(_SCROLL_PAUSE_MS) diff --git a/preprocessor/services/characters/image_search/google_image_search.py b/preprocessor/services/characters/image_search/google_image_search.py new file mode 100644 index 000000000..59fb15726 --- /dev/null +++ b/preprocessor/services/characters/image_search/google_image_search.py @@ -0,0 +1,43 @@ +from typing import ( + Any, + Dict, + Iterator, +) + +from serpapi import GoogleSearch + +from preprocessor.services.characters.image_search.image_search import BaseImageSearch + + +class GoogleImageSearch(BaseImageSearch): + def __init__(self, api_key: str, max_results: int = 50) -> None: + super().__init__(max_results) + + if not api_key: + raise ValueError('SerpAPI key is required for Google Image Search') + + self.__api_key = api_key + + @property + def name(self) -> str: + return 'Google Images (SerpAPI)' + + def search(self, query: str) -> Iterator[Dict[str, str]]: + params = self.__build_search_params(query) + raw_results = GoogleSearch(params).get_dict() + yield from self.__iter_image_data(raw_results) + + def __build_search_params(self, query: str) -> Dict[str, str]: + return { + 'engine': 'google_images', + 'q': query, + 'hl': 'pl', + 'gl': 'pl', + 'api_key': self.__api_key, + } + + def __iter_image_data(self, raw_results: Dict[str, Any]) -> Iterator[Dict[str, str]]: + for img in raw_results.get('images_results', [])[:self._max_results]: + url = img.get('original') or img.get('thumbnail', '') + if url: + yield {'image': url, 'thumbnail': img.get('thumbnail', '')} diff --git a/preprocessor/characters/base_image_search.py b/preprocessor/services/characters/image_search/image_search.py similarity index 56% rename from preprocessor/characters/base_image_search.py rename to preprocessor/services/characters/image_search/image_search.py index 9bdd4642e..3107cdde3 100644 --- a/preprocessor/characters/base_image_search.py +++ b/preprocessor/services/characters/image_search/image_search.py @@ -4,19 +4,19 @@ ) from typing import ( Dict, - List, + Iterator, ) class BaseImageSearch(ABC): - def __init__(self, max_results: int = 50): - self.max_results = max_results + def __init__(self, max_results: int = 50) -> None: + self._max_results = max_results + @property @abstractmethod - def search(self, query: str) -> List[Dict[str, str]]: + def name(self) -> str: pass - @property @abstractmethod - def name(self) -> str: + def search(self, query: str) -> Iterator[Dict[str, str]]: pass diff --git a/preprocessor/services/characters/image_search/serpapi_image_search.py.delete b/preprocessor/services/characters/image_search/serpapi_image_search.py.delete new file mode 100644 index 000000000..244569e41 --- /dev/null +++ b/preprocessor/services/characters/image_search/serpapi_image_search.py.delete @@ -0,0 +1,51 @@ +from typing import ( + Any, + Dict, + List, +) + +from serpapi import GoogleSearch + +from preprocessor.services.characters.image_search.image_search import BaseImageSearch + + +class SerpApiImageSearch(BaseImageSearch): + def __init__(self, api_key: str, max_results: int = 50) -> None: + super().__init__(max_results) + + if not api_key: + raise ValueError('SerpAPI key is required for Google Image Search') + + self.__api_key = api_key + + @property + def name(self) -> str: + return 'Google Images API (SerpAPI)' + + def search(self, query: str) -> List[Dict[str, str]]: + params = self.__build_search_params(query) + search_client = GoogleSearch(params) + raw_results = search_client.get_dict() + + return self.__extract_image_data(raw_results) + + def __build_search_params(self, query: str) -> Dict[str, str]: + return { + 'engine': 'google_images', + 'q': query, + 'hl': 'pl', + 'gl': 'pl', + 'api_key': self.__api_key, + } + + def __extract_image_data(self, raw_results: Dict[str, Any]) -> List[Dict[str, str]]: + images: List[Dict[str, str]] = [] + image_results = raw_results.get('images_results', [])[:self._max_results] + + for img_result in image_results: + images.append({ + 'image': img_result.get('original', ''), + 'thumbnail': img_result.get('thumbnail', ''), + }) + + return images diff --git a/preprocessor/services/characters/models.py b/preprocessor/services/characters/models.py new file mode 100644 index 000000000..013b4ebcf --- /dev/null +++ b/preprocessor/services/characters/models.py @@ -0,0 +1,20 @@ +from dataclasses import dataclass +from pathlib import Path +from typing import List + +import numpy as np + + +@dataclass(frozen=True) +class FaceData: + bbox: np.ndarray + face_img: np.ndarray + face_vector: np.ndarray + source_image_idx: int + source_image_path: Path + + +@dataclass(frozen=True) +class CandidateFace: + avg_similarity: float + faces: List[FaceData] diff --git a/preprocessor/services/characters/reference_downloader.py b/preprocessor/services/characters/reference_downloader.py new file mode 100644 index 000000000..a5d42e4eb --- /dev/null +++ b/preprocessor/services/characters/reference_downloader.py @@ -0,0 +1,357 @@ +from __future__ import annotations + +import io +import json +import logging +from pathlib import Path +import random +import time +from typing import ( + Any, + Dict, + Iterator, + List, + Optional, + Tuple, +) +import warnings + +from PIL import Image +import cv2 +from insightface.app import FaceAnalysis +import numpy as np +from patchright.sync_api import ( + BrowserContext, + Page, + Playwright, + sync_playwright, +) + +from preprocessor.config.settings_instance import settings +from preprocessor.services.characters.face_detection import FaceDetector +from preprocessor.services.characters.image_search import ( + BaseImageSearch, + BrowserBingImageSearch, + GoogleImageSearch, +) +from preprocessor.services.core.base_processor import ( + BaseProcessor, + OutputSpec, + ProcessingItem, +) +from preprocessor.services.ui.console import console + + +class CharacterReferenceDownloader(BaseProcessor): + def __init__(self, args: Dict[str, Any]) -> None: + super().__init__( + args=args, + class_name=self.__class__.__name__, + error_exit_code=8, + loglevel=logging.DEBUG, + ) + self.__characters_json: Path = self._args['characters_json'] + self.__series_name: str = self._args['series_name'] + self.__output_dir: Path = self._args.get( + 'output_dir', settings.character.get_output_dir(self.__series_name), + ) + self.__images_per_character: int = self._args.get( + 'images_per_character', settings.character.reference_images_per_character, + ) + + self.__max_results: int = settings.image_scraper.max_results_to_scrape + self.__min_width: int = settings.image_scraper.min_image_width + self.__min_height: int = settings.image_scraper.min_image_height + self.__search_engine_name: str = self._args.get('search_engine', 'normal') + self.__force_rerun: bool = self._args.get('force_rerun', False) + self.__search_query_template: str = self._args.get( + 'search_query_template', 'Serial {series_name} {char_name} postać', + ) + + self.__search_engine: Optional[BaseImageSearch] = None + self.__face_app: Optional[FaceAnalysis] = None + self.__playwright: Optional[Playwright] = None + self.__browser_context: Optional[BrowserContext] = None + + def get_output_subdir(self) -> str: + return 'character_references' + + def cleanup(self) -> None: + if self.__browser_context: + self.__browser_context.close() + if self.__playwright: + self.__playwright.stop() + + def _validate_args(self, args: Dict[str, Any]) -> None: + if 'characters_json' not in args: + raise ValueError("Argument 'characters_json' is required.") + + def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: + char_name = item.metadata['char_name'] + output_folder = self.__output_dir / char_name.replace(' ', '_').lower() + + exhausted_marker = output_folder / '.exhausted' + if not self.__force_rerun and exhausted_marker.exists(): + return [OutputSpec(path=exhausted_marker, required=True)] + + return [ + OutputSpec(path=output_folder / f'{i:02d}.jpg', required=True) + for i in range(self.__images_per_character) + ] + + def _get_processing_items(self) -> List[ProcessingItem]: + if not self.__characters_json.exists(): + console.print(f'[red]Characters JSON not found: {self.__characters_json}[/red]') + return [] + + with open(self.__characters_json, encoding='utf-8') as f: + data = json.load(f) + + return [ + ProcessingItem( + episode_id=f"char_{char['name']}", + input_path=self.__characters_json, + metadata={'char_name': char['name']}, + ) + for char in data.get('characters', []) + ] + + def _load_resources(self) -> bool: + self.__face_app = FaceDetector.init() + self.__playwright = sync_playwright().start() + self.__browser_context = self.__playwright.chromium.launch_persistent_context( + user_data_dir='/tmp/patchright_profile', + headless=True, + args=['--no-sandbox', '--disable-dev-shm-usage', '--disable-gpu'], + ignore_default_args=['--enable-automation'], + ) + self.__search_engine = self.__create_search_engine() + return True + + def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> None: + char_name = item.metadata['char_name'] + output_folder = self.__prepare_output_folder(char_name) + + saved_count = len(list(output_folder.glob('*.jpg'))) + if saved_count >= self.__images_per_character: + return + + assert self.__search_engine is not None + + search_query = self.__search_query_template.format( + series_name=self.__series_name, char_name=char_name, + ) + self.logger.info(f'Searching: {search_query}') + + saved_count = self.__execute_search_with_retries(search_query, char_name, output_folder, saved_count) + self.__log_final_results(char_name, saved_count) + self.__apply_random_delay() + + if saved_count == 0: + self.__mark_exhausted(output_folder, char_name) + + def __create_search_engine(self) -> BaseImageSearch: + if self.__search_engine_name == 'premium': + return GoogleImageSearch( + api_key=settings.image_scraper.serpapi_key, + max_results=self.__max_results, + ) + return BrowserBingImageSearch( + browser_context=self.__browser_context, + max_results=self.__max_results, + ) + + def __prepare_output_folder(self, char_name: str) -> Path: + output_folder = self.__output_dir / char_name.replace(' ', '_').lower() + output_folder.mkdir(parents=True, exist_ok=True) + return output_folder + + def __execute_search_with_retries( + self, query: str, char_name: str, output_folder: Path, saved_count: int, + ) -> int: + for attempt in range(settings.image_scraper.retry_attempts): + try: + results = self.__search_engine.search(query) + return self.__download_and_process_images(results, output_folder, saved_count) + except Exception as e: + if isinstance(e, KeyboardInterrupt): + raise + self.__handle_retry_logic(e, attempt, char_name) + return saved_count + + def __handle_retry_logic(self, error: Exception, attempt: int, char_name: str) -> None: + if attempt < settings.image_scraper.retry_attempts - 1: + delay = settings.image_scraper.retry_delay * (2 ** attempt) + self.logger.warning(f'Attempt {attempt + 1} failed for {char_name}, retrying in {delay}s: {error}') + time.sleep(delay) + else: + self.logger.warning(f'All retry attempts failed for {char_name}: {error}') + + def __download_and_process_images( + self, results: Iterator[Dict[str, Any]], output_folder: Path, saved_count: int, + ) -> int: + needed = self.__images_per_character - saved_count + raw, consensus = self.__collect_raw_candidates(results, needed) + if consensus is None: + return saved_count + scored = self.__score_by_consensus(raw, consensus) + return self.__save_best_candidates(scored, output_folder, saved_count) + + def __collect_raw_candidates( + self, results: Iterator[Dict[str, Any]], needed: int, + ) -> Tuple[List[Tuple[np.ndarray, List[Any]]], Optional[np.ndarray]]: + raw: List[Tuple[np.ndarray, List[Any]]] = [] + processed = 0 + + page = self.__browser_context.new_page() + try: + for res in results: + if processed >= self.__max_results: + break + processed += 1 + img_url = res.get('image', '') + try: + img = self.__download_image_via_browser(img_url, page) + if img is None: + continue + h, w = img.shape[:2] + if w < self.__min_width or h < self.__min_height: + continue + faces = self.__face_app.get(img) + if faces: + raw.append((img, list(faces))) + except Exception as e: + self.logger.debug(f'Error processing image {img_url}: {e}') + + consensus = self.__find_confident_consensus(raw, needed) + if consensus is not None and len(self.__score_by_consensus(raw, consensus)) >= needed: + return raw, consensus + finally: + page.close() + + return raw, self.__find_confident_consensus(raw, needed) + + def __find_confident_consensus( + self, candidates: List[Tuple[np.ndarray, List[Any]]], needed: int, + ) -> Optional[np.ndarray]: + if len(candidates) < needed: + return None + + threshold = settings.character.reference_matching_threshold + clusters: List[Tuple[np.ndarray, List[int]]] = [] + + for img_idx, (_, faces) in enumerate(candidates): + for face in faces: + matched = False + for cluster_emb, img_indices in clusters: + if float(np.dot(cluster_emb, face.normed_embedding)) >= threshold: + if img_idx not in img_indices: + img_indices.append(img_idx) + matched = True + break + if not matched: + clusters.append((face.normed_embedding, [img_idx])) + + if not clusters: + return None + + clusters.sort(key=lambda x: len(x[1]), reverse=True) + best_count = len(clusters[0][1]) + second_count = len(clusters[1][1]) if len(clusters) > 1 else 0 + + if best_count > second_count and best_count >= needed: + return clusters[0][0] + return None + + def __score_by_consensus( + self, candidates: List[Tuple[np.ndarray, List[Any]]], consensus: np.ndarray, + ) -> List[Tuple[np.ndarray, float]]: + threshold = settings.character.reference_matching_threshold + scored: List[Tuple[np.ndarray, float]] = [] + for img, faces in candidates: + best_det = max( + ( + f.det_score for f in faces + if float(np.dot(consensus, f.normed_embedding)) >= threshold + ), + default=None, + ) + if best_det is not None: + scored.append((img, float(best_det))) + return scored + + def __save_best_candidates( + self, candidates: List[Tuple[np.ndarray, float]], output_folder: Path, saved_count: int, + ) -> int: + needed = self.__images_per_character - saved_count + best = sorted(candidates, key=lambda x: x[1], reverse=True)[:needed] + for img, _ in best: + cv2.imwrite(str(output_folder / f'{saved_count:02d}.jpg'), img) + saved_count += 1 + return saved_count + + def __download_image_via_browser(self, img_url: str, page: Page) -> Optional[np.ndarray]: + try: + response = page.goto( + img_url, + timeout=settings.image_scraper.image_download_timeout, + wait_until='commit', + ) + + if not response or response.status != 200: + return None + + if 'image' not in response.headers.get('content-type', ''): + return None + + return self.__decode_image_bytes(response.body(), img_url) + + except TimeoutError: + self.logger.debug(f'Timeout downloading image {img_url}') + except Exception as e: + msg = str(e) + if 'net::ERR_CONNECTION_CLOSED' in msg or 'Navigation' in msg: + self.logger.debug(f'Connection/navigation error for {img_url}: {msg}') + else: + self.logger.debug(f'Failed to download image {img_url}: {msg}') + return None + + def __decode_image_bytes(self, img_bytes: bytes, img_url: str) -> Optional[np.ndarray]: + if not img_bytes: + return None + + try: + with warnings.catch_warnings(): + warnings.simplefilter('ignore') + pil_img = Image.open(io.BytesIO(img_bytes)).convert('RGB') + img = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR) + except Exception: + self.logger.debug(f'Failed to decode image from {img_url}') + return None + + if len(img.shape) != 3 or img.shape[2] != 3: + self.logger.debug(f'Image has unexpected shape {img.shape} from {img_url}') + return None + + return img + + def __mark_exhausted(self, output_folder: Path, char_name: str) -> None: + exhausted_marker = output_folder / '.exhausted' + exhausted_marker.touch() + self.logger.info(f'{char_name}: marked as exhausted (no images found after search)') + + def __log_final_results(self, char_name: str, saved_count: int) -> None: + if saved_count >= self.__images_per_character: + self.logger.info(f'{char_name}: {saved_count}/{self.__images_per_character} images') + elif saved_count > 0: + self.logger.warning(f'{char_name}: {saved_count}/{self.__images_per_character} images (incomplete)') + else: + self.logger.warning(f'{char_name}: No suitable images found') + + @staticmethod + def __apply_random_delay() -> None: + delay = random.uniform( + settings.image_scraper.request_delay_min, + settings.image_scraper.request_delay_max, + ) + time.sleep(delay) diff --git a/preprocessor/services/core/__init__.py b/preprocessor/services/core/__init__.py new file mode 100644 index 000000000..7370a784f --- /dev/null +++ b/preprocessor/services/core/__init__.py @@ -0,0 +1,7 @@ +from preprocessor.services.core.logging import ( + ErrorHandlingLogger, + LoggerNotFinalizedException, +) +from preprocessor.services.core.time import TimeFormatter + +__all__ = ['ErrorHandlingLogger', 'LoggerNotFinalizedException', 'TimeFormatter'] diff --git a/preprocessor/services/core/base_processor.py b/preprocessor/services/core/base_processor.py new file mode 100644 index 000000000..cf246818d --- /dev/null +++ b/preprocessor/services/core/base_processor.py @@ -0,0 +1,269 @@ +from abc import ( + ABC, + abstractmethod, +) +from dataclasses import dataclass +from pathlib import Path +import re +from typing import ( + Any, + Dict, + List, + Optional, + Tuple, +) + +from preprocessor.config.constants import SUPPORTED_VIDEO_EXTENSIONS +from preprocessor.core.state_manager import StateManager +from preprocessor.services.core.logging import ErrorHandlingLogger +from preprocessor.services.io.path_service import PathService +from preprocessor.services.ui.console import ( + SimpleProgress, + console, +) +from preprocessor.services.ui.progress import OperationTracker + + +@dataclass +class ProcessingItem: + episode_id: str + input_path: Path + metadata: Dict[str, Any] + + +@dataclass +class OutputSpec: + path: Path + required: bool = True + + +@dataclass +class _FilterResult: + items_to_process: List[ProcessingItem] + skipped_count: int + skip_messages: List[str] + total_items: int + + +class BaseProcessor(ABC): + DESCRIPTION: str = '' + PRIORITY: int = 100 + PRODUCES: List[str] = [] + REQUIRES: List[str] = [] + SUPPORTED_VIDEO_EXTENSIONS = SUPPORTED_VIDEO_EXTENSIONS + + def __init__( + self, + args: Dict[str, Any], + class_name: str, + error_exit_code: int, + loglevel: int = 10, + ) -> None: + self._validate_args(args) + self._args = args + self.logger = ErrorHandlingLogger( + class_name=class_name, loglevel=loglevel, error_exit_code=error_exit_code, + ) + self.state_manager: Optional[StateManager] = args.get('state_manager') + self.series_name: str = args.get('series_name', 'unknown') + self.path_manager: PathService = args.get('path_manager', PathService(self.series_name)) + self.progress = args.get('progress_tracker', OperationTracker('default', 0, 0.0)) + + def cleanup(self) -> None: + pass + + def work(self) -> int: + try: + self._execute() + except KeyboardInterrupt: + console.print('\n[yellow]Process interrupted by user[/yellow]') + self.cleanup() + self.logger.finalize() + return 130 + except Exception as e: + self.logger.error(f'{self.__class__.__name__} failed: {e}') + + self.cleanup() + return self.logger.finalize() + + @abstractmethod + def get_output_subdir(self) -> str: + pass + + @abstractmethod + def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: + pass + + @abstractmethod + def _get_processing_items(self) -> List[ProcessingItem]: + pass + + @abstractmethod + def _process_item( + self, item: ProcessingItem, missing_outputs: List[OutputSpec], + ) -> None: + pass + + @abstractmethod + def _validate_args(self, args: Dict[str, Any]) -> None: + pass + + def _execute(self) -> None: + all_items = self._get_processing_items() + if not all_items: + console.print('[yellow]No items to process[/yellow]') + return + + filter_result = self.__filter_skipped_items(all_items) + + if not filter_result.items_to_process: + console.print( + f'[yellow]All items already processed ' + f'({filter_result.total_items} total, {filter_result.skipped_count} skipped)[/yellow]', + ) + return + + self.__display_processing_summary(filter_result) + self.__execute_processing(filter_result.items_to_process) + self._finalize() + + def _finalize(self) -> None: + pass + + def _get_progress_description(self) -> str: + return f'Processing {self.__class__.__name__}' + + def _load_resources(self) -> bool: + return True + + def __filter_skipped_items(self, all_items: List[ProcessingItem]) -> _FilterResult: + items_to_process: List[ProcessingItem] = [] + skipped_count = 0 + skip_messages: List[str] = [] + + for item in all_items: + should_skip, missing_outputs, skip_message = self.__should_skip_item(item) + + if should_skip: + if skip_message: + skip_messages.append(skip_message) + skipped_count += 1 + else: + item.metadata['missing_outputs'] = missing_outputs + items_to_process.append(item) + + return _FilterResult( + items_to_process=items_to_process, + skipped_count=skipped_count, + skip_messages=skip_messages, + total_items=len(all_items), + ) + + def __should_skip_item( + self, item: ProcessingItem, + ) -> Tuple[bool, List[OutputSpec], str]: + expected_outputs = self._get_expected_outputs(item) + if not expected_outputs: + return False, [], '' + + missing_outputs = self.__get_missing_outputs(expected_outputs) + step_name = self.__get_step_name() + state_completed = self.__is_step_completed_in_state(step_name, item.episode_id) + has_all_outputs = len(missing_outputs) == 0 + + if has_all_outputs and state_completed: + return True, [], f'[yellow]Skipping (completed): {item.episode_id}[/yellow]' + + if has_all_outputs and not state_completed: + self.__sync_state_completed(step_name, item.episode_id) + return True, [], f'[yellow]Skipping (files exist, state synced): {item.episode_id}[/yellow]' + + if not has_all_outputs and state_completed: + console.print( + f'[yellow]Warning: State marked complete but outputs missing ' + f'for {item.episode_id}[/yellow]', + ) + + return False, missing_outputs, '' + + def __execute_processing(self, items: List[ProcessingItem]) -> None: + if not items: + console.print('[yellow]No items to process, skipping resource loading[/yellow]') + return + + if not self._load_resources(): + return + + step_name = self.__get_step_name() + try: + with SimpleProgress() as progress: + task = progress.add_task(self._get_progress_description(), total=len(items)) + for item in items: + self.__process_single_item(item, step_name, progress, task) + except KeyboardInterrupt: + console.print('\n[yellow]Processing interrupted[/yellow]') + raise + + def __process_single_item( + self, + item: ProcessingItem, + step_name: str, + progress: SimpleProgress, + task: int, + ) -> None: + try: + if self.state_manager: + self.state_manager.mark_step_started(step_name, item.episode_id, []) + + missing_outputs = item.metadata.get('missing_outputs', []) + self._process_item(item, missing_outputs) + + if self.state_manager: + self.state_manager.mark_step_completed(step_name, item.episode_id) + except Exception as e: + self.logger.error(f'Failed to process {item.episode_id}: {e}') + finally: + progress.advance(task) + + def __is_step_completed_in_state(self, step_name: str, episode_id: str) -> bool: + if not self.state_manager: + return False + return self.state_manager.is_step_completed(step_name, episode_id) + + def __sync_state_completed(self, step_name: str, episode_id: str) -> None: + if self.state_manager: + self.state_manager.mark_step_completed(step_name, episode_id) + + def __get_step_name(self) -> str: + class_name = self.__class__.__name__ + suffixes_to_remove = [ + 'Processor', 'Generator', 'Detector', 'Transcoder', 'Importer', 'Indexer', + ] + + name = class_name + for suffix in suffixes_to_remove: + name = name.replace(suffix, '') + + return self.__to_snake_case(name) + + @staticmethod + def __display_processing_summary(result: _FilterResult) -> None: + for skip_message in result.skip_messages: + console.print(skip_message) + + console.print( + f'[blue]Processing {len(result.items_to_process)} items ' + f'(of {result.total_items} total, {result.skipped_count} skipped)[/blue]', + ) + + @staticmethod + def __get_missing_outputs(expected_outputs: List[OutputSpec]) -> List[OutputSpec]: + return [ + output for output in expected_outputs + if not output.path.exists() or output.path.stat().st_size == 0 + ] + + @staticmethod + def __to_snake_case(name: str) -> str: + name = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name) + return re.sub('([a-z0-9])([A-Z])', r'\1_\2', name).lower() diff --git a/preprocessor/services/core/environment.py b/preprocessor/services/core/environment.py new file mode 100644 index 000000000..cf6c6af16 --- /dev/null +++ b/preprocessor/services/core/environment.py @@ -0,0 +1,18 @@ +import os +from typing import Optional + + +class Environment: + __is_docker_cached: Optional[bool] = None + + @staticmethod + def is_docker() -> bool: + if Environment.__is_docker_cached is None: + Environment.__is_docker_cached = ( + os.getenv('DOCKER_CONTAINER', 'false').lower() == 'true' + ) + return Environment.__is_docker_cached + + @staticmethod + def reset_cache() -> None: + Environment.__is_docker_cached = None diff --git a/preprocessor/services/core/logging.py b/preprocessor/services/core/logging.py new file mode 100644 index 000000000..98f53d112 --- /dev/null +++ b/preprocessor/services/core/logging.py @@ -0,0 +1,98 @@ +import logging +import threading +from typing import List + +from rich.logging import RichHandler +from rich.panel import Panel + +from preprocessor.services.ui.console import console + + +class LoggerNotFinalizedException(Exception): + def __init__(self) -> None: + super().__init__('Logger destroyed without finalize() being called.') + + +class ErrorHandlingLogger: + DEBUG = 10 + INFO = 20 + WARNING = 30 + ERROR = 40 + CRITICAL = 50 + __lock = threading.Lock() + + def __init__(self, class_name: str, loglevel: int, error_exit_code: int) -> None: + self.__class_name = class_name + self.__error_exit_code = error_exit_code + self.__errors: List[str] = [] + self.__is_finalized = False + self.__logger: logging.Logger = self.__setup_logger(loglevel) + + def __del__(self) -> None: + if not self.__is_finalized: + self.__logger.error( + f"ErrorHandlingLogger for '{self.__class_name}' destroyed without finalize().", + ) + if self.__errors: + self.__logger.error('Logged errors:') + for error in self.__errors: + self.__logger.error(f'- {error}') + raise LoggerNotFinalizedException() + + def debug(self, message: str) -> None: + with self.__lock: + self.__logger.debug(message) + + def info(self, message: str) -> None: + with self.__lock: + self.__logger.info(message) + + def warning(self, message: str) -> None: + with self.__lock: + self.__logger.warning(message) + + def error(self, message: str) -> None: + with self.__lock: + self.__logger.error(message) + self.__errors.append(message) + + def finalize(self) -> int: + with self.__lock: + self.__is_finalized = True + + if self.__errors: + console.print( + Panel( + f"[bold red]Processing for '{self.__class_name}' " + f"completed with {len(self.__errors)} error(s)[/bold red]", + title='Errors Occurred', + border_style='red', + ), + ) + return self.__error_exit_code + + console.print( + Panel( + f"[bold green]Processing for '{self.__class_name}' " + "completed successfully[/bold green]", + title='Success', + border_style='green', + ), + ) + return 0 + + def __setup_logger(self, level: int) -> logging.Logger: + logging.basicConfig( + level=level, + format='%(message)s', + handlers=[ + RichHandler( + console=console, + rich_tracebacks=True, + show_time=True, + show_path=False, + ), + ], + force=True, + ) + return logging.getLogger(self.__class_name) diff --git a/preprocessor/services/core/time.py b/preprocessor/services/core/time.py new file mode 100644 index 000000000..8abda2cd9 --- /dev/null +++ b/preprocessor/services/core/time.py @@ -0,0 +1,22 @@ +class TimeFormatter: + @staticmethod + def format_hms(seconds: float) -> str: + hours = int(seconds // 3600) + minutes = int((seconds % 3600) // 60) + secs = int(seconds % 60) + return f'{hours}:{minutes:02d}:{secs:02d}' + + @staticmethod + def format_human(seconds: float) -> str: + if seconds < 60: + return f'{seconds:.1f}s' + + minutes = int(seconds // 60) + secs = int(seconds % 60) + + if minutes < 60: + return f'{minutes}m {secs}s' + + hours = minutes // 60 + minutes = minutes % 60 + return f'{hours}h {minutes}m {secs}s' diff --git a/preprocessor/services/episodes/__init__.py b/preprocessor/services/episodes/__init__.py new file mode 100644 index 000000000..1026cd993 --- /dev/null +++ b/preprocessor/services/episodes/__init__.py @@ -0,0 +1,4 @@ +from preprocessor.services.episodes.episode_manager import EpisodeManager +from preprocessor.services.episodes.types import EpisodeInfo + +__all__ = ['EpisodeInfo', 'EpisodeManager'] diff --git a/preprocessor/services/episodes/episode_manager.py b/preprocessor/services/episodes/episode_manager.py new file mode 100644 index 000000000..44b83a43c --- /dev/null +++ b/preprocessor/services/episodes/episode_manager.py @@ -0,0 +1,164 @@ +import json +from pathlib import Path +import re +from typing import ( + Any, + Dict, + List, + Optional, +) + +from preprocessor.config.constants import ( + EpisodeMetadataKeys, + EpisodesDataKeys, +) +from preprocessor.services.core.logging import ErrorHandlingLogger +from preprocessor.services.episodes.types import EpisodeInfo +from preprocessor.services.io.path_service import PathService + + +class EpisodeManager: + def __init__( + self, + episodes_info_json: Optional[Path], + series_name: str, + logger: Optional[ErrorHandlingLogger] = None, + ) -> None: + self.__series_name = series_name.lower() + self.__episodes_data: Optional[Dict[str, Any]] = None + self.__path_manager = PathService(self.__series_name) + self.__logger = logger + + self.__load_episodes_data(episodes_info_json) + + @property + def path_manager(self) -> PathService: + return self.__path_manager + + def get_episode_by_season_and_relative(self, season: int, relative_episode: int) -> EpisodeInfo: + if not self.__episodes_data: + return self.__create_fallback_episode_info(season, relative_episode) + + season_list = self.__episodes_data.get(EpisodesDataKeys.SEASONS, []) + for season_data in season_list: + if season_data.get(EpisodesDataKeys.SEASON_NUMBER) == season: + return self.__extract_episode_from_season(season_data, season, relative_episode) + + self.__log_missing_season_warning(season, relative_episode) + return self.__create_fallback_episode_info(season, relative_episode) + + def get_all_episodes(self) -> List[EpisodeInfo]: + if not self.__episodes_data: + return [] + + result: List[EpisodeInfo] = [] + for season_data in self.__episodes_data.get(EpisodesDataKeys.SEASONS, []): + season = season_data.get(EpisodesDataKeys.SEASON_NUMBER, 0) + episodes = sorted( + season_data.get(EpisodesDataKeys.EPISODES, []), + key=lambda ep: ep.get(EpisodeMetadataKeys.EPISODE_NUMBER, 0), + ) + for idx, ep_data in enumerate(episodes, start=1): + result.append( + self.__create_episode_info( + season=season, + relative_episode=idx, + title=ep_data.get(EpisodeMetadataKeys.TITLE), + premiere_date=ep_data.get(EpisodeMetadataKeys.PREMIERE_DATE), + viewership=ep_data.get(EpisodeMetadataKeys.VIEWERSHIP), + ), + ) + return result + + def parse_filename(self, file_path: Path) -> Optional[EpisodeInfo]: + full_path_str = str(file_path) + match_season_episode = re.search(r'S(\d+)[/\\]?E(\d+)', full_path_str, re.IGNORECASE) + + if match_season_episode: + season = int(match_season_episode.group(1)) + episode = int(match_season_episode.group(2)) + return self.get_episode_by_season_and_relative(season, episode) + + if self.__logger: + self.__logger.error( + f'Cannot parse episode from filename: {file_path.name}. ' + 'Expected format: S##E## (e.g., S01E05, S10E13). ' + 'Absolute episode numbers (E## without season) are not supported.', + ) + return None + + @staticmethod + def get_episode_id_for_state(episode_info: EpisodeInfo) -> str: + return episode_info.episode_code() + + @staticmethod + def get_metadata(episode_info: EpisodeInfo) -> Dict[str, Any]: + return { + 'season': episode_info.season, + 'episode_number': episode_info.relative_episode, + 'title': episode_info.title, + 'premiere_date': episode_info.premiere_date, + 'viewership': episode_info.viewership, + } + + def __load_episodes_data(self, json_path: Optional[Path]) -> None: + if json_path and json_path.exists(): + try: + with open(json_path, 'r', encoding='utf-8') as f: + self.__episodes_data = json.load(f) + except Exception as e: + if self.__logger: + self.__logger.error(f'Failed to load episodes data from {json_path}: {e}') + + def __extract_episode_from_season( + self, season_data: Dict[str, Any], season: int, relative_episode: int, + ) -> EpisodeInfo: + episodes = sorted( + season_data.get(EpisodesDataKeys.EPISODES, []), + key=lambda ep: ep.get(EpisodeMetadataKeys.EPISODE_NUMBER, 0), + ) + + if 0 < relative_episode <= len(episodes): + ep_data = episodes[relative_episode - 1] + return self.__create_episode_info( + season=season, + relative_episode=relative_episode, + title=ep_data.get(EpisodeMetadataKeys.TITLE), + premiere_date=ep_data.get(EpisodeMetadataKeys.PREMIERE_DATE), + viewership=ep_data.get(EpisodeMetadataKeys.VIEWERSHIP), + ) + + return self.__create_fallback_episode_info(season, relative_episode) + + def __log_missing_season_warning(self, season: int, relative_episode: int) -> None: + if self.__logger: + self.__logger.warning( + f'Season {season} not found in episodes_info_json! ' + f'Processing S{season:02d}E{relative_episode:02d} with filename-only metadata. ' + f'Scrape episode info for season {season} to get title, premiere date, etc.', + ) + + def __create_fallback_episode_info(self, season: int, relative_episode: int) -> EpisodeInfo: + return self.__create_episode_info( + season=season, + relative_episode=relative_episode, + title=f'S{season:02d}E{relative_episode:02d}', + ) + + def __create_episode_info( + self, + season: int, + relative_episode: int, + title: Optional[str] = None, + premiere_date: Optional[str] = None, + viewership: Optional[str] = None, + ) -> EpisodeInfo: + return EpisodeInfo( + absolute_episode=0, + season=season, + relative_episode=relative_episode, + title=title or f'S{season:02d}E{relative_episode:02d}', + series_name=self.__series_name, + premiere_date=premiere_date, + viewership=viewership, + ) diff --git a/preprocessor/services/episodes/types.py b/preprocessor/services/episodes/types.py new file mode 100644 index 000000000..38de88b2f --- /dev/null +++ b/preprocessor/services/episodes/types.py @@ -0,0 +1,25 @@ +from dataclasses import dataclass +from typing import Optional + + +@dataclass +class EpisodeInfo: + absolute_episode: int + relative_episode: int + season: int + title: str + premiere_date: Optional[str] = None + series_name: Optional[str] = None + viewership: Optional[str] = None + + def episode_code(self) -> str: + return f'S{self.season:02d}E{self.relative_episode:02d}' + + def episode_num(self) -> str: + return f'E{self.relative_episode:02d}' + + def season_code(self) -> str: + return f'S{self.season:02d}' + + def is_special(self) -> bool: + return self.season == 0 diff --git a/preprocessor/services/io/__init__.py b/preprocessor/services/io/__init__.py new file mode 100644 index 000000000..d9b335f39 --- /dev/null +++ b/preprocessor/services/io/__init__.py @@ -0,0 +1,3 @@ +from preprocessor.services.io.path_service import PathService + +__all__ = ['PathService'] diff --git a/preprocessor/services/io/files.py b/preprocessor/services/io/files.py new file mode 100644 index 000000000..e0b12e769 --- /dev/null +++ b/preprocessor/services/io/files.py @@ -0,0 +1,21 @@ +import json +from pathlib import Path +from typing import ( + Any, + Dict, +) + +from preprocessor.core.temp_files import StepTempFile + + +class FileOperations: + @staticmethod + def atomic_write_json(path: Path, data: Dict[str, Any], indent: int = 2) -> None: + with StepTempFile(path) as temp_path: + with open(temp_path, 'w', encoding='utf-8') as f: + json.dump(data, f, ensure_ascii=False, indent=indent) + + @staticmethod + def load_json(path: Path) -> Dict[str, Any]: + with open(path, 'r', encoding='utf-8') as f: + return json.load(f) diff --git a/preprocessor/services/io/metadata.py b/preprocessor/services/io/metadata.py new file mode 100644 index 000000000..c54528c8f --- /dev/null +++ b/preprocessor/services/io/metadata.py @@ -0,0 +1,52 @@ +from datetime import datetime +from pathlib import Path +from typing import ( + Any, + Dict, + List, +) + +from preprocessor.core.artifacts import EmbeddingCollection + + +class MetadataBuilder: + @staticmethod + def create_embedding_collection( + episode_id: str, + episode_info: Any, + path: Path, + model_name: str, + embedding_count: int, + embedding_type: str, + ) -> EmbeddingCollection: + return EmbeddingCollection( + episode_id=episode_id, + episode_info=episode_info, + path=path, + model_name=model_name, + embedding_count=embedding_count, + embedding_type=embedding_type, + ) + + @staticmethod + def create_processing_metadata( + episode_info: Any, + processing_params: Dict[str, Any], + statistics: Dict[str, Any], + results_key: str, + results_data: List[Any], + ) -> Dict[str, Any]: + return { + 'generated_at': datetime.now().isoformat(), + 'episode_info': MetadataBuilder.__create_minimal_episode_info(episode_info), + 'processing_parameters': processing_params, + 'statistics': statistics, + results_key: results_data, + } + + @staticmethod + def __create_minimal_episode_info(episode_info: Any) -> Dict[str, Any]: + return { + 'season': episode_info.season, + 'episode_number': episode_info.relative_episode, + } diff --git a/preprocessor/services/io/path_service.py b/preprocessor/services/io/path_service.py new file mode 100644 index 000000000..bbb93847a --- /dev/null +++ b/preprocessor/services/io/path_service.py @@ -0,0 +1,47 @@ +from __future__ import annotations + +from pathlib import Path +from typing import Optional + +from preprocessor.config.output_paths import get_base_output_dir +from preprocessor.services.core.environment import Environment +from preprocessor.services.episodes.types import EpisodeInfo + + +class PathService: + def __init__(self, series_name: str) -> None: + self.__series_name = series_name.lower() + + def build_filename( + self, + episode_info: EpisodeInfo, + extension: str = 'json', + suffix: Optional[str] = None, + ) -> str: + base = f'{self.__series_name}_{episode_info.episode_code()}' + suffix_str = f'_{suffix}' if suffix else '' + return f'{base}{suffix_str}.{extension}' + + def get_episode_dir(self, episode_info: EpisodeInfo, subdir: str) -> Path: + base_output_dir = get_base_output_dir(self.__series_name) + return base_output_dir / subdir / episode_info.season_code() / episode_info.episode_num() + + def get_episode_dir_by_code(self, episode_info: EpisodeInfo, subdir: str) -> Path: + base_output_dir = get_base_output_dir(self.__series_name) + return base_output_dir / subdir / episode_info.season_code() / episode_info.episode_code() + + def get_episode_file_path(self, episode_info: EpisodeInfo, subdir: str, extension: str = 'json') -> Path: + base_output_dir = get_base_output_dir(self.__series_name) + return base_output_dir / subdir / episode_info.season_code() / f'{episode_info.episode_code()}.{extension}' + + @staticmethod + def get_input_base() -> Path: + if Environment.is_docker(): + return Path('/input_data') + return Path('preprocessor/input_data') + + @staticmethod + def get_output_base() -> Path: + if Environment.is_docker(): + return Path('/app/output_data') + return Path('preprocessor/output_data') diff --git a/preprocessor/services/media/__init__.py b/preprocessor/services/media/__init__.py new file mode 100644 index 000000000..9876d4003 --- /dev/null +++ b/preprocessor/services/media/__init__.py @@ -0,0 +1,9 @@ +from preprocessor.services.media.ffmpeg import FFmpegWrapper +from preprocessor.services.media.resolution import Resolution + +__all__ = ['FFmpegWrapper', 'Resolution'] +try: + from preprocessor.services.media.scene_detection import TransNetWrapper + __all__.append('TransNetWrapper') +except ImportError: + pass diff --git a/preprocessor/services/media/ffmpeg.py b/preprocessor/services/media/ffmpeg.py new file mode 100644 index 000000000..d4038d661 --- /dev/null +++ b/preprocessor/services/media/ffmpeg.py @@ -0,0 +1,413 @@ +from io import BytesIO +import json +from pathlib import Path +import re +import subprocess +from typing import ( + Any, + Dict, + List, + Optional, + Tuple, + Union, +) + +from PIL import Image + +from preprocessor.services.media.transcode_params import TranscodeParams + + +class FFmpegWrapper: + __ADAPTIVE_QUANTIZATION_STRENGTH = '15' + __AUDIO_CHANNELS = '2' + __AUDIO_SAMPLE_RATE = '48000' + __B_FRAMES = '2' + __B_ADAPT_MODE = '1' + __LEVEL = '4.1' + __PIX_FMT = 'yuv420p' + __PROFILE = 'high' + __RC_LOOKAHEAD = '32' + __TWO_PASS = '1' + + @staticmethod + def detect_interlacing( + video_path: Path, + analysis_time: Optional[int] = 60, + threshold: float = 0.15, + ) -> Tuple[bool, Optional[Dict[str, Union[int, float]]]]: + cmd = ['ffmpeg'] + + if analysis_time is not None: + cmd.extend(['-t', str(analysis_time)]) + + cmd.extend([ + '-i', str(video_path), + '-vf', 'idet', + '-an', + '-f', 'null', + '-', + ]) + + result = subprocess.run( + cmd, + stdout=subprocess.DEVNULL, + stderr=subprocess.PIPE, + text=True, + encoding='utf-8', + errors='ignore', + check=False, + ) + + if result.returncode != 0: + return False, None + + stats = FFmpegWrapper.__parse_idet_output(result.stderr) + if stats is None: + return False, None + + total_interlaced = stats['tff'] + stats['bff'] + total_frames = total_interlaced + stats['progressive'] + + if total_frames == 0: + return False, None + + ratio = total_interlaced / total_frames + stats['ratio'] = ratio + + return ratio > threshold, stats + + @staticmethod + def get_audio_bitrate(probe_data: Dict[str, Any]) -> Optional[int]: + stream = FFmpegWrapper.__get_stream_by_type(probe_data, 'audio') + if not stream: + return None + + bit_rate = stream.get('bit_rate') + if not bit_rate: + return None + + return int(int(bit_rate) / 1000) + + @staticmethod + def get_framerate(probe_data: Dict[str, Any]) -> float: + stream = FFmpegWrapper.__get_stream_by_type(probe_data, 'video') + if not stream: + raise ValueError('No video streams found') + + r_frame_rate = stream.get('r_frame_rate') + if not r_frame_rate: + raise ValueError('Frame rate not found') + + num, denom = [int(x) for x in r_frame_rate.split('/')] + return num / denom + + @staticmethod + def get_video_bitrate(probe_data: Dict[str, Any]) -> Optional[float]: + stream = FFmpegWrapper.__get_stream_by_type(probe_data, 'video') + if not stream: + return None + + bit_rate = stream.get('bit_rate') + if not bit_rate: + return None + + return round(int(bit_rate) / 1000000, 2) + + @staticmethod + def get_resolution(probe_data: Dict[str, Any]) -> Tuple[int, int]: + stream = FFmpegWrapper.__get_stream_by_type(probe_data, 'video') + if not stream: + raise ValueError('No video streams found') + + width = stream.get('width') + height = stream.get('height') + if not width or not height: + raise ValueError('Resolution not found') + + return int(width), int(height) + + @staticmethod + def get_sample_aspect_ratio(probe_data: Dict[str, Any]) -> Tuple[int, int]: + stream = FFmpegWrapper.__get_stream_by_type(probe_data, 'video') + if not stream: + return 1, 1 + + sar = stream.get('sample_aspect_ratio', '1:1') + if sar == '0:1' or not sar: + return 1, 1 + + try: + num, denom = [int(x) for x in sar.split(':')] + return num, denom + except (ValueError, AttributeError): + return 1, 1 + + @staticmethod + def get_field_order(probe_data: Dict[str, Any]) -> str: + stream = FFmpegWrapper.__get_stream_by_type(probe_data, 'video') + if not stream: + return 'unknown' + return stream.get('field_order', 'unknown') + + @staticmethod + def get_video_codec(probe_data: Dict[str, Any]) -> str: + stream = FFmpegWrapper.__get_stream_by_type(probe_data, 'video') + if not stream: + return 'h264' + return stream.get('codec_name', 'h264').lower() + + @staticmethod + def probe_video(video_path: Path) -> Dict[str, Any]: + cmd = [ + 'ffprobe', '-v', 'error', '-show_streams', '-show_format', + '-of', 'json', str(video_path), + ] + result = subprocess.run(cmd, capture_output=True, text=True, check=True) + return json.loads(result.stdout) + + @staticmethod + def transcode(params: TranscodeParams) -> Optional[str]: + width, height = params.get_resolution_tuple() + vf_filter = FFmpegWrapper.__build_video_filter( + width, height, params.deinterlace, params.is_upscaling, + ) + command = FFmpegWrapper.__build_base_command( + params.input_path, params.codec, params.preset, params.target_fps, + ) + command.extend( + FFmpegWrapper.__build_encoding_params( + params.video_bitrate, + params.minrate, + params.maxrate, + params.bufsize, + params.gop_size, + params.is_upscaling, + ), + ) + command.extend( + FFmpegWrapper.__build_audio_and_output_params( + params.audio_bitrate, vf_filter, params.output_path, + ), + ) + + log_output = FFmpegWrapper.__log_ffmpeg_command(command) if params.log_command else None + subprocess.run( + command, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, + ) + return log_output + + @staticmethod + def get_audio_streams(video_path: Path) -> List[Dict[str, Any]]: + cmd = [ + 'ffprobe', '-v', 'error', '-select_streams', 'a', + '-show_entries', 'stream=index,bit_rate,codec_name,channels,sample_rate', + '-of', 'json', str(video_path), + ] + result = subprocess.run(cmd, capture_output=True, text=True, check=True) + return json.loads(result.stdout).get('streams', []) + + @staticmethod + def extract_audio( + video_path: Path, + output_path: Path, + audio_stream_index: Optional[int] = None, + codec: str = 'pcm_s16le', + sample_rate: int = 48000, + channels: int = 1, + ) -> None: + cmd = ['ffmpeg', '-y', '-i', str(video_path)] + + if audio_stream_index is not None: + cmd.extend(['-map', f'0:{audio_stream_index}']) + + cmd.extend([ + '-acodec', codec, + '-ar', str(sample_rate), + '-ac', str(channels), + str(output_path), + ]) + + subprocess.run( + cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, + ) + + @staticmethod + def normalize_audio(input_path: Path, output_path: Path) -> None: + cmd = [ + 'ffmpeg', '-y', '-i', str(input_path), + '-af', 'dynaudnorm', + str(output_path), + ] + subprocess.run( + cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, + ) + + @staticmethod + def extract_frame_at_timestamp(video_path: Path, timestamp: float) -> Image.Image: + cmd = [ + 'ffmpeg', + '-ss', str(timestamp), + '-i', str(video_path), + '-frames:v', '1', + '-f', 'image2pipe', + '-vcodec', 'bmp', + '-', + ] + result = subprocess.run(cmd, capture_output=True, check=True) + if not result.stdout: + raise ValueError(f'No frame data extracted at timestamp {timestamp}s from {video_path}') + return Image.open(BytesIO(result.stdout)) + + + @staticmethod + def get_keyframe_timestamps(video_path: Path) -> List[float]: + cmd = [ + 'ffprobe', + '-skip_frame', 'nokey', + '-select_streams', 'v:0', + '-show_entries', 'frame=pts_time,pkt_pts_time', + '-of', 'json', + str(video_path), + ] + result = subprocess.run(cmd, capture_output=True, text=True, check=True) + data: Dict[str, Any] = json.loads(result.stdout) + frames: List[Dict[str, Any]] = data.get('frames', []) + + timestamps = [] + for frame in frames: + pts = frame.get('pts_time') or frame.get('pkt_pts_time') + if pts: + timestamps.append(float(pts)) + + return timestamps + + @staticmethod + def __log_ffmpeg_command(command: List[str]) -> str: + return ' '.join(command) + + @staticmethod + def __build_audio_and_output_params( + audio_bitrate: str, vf_filter: str, output_path: Path, + ) -> List[str]: + return [ + '-c:a', 'aac', + '-b:a', audio_bitrate, + '-ac', FFmpegWrapper.__AUDIO_CHANNELS, + '-ar', FFmpegWrapper.__AUDIO_SAMPLE_RATE, + '-vf', vf_filter, + '-movflags', '+faststart', + '-f', 'mp4', + str(output_path), + ] + + @staticmethod + def __build_base_command( + input_path: Path, codec: str, preset: str, target_fps: Optional[float], + ) -> List[str]: + command = [ + 'ffmpeg', '-v', 'error', '-hide_banner', '-y', + '-sws_flags', 'accurate_rnd+full_chroma_int+full_chroma_inp', + '-i', str(input_path), + '-c:v', codec, + '-preset', preset, + '-profile:v', FFmpegWrapper.__PROFILE, + '-level', FFmpegWrapper.__LEVEL, + '-pix_fmt', FFmpegWrapper.__PIX_FMT, + '-colorspace', 'bt709', + '-color_primaries', 'bt709', + '-color_trc', 'bt709', + '-color_range', 'tv', + '-video_track_timescale', '90000', + ] + + if target_fps: + command.extend(['-r', str(target_fps)]) + + return command + + @staticmethod + def __build_encoding_params( + video_bitrate: str, + minrate: str, + maxrate: str, + bufsize: str, + gop_size: int, + is_upscaling: bool = False, + ) -> List[str]: + params = [ + '-rc', 'vbr_hq', + '-b:v', video_bitrate, + '-minrate', minrate, + '-maxrate', maxrate, + '-bufsize', bufsize, + '-bf', FFmpegWrapper.__B_FRAMES, + '-b_adapt', FFmpegWrapper.__B_ADAPT_MODE, + '-2pass', FFmpegWrapper.__TWO_PASS, + '-multipass', 'fullres', + '-g', str(gop_size), + '-spatial-aq', '1', + '-temporal-aq', '1', + ] + + if is_upscaling: + params.extend([ + '-rc-lookahead', '60', + '-aq-strength', '15', + '-b_ref_mode', 'middle', + ]) + else: + params.extend([ + '-rc-lookahead', FFmpegWrapper.__RC_LOOKAHEAD, + '-aq-strength', FFmpegWrapper.__ADAPTIVE_QUANTIZATION_STRENGTH, + ]) + + params.extend([ + '-strict_gop', '1', + '-forced-idr', '1', + '-no-scenecut', '1', + ]) + + return params + + @staticmethod + def __build_video_filter( + width: int, height: int, deinterlace: bool = False, is_upscaling: bool = False, + ) -> str: + filters = [] + + if deinterlace: + filters.append('bwdif=mode=0:parity=-1:deint=1') + filters.append('setfield=prog') + + scaler_flags = 'lanczos' if is_upscaling else 'bicubic' + + filters.append( + f"scale='iw*sar:ih',scale={width}:{height}:" + f"force_original_aspect_ratio=decrease:flags={scaler_flags}," + f"pad={width}:{height}:(ow-iw)/2:(oh-ih)/2:black,setsar=1", + ) + + return ','.join(filters) + + @staticmethod + def __get_stream_by_type(probe_data: Dict[str, Any], codec_type: str) -> Optional[Dict[str, Any]]: + streams = [s for s in probe_data.get('streams', []) if s.get('codec_type') == codec_type] + return streams[0] if streams else None + + @staticmethod + def __parse_idet_output(stderr: str) -> Optional[Dict[str, Union[int, float]]]: + matches = re.findall( + r'Multi frame detection:\s+TFF:\s*(\d+)\s+BFF:\s*(\d+)\s+Progressive:\s*(\d+)', + stderr, + ) + + if not matches: + return None + + tff, bff, progressive = matches[-1] + + return { + 'tff': int(tff), + 'bff': int(bff), + 'progressive': int(progressive), + } diff --git a/preprocessor/services/media/resolution.py b/preprocessor/services/media/resolution.py new file mode 100644 index 000000000..2f92eb0f9 --- /dev/null +++ b/preprocessor/services/media/resolution.py @@ -0,0 +1,38 @@ +from enum import Enum +from typing import ( + List, + Type, + TypeVar, +) + +T = TypeVar('T', bound='Resolution') + + +class Resolution(Enum): + R144P = (256, 144) + R240P = (426, 240) + R360P = (640, 360) + R480P = (854, 480) + R720P = (1280, 720) + R1080P = (1920, 1080) + R1440P = (2560, 1440) + R2160P = (3840, 2160) + R4320P = (7680, 4320) + + def __init__(self, width: int, height: int) -> None: + self.width = width + self.height = height + + def __str__(self) -> str: + return f'{self.height}p' + + @classmethod + def from_string(cls: Type[T], init: str) -> T: + clean_init = init.strip().upper() + if not clean_init[0].isalpha(): + clean_init = f'R{clean_init}' + return cls[clean_init] + + @classmethod + def get_all_choices(cls) -> List[str]: + return [str(r) for r in cls] diff --git a/preprocessor/services/media/scene_detection.py b/preprocessor/services/media/scene_detection.py new file mode 100644 index 000000000..91f17b669 --- /dev/null +++ b/preprocessor/services/media/scene_detection.py @@ -0,0 +1,128 @@ +import gc +from pathlib import Path +from typing import ( + Any, + Dict, + List, + Optional, +) + +import decord +import numpy as np +import torch +from transnetv2_pytorch import TransNetV2 + + +class TransNetWrapper: + def __init__(self) -> None: + self.__model: Optional[TransNetV2] = None + + def load_model(self) -> None: + if not torch.cuda.is_available(): + raise RuntimeError('CUDA not available for TransNetV2.') + self.__model = TransNetV2().cuda() + + def cleanup(self) -> None: + if self.__model is not None: + del self.__model + self.__model = None + + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + def detect_scenes( + self, + video_path: Path, + threshold: float = 0.5, + min_scene_len: int = 10, + ) -> List[Dict[str, Any]]: + if self.__model is None: + raise RuntimeError('Model not loaded. Call load_model() first.') + + video_info = self.get_video_info(video_path) + if not video_info: + raise RuntimeError(f'Failed to get video info for {video_path}') + + try: + _, single_frame_predictions, _ = self.__model.predict_video(str(video_path)) + scene_changes = np.where(single_frame_predictions > threshold)[0] + + return self.__build_scenes_from_predictions( + scene_changes, + video_info, + min_scene_len, + ) + except Exception as e: + raise RuntimeError(f'TransNetV2 detection failed: {e}') from e + + def __build_scenes_from_predictions( + self, + scene_changes: np.ndarray, + video_info: Dict[str, Any], + min_scene_len: int, + ) -> List[Dict[str, Any]]: + scenes: List[Dict[str, Any]] = [] + fps = video_info['fps'] + prev_frame = 0 + + for frame_num in scene_changes: + if frame_num - prev_frame < min_scene_len: + continue + + scenes.append(self.__create_scene_dict(len(scenes) + 1, prev_frame, frame_num, fps)) + prev_frame = frame_num + + total_frames = video_info['total_frames'] + if total_frames - prev_frame > min_scene_len: + scenes.append(self.__create_scene_dict(len(scenes) + 1, prev_frame, total_frames, fps)) + + return scenes + + def __create_scene_dict( + self, + scene_number: int, + start_frame: int, + end_frame: int, + fps: float, + ) -> Dict[str, Any]: + return { + 'scene_number': scene_number, + 'start': { + 'frame': int(start_frame), + 'seconds': float(start_frame / fps), + 'timecode': self.__frame_to_timecode(start_frame, fps), + }, + 'end': { + 'frame': int(end_frame), + 'seconds': float(end_frame / fps), + 'timecode': self.__frame_to_timecode(end_frame, fps), + }, + 'duration': float((end_frame - start_frame) / fps), + 'frame_count': int(end_frame - start_frame), + } + + @staticmethod + def __frame_to_timecode(frame: int, fps: float) -> str: + seconds = frame / fps + hours = int(seconds // 3600) + minutes = int(seconds % 3600 // 60) + secs = int(seconds % 60) + frames = int((seconds % 1) * fps) + return f'{hours:02d}:{minutes:02d}:{secs:02d}:{frames:02d}' + + @staticmethod + def get_video_info(video_file: Path) -> Optional[Dict[str, Any]]: + try: + vr = decord.VideoReader(str(video_file), ctx=decord.cpu(0)) + fps = vr.get_avg_fps() + total_frames = len(vr) + duration = total_frames / fps if fps > 0 else 0 + + return { + 'fps': fps, + 'duration': duration, + 'total_frames': total_frames, + } + except Exception: + return None diff --git a/preprocessor/services/media/transcode_params.py b/preprocessor/services/media/transcode_params.py new file mode 100644 index 000000000..3c4d1b412 --- /dev/null +++ b/preprocessor/services/media/transcode_params.py @@ -0,0 +1,34 @@ +from dataclasses import dataclass +from pathlib import Path +from typing import ( + Optional, + Tuple, +) + + +@dataclass(frozen=True) +class TranscodeParams: + input_path: Path + output_path: Path + codec: str + preset: str + resolution: str + video_bitrate: str + minrate: str + maxrate: str + bufsize: str + audio_bitrate: str + gop_size: int + target_fps: Optional[float] = None + deinterlace: bool = False + is_upscaling: bool = False + log_command: bool = False + + def get_resolution_tuple(self) -> Tuple[int, int]: + try: + width, height = [int(x) for x in self.resolution.split(':')] + return width, height + except (ValueError, AttributeError) as e: + raise ValueError( + f"Invalid resolution format: '{self.resolution}'. Expected format 'WIDTH:HEIGHT'.", + ) from e diff --git a/preprocessor/services/scraping/__init__.py b/preprocessor/services/scraping/__init__.py new file mode 100644 index 000000000..2abe08a3a --- /dev/null +++ b/preprocessor/services/scraping/__init__.py @@ -0,0 +1,6 @@ +from preprocessor.services.scraping.base_scraper import BaseScraper +from preprocessor.services.scraping.character_scraper import CharacterScraper +from preprocessor.services.scraping.episode_scraper import EpisodeScraper +from preprocessor.services.scraping.reference_processor import CharacterReferenceProcessor + +__all__ = ['BaseScraper', 'CharacterReferenceProcessor', 'CharacterScraper', 'EpisodeScraper'] diff --git a/preprocessor/services/scraping/base_scraper.py b/preprocessor/services/scraping/base_scraper.py new file mode 100644 index 000000000..2d25e0524 --- /dev/null +++ b/preprocessor/services/scraping/base_scraper.py @@ -0,0 +1,124 @@ +from abc import abstractmethod +import json +import logging +from pathlib import Path +from typing import ( + Any, + Dict, + List, + Optional, +) + +from preprocessor.config.enums import ( + ParserMode, + ScraperMethod, +) +from preprocessor.config.settings_instance import settings +from preprocessor.services.ai import LLMProvider +from preprocessor.services.core.base_processor import ( + BaseProcessor, + OutputSpec, + ProcessingItem, +) +from preprocessor.services.scraping.clipboard import ScraperClipboard +from preprocessor.services.scraping.crawl4ai import ScraperCrawl4AI +from preprocessor.services.ui.console import console + + +class BaseScraper(BaseProcessor): + def __init__(self, args: Dict[str, Any], error_exit_code: int = 7) -> None: + super().__init__( + args=args, + class_name=self.__class__.__name__, + error_exit_code=error_exit_code, + loglevel=logging.DEBUG, + ) + self.__urls: List[str] = self._args['urls'] + self.__output_file: Path = self._args['output_file'] + self.__headless: bool = self._args.get('headless', True) + self.__scraper_method = ScraperMethod(self._args.get('scraper_method', 'crawl4ai')) + self.__parser_mode = ParserMode(self._args.get('parser_mode', 'normal')) + self.__llm: Optional[LLMProvider] = None + + @property + def output_file(self) -> Path: + return self.__output_file + + @property + def llm(self) -> LLMProvider: + if self.__llm is None: + raise RuntimeError("LLMProvider not initialized. Call _execute first.") + return self.__llm + + def _execute(self) -> None: + self.__llm = LLMProvider(parser_mode=self.__parser_mode) + console.print(f'[blue]Scraping {len(self.__urls)} URLs...[/blue]') + + scraped_pages = self.__scrape_all_urls() + if not scraped_pages: + console.print('[yellow]No pages scraped[/yellow]') + return + + console.print(f'[blue]Scraped {len(scraped_pages)} pages, processing with LLM...[/blue]') + try: + self._process_scraped_pages(scraped_pages) + except Exception as e: + self.logger.error(f'LLM processing failed: {e}') + + @abstractmethod + def _process_scraped_pages(self, scraped_pages: List[Dict[str, Any]]) -> None: + pass + + def _save_result(self, result: Dict[str, Any]) -> None: + self.__output_file.parent.mkdir(parents=True, exist_ok=True) + with open(self.__output_file, 'w', encoding='utf-8') as f: + json.dump(result, f, indent=2, ensure_ascii=False) + + def __scrape_all_urls(self) -> List[Dict[str, Any]]: + results = [] + for i, url in enumerate(self.__urls, 1): + console.print(f'[cyan]Fetching page {i}/{len(self.__urls)}[/cyan]') + try: + content = self.__run_scraper(url) + if content: + results.append({'url': url, 'markdown': content}) + console.print(f'[green]Success[/green] {url}: {len(content)} chars') + else: + self.logger.error(f'Failed to scrape {url}') + except Exception as e: + self.logger.error(f'Error scraping {url}: {e}') + return results + + def __run_scraper(self, url: str) -> Optional[str]: + if self.__scraper_method == ScraperMethod.CLIPBOARD: + return ScraperClipboard.scrape(url, headless=self.__headless, logger=self.logger) + + if self.__scraper_method == ScraperMethod.CRAWL4AI: + return ScraperCrawl4AI.scrape( + url, + save_markdown=True, + output_dir=settings.scraper.get_output_dir(self.series_name), + logger=self.logger, + ) + + return None + + def get_output_subdir(self) -> str: + return 'scraper' + + def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: + return [] + + def _get_processing_items(self) -> List[ProcessingItem]: + return [] + + def _process_item( + self, item: ProcessingItem, missing_outputs: List[OutputSpec], + ) -> None: + pass + + def _validate_args(self, args: Dict[str, Any]) -> None: + if 'urls' not in args: + raise ValueError("Missing required argument: 'urls'") + if 'output_file' not in args: + raise ValueError("Missing required argument: 'output_file'") diff --git a/preprocessor/services/scraping/base_scraper_step.py b/preprocessor/services/scraping/base_scraper_step.py new file mode 100644 index 000000000..01f9ffbd0 --- /dev/null +++ b/preprocessor/services/scraping/base_scraper_step.py @@ -0,0 +1,71 @@ +from abc import ( + ABC, + abstractmethod, +) +from pathlib import Path +from typing import ( + Any, + Dict, + Optional, + Type, + TypeVar, +) + +from pydantic import BaseModel + +from preprocessor.config.output_paths import get_base_output_dir +from preprocessor.core.artifacts import SourceVideo +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext + +ConfigT = TypeVar("ConfigT", bound=BaseModel) + + +class BaseScraperStep(PipelineStep[SourceVideo, SourceVideo, ConfigT], ABC): + @property + def is_global(self) -> bool: + return True + + def _process(self, input_data: SourceVideo, context: ExecutionContext) -> SourceVideo: + raise NotImplementedError("BaseScraperStep uses execute() instead of _process()") + + def execute(self, input_data: SourceVideo, context: ExecutionContext) -> Optional[SourceVideo]: + output_path = self.__resolve_output_path(context) + + if output_path.exists() and not context.force_rerun: + context.logger.info(f"{self._get_metadata_type_name()} metadata already exists.") + return input_data + + context.logger.info(f"Scraping {self._get_metadata_type_name().lower()} from {len(self.config.urls)} URLs") + + scraper = self._get_scraper_class()(self._build_scraper_args(output_path, context)) + exit_code = scraper.work() + + if exit_code != 0: + raise RuntimeError(f"{self._get_metadata_type_name()} scraper failed with code {exit_code}") + + context.logger.info(f"{self._get_metadata_type_name()} metadata saved to: {output_path}") + return input_data + + def __resolve_output_path(self, context: ExecutionContext) -> Path: + metadata_type = self._get_metadata_type_name().lower() + output_dir = get_base_output_dir(context.series_name) + return output_dir / f"{context.series_name}_{metadata_type}.json" + + @abstractmethod + def _get_scraper_class(self) -> Type: + pass + + @abstractmethod + def _get_metadata_type_name(self) -> str: + pass + + def _build_scraper_args(self, output_path: Path, context: ExecutionContext) -> Dict[str, Any]: + return { + "urls": self.config.urls, + "output_file": output_path, + "headless": self.config.headless, + "scraper_method": self.config.scraper_method, + "parser_mode": self.config.parser_mode, + "series_name": context.series_name, + } diff --git a/preprocessor/services/scraping/character_scraper.py b/preprocessor/services/scraping/character_scraper.py new file mode 100644 index 000000000..6d89ed38c --- /dev/null +++ b/preprocessor/services/scraping/character_scraper.py @@ -0,0 +1,29 @@ +from typing import ( + Any, + Dict, + List, +) + +from preprocessor.services.scraping.base_scraper import BaseScraper +from preprocessor.services.ui.console import console + + +class CharacterScraper(BaseScraper): + def __init__(self, args: Dict[str, Any]) -> None: + super().__init__(args) + self.__series_name: str = self._args.get('series_name', '') + + def _process_scraped_pages(self, scraped_pages: List[Dict[str, Any]]) -> None: + characters = self.llm.extract_characters(scraped_pages, self.__series_name) + + if not characters: + self.logger.error('LLM failed to extract any character data') + return + + payload = { + 'sources': [p['url'] for p in scraped_pages], + 'characters': [c.model_dump() for c in characters], + } + + self._save_result(payload) + console.print(f'[green]Extracted {len(characters)} characters. Saved to: {self.output_file}[/green]') diff --git a/preprocessor/services/scraping/clipboard.py b/preprocessor/services/scraping/clipboard.py new file mode 100644 index 000000000..ab0c4ae15 --- /dev/null +++ b/preprocessor/services/scraping/clipboard.py @@ -0,0 +1,30 @@ +from typing import Optional + +from patchright.sync_api import sync_playwright + +from preprocessor.services.core.logging import ErrorHandlingLogger + + +class ScraperClipboard: + __BROWSER_ARGS = ['--no-sandbox', '--disable-dev-shm-usage', '--disable-gpu'] + + @staticmethod + def scrape(url: str, headless: bool = True, logger: Optional[ErrorHandlingLogger] = None) -> Optional[str]: + try: + with sync_playwright() as p: + browser = p.chromium.launch(headless=headless, args=ScraperClipboard.__BROWSER_ARGS) + context = browser.new_context() + page = context.new_page() + + page.goto(url, wait_until='networkidle', timeout=30000) + + page.keyboard.press('Control+A') + page.keyboard.press('Control+C') + + content = page.evaluate('navigator.clipboard.readText()') + browser.close() + return content + except Exception as e: + if logger: + logger.error(f'Clipboard scraping failed for {url}: {e}') + return None diff --git a/preprocessor/services/scraping/crawl4ai.py b/preprocessor/services/scraping/crawl4ai.py new file mode 100644 index 000000000..4f1162e3b --- /dev/null +++ b/preprocessor/services/scraping/crawl4ai.py @@ -0,0 +1,65 @@ +import asyncio +from pathlib import Path +from typing import Optional + +from crawl4ai import AsyncWebCrawler +from crawl4ai.async_configs import ( + BrowserConfig, + CrawlerRunConfig, +) +from pathvalidate import sanitize_filename +import ua_generator + +from preprocessor.services.core.logging import ErrorHandlingLogger + + +class ScraperCrawl4AI: + @staticmethod + def scrape( + url: str, + save_markdown: bool = False, + output_dir: Optional[Path] = None, + logger: Optional[ErrorHandlingLogger] = None, + ) -> Optional[str]: + return asyncio.run(ScraperCrawl4AI.__scrape_async(url, save_markdown, output_dir, logger)) + + @staticmethod + async def __scrape_async( + url: str, + save_markdown: bool, + output_dir: Optional[Path], + logger: Optional[ErrorHandlingLogger], + ) -> Optional[str]: + try: + browser_config = BrowserConfig( + headless=True, + enable_stealth=True, + viewport_width=1920, + viewport_height=1080, + user_agent=str(ua_generator.generate()), + ) + run_config = CrawlerRunConfig(wait_until='networkidle', page_timeout=60000, delay_before_return_html=2.0) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun(url=url, config=run_config) + if result.success: + if save_markdown and output_dir: + ScraperCrawl4AI.__persist_markdown(result.markdown, url, output_dir, logger) + return result.markdown + + if logger: + logger.error(f'Crawl4AI failed for {url}: {result.error_message}') + except Exception as e: + if logger: + logger.error(f'Crawl4AI exception: {e}') + return None + + @staticmethod + def __persist_markdown(content: str, url: str, output_dir: Path, logger: Optional[ErrorHandlingLogger]) -> None: + output_dir.mkdir(parents=True, exist_ok=True) + safe_name = sanitize_filename(url.replace('://', '_').replace('/', '_')) + path = output_dir / f'{safe_name}.md' + + path.write_text(content, encoding='utf-8') + if logger: + logger.info(f'Saved markdown: {path}') diff --git a/preprocessor/services/scraping/episode_scraper.py b/preprocessor/services/scraping/episode_scraper.py new file mode 100644 index 000000000..c5ea22c1b --- /dev/null +++ b/preprocessor/services/scraping/episode_scraper.py @@ -0,0 +1,90 @@ +from pathlib import Path +from typing import ( + Any, + Dict, + List, + Optional, + Tuple, +) + +from preprocessor.services.scraping.base_scraper import BaseScraper +from preprocessor.services.ui.console import console + + +class EpisodeScraper(BaseScraper): + def __init__(self, args: Dict[str, Any]) -> None: + super().__init__(args) + self.__expected_episodes_count: Optional[int] = self._args.get('expected_episodes_count') + self.__videos_dir: Optional[Path] = self._args.get('videos_dir') + + def _process_scraped_pages(self, scraped_pages: List[Dict[str, Any]]) -> None: + all_seasons = self.llm.extract_all_seasons(scraped_pages) + if not all_seasons: + self.logger.error('LLM failed to extract any season data') + return + + result = { + 'sources': [item['url'] for item in scraped_pages], + 'seasons': [season.model_dump() for season in all_seasons], + } + self._save_result(result) + + total_episodes = sum(len(season.episodes) for season in all_seasons) + console.print(f'[green]Extracted {len(all_seasons)} seasons, {total_episodes} episodes[/green]') + console.print(f'[green]Saved to: {self.output_file}[/green]') + + self.__validate_and_report_coverage(total_episodes) + + def __validate_and_report_coverage(self, scraped_count: int) -> None: + expected_count = self.__get_expected_episodes_count() + if expected_count is None: + self.__print_no_validation_warning(scraped_count) + return + + status, message = self.__evaluate_coverage_status(scraped_count, expected_count) + self.__print_coverage_report(scraped_count, expected_count, status, message) + + def __get_expected_episodes_count(self) -> Optional[int]: + if self.__expected_episodes_count is not None: + return self.__expected_episodes_count + if self.__videos_dir and self.__videos_dir.exists(): + return self.__count_video_files(self.__videos_dir) + return None + + def __count_video_files(self, directory: Path) -> int: + count = 0 + for ext in self.SUPPORTED_VIDEO_EXTENSIONS: + count += len(list(directory.rglob(f'*{ext}'))) + return count + + @staticmethod + def __evaluate_coverage_status(scraped: int, expected: int) -> Tuple[str, str]: + if scraped < expected: + return 'missing', f'Missing {expected - scraped} episodes' + if scraped > expected: + return 'extra', f'Scraped {scraped - expected} more episodes than video files' + return 'perfect', 'Perfect coverage' + + @staticmethod + def __print_coverage_report(scraped: int, expected: int, status: str, message: str) -> None: + coverage_pct = (scraped / expected * 100) if expected > 0 else 0 + console.print('\n[yellow]Episode coverage validation:[/yellow]') + console.print(f' [cyan]Scraped episodes: {scraped}[/cyan]') + console.print(f' [cyan]Video files found: {expected}[/cyan]') + console.print(f' [cyan]Coverage: {coverage_pct:.1f}%[/cyan]') + + if status == 'missing': + console.print(f'\n[red]WARNING: {message}![/red]') + console.print(' [yellow]Consider adding more URLs to --scrape-urls[/yellow]') + elif status == 'extra': + console.print(f'\n[yellow]Note: {message}[/yellow]') + console.print(' [dim]This is OK if you plan to add more videos later[/dim]\n') + else: + console.print('\n[green]Perfect coverage - all video files have metadata![/green]\n') + + @staticmethod + def __print_no_validation_warning(scraped_count: int) -> None: + console.print('\n[yellow]Coverage validation:[/yellow]') + console.print(f' [cyan]Scraped episodes: {scraped_count}[/cyan]') + console.print(' [yellow]No video directory provided - unable to validate coverage[/yellow]') + console.print(' [dim]Make sure the scraped episodes cover all your video files[/dim]\n') diff --git a/preprocessor/services/scraping/grid_visualizer.py b/preprocessor/services/scraping/grid_visualizer.py new file mode 100644 index 000000000..a927bd44a --- /dev/null +++ b/preprocessor/services/scraping/grid_visualizer.py @@ -0,0 +1,336 @@ +from dataclasses import dataclass +from datetime import datetime +import json +from pathlib import Path +from typing import ( + Any, + Dict, + List, + Optional, + Tuple, +) +import unicodedata + +import cv2 +import numpy as np + +from preprocessor.config.settings_instance import settings + + +@dataclass(frozen=True) +class GridDimensions: + face_size: int = 280 + faces_per_char: int = 3 + footer_height: int = 80 + header_height: int = 180 + header_row_height: int = 40 + label_col_width: int = 350 + padding: int = 15 + stats_col_width: int = 200 + + @property + def face_col_width(self) -> int: + return self.face_size + self.padding + + @property + def row_height(self) -> int: + return self.face_size + self.padding * 2 + + def total_height(self, num_chars: int) -> int: + return self.header_height + (num_chars * self.row_height) + self.footer_height + + def total_width(self) -> int: + return ( + self.label_col_width + + self.stats_col_width + + (self.faces_per_char * self.face_col_width) + + (self.padding * 2) + ) + + +class CharacterGridVisualizer: + def __init__( + self, + dimensions: Optional[GridDimensions] = None, + similarity_threshold: float = 0.5, + ) -> None: + self.__dims = dimensions or GridDimensions() + self.__similarity_threshold = similarity_threshold + + def generate_grid( + self, + processed_chars_dir: Path, + output_path: Path, + ) -> Dict[str, Any]: + processed_chars = self.__get_processed_characters(processed_chars_dir) + + if not processed_chars: + return self.__empty_result() + + metadata_all = self.__load_all_metadata(processed_chars) + avg_similarity = self.__calculate_avg_similarity(metadata_all) + + canvas = self.__create_canvas(len(processed_chars)) + self.__render_header(canvas, len(processed_chars), avg_similarity) + self.__render_table_headers(canvas) + self.__render_character_rows(canvas, processed_chars) + self.__render_footer(canvas) + + self.__save_grid_image(canvas, output_path) + + return { + 'width': self.__dims.total_width(), + 'height': self.__dims.total_height(len(processed_chars)), + 'num_chars': len(processed_chars), + 'avg_similarity': avg_similarity, + } + + @staticmethod + def __empty_result() -> Dict[str, Any]: + return { + 'width': 0, + 'height': 0, + 'num_chars': 0, + 'avg_similarity': 0.0, + } + + @staticmethod + def __get_processed_characters(dir_path: Path) -> List[Path]: + return sorted([d for d in dir_path.iterdir() if d.is_dir()]) + + def __create_canvas(self, num_chars: int) -> np.ndarray: + grid_width = self.__dims.total_width() + grid_height = self.__dims.total_height(num_chars) + bg_color = (250, 252, 255) + return np.full((grid_height, grid_width, 3), bg_color, dtype=np.uint8) + + def __render_header(self, canvas: np.ndarray, total_chars: int, avg_similarity: float) -> None: + header_bg_color = (45, 55, 72) + cv2.rectangle( + canvas, + (0, 0), + (self.__dims.total_width(), self.__dims.header_height), + header_bg_color, + -1, + ) + + title_pos = (self.__dims.padding * 3, 50) + cv2.putText( + canvas, 'FACIAL REFERENCE VALIDATION REPORT', title_pos, + cv2.FONT_HERSHEY_DUPLEX, 1.1, (255, 255, 255), 2, cv2.LINE_AA, + ) + + subtitle_pos = (self.__dims.padding * 3, 85) + cv2.putText( + canvas, 'InsightFace Buffalo-L Model | Face Vector Extraction & Similarity Analysis', + subtitle_pos, cv2.FONT_HERSHEY_SIMPLEX, 0.55, (200, 210, 220), 1, cv2.LINE_AA, + ) + + stats_y = 115 + stats_items = [ + f'Total Subjects: {total_chars}', + f'Avg Similarity: {avg_similarity:.4f}', + f'Threshold: {self.__similarity_threshold:.2f}', + ] + + for idx, stat in enumerate(stats_items): + x_pos = self.__dims.padding * 3 + idx * 280 + cv2.putText( + canvas, stat, (x_pos, stats_y), + cv2.FONT_HERSHEY_SIMPLEX, 0.5, (180, 200, 220), 1, cv2.LINE_AA, + ) + + def __render_table_headers(self, canvas: np.ndarray) -> None: + table_header_y = self.__dims.header_height + 1 + cv2.line( + canvas, (0, table_header_y), (self.__dims.total_width(), table_header_y), + (180, 190, 200), 2, + ) + + base_stats_x = self.__dims.label_col_width + base_face_x = base_stats_x + self.__dims.stats_col_width + half_face_col = self.__dims.face_col_width // 2 + + col_headers = [ + ('CHARACTER NAME', self.__dims.label_col_width // 2), + ('STATISTICS', base_stats_x + self.__dims.stats_col_width // 2), + ('REFERENCE IMAGE 1', base_face_x + half_face_col), + ('REFERENCE IMAGE 2', base_face_x + self.__dims.face_col_width + half_face_col), + ('REFERENCE IMAGE 3', base_face_x + (self.__dims.face_col_width * 2) + half_face_col), + ] + + for text, x_center in col_headers: + text_size = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.42, 1)[0] + text_x = x_center - text_size[0] // 2 + cv2.putText( + canvas, text, (text_x, table_header_y + 25), + cv2.FONT_HERSHEY_SIMPLEX, 0.42, (60, 70, 85), 1, cv2.LINE_AA, + ) + + line_y = table_header_y + self.__dims.header_row_height + cv2.line(canvas, (0, line_y), (self.__dims.total_width(), line_y), (200, 210, 220), 1) + + def __render_character_rows(self, canvas: np.ndarray, processed_chars: List[Path]) -> None: + y_offset = self.__dims.header_height + self.__dims.header_row_height + self.__dims.padding + bg_color = (250, 252, 255) + + for idx, char_dir in enumerate(processed_chars): + self.__render_single_row(canvas, char_dir, idx, y_offset, bg_color) + y_offset += self.__dims.row_height + + def __render_single_row( + self, canvas: np.ndarray, char_dir: Path, row_idx: int, y_offset: int, bg_color: Tuple[int, int, int], + ) -> None: + row_bg = (245, 248, 252) if row_idx % 2 == 0 else bg_color + + cv2.rectangle( + canvas, + (0, y_offset - self.__dims.padding), + (self.__dims.total_width(), y_offset + self.__dims.face_size + self.__dims.padding), + row_bg, -1, + ) + + char_name = char_dir.name.replace('_', ' ').title() + cv2.putText( + canvas, self.__ascii_safe(char_name), + (self.__dims.padding * 2, y_offset + self.__dims.face_size // 2), + cv2.FONT_HERSHEY_SIMPLEX, 0.55, (30, 40, 50), 1, cv2.LINE_AA, + ) + + self.__render_character_stats(canvas, char_dir, y_offset) + self.__render_character_faces(canvas, char_dir, y_offset) + + def __render_character_stats(self, canvas: np.ndarray, char_dir: Path, y_offset: int) -> None: + metadata_file = char_dir / 'metadata.json' + if not metadata_file.exists(): + return + + with open(metadata_file, 'r', encoding='utf-8') as f: + metadata = json.load(f) + + similarity = metadata.get('average_similarity', 0.0) + method = metadata.get('detection_stats', {}).get('selection_method', 'unknown') + faces_detected = metadata.get('detection_stats', {}).get('total_faces_detected', []) + + stats_x = self.__dims.label_col_width + self.__dims.padding + stats_y_base = y_offset + self.__dims.face_size // 2 - 30 + + sim_color = (0, 150, 0) if similarity >= self.__similarity_threshold else (180, 100, 0) + cv2.putText( + canvas, f'Similarity: {similarity:.4f}', (stats_x, stats_y_base), + cv2.FONT_HERSHEY_SIMPLEX, 0.45, sim_color, 1, cv2.LINE_AA, + ) + + method_color = (50, 120, 200) if method == 'automatic' else (180, 100, 50) + cv2.putText( + canvas, f'Method: {method}', (stats_x, stats_y_base + 25), + cv2.FONT_HERSHEY_SIMPLEX, 0.42, method_color, 1, cv2.LINE_AA, + ) + + faces_str = str(faces_detected) if len(str(faces_detected)) < 20 else f'[{len(faces_detected)} imgs]' + cv2.putText( + canvas, f'Detected: {faces_str}', (stats_x, stats_y_base + 50), + cv2.FONT_HERSHEY_SIMPLEX, 0.38, (100, 110, 120), 1, cv2.LINE_AA, + ) + + def __render_character_faces(self, canvas: np.ndarray, char_dir: Path, y_offset: int) -> None: + face_files = sorted(char_dir.glob('face_*.jpg')) + + for face_idx, face_file in enumerate(face_files[:self.__dims.faces_per_char]): + face_img = cv2.imread(str(face_file)) + if face_img is None: + continue + + face_resized = CharacterGridVisualizer.safe_resize( + face_img, + (self.__dims.face_size, self.__dims.face_size), + ) + if face_resized is None: + continue + + x = ( + self.__dims.label_col_width + + self.__dims.stats_col_width + + face_idx * self.__dims.face_col_width + + self.__dims.padding + ) + + canvas[y_offset:y_offset + self.__dims.face_size, x:x + self.__dims.face_size] = face_resized + + cv2.rectangle( + canvas, + (x - 1, y_offset - 1), + (x + self.__dims.face_size + 1, y_offset + self.__dims.face_size + 1), + (180, 190, 200), 1, + ) + + def __render_footer(self, canvas: np.ndarray) -> None: + grid_height = canvas.shape[0] + footer_y = grid_height - self.__dims.footer_height + 20 + + cv2.line( + canvas, (0, footer_y - 20), (self.__dims.total_width(), footer_y - 20), + (200, 210, 220), 1, + ) + + norm_size = settings.character.normalized_face_size + footer_text = ( + f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} | " + f"Model: {settings.face_recognition.model_name} | " + f"Normalized Size: {norm_size[0]}x{norm_size[1]}px" + ) + cv2.putText( + canvas, footer_text, (self.__dims.padding * 3, footer_y), + cv2.FONT_HERSHEY_SIMPLEX, 0.4, (120, 130, 140), 1, cv2.LINE_AA, + ) + + # Legend + legend_y = footer_y + 30 + legend_items = [ + ('Automatic: Face found on all references', (50, 120, 200)), + ('Manual: User-selected reference', (180, 100, 50)), + ] + + for idx, (text, color) in enumerate(legend_items): + x_pos = self.__dims.padding * 3 + idx * 380 + cv2.circle(canvas, (x_pos, legend_y - 3), 5, color, -1) + cv2.putText( + canvas, text, (x_pos + 15, legend_y), + cv2.FONT_HERSHEY_SIMPLEX, 0.38, (100, 110, 120), 1, cv2.LINE_AA, + ) + + @staticmethod + def __save_grid_image(canvas: np.ndarray, output_path: Path) -> None: + cv2.imwrite(str(output_path), canvas, [cv2.IMWRITE_PNG_COMPRESSION, 6]) + + @staticmethod + def __load_all_metadata(processed_chars: List[Path]) -> List[Dict[str, Any]]: + metadata_all = [] + for char_dir in processed_chars: + metadata_file = char_dir / 'metadata.json' + if metadata_file.exists(): + with open(metadata_file, 'r', encoding='utf-8') as f: + metadata_all.append(json.load(f)) + return metadata_all + + @staticmethod + def __calculate_avg_similarity(metadata_all: List[Dict[str, Any]]) -> float: + if not metadata_all: + return 0.0 + return float(np.mean([m.get('average_similarity', 0) for m in metadata_all])) + + @staticmethod + def __ascii_safe(text: str) -> str: + text = text.translate(str.maketrans('łŁ', 'lL')) + return unicodedata.normalize('NFD', text).encode('ascii', 'ignore').decode('ascii') + + @staticmethod + def safe_resize(img: np.ndarray, target_size: Tuple[int, int]) -> Optional[np.ndarray]: + if img is None or img.size == 0: + return None + if img.shape[0] == 0 or img.shape[1] == 0: + return None + try: + return cv2.resize(img, target_size) + except cv2.error: + return None diff --git a/preprocessor/services/scraping/reference_processor.py b/preprocessor/services/scraping/reference_processor.py new file mode 100644 index 000000000..137e6d0be --- /dev/null +++ b/preprocessor/services/scraping/reference_processor.py @@ -0,0 +1,258 @@ +from datetime import datetime +import json +import logging +from pathlib import Path +from typing import ( + Any, + Dict, + List, + Optional, + Tuple, +) +import warnings + +import cv2 +from insightface.app import FaceAnalysis +import numpy as np + +from preprocessor.config.settings_instance import settings +from preprocessor.services.characters.face_detection import FaceDetector +from preprocessor.services.characters.models import ( + CandidateFace, + FaceData, +) +from preprocessor.services.core.base_processor import ( + BaseProcessor, + OutputSpec, + ProcessingItem, +) +from preprocessor.services.scraping.grid_visualizer import CharacterGridVisualizer +from preprocessor.services.ui.console import console + +warnings.filterwarnings('ignore', category=FutureWarning, module='insightface') + + +class CharacterReferenceProcessor(BaseProcessor): + def __init__(self, args: Dict[str, Any]) -> None: + super().__init__( + args=args, + class_name='CharacterReferenceProcessor', + error_exit_code=20, + loglevel=logging.INFO, + ) + self.__characters_dir: Path = args['characters_dir'] + self.__output_dir: Path = args['output_dir'] + self.__similarity_threshold: float = args['similarity_threshold'] + self.__interactive: bool = args['interactive'] + + self.__face_app: Optional[FaceAnalysis] = None + self.__visualizer = CharacterGridVisualizer(similarity_threshold=self.__similarity_threshold) + + def get_output_subdir(self) -> str: + return 'character_references' + + def _execute(self) -> None: + super()._execute() + self.__generate_validation_grid() + + def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: + char_output_dir = self.__output_dir / item.episode_id + return [ + OutputSpec(path=char_output_dir / 'metadata.json', required=True), + OutputSpec(path=char_output_dir / 'face_vector.npy', required=True), + ] + + def _get_processing_items(self) -> List[ProcessingItem]: + if not self.__characters_dir.exists(): + console.print(f'[red]Characters directory not found: {self.__characters_dir}[/red]') + return [] + + return [ + ProcessingItem( + episode_id=char_dir.name, + input_path=char_dir, + metadata={'char_name': char_dir.name}, + ) + for char_dir in sorted(self.__characters_dir.iterdir()) if char_dir.is_dir() + ] + + def _get_progress_description(self) -> str: + return 'Processing character references' + + def _load_resources(self) -> bool: + self.__face_app = FaceDetector.init() + return True + + def _validate_args(self, args: Dict[str, Any]) -> None: + required = ['characters_dir', 'output_dir', 'similarity_threshold', 'interactive'] + for key in required: + if key not in args: + raise ValueError(f'Missing required argument: {key}') + + def _process_item(self, item: ProcessingItem, _missing_outputs: List[OutputSpec]) -> None: + char_dir = item.input_path + char_name = item.metadata['char_name'] + console.print(f'[blue]Processing character: {char_name}[/blue]') + + ref_images = sorted(char_dir.glob('*.jpg')) + if len(ref_images) < 2: + console.print(f'[yellow]Skipping {char_name}: need >=2 images, found {len(ref_images)}[/yellow]') + return + + all_faces = self.__detect_faces_in_references(ref_images) + if not all_faces or not all_faces[0]: + console.print(f'[yellow]Skipping {char_name}: no faces detected[/yellow]') + return + + result = self.__find_common_face(all_faces) + if not result: + console.print(f'[yellow]Skipping {char_name}: could not identify common face[/yellow]') + return + + selected_faces, avg_similarity = result + faces_per_image = [len(faces) for faces in all_faces] + self.__save_processed_references(char_name, selected_faces, ref_images, avg_similarity, faces_per_image) + console.print(f'[green]Processed {char_name}[/green]') + + def __generate_validation_grid(self) -> None: + output_path = self.__output_dir / 'validation_grid.png' + if output_path.exists(): + console.print(f'[dim]Skipping validation grid (exists): {output_path}[/dim]') + return + + processed_chars = sorted([d for d in self.__output_dir.iterdir() if d.is_dir()]) + if not processed_chars: + return + + stats = self.__visualizer.generate_grid( + processed_chars_dir=self.__output_dir, + output_path=output_path, + ) + + console.print(f'\n[green]Validation grid saved to: {output_path}[/green]') + console.print(f'[green] Size: {stats["width"]}x{stats["height"]}px | Chars: {stats["num_chars"]}[/green]') + + def __detect_faces_in_references(self, image_paths: List[Path]) -> List[List[FaceData]]: + all_faces = [] + for idx, img_path in enumerate(image_paths): + img = cv2.imread(str(img_path)) + if img is None: + all_faces.append([]) + continue + + faces = self.__face_app.get(img) + faces_data = [ + FaceData( + bbox=(bbox := face.bbox.astype(int)), + face_vector=face.normed_embedding, + source_image_path=img_path, + source_image_idx=idx, + face_img=img[bbox[1]:bbox[3], bbox[0]:bbox[2]], + ) for face in faces + ] + all_faces.append(faces_data) + return all_faces + + def __find_common_face( + self, + all_faces: List[List[FaceData]], + ) -> Optional[Tuple[List[FaceData], float]]: + first_faces = all_faces[0] + candidates = self.__find_face_candidates(first_faces, all_faces[1:], all_faces) + + if len(candidates) == 1: + return candidates[0].faces, candidates[0].avg_similarity + + if len(candidates) > 1 and not self.__interactive: + candidates.sort(key=lambda c: c.avg_similarity, reverse=True) + return candidates[0].faces, candidates[0].avg_similarity + + return None + + def __find_face_candidates( + self, first_faces: List[FaceData], remaining: List[List[FaceData]], all_faces: List[List[FaceData]], + ) -> List[CandidateFace]: + candidates = [] + for first_face in first_faces: + matched = [first_face] + sims = [] + + for other_faces in remaining: + best_match, best_sim = self.__get_best_match(first_face, other_faces) + if best_match: + matched.append(best_match) + sims.append(best_sim) + else: + break + + if len(matched) == len(all_faces): + candidates.append(CandidateFace(faces=matched, avg_similarity=float(np.mean(sims)))) + + return candidates + + @staticmethod + def __get_best_match(ref_face: FaceData, candidates: List[FaceData]) -> Tuple[Optional[FaceData], float]: + best_match, best_sim = None, -1.0 + for cand in candidates: + sim = float(np.dot(ref_face.face_vector, cand.face_vector)) + if sim > best_sim: + best_sim = sim + best_match = cand + return best_match, best_sim + + def __save_processed_references( + self, + char_name: str, + selected_faces: List[FaceData], + ref_images: List[Path], + avg_similarity: float, + faces_per_image: List[int], + ) -> None: + char_out = self.__output_dir / char_name + char_out.mkdir(parents=True, exist_ok=True) + + face_vectors = [] + for idx, face_data in enumerate(selected_faces): + norm_face = CharacterGridVisualizer.safe_resize( + face_data.face_img, + settings.character.normalized_face_size, + ) + if norm_face is not None: + cv2.imwrite(str(char_out / f'face_{idx:02d}.jpg'), norm_face) + face_vectors.append(face_data.face_vector) + + mean_vector = np.mean(face_vectors, axis=0) + np.save(char_out / 'face_vector.npy', mean_vector) + + metadata = self.__create_metadata( + char_name, selected_faces, ref_images, mean_vector, avg_similarity, faces_per_image, + ) + with open(char_out / 'metadata.json', 'w', encoding='utf-8') as f: + json.dump(metadata, f, indent=2, ensure_ascii=False) + + def __create_metadata( + self, + name: str, + faces: List[FaceData], + refs: List[Path], + mean_vec: np.ndarray, + avg_similarity: float, + faces_per_image: List[int], + ) -> Dict[str, Any]: + return { + 'character_name': name.replace('_', ' ').title(), + 'source_images': [str(img) for img in refs], + 'processed_at': datetime.now().isoformat(), + 'average_similarity': avg_similarity, + 'processing_params': { + 'similarity_threshold': self.__similarity_threshold, + 'face_model': settings.face_recognition.model_name, + }, + 'detection_stats': { + 'total_faces_detected': faces_per_image, + 'candidates_found': 1, + 'selection_method': 'automatic', + }, + 'selected_face_indices': [f.source_image_idx for f in faces], + 'face_vector_dim': int(mean_vec.shape[0]), + } diff --git a/preprocessor/services/search/__init__.py b/preprocessor/services/search/__init__.py new file mode 100644 index 000000000..44c378a7e --- /dev/null +++ b/preprocessor/services/search/__init__.py @@ -0,0 +1,4 @@ +from preprocessor.services.search.elasticsearch import ElasticsearchWrapper +from preprocessor.services.search.embedding_model import EmbeddingModelWrapper + +__all__ = ['ElasticsearchWrapper', 'EmbeddingModelWrapper'] diff --git a/preprocessor/services/search/clients/__init__.py b/preprocessor/services/search/clients/__init__.py new file mode 100644 index 000000000..8bf7d3a13 --- /dev/null +++ b/preprocessor/services/search/clients/__init__.py @@ -0,0 +1,6 @@ +from preprocessor.services.search.clients.elasticsearch_queries import ElasticsearchQueries +from preprocessor.services.search.clients.embedding_service import EmbeddingService +from preprocessor.services.search.clients.hash_service import HashService +from preprocessor.services.search.clients.result_formatters import ResultFormatter + +__all__ = ['ElasticsearchQueries', 'EmbeddingService', 'HashService', 'ResultFormatter'] diff --git a/preprocessor/services/search/clients/elasticsearch_queries.py b/preprocessor/services/search/clients/elasticsearch_queries.py new file mode 100644 index 000000000..7901709bb --- /dev/null +++ b/preprocessor/services/search/clients/elasticsearch_queries.py @@ -0,0 +1,160 @@ +from typing import ( + Any, + Dict, + List, + Optional, + Tuple, +) + +from elasticsearch import AsyncElasticsearch + +from preprocessor.services.search.clients.embedding_service import EmbeddingService + + +class ElasticsearchQueries: + def __init__(self, embedding_service: EmbeddingService, index_base: str) -> None: + self.__embedding_service = embedding_service + self.__index_base = index_base + + async def get_stats(self, es_client: AsyncElasticsearch) -> Dict[str, int]: + return { + 'segments': (await es_client.count(index=self.__segments_index))['count'], + 'text_embeddings': (await es_client.count(index=self.__text_embeddings_index))['count'], + 'video_embeddings': (await es_client.count(index=self.__video_frames_index))['count'], + 'episode_names': (await es_client.count(index=self.__episode_names_index))['count'], + } + + async def list_characters(self, es_client: AsyncElasticsearch) -> List[Tuple[str, int]]: + return await self.__list_nested_terms(es_client, self.__video_frames_index, 'character_appearances', 'name') + + async def list_objects(self, es_client: AsyncElasticsearch) -> List[Tuple[str, int]]: + return await self.__list_nested_terms(es_client, self.__video_frames_index, 'detected_objects', 'class') + + async def search_by_emotion( + self, + es_client: AsyncElasticsearch, + emotion: str, + season: Optional[int] = None, + episode: Optional[int] = None, + character: Optional[str] = None, + limit: int = 20, + ) -> Dict[str, Any]: + nested_must = [{'term': {'character_appearances.emotion.label': emotion}}] + if character: + nested_must.append({'term': {'character_appearances.name': character}}) + + must_clauses = [{'nested': {'path': 'character_appearances', 'query': {'bool': {'must': nested_must}}}}] + must_clauses.extend(self.__build_episode_filters(season, episode)) + + nested_filter = self.__build_nested_filter(emotion, character) + + return await es_client.search( + index=self.__video_frames_index, + query={'bool': {'must': must_clauses}}, + sort=[{ + 'character_appearances.emotion.confidence': { + 'order': 'desc', + 'nested': {'path': 'character_appearances', 'filter': nested_filter}, + }, + }], + track_scores=True, + size=limit, + _source=[ + 'episode_id', 'frame_number', 'timestamp', 'video_path', + 'episode_metadata', 'character_appearances', 'scene_info', + ], + ) + + async def search_video_semantic( + self, + es_client: AsyncElasticsearch, + image_path: str, + season: Optional[int] = None, + episode: Optional[int] = None, + character: Optional[str] = None, + limit: int = 10, + ) -> Dict[str, Any]: + embedding = self.__embedding_service.get_image_embedding(image_path) + return await self.__execute_knn_query( + es_client, self.__video_frames_index, 'video_embedding', embedding, + limit, season, episode, character, + ) + + async def __execute_knn_query( + self, es_client: AsyncElasticsearch, index: str, field: str, vector: List[float], + limit: int, season: Optional[int], episode: Optional[int], character: Optional[str] = None, + ) -> Dict[str, Any]: + filters = self.__build_episode_filters(season, episode) + if character: + filters.append({ + 'nested': { + 'path': 'character_appearances', + 'query': {'term': {'character_appearances.name': character}}, + }, + }) + + knn = { + 'field': field, + 'query_vector': vector, + 'k': limit, + 'num_candidates': limit * 10, + 'filter': filters if filters else None, + } + return await es_client.search(index=index, knn=knn, size=limit) + + @staticmethod + def __build_episode_filters(season: Optional[int], episode: Optional[int]) -> List[Dict[str, Any]]: + filters = [] + if season is not None: + filters.append({'term': {'episode_metadata.season': season}}) + if episode is not None: + filters.append({'term': {'episode_metadata.episode_number': episode}}) + return filters + + @staticmethod + def __build_nested_filter(emotion: str, character: Optional[str]) -> Dict[str, Any]: + if not character: + return {'term': {'character_appearances.emotion.label': emotion}} + return { + 'bool': { + 'must': [ + {'term': {'character_appearances.emotion.label': emotion}}, + {'term': {'character_appearances.name': character}}, + ], + }, + } + + @staticmethod + async def __list_nested_terms(es_client: AsyncElasticsearch, index: str, path: str, field: str) -> List[ + Tuple[str, int] + ]: + result = await es_client.search( + index=index, + size=0, + aggs={ + 'nested_path': { + 'nested': {'path': path}, + 'aggs': { + 'terms_agg': {'terms': {'field': f'{path}.{field}', 'size': 1000}}, + }, + }, + }, + ) + buckets = result['aggregations']['nested_path']['terms_agg']['buckets'] + return [(b['key'], b['doc_count']) for b in buckets] + + @property + def __episode_names_index(self) -> str: + return f'{self.__index_base}_episode_names' + + @property + def __segments_index(self) -> str: + return f'{self.__index_base}_text_segments' + + @property + def __text_embeddings_index(self) -> str: + return f'{self.__index_base}_text_embeddings' + + @property + def __video_frames_index(self) -> str: + return f'{self.__index_base}_video_frames' diff --git a/preprocessor/services/search/clients/embedding_service.py b/preprocessor/services/search/clients/embedding_service.py new file mode 100644 index 000000000..3791291d4 --- /dev/null +++ b/preprocessor/services/search/clients/embedding_service.py @@ -0,0 +1,74 @@ +import gc +from pathlib import Path +from typing import ( + Any, + Dict, + List, + Optional, + Union, +) + +from PIL import Image +import click +import torch +from vllm import LLM + +from preprocessor.config.settings_instance import settings + + +class EmbeddingService: + def __init__(self, model_name: Optional[str] = None) -> None: + self.__model_name: str = model_name or settings.embedding_model.model_name + self.__llm: Optional[LLM] = None + + def ensure_loaded(self) -> None: + if self.__llm is None: + self.__load_resources() + + def cleanup(self) -> None: + if self.__llm is not None: + del self.__llm + self.__llm = None + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + def get_image_embeddings_batch(self, image_paths: List[Union[str, Path]]) -> List[List[float]]: + placeholder = settings.embedding_model.image_placeholder + inputs: List[Dict[str, Any]] = [ + { + 'prompt': f'{placeholder}\nDescribe this image.', + 'multi_modal_data': {'image': Image.open(str(path)).convert('RGB')}, + } + for path in image_paths + ] + return self.__embed(inputs) + + def get_text_embeddings_batch(self, texts: List[str]) -> List[List[float]]: + inputs: List[Dict[str, Any]] = [{'prompt': text} for text in texts] + return self.__embed(inputs) + + def __embed(self, inputs: List[Dict[str, Any]]) -> List[List[float]]: + if self.__llm is None: + self.__load_resources() + outputs = self.__llm.embed(inputs) # type: ignore[union-attr] + return [output.outputs.embedding for output in outputs] + + def __load_resources(self) -> None: + click.echo('Loading vLLM embedding model...', err=True) + if not torch.cuda.is_available(): + raise RuntimeError('CUDA required for multimodal embeddings.') + + em = settings.embedding_model + self.__llm = LLM( + model=self.__model_name, + runner="pooling", + trust_remote_code=True, + max_model_len=em.max_model_len, + gpu_memory_utilization=em.gpu_memory_utilization, + enable_chunked_prefill=em.enable_chunked_prefill, + enforce_eager=em.enforce_eager, + max_num_batched_tokens=em.max_num_batched_tokens, + tensor_parallel_size=em.tensor_parallel_size, + disable_log_stats=True, + ) diff --git a/preprocessor/services/search/clients/hash_service.py b/preprocessor/services/search/clients/hash_service.py new file mode 100644 index 000000000..8f3c9b888 --- /dev/null +++ b/preprocessor/services/search/clients/hash_service.py @@ -0,0 +1,35 @@ +from pathlib import Path +from typing import ( + Optional, + Union, +) + +from PIL import Image +import click +import torch + +from preprocessor.services.video.image_hasher import PerceptualHasher + + +class HashService: + def __init__(self) -> None: + self.__hasher: Optional[PerceptualHasher] = None + + def get_perceptual_hash(self, image_path: Union[str, Path]) -> Optional[str]: + hasher = self.__get_hasher() + with Image.open(image_path) as img: + rgb_img = img.convert('RGB') + hashes = hasher.compute_phash_batch([rgb_img]) + return hashes[0] if hashes else None + + def __get_hasher(self) -> PerceptualHasher: + if self.__hasher is None: + click.echo('Loading perceptual hasher...', err=True) + self.__hasher = PerceptualHasher(device='cuda', hash_size=8) + return self.__hasher + + def cleanup(self) -> None: + if self.__hasher: + self.__hasher = None + if torch.cuda.is_available(): + torch.cuda.empty_cache() diff --git a/preprocessor/services/search/clients/result_formatters.py b/preprocessor/services/search/clients/result_formatters.py new file mode 100644 index 000000000..384340ac0 --- /dev/null +++ b/preprocessor/services/search/clients/result_formatters.py @@ -0,0 +1,65 @@ +from typing import ( + Any, + Dict, + Optional, +) + +import click + +from preprocessor.config.types import ( + ElasticsearchAggregationKeys, + ElasticsearchKeys, + EpisodeMetadataKeys, +) + + +class ResultFormatter: + @staticmethod + def format_timestamp(seconds: float) -> str: + return f'{int(seconds // 60)}m {seconds % 60:.1f}s' + + @staticmethod + def print_results(result: Dict[str, Any], result_type: str = 'text') -> None: + hits_data = result[ElasticsearchKeys.HITS] + total = hits_data[ElasticsearchKeys.TOTAL][ElasticsearchAggregationKeys.VALUE] + hits = hits_data[ElasticsearchKeys.HITS] + + click.echo(f'\nResults found: {total}') + click.echo('=' * 80) + + for i, hit in enumerate(hits, 1): + source = hit[ElasticsearchKeys.SOURCE] + meta = source[EpisodeMetadataKeys.EPISODE_METADATA] + + click.echo(f'\n[{i}] Score: {hit[ElasticsearchKeys.SCORE]:.2f}') + click.echo(f"Episode: S{meta['season']:02d}E{meta['episode_number']:02d} - {meta.get('title', 'N/A')}") + + ResultFormatter.__print_specific_content(source, result_type) + click.echo(f"Video: {source['video_path']}") + + @staticmethod + def __print_specific_content(source: Dict[str, Any], r_type: str) -> None: + scene_ctx = ResultFormatter.__get_scene_ctx(source.get('scene_info')) + + if r_type == 'text': + click.echo( + f"Time: {ResultFormatter.format_timestamp(source['start_time'])} - {ResultFormatter.format_timestamp(source['end_time'])}{scene_ctx}", + ) + click.echo(f"Speaker: {source.get('speaker', 'N/A')}\nText: {source['text']}") + elif r_type == 'text_semantic': + click.echo(f"Range: {source['segment_range']}{scene_ctx}\nText: {source['text']}") + else: + ts = ResultFormatter.format_timestamp(source['timestamp']) + click.echo(f"Frame: {source['frame_number']} @ {ts}{scene_ctx}") + if source.get('character_appearances'): + click.echo(f"Characters: {ResultFormatter.__fmt_chars(source['character_appearances'])}") + + @staticmethod + def __get_scene_ctx(info: Optional[Dict[str, Any]]) -> str: + if not info: + return '' + return f" [Scene {info.get('scene_number')}: {ResultFormatter.format_timestamp(info.get('scene_start_time', 0))}]" + + @staticmethod + def __fmt_chars(appearances: list) -> str: + return ', '.join([f"{c['name']} ({c['emotion']['label']})" for c in appearances if 'emotion' in c]) diff --git a/preprocessor/services/search/elasticsearch.py b/preprocessor/services/search/elasticsearch.py new file mode 100644 index 000000000..6390d4a82 --- /dev/null +++ b/preprocessor/services/search/elasticsearch.py @@ -0,0 +1,84 @@ +from typing import ( + Any, + Dict, + List, + Optional, +) + +from elasticsearch import AsyncElasticsearch +import urllib3 + +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + + +class ElasticsearchWrapper: + def __init__( + self, + index_name: str, + host: str = 'localhost:9200', + dry_run: bool = False, + ) -> None: + self.__index_name = index_name + self.__host = host + self.__dry_run = dry_run + self.__client: Optional[AsyncElasticsearch] = None + + @property + def index_name(self) -> str: + return self.__index_name + + async def bulk_index(self, documents: List[Dict[str, Any]]) -> Dict[str, Any]: + if self.__dry_run: + return {'indexed': len(documents), 'errors': []} + + client = await self.__ensure_client() + actions = self.__build_bulk_actions(documents) + + try: + response = await client.bulk(operations=actions) + return response + except Exception as e: + return {'errors': [str(e)]} + + async def create_index(self, mapping: Dict[str, Any]) -> None: + if self.__dry_run: + return + + client = await self.__ensure_client() + await client.indices.create(index=self.__index_name, body=mapping) + + async def delete_index(self) -> None: + if self.__dry_run: + return + + client = await self.__ensure_client() + if await client.indices.exists(index=self.__index_name): + await client.indices.delete(index=self.__index_name) + + async def index_exists(self) -> bool: + if self.__dry_run: + return False + + client = await self.__ensure_client() + return await client.indices.exists(index=self.__index_name) + + async def close(self) -> None: + if self.__client is not None: + await self.__client.close() + self.__client = None + + async def __ensure_client(self) -> AsyncElasticsearch: + if self.__client is None: + self.__client = AsyncElasticsearch( + [self.__host], + verify_certs=False, + ssl_show_warn=False, + ) + return self.__client + + def __build_bulk_actions(self, documents: List[Dict[str, Any]]) -> List[Any]: + actions = [] + for doc in documents: + actions.append({'index': {'_index': self.__index_name}}) + actions.append(doc) + return actions diff --git a/preprocessor/services/search/embedding_model.py b/preprocessor/services/search/embedding_model.py new file mode 100644 index 000000000..d4dbfdfe3 --- /dev/null +++ b/preprocessor/services/search/embedding_model.py @@ -0,0 +1,33 @@ +from typing import ( + List, + Union, +) + +import numpy as np + +from preprocessor.services.search.clients.embedding_service import EmbeddingService + + +class EmbeddingModelWrapper: + def __init__( + self, + model_name: str, + _device: str = 'cuda', + _batch_size: int = 8, + ) -> None: + self.__service = EmbeddingService(model_name=model_name) + + def load_model(self) -> None: + self.__service.ensure_loaded() + + def cleanup(self) -> None: + self.__service.cleanup() + + def encode_text(self, text: Union[str, List[str]]) -> Union[List[float], List[List[float]]]: + if isinstance(text, str): + return self.__service.get_text_embeddings_batch([text])[0] + return self.__service.get_text_embeddings_batch(text) + + def encode_images(self, image_paths: List[str]) -> List[np.ndarray]: + embeddings_list = self.__service.get_image_embeddings_batch(image_paths) + return [np.array(e) for e in embeddings_list] diff --git a/preprocessor/services/text/__init__.py b/preprocessor/services/text/__init__.py new file mode 100644 index 000000000..7a78d5188 --- /dev/null +++ b/preprocessor/services/text/__init__.py @@ -0,0 +1,8 @@ +from preprocessor.services.text.language_config import ( + ENGLISH_CONFIG, + POLISH_CONFIG, + LanguageConfig, +) +from preprocessor.services.text.text_statistics import TextStatistics + +__all__ = ['TextStatistics', 'LanguageConfig', 'POLISH_CONFIG', 'ENGLISH_CONFIG'] diff --git a/preprocessor/services/text/import_step.py b/preprocessor/services/text/import_step.py new file mode 100644 index 000000000..a886f062e --- /dev/null +++ b/preprocessor/services/text/import_step.py @@ -0,0 +1,168 @@ +from __future__ import annotations + +import json +from pathlib import Path +from typing import ( + Any, + Dict, + List, + Optional, +) + +from preprocessor.config.step_configs import TranscriptionImportConfig +from preprocessor.core.artifacts import ( + SourceVideo, + TranscriptionData, +) +from preprocessor.core.base_transcription_step import BaseTranscriptionStep +from preprocessor.core.context import ExecutionContext +from preprocessor.services.episodes.episode_manager import EpisodeManager +from preprocessor.services.episodes.types import EpisodeInfo + + +class TranscriptionImportStep(BaseTranscriptionStep[SourceVideo, TranscriptionImportConfig]): + @property + def supports_batch_processing(self) -> bool: + return True + + def execute_batch( + self, input_data: List[SourceVideo], context: ExecutionContext, + ) -> List[TranscriptionData]: + return self._execute_with_threadpool( + input_data, context, 4, self.execute, + ) + + def _process(self, input_data: SourceVideo, context: ExecutionContext) -> TranscriptionData: + episode_info = input_data.episode_info + + json_file = self.__find_transcription_file(episode_info) + if not json_file: + raise FileNotFoundError( + f'No transcription file found for {input_data.episode_id} in {self.config.source_dir}', + ) + + output_path = self._get_cache_path(input_data, context) + source_data = self.__load_json(json_file) + converted_data = self.__convert_data(source_data, json_file) + converted_data['episode_info'] = EpisodeManager.get_metadata(episode_info) + self.__save_converted_data(output_path, converted_data) + + context.logger.info(f'Imported {input_data.episode_id} from {json_file.name}') + + trans_meta = converted_data.get('transcription', {}) + return TranscriptionData( + episode_id=input_data.episode_id, + episode_info=episode_info, + path=output_path, + language=trans_meta.get('language_code', 'pl'), + model=trans_meta.get('format', '11labs'), + format='json', + ) + + def _load_from_cache( + self, cache_path: Path, input_data: SourceVideo, context: ExecutionContext, + ) -> TranscriptionData: + return TranscriptionData( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=cache_path, + language='pl', + model='11labs', + format='json', + ) + + def __find_transcription_file(self, episode_info: EpisodeInfo) -> Optional[Path]: + file_season = self.__resolve_file_season(episode_info.season) + ep = episode_info.relative_episode + pattern = ( + f'*S{file_season:02d}E{ep:02d}*_segmented.json' + if self.config.format_type == '11labs_segmented' + else f'*S{file_season:02d}E{ep:02d}*.json' + ) + files = sorted(self.config.source_dir.rglob(pattern)) + return files[0] if files else None + + def __resolve_file_season(self, target_season: int) -> int: + for file_season_str, mapped_season in self.config.season_remap.items(): + if mapped_season == target_season: + return int(file_season_str) + return target_season + + def __convert_data(self, data: Dict[str, Any], source_file: Path) -> Dict[str, Any]: + if self.config.format_type == '11labs_segmented': + return self.__convert_11labs_segmented(data, source_file) + if self.config.format_type == '11labs': + return self.__convert_11labs_full(data, source_file) + raise ValueError(f'Unknown format type: {self.config.format_type}') + + @staticmethod + def __convert_11labs_full(data: Dict[str, Any], source_file: Path) -> Dict[str, Any]: + segments: List[Dict[str, Any]] = [] + words = data.get('words', []) + current_seg: Dict[str, Any] = { + 'words': [], 'start': None, 'end': None, 'text': '', 'speaker': 'unknown', + } + + for word in words: + if current_seg['start'] is None: + current_seg['start'] = word.get('start') + + current_seg['words'].append(word) + current_seg['end'] = word.get('end') + + if word.get('text', '').endswith(('.', '!', '?')) or len(current_seg['words']) >= 20: + current_seg['text'] = ' '.join(w.get('text', '') for w in current_seg['words']) + segments.append(dict(current_seg)) + current_seg = { + 'words': [], 'start': None, 'end': None, 'text': '', + 'speaker': word.get('speaker_id', 'unknown'), + } + + if current_seg['words']: + current_seg['text'] = ' '.join(w.get('text', '') for w in current_seg['words']) + segments.append(current_seg) + + for i, seg in enumerate(segments): + seg['id'] = i + + return { + 'transcription': { + 'format': '11labs', + 'source_file': source_file.name, + 'language_code': data.get('language_code', 'pol'), + 'language_probability': data.get('language_probability', 1.0), + }, + 'segments': segments, + } + + @staticmethod + def __convert_11labs_segmented(data: Dict[str, Any], source_file: Path) -> Dict[str, Any]: + segments = [] + for i, segment in enumerate(data.get('segments', [])): + segments.append({ + 'id': i, + 'start': segment.get('start'), + 'end': segment.get('end'), + 'text': segment.get('text', ''), + 'speaker': segment.get('speaker', 'unknown'), + 'words': segment.get('words', []), + }) + return { + 'transcription': { + 'format': '11labs_segmented', + 'source_file': source_file.name, + 'language_code': 'pol', + }, + 'segments': segments, + } + + @staticmethod + def __load_json(file_path: Path) -> Dict[str, Any]: + with open(file_path, 'r', encoding='utf-8') as f: + return json.load(f) + + @staticmethod + def __save_converted_data(output_path: Path, data: Dict[str, Any]) -> None: + output_path.parent.mkdir(parents=True, exist_ok=True) + with open(output_path, 'w', encoding='utf-8') as f: + json.dump(data, f, indent=2, ensure_ascii=False) diff --git a/preprocessor/services/text/language_config.py b/preprocessor/services/text/language_config.py new file mode 100644 index 000000000..83343d61b --- /dev/null +++ b/preprocessor/services/text/language_config.py @@ -0,0 +1,32 @@ +from dataclasses import dataclass +from typing import Set + + +@dataclass(frozen=True) +class LanguageConfig: + consonants: Set[str] + punctuation: Set[str] + special_chars: Set[str] + vowels: Set[str] + + +_PUNCTUATION = set('.,;:!?…-—–()[]{}"\'«»„\'\'') +_SPECIAL_CHARS = set('@#$%^&*+=<>|\\/_~`') +_ENGLISH_VOWELS = set('aeiouAEIOU') +_ENGLISH_CONSONANTS = set('bcdfghjklmnpqrstvwxyzBCDFGHJKLMNPQRSTVWXYZ') +_POLISH_VOWELS = set('aąeęioóuyAĄEĘIOÓUY') +_POLISH_CONSONANTS = set('bcćdfghjklłmnńprsśtwzźżBCĆDFGHJKLŁMNŃPRSŚTWZŹŻ') + +POLISH_CONFIG = LanguageConfig( + vowels=_POLISH_VOWELS | _ENGLISH_VOWELS, + consonants=_POLISH_CONSONANTS | _ENGLISH_CONSONANTS, + punctuation=_PUNCTUATION, + special_chars=_SPECIAL_CHARS, +) + +ENGLISH_CONFIG = LanguageConfig( + vowels=_ENGLISH_VOWELS, + consonants=_ENGLISH_CONSONANTS, + punctuation=_PUNCTUATION, + special_chars=_SPECIAL_CHARS, +) diff --git a/preprocessor/services/text/text_statistics.py b/preprocessor/services/text/text_statistics.py new file mode 100644 index 000000000..ba9cdb7eb --- /dev/null +++ b/preprocessor/services/text/text_statistics.py @@ -0,0 +1,164 @@ +from collections import Counter +from dataclasses import ( + dataclass, + field, +) +from pathlib import Path +import re +from typing import ( + Any, + Dict, + List, +) + +from preprocessor.services.text.language_config import ( + ENGLISH_CONFIG, + POLISH_CONFIG, +) + + +@dataclass +class TextStatistics: # pylint: disable=too-many-instance-attributes # Data structure for comprehensive text statistics - all attributes necessary + text: str + language: str = 'pl' + + avg_sentence_length: float = 0.0 + avg_word_length: float = 0.0 + bigrams: List[Dict[str, Any]] = field(default_factory=list) + chars_without_spaces: int = 0 + consonants: int = 0 + digits: int = 0 + empty_lines: int = 0 + letter_frequency: Dict[str, int] = field(default_factory=dict) + letters: int = 0 + lines: int = 0 + paragraphs: int = 0 + punctuation_marks: int = 0 + sentences: int = 0 + spaces: int = 0 + special_characters: int = 0 + symbols: int = 0 + total_chars: int = 0 + trigrams: List[Dict[str, Any]] = field(default_factory=list) + type_token_ratio: float = 0.0 + unique_words: int = 0 + vowels: int = 0 + word_frequency: List[Dict[str, Any]] = field(default_factory=list) + words: int = 0 + + @classmethod + def from_file(cls, file_path: Path, language: str = 'pl') -> 'TextStatistics': + with open(file_path, 'r', encoding='utf-8') as f: + text = f.read() + stats = cls(text=text, language=language) + stats.__process_calculations() + return stats + + @classmethod + def from_text(cls, text: str, language: str = 'pl') -> 'TextStatistics': + stats = cls(text=text, language=language) + stats.__process_calculations() + return stats + + def to_dict(self) -> Dict[str, Any]: + return { + 'basic_statistics': { + 'sentences': self.sentences, + 'lines': self.lines, + 'paragraphs': self.paragraphs, + 'empty_lines': self.empty_lines, + 'words': self.words, + 'letters': self.letters, + 'digits': self.digits, + 'symbols': self.symbols, + 'punctuation_marks': self.punctuation_marks, + 'special_characters': self.special_characters, + 'chars_without_spaces': self.chars_without_spaces, + 'spaces': self.spaces, + 'total_chars': self.total_chars, + 'vowels': self.vowels, + 'consonants': self.consonants, + }, + 'advanced_statistics': { + 'unique_words': self.unique_words, + 'avg_word_length': self.avg_word_length, + 'avg_sentence_length': self.avg_sentence_length, + 'type_token_ratio': self.type_token_ratio, + }, + 'letter_frequency': self.letter_frequency, + 'word_frequency': self.word_frequency, + 'bigrams': self.bigrams, + 'trigrams': self.trigrams, + } + + def __process_calculations(self) -> None: # pylint: disable=unused-private-member # Called in from_file and from_text via name mangling - false positive + self.__calculate_structural_stats() + self.__calculate_character_distribution() + self.__calculate_lexical_stats() + self.__generate_n_grams() + + def __calculate_structural_stats(self) -> None: + lines = self.text.split('\n') + self.lines = len(lines) + self.empty_lines = sum(1 for line in lines if not line.strip()) + + paragraphs = self.text.split('\n\n') + self.paragraphs = len([p for p in paragraphs if p.strip()]) + + self.sentences = len(re.findall(r'[.!?…]+(?:\s|$)', self.text)) + self.total_chars = len(self.text) + self.spaces = self.text.count(' ') + self.text.count('\t') + self.text.count('\n') + self.chars_without_spaces = self.total_chars - self.spaces + + def __calculate_character_distribution(self) -> None: + config = POLISH_CONFIG if self.language == 'pl' else ENGLISH_CONFIG + letter_counter: Counter = Counter() + + for char in self.text: + if char.isalpha(): + self.letters += 1 + char_lower = char.lower() + letter_counter[char_lower] += 1 + if char in config.vowels: + self.vowels += 1 + elif char in config.consonants: + self.consonants += 1 + elif char.isdigit(): + self.digits += 1 + elif char in config.punctuation: + self.punctuation_marks += 1 + elif char in config.special_chars: + self.special_characters += 1 + elif not char.isspace(): + self.symbols += 1 + + self.letter_frequency = dict(letter_counter.most_common()) + + def __calculate_lexical_stats(self) -> None: + words = self.__extract_words() + self.words = len(words) + + if self.words > 0: + word_counter = Counter(words) + self.unique_words = len(word_counter) + self.type_token_ratio = round(self.unique_words / self.words, 4) + + lengths = [len(w) for w in words] + self.avg_word_length = round(sum(lengths) / self.words, 2) + self.word_frequency = [{'word': w, 'count': c} for w, c in word_counter.most_common(50)] + + if self.sentences > 0: + self.avg_sentence_length = round(self.words / self.sentences, 2) + + def __generate_n_grams(self) -> None: + words = self.__extract_words() + if len(words) >= 2: + bigrams = Counter(zip(words[:-1], words[1:])) + self.bigrams = [{'bigram': f'{w1} {w2}', 'count': c} for (w1, w2), c in bigrams.most_common(25)] + + if len(words) >= 3: + trigrams = Counter(zip(words[:-2], words[1:-1], words[2:])) + self.trigrams = [{'trigram': f'{w1} {w2} {w3}', 'count': c} for (w1, w2, w3), c in trigrams.most_common(25)] + + def __extract_words(self) -> List[str]: + return re.findall(r'\b\w+\b', self.text.lower()) diff --git a/preprocessor/services/transcription/__init__.py b/preprocessor/services/transcription/__init__.py new file mode 100644 index 000000000..21b0091d6 --- /dev/null +++ b/preprocessor/services/transcription/__init__.py @@ -0,0 +1,23 @@ +from preprocessor.services.transcription.generators.json_generator import JsonGenerator +from preprocessor.services.transcription.processors.audio_normalizer import AudioNormalizer +from preprocessor.services.transcription.processors.episode_info_processor import EpisodeInfoProcessor +from preprocessor.services.transcription.processors.normalized_audio_processor import NormalizedAudioProcessor +from preprocessor.services.transcription.sound_classification import ( + classify_segment, + is_sound_event, +) +from preprocessor.services.transcription.utils import ( + TranscriptionUtils, + WhisperUtils, +) + +__all__ = [ + 'JsonGenerator', + 'AudioNormalizer', + 'EpisodeInfoProcessor', + 'NormalizedAudioProcessor', + 'classify_segment', + 'is_sound_event', + 'TranscriptionUtils', + 'WhisperUtils', +] diff --git a/preprocessor/cli_utils/__init__.py b/preprocessor/services/transcription/engines/__init__.py similarity index 100% rename from preprocessor/cli_utils/__init__.py rename to preprocessor/services/transcription/engines/__init__.py diff --git a/preprocessor/transcription/engines/base_engine.py b/preprocessor/services/transcription/engines/base_engine.py similarity index 87% rename from preprocessor/transcription/engines/base_engine.py rename to preprocessor/services/transcription/engines/base_engine.py index 7ef4474fc..0ac724ca6 100644 --- a/preprocessor/transcription/engines/base_engine.py +++ b/preprocessor/services/transcription/engines/base_engine.py @@ -10,10 +10,13 @@ class TranscriptionEngine(ABC): - @abstractmethod - def transcribe(self, audio_path: Path) -> Dict[str, Any]: + def cleanup(self) -> None: pass @abstractmethod def get_name(self) -> str: pass + + @abstractmethod + def transcribe(self, audio_path: Path) -> Dict[str, Any]: + pass diff --git a/preprocessor/services/transcription/engines/elevenlabs_engine.py b/preprocessor/services/transcription/engines/elevenlabs_engine.py new file mode 100644 index 000000000..15d18f368 --- /dev/null +++ b/preprocessor/services/transcription/engines/elevenlabs_engine.py @@ -0,0 +1,149 @@ +import json +from pathlib import Path +import time +from typing import ( + Any, + Dict, + List, + Optional, +) + +from elevenlabs.client import ElevenLabs +from elevenlabs.core import ApiError + +from preprocessor.config.settings_instance import settings +from preprocessor.services.core.logging import ErrorHandlingLogger +from preprocessor.services.transcription.engines.base_engine import TranscriptionEngine +from preprocessor.services.ui.console import console + + +class ElevenLabsEngine(TranscriptionEngine): + def __init__( + self, + logger: ErrorHandlingLogger, + model_id: Optional[str] = None, + language_code: Optional[str] = None, + diarize: Optional[bool] = None, + polling_interval: Optional[int] = None, + ) -> None: + self.__validate_api_key() + + self.__client = ElevenLabs(api_key=settings.elevenlabs.api_key) + self.__logger = logger + self.__model_id = model_id or settings.elevenlabs.model_id + self.__language_code = language_code or settings.elevenlabs.language_code + self.__diarize = diarize if diarize is not None else settings.elevenlabs.diarize + self.__polling_interval = polling_interval or settings.elevenlabs.polling_interval + + self.__additional_formats: List[Dict[str, Any]] = [ + {'format': 'srt'}, + { + 'format': 'segmented_json', + 'include_speakers': True, + 'include_timestamps': True, + 'segment_on_silence_longer_than_s': 0.5, + 'max_segment_duration_s': 10.0, + 'max_segment_chars': 200, + }, + ] + + def get_name(self) -> str: + return 'ElevenLabs' + + def transcribe(self, audio_path: Path) -> Dict[str, Any]: + console.print(f'[cyan]Transcribing with ElevenLabs: {audio_path.name}[/cyan]') + + if not audio_path.exists(): + raise FileNotFoundError(f'Audio file not found: {audio_path}') + + job_id = self.__submit_job(audio_path) + raw_result = self.__poll_for_results(job_id) + + console.print(f'[green]Transcription completed: {audio_path.name}[/green]') + return self.__convert_to_unified_format(raw_result) + + def __submit_job(self, audio_path: Path) -> str: + try: + with open(audio_path, 'rb') as audio_file: + audio_data = audio_file.read() + + response = self.__client.speech_to_text.convert( + file=audio_data, + model_id=self.__model_id, + language_code=self.__language_code, + tag_audio_events=True, + timestamps_granularity='character', + diarize=self.__diarize, + num_speakers=32, + use_multi_channel=False, + additional_formats=self.__additional_formats, + webhook=True, + ) + self.__logger.info(f'Job submitted. ID: {response.transcription_id}') + return response.transcription_id + except ApiError as e: + self.__logger.error(f'API error during job submission: {e.body}') + raise + + def __poll_for_results(self, transcription_id: str) -> Any: + self.__logger.info(f'Polling for results (ID: {transcription_id})...') + max_attempts = settings.elevenlabs.max_attempts + + for _attempt in range(max_attempts): + try: + result = self.__client.speech_to_text.transcripts.get(transcription_id=transcription_id) + self.__logger.info('Transcription ready!') + return result + except ApiError as e: + if e.status_code == 404: + time.sleep(self.__polling_interval) + else: + self.__logger.error(f'API error during polling: {e.body}') + raise + + raise TimeoutError(f'Transcription timeout after {max_attempts} attempts') + + @staticmethod + def __convert_to_unified_format(result: Any) -> Dict[str, Any]: + unified_data = { + 'text': result.text, + 'language_code': result.language_code, + 'segments': [], + } + + if not result.additional_formats: + return unified_data + + for fmt in result.additional_formats: + if fmt.requested_format == 'segmented_json': + segmented_data = json.loads(fmt.content) + for seg in segmented_data.get('segments', []): + segment = ElevenLabsEngine.__parse_segment(seg) + if segment: + unified_data['segments'].append(segment) + break + return unified_data + + @staticmethod + def __parse_segment(seg_data: Dict[str, Any]) -> Optional[Dict[str, Any]]: + words = seg_data.get('words', []) + if not words: + return None + + non_spacing = [w for w in words if w.get('type') != 'spacing'] + segment = { + 'text': seg_data.get('text', '').strip(), + 'words': words, + } + + if non_spacing: + segment['start'] = non_spacing[0].get('start') + segment['end'] = non_spacing[-1].get('end') + segment['speaker'] = non_spacing[0].get('speaker_id') + + return segment + + @staticmethod + def __validate_api_key() -> None: + if not settings.elevenlabs.api_key: + raise ValueError('ElevenLabs API key missing in settings.') diff --git a/preprocessor/services/transcription/engines/whisper_engine.py b/preprocessor/services/transcription/engines/whisper_engine.py new file mode 100644 index 000000000..876e727cd --- /dev/null +++ b/preprocessor/services/transcription/engines/whisper_engine.py @@ -0,0 +1,169 @@ +import gc +import json +from pathlib import Path +import subprocess +import tempfile +from typing import ( + Any, + Dict, + List, + Optional, +) + +from faster_whisper import WhisperModel +import torch + +from preprocessor.services.transcription.engines.base_engine import TranscriptionEngine +from preprocessor.services.transcription.utils import WhisperUtils +from preprocessor.services.ui.console import console + + +class WhisperEngine(TranscriptionEngine): + def __init__( + self, + model_name: str = 'large-v3-turbo', + language: str = 'Polish', + device: str = 'cuda', + beam_size: int = 10, + temperature: float = 0.0, + max_chunk_duration_seconds: int = 1800, + ) -> None: + self.__model_name = model_name + self.__language = language + self.__device = device + self.__beam_size = beam_size + self.__temperature = temperature + self.__max_chunk_duration_seconds = max_chunk_duration_seconds + + if device != 'cuda': + raise ValueError(f'Whisper acceleration requires CUDA, got: {device}') + + self.__model: Optional[WhisperModel] = self.__load_model() + + def cleanup(self) -> None: + console.print('[cyan]Unloading Whisper model and clearing GPU memory...[/cyan]') + if self.__model: + del self.__model + self.__model = None + + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + console.print('[green]Whisper model unloaded, GPU memory cleared[/green]') + + def get_name(self) -> str: + return f'Whisper-{self.__model_name}' + + def transcribe(self, audio_path: Path) -> Dict[str, Any]: + console.print(f'[cyan]Transcribing with Whisper: {audio_path.name}[/cyan]') + + if not audio_path.exists(): + raise FileNotFoundError(f'Audio file not found: {audio_path}') + if not self.__model: + raise RuntimeError('Whisper model not loaded.') + + duration = self.__get_duration(audio_path) + if duration > self.__max_chunk_duration_seconds: + n_chunks = int(duration // self.__max_chunk_duration_seconds) + 1 + console.print( + f'[yellow]Long audio ({duration/3600:.1f}h), splitting into {n_chunks} chunks ' + f'of {self.__max_chunk_duration_seconds//60}min each[/yellow]', + ) + result = self.__transcribe_chunked(audio_path, duration) + else: + result = self.__transcribe_single(audio_path) + + console.print(f'[green]Transcription completed: {audio_path.name}[/green]') + return result + + def __transcribe_chunked(self, audio_path: Path, total_duration: float) -> Dict[str, Any]: + chunk_starts = list(range(0, int(total_duration), self.__max_chunk_duration_seconds)) + all_segments: List[Dict[str, Any]] = [] + text_parts: List[str] = [] + language: Optional[str] = None + + id_offset = 0 + with tempfile.TemporaryDirectory() as tmpdir: + for i, start in enumerate(chunk_starts): + end = min(start + self.__max_chunk_duration_seconds, total_duration) + chunk_path = Path(tmpdir) / f'chunk_{i:04d}.wav' + + console.print( + f'[cyan]Chunk {i+1}/{len(chunk_starts)}: ' + f'{start/3600:.2f}h - {end/3600:.2f}h[/cyan]', + ) + self.__extract_audio_chunk(audio_path, chunk_path, start, end) + + chunk_result = self.__transcribe_single(chunk_path) + + if language is None: + language = chunk_result.get('language') + + offset = float(start) + chunk_segments = chunk_result.get('segments', []) + for seg in chunk_segments: + adjusted_seg = { + **seg, + 'id': seg['id'] + id_offset, + 'start': seg['start'] + offset, + 'end': seg['end'] + offset, + } + if adjusted_seg.get('words'): + adjusted_seg['words'] = [ + {**w, 'start': w['start'] + offset, 'end': w['end'] + offset} + for w in adjusted_seg['words'] + ] + all_segments.append(adjusted_seg) + + id_offset += len(chunk_segments) + text_parts.append(chunk_result.get('text', '')) + + result: Dict[str, Any] = {'text': ''.join(text_parts), 'segments': all_segments} + if language: + result['language'] = language + return result + + def __transcribe_single(self, audio_path: Path) -> Dict[str, Any]: + if not self.__model: + raise RuntimeError('Whisper model not loaded.') + + language_code = WhisperUtils.get_language_code(self.__language) + segments, info = self.__model.transcribe( + str(audio_path), + language=language_code, + beam_size=self.__beam_size, + word_timestamps=True, + condition_on_previous_text=False, + temperature=self.__temperature, + vad_filter=True, + ) + return WhisperUtils.build_transcription_result(segments, language=info.language) + + def __load_model(self) -> WhisperModel: + compute_type = 'float16' + console.print(f'[cyan]Loading Whisper: {self.__model_name} on {self.__device} ({compute_type})[/cyan]') + + model = WhisperModel(self.__model_name, device=self.__device, compute_type=compute_type) + console.print('[green]Whisper model loaded[/green]') + return model + + @staticmethod + def __get_duration(path: Path) -> float: + result = subprocess.run( + ['ffprobe', '-v', 'quiet', '-print_format', 'json', '-show_format', str(path)], + capture_output=True, text=True, check=True, + ) + return float(json.loads(result.stdout)['format']['duration']) + + @staticmethod + def __extract_audio_chunk(video_path: Path, output_path: Path, start: float, end: float) -> None: + subprocess.run( + [ + 'ffmpeg', '-y', + '-ss', str(start), '-to', str(end), + '-i', str(video_path), + '-vn', '-acodec', 'pcm_f32le', '-ar', '16000', '-ac', '1', + str(output_path), + ], + capture_output=True, check=True, + ) diff --git a/preprocessor/embeddings/__init__.py b/preprocessor/services/transcription/generators/__init__.py similarity index 100% rename from preprocessor/embeddings/__init__.py rename to preprocessor/services/transcription/generators/__init__.py diff --git a/preprocessor/services/transcription/generators/base_generator.py b/preprocessor/services/transcription/generators/base_generator.py new file mode 100644 index 000000000..46e376e34 --- /dev/null +++ b/preprocessor/services/transcription/generators/base_generator.py @@ -0,0 +1,42 @@ +from abc import ( + ABC, + abstractmethod, +) +import json +from pathlib import Path +from typing import ( + Any, + Dict, +) + +from preprocessor.services.core.logging import ErrorHandlingLogger + + +class BaseTranscriptionGenerator(ABC): + def __init__(self, input_dir: Path, output_dir: Path, logger: ErrorHandlingLogger) -> None: + self._input_dir = input_dir + self._output_dir = output_dir + self._logger = logger + + def generate(self) -> None: + self._output_dir.mkdir(parents=True, exist_ok=True) + for json_file in self._input_dir.rglob('*.json'): + try: + data = self.__load_json(json_file) + if data: + self._process_file(json_file, data) + except Exception as e: + self._logger.error(f'Failed to generate output for {json_file}: {e}') + + @staticmethod + def __load_json(file_path: Path) -> Dict[str, Any]: + with open(file_path, 'r', encoding='utf-8') as f: + return json.load(f) + + @abstractmethod + def _get_output_filename(self, json_file: Path) -> str: + ... + + @abstractmethod + def _process_file(self, json_file: Path, data: Dict[str, Any]) -> None: + ... diff --git a/preprocessor/services/transcription/generators/json_generator.py b/preprocessor/services/transcription/generators/json_generator.py new file mode 100644 index 000000000..0724fad51 --- /dev/null +++ b/preprocessor/services/transcription/generators/json_generator.py @@ -0,0 +1,85 @@ +import json +from pathlib import Path +from typing import ( + Any, + Dict, + Literal, +) + +from preprocessor.config.constants import ( + FILE_EXTENSIONS, + FILE_SUFFIXES, +) +from preprocessor.services.transcription.generators.base_generator import BaseTranscriptionGenerator +from preprocessor.services.transcription.utils import TranscriptionUtils + + +class JsonGenerator(BaseTranscriptionGenerator): + def __init__(self, format_type: Literal['full', 'simple', 'segmented'], *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + self.__format_type = format_type + + def convert(self, data: Dict[str, Any]) -> Dict[str, Any]: + converters = { + 'full': self.convert_to_full_format, + 'simple': self.convert_to_simple_format, + 'segmented': self.convert_to_segmented_format, + } + if self.__format_type not in converters: + raise ValueError(f'Unknown format type: {self.__format_type}') + return converters[self.__format_type](data) + + @staticmethod + def convert_to_full_format(data: Dict[str, Any]) -> Dict[str, Any]: + segments = data.get('segments', []) + full_text = ' '.join(seg.get('text', '').strip() for seg in segments) + + language = data.get('language', 'pol').lower() + language_code = 'pol' if language in {'polish', 'pol'} else language + + words = [] + for seg in segments: + words.extend(TranscriptionUtils.convert_words_list(seg.get('words', []))) + + return { + 'language_code': language_code, + 'language_probability': 1.0, + 'text': full_text, + 'words': words, + } + + @staticmethod + def convert_to_segmented_format(data: Dict[str, Any]) -> Dict[str, Any]: + segments = data.get('segments', []) + result = [] + for seg in segments: + result.append({ + 'text': seg.get('text', '').strip(), + 'words': TranscriptionUtils.convert_words_list(seg.get('words', [])), + }) + return {'segments': result} + + @staticmethod + def convert_to_simple_format(data: Dict[str, Any]) -> Dict[str, Any]: + segments = data.get('segments', []) + result = [] + for seg in segments: + words = seg.get('words', []) + speaker = words[0].get('speaker_id', 'speaker_unknown') if words else 'speaker_unknown' + result.append({ + 'speaker': speaker, + 'text': seg.get('text', '').strip(), + }) + return {'segments': result} + + def _get_output_filename(self, json_file: Path) -> str: + if self.__format_type == 'full': + return json_file.name + suffix = FILE_SUFFIXES[self.__format_type] + return json_file.name.replace(FILE_EXTENSIONS['json'], f"{suffix}{FILE_EXTENSIONS['json']}") + + def _process_file(self, json_file: Path, data: Dict[str, Any]) -> None: + converted_data = self.convert(data) + output_path = self._output_dir / self._get_output_filename(json_file) + with open(output_path, 'w', encoding='utf-8') as f: + json.dump(converted_data, f, indent=2, ensure_ascii=False) diff --git a/preprocessor/services/transcription/generators/multi_format_generator.py b/preprocessor/services/transcription/generators/multi_format_generator.py new file mode 100644 index 000000000..48d3bbf9f --- /dev/null +++ b/preprocessor/services/transcription/generators/multi_format_generator.py @@ -0,0 +1,118 @@ +import json +from pathlib import Path +from typing import ( + Any, + Dict, + Literal, + Optional, +) + +from preprocessor.config.output_paths import get_base_output_dir +from preprocessor.config.settings_instance import settings +from preprocessor.services.core.logging import ErrorHandlingLogger +from preprocessor.services.episodes import EpisodeManager +from preprocessor.services.transcription.generators.json_generator import JsonGenerator +from preprocessor.services.transcription.generators.srt_generator import SrtGenerator +from preprocessor.services.transcription.generators.txt_generator import TxtGenerator + + +class MultiFormatGenerator: + def __init__( + self, + jsons_dir: Path, + episodes_info_json: Path, + _output_base_path: Path, + logger: ErrorHandlingLogger, + series_name: str = '', + ) -> None: + self.__jsons_dir = jsons_dir + self.__logger = logger + self.__series_name = series_name.lower() if series_name else 'unknown' + self.__episode_manager = EpisodeManager(episodes_info_json, self.__series_name, logger) + + def __call__(self) -> None: + for transcription_file in self.__jsons_dir.rglob('*.json'): + self.__process_transcription_file(transcription_file) + + def __process_transcription_file(self, file_path: Path) -> None: + try: + transcription = self.__load_json(file_path) + if not transcription: + return + + episode_info = self.__episode_manager.parse_filename(file_path) + if not episode_info: + self.__logger.error(f'Cannot extract episode info from {file_path.name}') + return + + if self.__is_already_processed(episode_info): + return + + self.__generate_all_formats(transcription, episode_info) + except Exception as e: + self.__logger.error(f'Error processing {file_path.name}: {e}') + + def __generate_all_formats(self, transcription: Dict[str, Any], episode_info: Any) -> None: + base_dir = self.__get_raw_output_dir(episode_info) + base_dir.mkdir(parents=True, exist_ok=True) + + metadata = EpisodeManager.get_metadata(episode_info) + full_data = {'episode_info': metadata, **transcription} + + self.__save_json(full_data, episode_info, base_dir, 'full') + self.__save_json(transcription, episode_info, base_dir, 'segmented') + self.__save_json(transcription, episode_info, base_dir, 'simple') + self.__save_srt(transcription, episode_info, base_dir) + self.__save_txt(transcription, episode_info, base_dir) + + def __save_json( + self, data: Dict[str, Any], ep_info: Any, out_dir: Path, + fmt: Literal['full', 'simple', 'segmented'], + ) -> None: + gen = JsonGenerator(fmt, Path(''), out_dir, self.__logger) + filename = self.__episode_manager.path_manager.build_filename( + ep_info, extension='json', suffix=fmt if fmt != 'full' else None, + ) + + converted = gen.convert(data) + if fmt != 'full': + converted['episode_info'] = {'season': ep_info.season, 'episode_number': ep_info.relative_episode} + else: + converted['episode_info'] = data.get('episode_info', {}) + + with open(out_dir / filename, 'w', encoding='utf-8') as f: + json.dump(converted, f, indent=2, ensure_ascii=False) + + def __save_srt(self, data: Dict[str, Any], ep_info: Any, out_dir: Path) -> None: + gen = SrtGenerator(Path(''), out_dir, self.__logger) + filename = self.__episode_manager.path_manager.build_filename(ep_info, extension='srt') + (out_dir / filename).write_text(gen.convert_to_srt_format(data), encoding='utf-8') + + def __save_txt(self, data: Dict[str, Any], ep_info: Any, out_dir: Path) -> None: + gen = TxtGenerator(Path(''), out_dir, self.__logger) + filename = self.__episode_manager.path_manager.build_filename(ep_info, extension='txt') + (out_dir / filename).write_text(gen.convert_to_txt_format(data), encoding='utf-8') + + def __is_already_processed(self, ep_info: Any) -> bool: + filename = self.__episode_manager.path_manager.build_filename(ep_info, extension='json') + target = self.__get_raw_output_dir(ep_info) / filename + if target.exists(): + self.__logger.info(f'Skipping existing: {ep_info.episode_code()}') + return True + return False + + def __get_raw_output_dir(self, ep_info: Any) -> Path: + return ( + get_base_output_dir(self.__series_name) / + settings.output_subdirs.transcriptions / + ep_info.season_code() / + ep_info.episode_code() / 'raw' + ) + + def __load_json(self, path: Path) -> Optional[Dict[str, Any]]: + try: + with open(path, 'r', encoding='utf-8') as f: + return json.load(f) + except Exception as e: + self.__logger.error(f'Load error {path.name}: {e}') + return None diff --git a/preprocessor/services/transcription/generators/srt_generator.py b/preprocessor/services/transcription/generators/srt_generator.py new file mode 100644 index 000000000..8eaaa1f23 --- /dev/null +++ b/preprocessor/services/transcription/generators/srt_generator.py @@ -0,0 +1,45 @@ +from pathlib import Path +from typing import ( + Any, + Dict, +) + +from preprocessor.config.constants import FILE_EXTENSIONS +from preprocessor.services.transcription.generators.base_generator import BaseTranscriptionGenerator + + +class SrtGenerator(BaseTranscriptionGenerator): + @staticmethod + def convert_to_srt_format(data: Dict[str, Any]) -> str: + segments = data.get('segments', []) + srt_lines = [] + index = 1 + + for seg in segments: + text = seg.get('text', '').strip() + if not text: + continue + + start_time = SrtGenerator.__format_timestamp(seg.get('start', 0.0)) + end_time = SrtGenerator.__format_timestamp(seg.get('end', 0.0)) + + srt_lines.extend([str(index), f'{start_time} --> {end_time}', text, '']) + index += 1 + + return '\n'.join(srt_lines) + + def _get_output_filename(self, json_file: Path) -> str: + return json_file.name.replace(FILE_EXTENSIONS['json'], FILE_EXTENSIONS['srt']) + + def _process_file(self, json_file: Path, data: Dict[str, Any]) -> None: + content = self.convert_to_srt_format(data) + output_path = self._output_dir / self._get_output_filename(json_file) + output_path.write_text(content, encoding='utf-8') + + @staticmethod + def __format_timestamp(seconds: float) -> str: + hours = int(seconds // 3600) + minutes = int(seconds % 3600 // 60) + secs = int(seconds % 60) + millis = int(seconds % 1 * 1000) + return f'{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}' diff --git a/preprocessor/services/transcription/generators/txt_generator.py b/preprocessor/services/transcription/generators/txt_generator.py new file mode 100644 index 000000000..9c28ee5a4 --- /dev/null +++ b/preprocessor/services/transcription/generators/txt_generator.py @@ -0,0 +1,23 @@ +from pathlib import Path +from typing import ( + Any, + Dict, +) + +from preprocessor.config.constants import FILE_EXTENSIONS +from preprocessor.services.transcription.generators.base_generator import BaseTranscriptionGenerator + + +class TxtGenerator(BaseTranscriptionGenerator): + @staticmethod + def convert_to_txt_format(data: Dict[str, Any]) -> str: + segments = data.get('segments', []) + return ' '.join(seg.get('text', '').strip() for seg in segments if seg.get('text')) + + def _get_output_filename(self, json_file: Path) -> str: + return json_file.name.replace(FILE_EXTENSIONS['json'], FILE_EXTENSIONS['txt']) + + def _process_file(self, json_file: Path, data: Dict[str, Any]) -> None: + content = self.convert_to_txt_format(data) + output_path = self._output_dir / self._get_output_filename(json_file) + output_path.write_text(content, encoding='utf-8') diff --git a/preprocessor/services/transcription/processors/__init__.py b/preprocessor/services/transcription/processors/__init__.py new file mode 100644 index 000000000..5534236cd --- /dev/null +++ b/preprocessor/services/transcription/processors/__init__.py @@ -0,0 +1,5 @@ +from preprocessor.services.transcription.processors.audio_normalizer import AudioNormalizer +from preprocessor.services.transcription.processors.episode_info_processor import EpisodeInfoProcessor +from preprocessor.services.transcription.processors.normalized_audio_processor import NormalizedAudioProcessor + +__all__ = ['AudioNormalizer', 'EpisodeInfoProcessor', 'NormalizedAudioProcessor'] diff --git a/preprocessor/services/transcription/processors/audio_normalizer.py b/preprocessor/services/transcription/processors/audio_normalizer.py new file mode 100644 index 000000000..570356bbc --- /dev/null +++ b/preprocessor/services/transcription/processors/audio_normalizer.py @@ -0,0 +1,73 @@ +from pathlib import Path +from typing import ( + List, + Optional, +) + +from preprocessor.services.core.base_processor import BaseProcessor +from preprocessor.services.core.logging import ErrorHandlingLogger +from preprocessor.services.media.ffmpeg import FFmpegWrapper + + +class AudioNormalizer: + SUPPORTED_VIDEO_EXTENSIONS = BaseProcessor.SUPPORTED_VIDEO_EXTENSIONS + + def __init__( + self, + input_videos: Path, + output_dir: Path, + logger: ErrorHandlingLogger, + video_files: Optional[List[Path]] = None, + ) -> None: + self.__input_videos = input_videos + self.__output_dir = output_dir + self.__logger = logger + self.__video_files = video_files + self.__output_dir.mkdir(parents=True, exist_ok=True) + + def __call__(self) -> None: + targets = self.__video_files if self.__video_files is not None else self.__discover_videos() + for video in targets: + self.__process_video(video) + + def __discover_videos(self) -> List[Path]: + return [ + v for v in self.__input_videos.rglob('*') + if v.suffix.lower() in self.SUPPORTED_VIDEO_EXTENSIONS + ] + + def __process_video(self, video: Path) -> None: + try: + output_path = self.__output_dir / video.with_suffix('.wav').name + if output_path.exists(): + return + + audio_idx = self.__get_best_audio_stream(video) + if audio_idx is None: + return + + self.__execute_normalization_pipeline(video, audio_idx, output_path) + except Exception as e: + self.__logger.error(f'Error processing video {video}: {e}') + + def __get_best_audio_stream(self, video: Path) -> Optional[int]: + streams = FFmpegWrapper.get_audio_streams(video) + + if not streams: + self.__logger.error(f'No audio streams found in file: {video}') + return None + + best_stream = max(streams, key=lambda s: int(s.get('bit_rate', 0) or 0)) + return best_stream['index'] + + def __execute_normalization_pipeline(self, video: Path, audio_idx: int, output: Path) -> None: + FFmpegWrapper.extract_audio( + video, output, audio_stream_index=audio_idx, + codec='pcm_s16le', sample_rate=48000, channels=1, + ) + + tmp_output = output.with_name(output.stem + '_temp.wav') + FFmpegWrapper.normalize_audio(output, tmp_output) + + tmp_output.replace(output) + self.__logger.info(f'Normalization complete: {output.name}') diff --git a/preprocessor/services/transcription/processors/episode_info_processor.py b/preprocessor/services/transcription/processors/episode_info_processor.py new file mode 100644 index 000000000..a52972205 --- /dev/null +++ b/preprocessor/services/transcription/processors/episode_info_processor.py @@ -0,0 +1,83 @@ +import json +from pathlib import Path +from typing import ( + Any, + Dict, + Tuple, +) + +from preprocessor.services.core.logging import ErrorHandlingLogger +from preprocessor.services.episodes import EpisodeManager + + +class EpisodeInfoProcessor: + def __init__( + self, + jsons_dir: Path, + episodes_info_json: Path, + output_path: Path, + logger: ErrorHandlingLogger, + series_name: str = '', + ) -> None: + self.__jsons_dir = jsons_dir + self.__output_path = output_path + self.__logger = logger + self.__series_name = self.__resolve_series_name(series_name) + + self.__output_path.mkdir(parents=True, exist_ok=True) + self.__episode_manager = EpisodeManager(episodes_info_json, self.__series_name, self.__logger) + + def __call__(self) -> None: + for transcription_file in self.__jsons_dir.rglob('*.json'): + self.__process_file(transcription_file) + + def __resolve_series_name(self, series_name: str) -> str: + if not series_name: + name = self.__output_path.parent.name.lower() + self.__logger.warning(f"Using fallback series name from folder: '{name}'") + return name + return series_name.lower() + + def __process_file(self, transcription_file: Path) -> None: + try: + transcription = self.__load_json(transcription_file) + episode_info = self.__episode_manager.parse_filename(transcription_file) + + if not episode_info: + self.__logger.error(f'Failed to parse episode info: {transcription_file.name}') + return + + _, new_name = self.__write_structured_json(transcription, episode_info) + self.__sync_original_filename(transcription_file, new_name) + except Exception as e: + self.__logger.error(f'Error processing {transcription_file.name}: {e}') + + def __write_structured_json(self, transcription: Dict[str, Any], episode_info) -> Tuple[Path, str]: + new_name = self.__episode_manager.path_manager.build_filename(episode_info, extension='json') + target_path = self.__output_path / episode_info.season_code() / new_name + target_path.parent.mkdir(parents=True, exist_ok=True) + + payload = { + 'episode_info': EpisodeManager.get_metadata(episode_info), + 'segments': transcription.get('segments', []), + } + + with target_path.open('w', encoding='utf-8') as f: + json.dump(payload, f, ensure_ascii=False, indent=4) + + return target_path, new_name + + def __sync_original_filename(self, original_path: Path, new_name: str) -> None: + target_path = original_path.parent / new_name + if original_path.name == new_name: + return + + if target_path.exists(): + self.__logger.error(f'Rename conflict: {target_path} already exists!') + else: + original_path.rename(target_path) + + @staticmethod + def __load_json(path: Path) -> Dict[str, Any]: + with path.open('r', encoding='utf-8') as f: + return json.load(f) diff --git a/preprocessor/services/transcription/processors/normalized_audio_processor.py b/preprocessor/services/transcription/processors/normalized_audio_processor.py new file mode 100644 index 000000000..36b0e6094 --- /dev/null +++ b/preprocessor/services/transcription/processors/normalized_audio_processor.py @@ -0,0 +1,95 @@ +import gc +import json +from pathlib import Path +from typing import ( + List, + Optional, + Tuple, +) + +from faster_whisper import WhisperModel +import torch + +from preprocessor.services.core.logging import ErrorHandlingLogger +from preprocessor.services.transcription.utils import WhisperUtils + + +class NormalizedAudioProcessor: + SUPPORTED_AUDIO_EXTENSIONS: Tuple[str, str] = ('.wav', '.mp3') + + def __init__( + self, + input_audios: Path, + output_dir: Path, + logger: ErrorHandlingLogger, + language: str, + model: str, + device: str, + audio_files: Optional[List[Path]] = None, + ): + self.__input_audios = input_audios + self.__output_dir = output_dir + self.__logger = logger + self.__audio_files = audio_files + self.__language = language + + self.__output_dir.mkdir(parents=True, exist_ok=True) + + if device != 'cuda': + raise ValueError(f'Whisper acceleration requires CUDA device, got: {device}') + + self.__whisper_model = WhisperModel( + model, + device=device, + compute_type='float16', + ) + self.__logger.info(f'Whisper {model} initialized on {device}') + + def cleanup(self) -> None: + self.__logger.info('Purging GPU memory and unloading Whisper model...') + del self.__whisper_model + + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + def __call__(self) -> None: + targets = self.__audio_files if self.__audio_files is not None else self.__discover_audios() + for audio in targets: + self.__transcribe_file(audio) + + def __discover_audios(self) -> List[Path]: + return [ + a for a in self.__input_audios.rglob('*') + if a.suffix.lower() in self.SUPPORTED_AUDIO_EXTENSIONS + ] + + def __transcribe_file(self, audio_path: Path) -> None: + try: + output_file = self.__output_dir / audio_path.with_suffix('.json').name + if output_file.exists(): + return + + segments, info = self.__whisper_model.transcribe( + str(audio_path), + language=WhisperUtils.get_language_code(self.__language), + beam_size=10, + word_timestamps=True, + condition_on_previous_text=False, + temperature=0.0, + ) + + result = WhisperUtils.build_transcription_result(segments, language=info.language) + self.__save_results(result, output_file) + self.__logger.info(f'Transcription saved: {output_file.name}') + + except Exception as e: + self.__logger.error(f'Whisper error on {audio_path.name}: {e}') + + @staticmethod + def __save_results(result: dict, path: Path) -> None: + for segment in result.get('segments', []): + segment['temperature'] = 0.0 + + with open(path, 'w', encoding='utf-8') as f: + json.dump(result, f, ensure_ascii=False, indent=2) diff --git a/preprocessor/services/transcription/sound_classification.py b/preprocessor/services/transcription/sound_classification.py new file mode 100644 index 000000000..be0ab12bf --- /dev/null +++ b/preprocessor/services/transcription/sound_classification.py @@ -0,0 +1,39 @@ +import re +from typing import ( + Any, + Dict, +) + +from preprocessor.config.types import ( + WordKeys, + WordTypeValues, +) + + +def is_sound_event(word: Dict[str, Any]) -> bool: + if word.get(WordKeys.TYPE) == WordTypeValues.AUDIO_EVENT: + return True + + text = word.get(WordKeys.TEXT, word.get(WordKeys.WORD, '')).strip() + return bool(re.match(r'^[\(\[].*[\)\]]$', text)) + + +def classify_segment(segment: Dict[str, Any]) -> str: + words = segment.get(WordKeys.WORDS, []) + if not words: + return 'dialogue' + + has_sound = False + has_dialogue = False + + for word in words: + if is_sound_event(word): + has_sound = True + elif word.get(WordKeys.TYPE) not in [WordTypeValues.SPACING, '']: + has_dialogue = True + + if has_sound and has_dialogue: + return 'mixed' + if has_sound: + return 'sound_event' + return 'dialogue' diff --git a/preprocessor/services/transcription/utils.py b/preprocessor/services/transcription/utils.py new file mode 100644 index 000000000..87b2c279c --- /dev/null +++ b/preprocessor/services/transcription/utils.py @@ -0,0 +1,92 @@ +import json +from pathlib import Path +from typing import ( + Any, + Dict, + List, + Optional, +) + + +class TranscriptionUtils: + @staticmethod + def convert_words_list(words: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + return [ + { + 'word': word.get('text', word.get('word', '')), + 'start': word.get('start', 0.0), + 'end': word.get('end', 0.0), + 'probability': word.get('probability', word.get('confidence', 1.0)), + 'speaker_id': word.get('speaker_id', 'speaker_unknown'), + } + for word in words + ] + + @staticmethod + def fix_transcription_file_unicode(file_path: Path) -> bool: + if not file_path.exists(): + return False + + with open(file_path, 'r', encoding='utf-8') as f: + original_content = f.read() + f.seek(0) + data: Dict[str, Any] = json.load(f) + + new_content = json.dumps(data, ensure_ascii=False, indent=2) + + if original_content != new_content: + with open(file_path, 'w', encoding='utf-8') as f: + f.write(new_content) + return True + return False + + +class WhisperUtils: + LANGUAGE_MAP: Dict[str, str] = { + 'polish': 'pl', + 'english': 'en', + 'german': 'de', + 'french': 'fr', + 'spanish': 'es', + } + + @staticmethod + def get_language_code(language: str) -> str: + return WhisperUtils.LANGUAGE_MAP.get(language.lower(), language.lower()) + + @staticmethod + def build_transcription_result(segments: Any, language: Optional[str] = None) -> Dict[str, Any]: + result: Dict[str, Any] = {'text': '', 'segments': []} + if language: + result['language'] = language + + for segment in segments: + segment_dict = WhisperUtils.__process_segment(segment) + result['segments'].append(segment_dict) + result['text'] += segment.text + return result + + @staticmethod + def __process_segment(segment: Any) -> Dict[str, Any]: + words: List[Dict[str, Any]] = [] + if hasattr(segment, 'words') and segment.words: + for word in segment.words: + words.append({ + 'word': word.word, + 'start': word.start, + 'end': word.end, + 'probability': word.probability, + }) + + return { + 'id': segment.id, + 'seek': 0, + 'start': segment.start, + 'end': segment.end, + 'text': segment.text, + 'tokens': [], + 'avg_logprob': segment.avg_logprob, + 'compression_ratio': segment.compression_ratio, + 'no_speech_prob': segment.no_speech_prob, + 'words': words, + } diff --git a/preprocessor/services/ui/__init__.py b/preprocessor/services/ui/__init__.py new file mode 100644 index 000000000..229e0e3c6 --- /dev/null +++ b/preprocessor/services/ui/__init__.py @@ -0,0 +1,7 @@ +from preprocessor.services.ui.console import ( + SimpleProgress, + console, +) +from preprocessor.services.ui.progress import OperationTracker + +__all__ = ['console', 'SimpleProgress', 'OperationTracker'] diff --git a/preprocessor/services/ui/console.py b/preprocessor/services/ui/console.py new file mode 100644 index 000000000..26e27699d --- /dev/null +++ b/preprocessor/services/ui/console.py @@ -0,0 +1,128 @@ +import os +import sys +import time +from typing import ( + Any, + Dict, + Optional, +) + +from rich.console import Console + +from preprocessor.services.core.time import TimeFormatter + +_console_instance: Optional[Console] = None + + +def __get_console() -> Console: + global _console_instance # pylint: disable=global-statement # Singleton pattern - global required for module-level instance + if _console_instance is None: + _console_instance = __initialize_rich_console() + return _console_instance + + +def __initialize_rich_console() -> Console: + in_docker = ( + os.path.exists('/.dockerenv') or + os.getenv('DOCKER_CONTAINER', 'false') == 'true' + ) + return Console( + force_terminal=True, + file=sys.stderr, + color_system='standard' if in_docker else 'auto', + ) + + +class SimpleProgress: + def __init__(self) -> None: + self.__tasks: Dict[int, Dict[str, Any]] = {} + self.__task_counter = 0 + self.__console = console + + def __enter__(self) -> 'SimpleProgress': + return self + + def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None: + self.__tasks.clear() + + def add_task(self, description: str, total: int) -> int: + task_id = self.__task_counter + self.__task_counter += 1 + + self.__tasks[task_id] = { + 'description': description, + 'total': total, + 'completed': 0, + 'start_time': time.time(), + 'last_print_time': 0.0, + } + + self.__render_progress(task_id) + return task_id + + def advance(self, task_id: int, step: int = 1) -> None: + task = self.__tasks.get(task_id) + if not task: + return + + task['completed'] += step + current_time = time.time() + + if self.__should_render(task, current_time): + self.__render_progress(task_id) + task['last_print_time'] = current_time + + @staticmethod + def __should_render(task: Dict[str, Any], current_time: float) -> bool: + is_finished = task['completed'] >= task['total'] + is_second_passed = (current_time - task['last_print_time']) >= 1.0 + return is_finished or is_second_passed + + def __render_progress(self, task_id: int) -> None: + task = self.__tasks[task_id] + completed = task['completed'] + total = task['total'] + + percent = (completed / total * 100) if total > 0 else 0 + eta = self.__compute_task_eta(task) + progress_bar = self.__build_visual_bar(completed, total) + + self.__console.print( + f"[bold blue]{task['description']}[/bold blue] " + f"[cyan]{progress_bar}[/cyan] " + f"[green]{percent:3.0f}%[/green] " + f"[yellow]{completed}/{total}[/yellow] " + f"[dim]ETA: {eta}[/dim]", + highlight=False, + ) + + @staticmethod + def __compute_task_eta(task: Dict[str, Any]) -> str: + completed = task['completed'] + total = task['total'] + + if completed >= total: + return '0:00:00' + if completed <= 0: + return '-:--:--' + + elapsed = time.time() - task['start_time'] + eta_seconds = (elapsed / completed) * (total - completed) + return TimeFormatter.format_hms(eta_seconds) + + @staticmethod + def __build_visual_bar(completed: int, total: int, width: int = 30) -> str: + if total <= 0: + return '-' * width + + filled_length = int(width * completed / total) + if filled_length < width: + return '=' * filled_length + '>' + '-' * (width - filled_length - 1) + return '=' * width + + +def create_progress() -> SimpleProgress: + return SimpleProgress() + + +console = __get_console() diff --git a/preprocessor/services/ui/progress.py b/preprocessor/services/ui/progress.py new file mode 100644 index 000000000..b971739c8 --- /dev/null +++ b/preprocessor/services/ui/progress.py @@ -0,0 +1,51 @@ +import time + +from preprocessor.services.core.time import TimeFormatter +from preprocessor.services.ui.console import console + + +class OperationTracker: + def __init__(self, operation_name: str, total: int, start_time: float) -> None: + self.__operation_name = operation_name + self.__total = total + self.__completed = 0 + self.__start_time = start_time + self.__last_report_count = 0 + + def update(self, completed: int, interval: int = 10) -> None: + self.__completed = completed + + if self.__should_report_progress(completed, interval): + self.__report_progress() + self.__last_report_count = completed + + def __should_report_progress(self, completed: int, interval: int) -> bool: + if completed == self.__last_report_count: + return False + + is_milestone = (completed % interval == 0) or (completed == self.__total) or (completed == 1) + return is_milestone + + def __report_progress(self) -> None: + percent = (self.__completed / self.__total * 100) if self.__total > 0 else 0 + eta = self.__calculate_eta() + + console.print( + f' [dim]{self.__operation_name}: {self.__completed}/{self.__total} ' + f'({percent:.0f}%) ETA: {eta}[/dim]', + ) + + def __calculate_eta(self) -> str: + elapsed = time.time() - self.__start_time + + if self.__completed >= self.__total: + return '0:00:00' + + if self.__completed <= 0: + return '-:--:--' + + rate = self.__completed / elapsed if elapsed > 0 else 0 + remaining = self.__total - self.__completed + eta_seconds = remaining / rate if rate > 0 else 0 + + return TimeFormatter.format_hms(eta_seconds) if eta_seconds > 0 else '0:00:00' diff --git a/preprocessor/validation/base_result.py b/preprocessor/services/validation/base_result.py similarity index 72% rename from preprocessor/validation/base_result.py rename to preprocessor/services/validation/base_result.py index a752cc7d7..2e75f1112 100644 --- a/preprocessor/validation/base_result.py +++ b/preprocessor/services/validation/base_result.py @@ -16,11 +16,10 @@ class ValidationStatusMixin: @property def status(self) -> str: if self.errors: - return "FAIL" + return 'FAIL' if self.warnings: - return "WARNING" - return "PASS" - + return 'WARNING' + return 'PASS' @dataclass class BaseValidationResult(ValidationStatusMixin): @@ -30,8 +29,8 @@ class BaseValidationResult(ValidationStatusMixin): def to_dict(self) -> Dict[str, Any]: return { - "status": self.status, - "errors": self.errors, - "warnings": self.warnings, - "stats": self.stats, + 'status': self.status, + 'errors': self.errors, + 'warnings': self.warnings, + 'stats': self.stats, } diff --git a/preprocessor/services/validation/episode_stats.py b/preprocessor/services/validation/episode_stats.py new file mode 100644 index 000000000..cd7a04b57 --- /dev/null +++ b/preprocessor/services/validation/episode_stats.py @@ -0,0 +1,83 @@ +from dataclasses import ( + dataclass, + field, +) +from typing import ( + Any, + Dict, + List, + Optional, + TypedDict, +) + +from preprocessor.services.episodes import EpisodeInfo +from preprocessor.services.validation.base_result import ValidationStatusMixin + + +class EpisodeStatsData(TypedDict, total=False): + transcription_chars: Optional[int] + transcription_duration: Optional[float] + transcription_words: Optional[int] + exported_frames_count: Optional[int] + exported_frames_total_size_mb: Optional[float] + video_size_mb: Optional[float] + video_duration: Optional[float] + scenes_count: Optional[int] + + +@dataclass +class EpisodeStats(ValidationStatusMixin): + episode_info: EpisodeInfo + series_name: str + errors: List[str] = field(default_factory=list) + warnings: List[str] = field(default_factory=list) + + # Metryki + transcription_chars: Optional[int] = None + transcription_duration: Optional[float] = None + transcription_words: Optional[int] = None + exported_frames_count: Optional[int] = None + exported_frames_total_size_mb: Optional[float] = None + video_duration: Optional[float] = None + video_size_mb: Optional[float] = None + scenes_count: Optional[int] = None + + def collect_stats(self) -> None: + # pylint: disable=import-outside-toplevel # Necessary to avoid circular import (validators import EpisodeStats) + from preprocessor.services.validation.validators import ( + CharacterValidator, + ElasticValidator, + FrameValidator, + ImageHashValidator, + ObjectValidator, + SceneValidator, + TranscriptionValidator, + VideoValidator, + ) + + validators = [ + TranscriptionValidator(), FrameValidator(), VideoValidator(), + SceneValidator(), ImageHashValidator(), CharacterValidator(), + ObjectValidator(), ElasticValidator(), + ] + + for validator in validators: + validator.validate(self) + + def to_dict(self) -> Dict[str, Any]: + return { + 'status': self.status, + 'errors': self.errors, + 'warnings': self.warnings, + 'stats': self.__get_metric_map(), + } + + def __get_metric_map(self) -> Dict[str, Any]: + return { + 'transcription_chars': self.transcription_chars, + 'transcription_duration': self.transcription_duration, + 'exported_frames_count': self.exported_frames_count, + 'video_duration': self.video_duration, + 'video_size_mb': self.video_size_mb, + 'scenes_count': self.scenes_count, + } diff --git a/preprocessor/services/validation/file_validators.py b/preprocessor/services/validation/file_validators.py new file mode 100644 index 000000000..0452f44fe --- /dev/null +++ b/preprocessor/services/validation/file_validators.py @@ -0,0 +1,115 @@ +from dataclasses import dataclass +import json +from pathlib import Path +from typing import ( + Any, + Dict, + Optional, +) + +from PIL import Image + +from preprocessor.config.types.keys import ( + FfprobeFormatKeys, + FfprobeKeys, + FfprobeStreamKeys, + ValidationMetadataKeys, +) +from preprocessor.services.media.ffmpeg import FFmpegWrapper + + +@dataclass +class ValidationResult: + is_valid: bool + error_message: Optional[str] = None + metadata: Optional[Dict[str, Any]] = None + + +class FileValidator: + @staticmethod + def validate_image_file(path: Path) -> ValidationResult: + err = FileValidator.__verify_existence(path) + if err: + return err + try: + with Image.open(path) as img: + img.verify() + with Image.open(path) as img: + return ValidationResult( + is_valid=True, + metadata={ + ValidationMetadataKeys.WIDTH: img.size[0], + ValidationMetadataKeys.HEIGHT: img.size[1], + ValidationMetadataKeys.FORMAT: img.format, + ValidationMetadataKeys.SIZE_MB: round(path.stat().st_size / (1024 * 1024), 2), + }, + ) + except Exception as e: + return ValidationResult(False, f'Invalid image: {e}') + + @staticmethod + def validate_json_file(path: Path) -> ValidationResult: + err = FileValidator.__verify_existence(path) + if err: + return err + try: + with open(path, 'r', encoding='utf-8') as f: + json.load(f) + return ValidationResult(True, metadata={ValidationMetadataKeys.SIZE_BYTES: path.stat().st_size}) + except Exception as e: + return ValidationResult(False, f'JSON error: {e}') + + @staticmethod + def validate_jsonl_file(path: Path) -> ValidationResult: + if err := FileValidator.__verify_existence(path): + return err + try: + line_count = 0 + with open(path, 'r', encoding='utf-8') as f: + for line in f: + if line.strip(): + json.loads(line) + line_count += 1 + return ValidationResult( + True, + metadata={ + ValidationMetadataKeys.SIZE_BYTES: path.stat().st_size, + 'line_count': line_count, + }, + ) + except Exception as e: + return ValidationResult(False, f'JSONL error: {e}') + + @staticmethod + def validate_video_file(path: Path) -> ValidationResult: + err = FileValidator.__verify_existence(path) + if err: + return err + try: + probe = FileValidator.__run_ffprobe(path) + stream = probe.get(FfprobeKeys.STREAMS, [{}])[0] + fmt = probe.get(FfprobeKeys.FORMAT, {}) + duration = float(stream.get(FfprobeStreamKeys.DURATION) or fmt.get(FfprobeFormatKeys.DURATION, 0)) + + return ValidationResult( + is_valid=True, + metadata={ + ValidationMetadataKeys.CODEC: stream.get(FfprobeStreamKeys.CODEC_NAME), + ValidationMetadataKeys.WIDTH: stream.get(FfprobeStreamKeys.WIDTH), + ValidationMetadataKeys.HEIGHT: stream.get(FfprobeStreamKeys.HEIGHT), + ValidationMetadataKeys.DURATION: round(duration, 2), + ValidationMetadataKeys.SIZE_MB: round(int(fmt.get(FfprobeFormatKeys.SIZE, 0)) / (1024 * 1024), 2), + }, + ) + except Exception as e: + return ValidationResult(False, str(e)) + + @staticmethod + def __verify_existence(path: Path) -> Optional[ValidationResult]: + if not path.exists(): + return ValidationResult(False, f'Missing: {path}') + return None + + @staticmethod + def __run_ffprobe(path: Path) -> Dict[str, Any]: + return FFmpegWrapper.probe_video(path) diff --git a/preprocessor/services/validation/global_validator.py b/preprocessor/services/validation/global_validator.py new file mode 100644 index 000000000..dab1de61a --- /dev/null +++ b/preprocessor/services/validation/global_validator.py @@ -0,0 +1,103 @@ +from pathlib import Path +from typing import List + +from preprocessor.services.validation.base_result import BaseValidationResult +from preprocessor.services.validation.file_validators import FileValidator + + +class GlobalValidationResult(BaseValidationResult): + pass + + +class GlobalValidator: + def __init__(self, series_name: str, base_output_dir: Path) -> None: + self.__series_name = series_name + self.__base_output_dir = base_output_dir + self.__result = GlobalValidationResult() + + def validate(self) -> GlobalValidationResult: + self.__check_main_json_files() + self.__check_characters_assets() + self.__check_processing_metadata_store() + return self.__result + + def __check_main_json_files(self) -> None: + files = [ + (f'{self.__series_name}_episodes.json', 'episodes_json_valid'), + (f'{self.__series_name}_characters.json', 'characters_json_valid'), + ] + for filename, stats_key in files: + self.__validate_json_at_path(self.__base_output_dir / filename, stats_key) + + def __check_characters_assets(self) -> None: + char_dir = self.__base_output_dir / 'characters' + if not char_dir.exists(): + self.__result.warnings.append('Missing characters/ directory') + return + + folders = [d for d in char_dir.iterdir() if d.is_dir()] + self.__result.stats['character_folders_count'] = len(folders) + + if not folders: + self.__result.warnings.append('No character folders in characters/') + return + + self.__process_all_character_folders(folders) + + def __process_all_character_folders(self, folders: List[Path]) -> None: + counters = {'total': 0, 'invalid': 0, 'empty_chars': []} + + for folder in folders: + images = self.__get_image_files(folder) + if not images: + counters['empty_chars'].append(folder.name) + continue + + counters['total'] += len(images) + counters['invalid'] += self.__validate_image_batch(images, folder.name) + + self.__result.stats['character_images_count'] = counters['total'] + self.__result.stats['invalid_character_images'] = counters['invalid'] + + if counters['empty_chars']: + self.__result.warnings.append(f'{len(counters["empty_chars"])} characters without images') + + def __validate_image_batch(self, images: List[Path], char_name: str) -> int: + invalid_count = 0 + for img in images: + v_res = FileValidator.validate_image_file(img) + if not v_res.is_valid: + invalid_count += 1 + self.__result.errors.append(f'Invalid image {char_name}/{img.name}: {v_res.error_message}') + return invalid_count + + def __check_processing_metadata_store(self) -> None: + meta_dir = self.__base_output_dir / 'processing_metadata' + if not meta_dir.exists(): + self.__result.warnings.append('Missing processing_metadata/ directory') + return + + json_files = list(meta_dir.glob('*.json')) + self.__result.stats['processing_metadata_files'] = len(json_files) + + for f in json_files: + v_res = FileValidator.validate_json_file(f) + if not v_res.is_valid: + self.__result.errors.append(f'Invalid metadata {f.name}: {v_res.error_message}') + + def __validate_json_at_path(self, path: Path, stats_key: str) -> None: + if not path.exists(): + self.__result.warnings.append(f'Missing {path.name}') + return + v_res = FileValidator.validate_json_file(path) + if not v_res.is_valid: + self.__result.errors.append(f'Invalid {path.name}: {v_res.error_message}') + else: + self.__result.stats[stats_key] = True + + @staticmethod + def __get_image_files(folder: Path) -> List[Path]: + found = [] + for ext in ('*.jpg', '*.jpeg', '*.png', '*.webp'): + found.extend(folder.glob(ext)) + return found diff --git a/preprocessor/services/validation/report_generator.py b/preprocessor/services/validation/report_generator.py new file mode 100644 index 000000000..c4a21321b --- /dev/null +++ b/preprocessor/services/validation/report_generator.py @@ -0,0 +1,40 @@ +from datetime import datetime +import json +from pathlib import Path +from typing import ( + Any, + Dict, + Optional, +) + +from preprocessor.services.validation.episode_stats import EpisodeStats +from preprocessor.services.validation.season_comparator import SeasonComparison + + +class ReportGenerator: + def __init__(self, season: str, anomaly_threshold: float) -> None: + self.__season = season + self.__anomaly_threshold = anomaly_threshold + self.__timestamp = datetime.now().isoformat() + + def generate_report( + self, + episodes_stats: Dict[str, EpisodeStats], + season_comparison: SeasonComparison, + output_path: Path, + ) -> Optional[Dict[str, Any]]: + report = { + 'validation_timestamp': self.__timestamp, + 'season': self.__season, + 'anomaly_threshold': self.__anomaly_threshold, + 'episodes': {eid: s.to_dict() for eid, s in episodes_stats.items()}, + 'season_comparison': season_comparison.to_dict(), + } + self.__write_to_disk(report, output_path) + return report + + @staticmethod + def __write_to_disk(data: Dict[str, Any], path: Path) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + with open(path, 'w', encoding='utf-8') as f: + json.dump(data, f, indent=2, ensure_ascii=False) diff --git a/preprocessor/services/validation/season_comparator.py b/preprocessor/services/validation/season_comparator.py new file mode 100644 index 000000000..76ed15cde --- /dev/null +++ b/preprocessor/services/validation/season_comparator.py @@ -0,0 +1,107 @@ +from dataclasses import ( + dataclass, + field, +) +from typing import ( + Any, + Dict, + List, + Optional, +) + +from preprocessor.services.validation.episode_stats import EpisodeStats + + +@dataclass +class MetricComparison: + avg_value: Optional[float] + difference_percent: Optional[float] + max_value: Optional[float] + metric_name: str + min_value: Optional[float] + + +@dataclass +class Anomaly: + avg: float + deviation_percent: float + episode: str + metric: str + severity: str + value: float + + +@dataclass +class SeasonComparison: + anomaly_threshold: float + season: str + anomalies: List[Anomaly] = field(default_factory=list) + metrics: Dict[str, MetricComparison] = field(default_factory=dict) + + def compare_episodes(self, episodes_stats: Dict[str, EpisodeStats]) -> None: + metrics_to_check = [ + 'transcription_duration', 'transcription_chars', 'transcription_words', + 'exported_frames_count', 'exported_frames_total_size_mb', + 'video_size_mb', 'video_duration', 'scenes_count', + ] + for key in metrics_to_check: + self.__analyze_metric_across_episodes(key, episodes_stats) + + def to_dict(self) -> Dict[str, Any]: + return { + 'metrics': { + name: { + 'min': m.min_value, 'max': m.max_value, + 'avg': m.avg_value, 'difference_percent': m.difference_percent, + } for name, m in self.metrics.items() + }, + 'anomalies': [ + { + 'episode': a.episode, 'metric': a.metric, 'value': a.value, + 'avg': a.avg, 'deviation_percent': a.deviation_percent, 'severity': a.severity, + } for a in self.anomalies + ], + } + + def __analyze_metric_across_episodes(self, key: str, stats_dict: Dict[str, EpisodeStats]) -> None: + episode_values = { + ep_id: val for ep_id, s in stats_dict.items() + if (val := getattr(s, key, None)) is not None + } + + if not episode_values: + return + + values = list(episode_values.values()) + avg_val = sum(values) / len(values) + + self.__calculate_metric_summary(key, values, avg_val) + self.__detect_anomalies_for_metric(key, episode_values, avg_val) + + def __calculate_metric_summary(self, key: str, values: List[float], avg_val: float) -> None: + min_v, max_v = min(values), max(values) + diff = ((max_v - min_v) / min_v * 100) if min_v > 0 else 0.0 + + self.metrics[key] = MetricComparison( + metric_name=key, + min_value=round(min_v, 2), + max_value=round(max_v, 2), + avg_value=round(avg_val, 2), + difference_percent=round(diff, 2), + ) + + def __detect_anomalies_for_metric(self, key: str, ep_values: Dict[str, float], avg_val: float) -> None: + if avg_val <= 0: + return + + for ep_id, val in ep_values.items(): + deviation = abs((val - avg_val) / avg_val) * 100 + if deviation > self.anomaly_threshold: + self.anomalies.append(self.__create_anomaly_record(ep_id, key, val, avg_val, deviation)) + + def __create_anomaly_record(self, ep_id: str, key: str, val: float, avg: float, dev: float) -> Anomaly: + severity = 'ERROR' if dev > (self.anomaly_threshold * 2) else 'WARNING' + return Anomaly( + episode=ep_id, metric=key, value=round(val, 2), + avg=round(avg, 2), deviation_percent=round(dev, 2), severity=severity, + ) diff --git a/preprocessor/services/validation/validator.py b/preprocessor/services/validation/validator.py new file mode 100644 index 000000000..307bb6ff8 --- /dev/null +++ b/preprocessor/services/validation/validator.py @@ -0,0 +1,162 @@ +from datetime import datetime +from pathlib import Path +from typing import ( + Any, + Dict, + List, + Optional, +) + +from rich.console import Console + +from preprocessor.config.settings_instance import settings +from preprocessor.services.episodes import EpisodeManager +from preprocessor.services.io.files import FileOperations +from preprocessor.services.io.path_service import PathService +from preprocessor.services.validation.episode_stats import EpisodeStats +from preprocessor.services.validation.report_generator import ReportGenerator +from preprocessor.services.validation.season_comparator import SeasonComparison + +console = Console() + + +class Validator: + def __init__( + self, + season: str, + series_name: str = 'ranczo', + anomaly_threshold: float = 20.0, + base_output_dir: Optional[Path] = None, + episodes_info_json: Optional[Path] = None, + ) -> None: + self.__season = season + self.__series_name = series_name + self.__anomaly_threshold = anomaly_threshold + self.__base_output_dir = base_output_dir + self.__episode_manager = EpisodeManager(episodes_info_json, series_name) + self.__validation_reports_dir = base_output_dir / settings.output_subdirs.validation_reports + + @property + def validation_reports_dir(self) -> Path: + return self.__validation_reports_dir + + def validate(self) -> int: + transcriptions_path = self.__base_output_dir / 'transcriptions' / 'raw' / self.__season + if not transcriptions_path.exists(): + console.print(f'[yellow]Season directory not found, skipping: {transcriptions_path}[/yellow]') + return 0 + + console.print(f'[bold cyan]Validating season {self.__season}...[/bold cyan]') + + episodes_stats = self.__collect_all_episodes_stats(transcriptions_path) + if not episodes_stats: + console.print(f'[red]No episodes found in {transcriptions_path}[/red]') + return 1 + + self.__generate_reports_and_compare(episodes_stats) + return 0 + + def __generate_reports_and_compare(self, episodes_stats: Dict[str, EpisodeStats]) -> None: + self.__validation_reports_dir.mkdir(parents=True, exist_ok=True) + + self.__save_individual_episode_reports(episodes_stats) + + comparison = SeasonComparison(season=self.__season, anomaly_threshold=self.__anomaly_threshold) + comparison.compare_episodes(episodes_stats) + + self.__generate_season_summary_report(episodes_stats, comparison) + self.__print_execution_summary(episodes_stats, comparison) + + console.print(f'\n[green]Validation reports saved to: {self.__validation_reports_dir}[/green]') + + def __collect_all_episodes_stats(self, season_path: Path) -> Dict[str, EpisodeStats]: + episode_dirs = sorted([d for d in season_path.iterdir() if d.is_dir() and d.name.startswith('E')]) + results: Dict[str, EpisodeStats] = {} + + for ep_dir in episode_dirs: + stats = self.__process_single_episode_dir(ep_dir) + if stats: + results[stats.episode_info.episode_code()] = stats + return results + + def __process_single_episode_dir(self, ep_dir: Path) -> Optional[EpisodeStats]: + try: + episode_num = int(ep_dir.name[1:]) + season_num = int(self.__season[1:]) + info = self.__episode_manager.get_episode_by_season_and_relative(season_num, episode_num) + + if not info: + console.print(f'[yellow]Skipping {ep_dir.name}: could not parse info[/yellow]') + return None + + stats = EpisodeStats(episode_info=info, series_name=self.__series_name) + stats.collect_stats() + return stats + except ValueError: + return None + + def __save_individual_episode_reports(self, episodes_stats: Dict[str, EpisodeStats]) -> None: + path_manager = PathService(self.__series_name) + for stats in episodes_stats.values(): + report = self.__build_episode_report_payload(stats) + filename = path_manager.build_filename(stats.episode_info, extension='json') + FileOperations.atomic_write_json(self.__validation_reports_dir / filename, report) + + def __generate_season_summary_report(self, stats: Dict[str, EpisodeStats], comparison: SeasonComparison) -> None: + generator = ReportGenerator(season=self.__season, anomaly_threshold=self.__anomaly_threshold) + report_path = self.__validation_reports_dir / f'{self.__series_name}_{self.__season}_season.json' + generator.generate_report(stats, comparison, report_path) + + def __print_execution_summary(self, stats: Dict[str, EpisodeStats], comparison: SeasonComparison) -> None: + console.print(f'\n[bold]Validation Summary for {self.__season}[/bold]') + console.print(f'Total episodes: {len(stats)}') + + self.__print_status_counts(stats) + self.__print_anomalies(comparison) + self.__print_issues(stats) + + @staticmethod + def __build_episode_report_payload(stats: EpisodeStats) -> Dict[str, Any]: + return { + 'validation_timestamp': datetime.now().isoformat(), + 'episode_id': stats.episode_info.episode_code(), + 'episode_title': stats.episode_info.title, + 'status': stats.status, + 'errors': stats.errors, + 'warnings': stats.warnings, + 'stats': stats.to_dict()['stats'], + } + + @staticmethod + def __print_status_counts(stats: Dict[str, EpisodeStats]) -> None: + counts = {'PASS': 0, 'WARNING': 0, 'FAIL': 0} + for s in stats.values(): + counts[s.status] += 1 + console.print(f' [green]PASS:[/green] {counts["PASS"]}') + console.print(f' [yellow]WARNING:[/yellow] {counts["WARNING"]}') + console.print(f' [red]FAIL:[/red] {counts["FAIL"]}') + + @staticmethod + def __print_anomalies(comparison: SeasonComparison) -> None: + if not comparison.anomalies: + return + console.print(f'\n[bold yellow]Anomalies detected: {len(comparison.anomalies)}[/bold yellow]') + for anomaly in comparison.anomalies[:5]: + color = 'red' if anomaly.severity == 'ERROR' else 'yellow' + msg = f'{anomaly.metric} = {anomaly.value} (avg: {anomaly.avg}, dev: {anomaly.deviation_percent:.1f}%)' + console.print(f' [{color}]{anomaly.episode}[/{color}]: {msg}') + + def __print_issues(self, stats_dict: Dict[str, EpisodeStats]) -> None: + for ep_id, stats in stats_dict.items(): + if stats.errors: + self.__print_list('red', f'Errors in {ep_id}', stats.errors) + if stats.warnings: + self.__print_list('yellow', f'Warnings in {ep_id}', stats.warnings) + + @staticmethod + def __print_list(color: str, title: str, items: List[str]) -> None: + console.print(f'\n[{color}]{title}:[/{color}]') + for item in items[:3]: + console.print(f' - {item}') + if len(items) > 3: + console.print(f' ... and {len(items) - 3} more') diff --git a/preprocessor/services/validation/validators/__init__.py b/preprocessor/services/validation/validators/__init__.py new file mode 100644 index 000000000..1b986667c --- /dev/null +++ b/preprocessor/services/validation/validators/__init__.py @@ -0,0 +1,21 @@ +from preprocessor.services.validation.validators.base_validator import BaseValidator +from preprocessor.services.validation.validators.character_validator import CharacterValidator +from preprocessor.services.validation.validators.elastic_validator import ElasticValidator +from preprocessor.services.validation.validators.frame_validator import FrameValidator +from preprocessor.services.validation.validators.image_hash_validator import ImageHashValidator +from preprocessor.services.validation.validators.object_validator import ObjectValidator +from preprocessor.services.validation.validators.scene_validator import SceneValidator +from preprocessor.services.validation.validators.transcription_validator import TranscriptionValidator +from preprocessor.services.validation.validators.video_validator import VideoValidator + +__all__ = [ + 'BaseValidator', + 'CharacterValidator', + 'ElasticValidator', + 'FrameValidator', + 'ImageHashValidator', + 'ObjectValidator', + 'SceneValidator', + 'TranscriptionValidator', + 'VideoValidator', +] diff --git a/preprocessor/services/validation/validators/base_validator.py b/preprocessor/services/validation/validators/base_validator.py new file mode 100644 index 000000000..cecd6650a --- /dev/null +++ b/preprocessor/services/validation/validators/base_validator.py @@ -0,0 +1,96 @@ +from __future__ import annotations + +from abc import ( + ABC, + abstractmethod, +) +import json +from pathlib import Path +from typing import ( + Any, + Dict, + Optional, +) + +from preprocessor.services.validation.episode_stats import EpisodeStats +from preprocessor.services.validation.file_validators import FileValidator + + +class BaseValidator(ABC): + @abstractmethod + def validate(self, stats: EpisodeStats) -> None: + pass + + @staticmethod + def _check_path_exists( + path: Path, stats: EpisodeStats, error_msg: str, + ) -> bool: + if not path.exists(): + stats.errors.append(error_msg) + return False + return True + + @staticmethod + def _add_warning(stats: EpisodeStats, message: str) -> None: + stats.warnings.append(message) + + @staticmethod + def _add_error(stats: EpisodeStats, message: str) -> None: + stats.errors.append(message) + + @staticmethod + def _validate_json_if_exists( + stats: EpisodeStats, + file_path: Path, + error_msg_prefix: str, + ) -> bool: + if not file_path.exists(): + return False + + result = FileValidator.validate_json_file(file_path) + if not result.is_valid: + BaseValidator._add_error(stats, f'{error_msg_prefix}: {result.error_message}') + return False + return True + + @staticmethod + def _validate_json_with_warning( + stats: EpisodeStats, + file_path: Path, + missing_msg: str, + invalid_msg_prefix: str, + ) -> bool: + if not file_path.exists(): + BaseValidator._add_warning(stats, missing_msg) + return False + + result = FileValidator.validate_json_file(file_path) + if not result.is_valid: + BaseValidator._add_warning(stats, f'{invalid_msg_prefix}: {result.error_message}') + return False + return True + + @staticmethod + def _validate_json_with_error( + stats: EpisodeStats, + file_path: Path, + missing_msg: str, + invalid_msg_prefix: str, + ) -> bool: + if not file_path.exists(): + BaseValidator._add_error(stats, missing_msg) + return False + + result = FileValidator.validate_json_file(file_path) + if not result.is_valid: + BaseValidator._add_error(stats, f'{invalid_msg_prefix}: {result.error_message}') + return False + return True + + @staticmethod + def _load_json_safely(file_path: Path) -> Optional[Dict[str, Any]]: + try: + with open(file_path, 'r', encoding='utf-8') as f: + return json.load(f) + except Exception: + return None diff --git a/preprocessor/services/validation/validators/character_validator.py b/preprocessor/services/validation/validators/character_validator.py new file mode 100644 index 000000000..f82598878 --- /dev/null +++ b/preprocessor/services/validation/validators/character_validator.py @@ -0,0 +1,16 @@ +from __future__ import annotations + +from preprocessor.config.settings_instance import settings +from preprocessor.services.validation.episode_stats import EpisodeStats +from preprocessor.services.validation.validators.base_validator import BaseValidator +from preprocessor.services.validation.validators.validation_helpers import VisualizationValidationHelper + + +class CharacterValidator(BaseValidator): + def validate(self, stats: EpisodeStats) -> None: + VisualizationValidationHelper.validate_visualizations( + stats, + settings.output_subdirs.character_visualizations, + 'character_visualizations_count', + 'character visualization', + ) diff --git a/preprocessor/services/validation/validators/elastic_validator.py b/preprocessor/services/validation/validators/elastic_validator.py new file mode 100644 index 000000000..8a2c8ecdd --- /dev/null +++ b/preprocessor/services/validation/validators/elastic_validator.py @@ -0,0 +1,127 @@ +from __future__ import annotations + +import json +from pathlib import Path +from typing import ( + Any, + Dict, +) + +from preprocessor.config.output_paths import get_base_output_dir +from preprocessor.config.settings_instance import settings +from preprocessor.services.io.path_service import PathService +from preprocessor.services.validation.episode_stats import EpisodeStats +from preprocessor.services.validation.file_validators import FileValidator +from preprocessor.services.validation.validators.base_validator import BaseValidator + +ELASTIC_SUBDIRS = settings.output_subdirs.elastic_document_subdirs + + +class ElasticValidator(BaseValidator): + def validate(self, stats: EpisodeStats) -> None: + self.__validate_character_detections(stats) + self.__validate_embeddings(stats) + self.__validate_elastic_documents(stats) + self.__validate_text_statistics(stats) + + @staticmethod + def __validate_character_detections(stats: EpisodeStats) -> None: + detections_file = PathService(stats.series_name).get_episode_file_path( + stats.episode_info, settings.output_subdirs.character_detections, + ) + if detections_file.exists(): + result = FileValidator.validate_json_file(detections_file) + if not result.is_valid: + stats.errors.append(f'Invalid character detections JSON: {result.error_message}') + + @staticmethod + def __validate_embeddings(stats: EpisodeStats) -> None: + embeddings_file = PathService(stats.series_name).get_episode_file_path( + stats.episode_info, f'{settings.output_subdirs.embeddings}/episode_names', + ) + if embeddings_file.exists(): + result = FileValidator.validate_json_file(embeddings_file) + if not result.is_valid: + stats.errors.append(f'Invalid episode embeddings JSON: {result.error_message}') + + def __validate_elastic_documents(self, stats: EpisodeStats) -> None: + subdirs_to_check = [ + ELASTIC_SUBDIRS.text_segments, ELASTIC_SUBDIRS.text_embeddings, + ELASTIC_SUBDIRS.video_frames, ELASTIC_SUBDIRS.episode_names, + ELASTIC_SUBDIRS.text_statistics, ELASTIC_SUBDIRS.full_episode_embeddings, + ELASTIC_SUBDIRS.sound_events, ELASTIC_SUBDIRS.sound_event_embeddings, + ] + + found_any = False + elastic_base = settings.output_subdirs.elastic_documents + ep_code = stats.episode_info.episode_code() + season_code = stats.episode_info.season_code() + + for subdir in subdirs_to_check: + season_dir = ( + get_base_output_dir(stats.series_name) / elastic_base / subdir / season_code + ) + if not season_dir.exists(): + continue + ep_files = list(season_dir.glob(f'{ep_code}_*.jsonl')) + if not ep_files: + continue + found_any = True + for jsonl_file in ep_files: + self.__validate_jsonl_file(stats, jsonl_file, subdir) + + if not found_any: + self._add_warning(stats, f'Missing {settings.output_subdirs.elastic_documents} directory') + + def __validate_jsonl_file(self, stats: EpisodeStats, jsonl_file: Path, subdir: str) -> None: + result = FileValidator.validate_jsonl_file(jsonl_file) + if not result.is_valid: + self._add_error(stats, f'Invalid JSONL {jsonl_file.name}: {result.error_message}') + else: + self.__validate_embedding_dimensions(stats, jsonl_file, subdir) + + @staticmethod + def __validate_text_statistics(stats: EpisodeStats) -> None: + text_stats_file = PathService(stats.series_name).get_episode_file_path( + stats.episode_info, 'text_analysis', + ) + if text_stats_file.exists(): + result = FileValidator.validate_json_file(text_stats_file) + if not result.is_valid: + stats.errors.append(f'Invalid text_stats JSON: {result.error_message}') + else: + stats.warnings.append(f'Missing text statistics file: {text_stats_file.name}') + + def __validate_embedding_dimensions(self, stats: EpisodeStats, jsonl_file: Path, subdir: str) -> None: + embedding_fields = { + ELASTIC_SUBDIRS.text_embeddings: 'text_embedding', + ELASTIC_SUBDIRS.video_frames: 'video_embedding', + ELASTIC_SUBDIRS.episode_names: 'title_embedding', + ELASTIC_SUBDIRS.full_episode_embeddings: 'full_episode_embedding', + ELASTIC_SUBDIRS.sound_event_embeddings: 'sound_event_embedding', + } + + if subdir not in embedding_fields: + return + + expected_dim = settings.embedding_model.embedding_dim + field_name = embedding_fields[subdir] + + try: + with open(jsonl_file, 'r', encoding='utf-8') as f: + for line_num, line in enumerate(f, 1): + if not line.strip(): + continue + doc = json.loads(line) + self.__check_doc_dimension(stats, doc, field_name, expected_dim, jsonl_file.name, line_num) + except Exception as e: + self._add_error(stats, f'Error validating embeddings in {jsonl_file.name}: {e}') + + def __check_doc_dimension( + self, stats: EpisodeStats, doc: Dict[str, Any], field: str, expected: int, fname: str, + lnum: int, + ) -> None: + if field in doc and isinstance(doc[field], list): + actual = len(doc[field]) + if actual != expected: + self._add_error(stats, f'{fname} line {lnum}: {field} has {actual} dim, expected {expected}') diff --git a/preprocessor/services/validation/validators/frame_validator.py b/preprocessor/services/validation/validators/frame_validator.py new file mode 100644 index 000000000..eced19853 --- /dev/null +++ b/preprocessor/services/validation/validators/frame_validator.py @@ -0,0 +1,59 @@ +from __future__ import annotations + +from pathlib import Path +from typing import ( + List, + Tuple, +) + +from preprocessor.config.constants import OUTPUT_FILE_PATTERNS +from preprocessor.config.settings_instance import settings +from preprocessor.services.io.path_service import PathService +from preprocessor.services.validation.episode_stats import EpisodeStats +from preprocessor.services.validation.file_validators import FileValidator +from preprocessor.services.validation.validators.base_validator import BaseValidator + + +class FrameValidator(BaseValidator): + def validate(self, stats: EpisodeStats) -> None: + frames_dir = PathService(stats.series_name).get_episode_dir_by_code( + stats.episode_info, settings.output_subdirs.frames, + ) + + if not self.__check_dir(stats, frames_dir): + return + + frame_files = sorted(frames_dir.glob(OUTPUT_FILE_PATTERNS['frame'])) + if not frame_files: + self._add_warning(stats, f'No frames found in {settings.output_subdirs.frames}/') + return + + stats.exported_frames_count = len(frame_files) + self.__process_frames(stats, frame_files) + + def __check_dir(self, stats: EpisodeStats, frames_dir: Path) -> bool: + if not frames_dir.exists(): + self._add_warning(stats, f'Missing {settings.output_subdirs.frames} directory') + return False + return True + + def __process_frames(self, stats: EpisodeStats, frame_files: List[Path]) -> None: + total_size = 0.0 + resolutions: List[Tuple[int, int]] = [] + invalid_count = 0 + + for frame_file in frame_files: + result = FileValidator.validate_image_file(frame_file) + if result.is_valid: + total_size += result.metadata['size_mb'] + resolutions.append((result.metadata['width'], result.metadata['height'])) + else: + invalid_count += 1 + self._add_error(stats, f'Invalid frame {frame_file.name}: {result.error_message}') + + if invalid_count > 0: + self._add_warning(stats, f'{invalid_count} invalid frames found') + + stats.exported_frames_total_size_mb = round(total_size, 2) + if resolutions: + stats.exported_frames_avg_resolution = max(set(resolutions), key=resolutions.count) diff --git a/preprocessor/services/validation/validators/image_hash_validator.py b/preprocessor/services/validation/validators/image_hash_validator.py new file mode 100644 index 000000000..f93c661ab --- /dev/null +++ b/preprocessor/services/validation/validators/image_hash_validator.py @@ -0,0 +1,22 @@ +from __future__ import annotations + +from preprocessor.config.settings_instance import settings +from preprocessor.services.io.path_service import PathService +from preprocessor.services.validation.episode_stats import EpisodeStats +from preprocessor.services.validation.file_validators import FileValidator +from preprocessor.services.validation.validators.base_validator import BaseValidator + + +class ImageHashValidator(BaseValidator): + def validate(self, stats: EpisodeStats) -> None: + hash_file = PathService(stats.series_name).get_episode_file_path( + stats.episode_info, settings.output_subdirs.image_hashes, + ) + + if not hash_file.exists(): + self._add_warning(stats, f'Missing image hashes file: {hash_file.name}') + return + + result = FileValidator.validate_json_file(hash_file) + if not result.is_valid: + self._add_error(stats, f'Invalid image hashes JSON: {result.error_message}') diff --git a/preprocessor/services/validation/validators/object_validator.py b/preprocessor/services/validation/validators/object_validator.py new file mode 100644 index 000000000..73fb285b8 --- /dev/null +++ b/preprocessor/services/validation/validators/object_validator.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from preprocessor.config.settings_instance import settings +from preprocessor.services.io.path_service import PathService +from preprocessor.services.validation.episode_stats import EpisodeStats +from preprocessor.services.validation.file_validators import FileValidator +from preprocessor.services.validation.validators.base_validator import BaseValidator +from preprocessor.services.validation.validators.validation_helpers import VisualizationValidationHelper + + +class ObjectValidator(BaseValidator): + def validate(self, stats: EpisodeStats) -> None: + self.__validate_object_detections(stats) + self.__validate_object_visualizations(stats) + + @staticmethod + def __validate_object_detections(stats: EpisodeStats) -> None: + detections_file = PathService(stats.series_name).get_episode_file_path( + stats.episode_info, settings.output_subdirs.object_detections, + ) + + if not detections_file.exists(): + stats.warnings.append(f'Missing object detections file: {detections_file.name}') + return + + result = FileValidator.validate_json_file(detections_file) + if not result.is_valid: + stats.errors.append(f'Invalid object detections JSON: {result.error_message}') + + @staticmethod + def __validate_object_visualizations(stats: EpisodeStats) -> None: + VisualizationValidationHelper.validate_visualizations( + stats, + settings.output_subdirs.object_visualizations, + 'object_visualizations_count', + 'visualization', + ) diff --git a/preprocessor/services/validation/validators/scene_validator.py b/preprocessor/services/validation/validators/scene_validator.py new file mode 100644 index 000000000..71a8ebd18 --- /dev/null +++ b/preprocessor/services/validation/validators/scene_validator.py @@ -0,0 +1,51 @@ +from __future__ import annotations + +from pathlib import Path +from typing import ( + Any, + Dict, + List, +) + +from preprocessor.config.settings_instance import settings +from preprocessor.services.io.path_service import PathService +from preprocessor.services.validation.episode_stats import EpisodeStats +from preprocessor.services.validation.file_validators import FileValidator +from preprocessor.services.validation.validators.base_validator import BaseValidator + + +class SceneValidator(BaseValidator): + def validate(self, stats: EpisodeStats) -> None: + scenes_file = self.__resolve_scenes_file(stats) + + if not self._check_path_exists(scenes_file, stats, f'Missing scenes file: {scenes_file}'): + return + + if not self.__validate_json_integrity(stats, scenes_file): + return + + data = self._load_json_safely(scenes_file) + if data: + self.__extract_scene_stats(stats, data) + + @staticmethod + def __resolve_scenes_file(stats: EpisodeStats) -> Path: + return PathService(stats.series_name).get_episode_file_path( + stats.episode_info, settings.output_subdirs.scenes, + ) + + def __validate_json_integrity(self, stats: EpisodeStats, file_path: Path) -> bool: + result = FileValidator.validate_json_file(file_path) + if not result.is_valid: + self._add_error(stats, f'Invalid scenes JSON: {result.error_message}') + return False + return True + + @staticmethod + def __extract_scene_stats(stats: EpisodeStats, data: Dict[str, Any]) -> None: + stats.scenes_count = data.get('total_scenes', 0) + scenes: List[Dict[str, Any]] = data.get('scenes', []) + + if scenes: + durations = [s.get('duration', 0) for s in scenes] + stats.scenes_avg_duration = round(sum(durations) / len(durations), 2) diff --git a/preprocessor/services/validation/validators/transcription_validator.py b/preprocessor/services/validation/validators/transcription_validator.py new file mode 100644 index 000000000..b734a5539 --- /dev/null +++ b/preprocessor/services/validation/validators/transcription_validator.py @@ -0,0 +1,107 @@ +from __future__ import annotations + +from pathlib import Path +from typing import ( + Any, + Dict, + List, +) + +from preprocessor.config.settings_instance import settings +from preprocessor.services.io.path_service import PathService +from preprocessor.services.validation.episode_stats import EpisodeStats +from preprocessor.services.validation.validators.base_validator import BaseValidator + + +class TranscriptionValidator(BaseValidator): + def validate(self, stats: EpisodeStats) -> None: + trans_files = self.__resolve_file_map(stats) + + if not any(f.exists() for f in trans_files.values()): + self._add_error(stats, 'No transcription files found in any format') + return + + self.__validate_raw_transcription(stats, trans_files) + self.__validate_clean_transcription(stats, trans_files['clean']) + self.__validate_clean_txt(stats, trans_files['clean_txt']) + self.__validate_sound_events(stats, trans_files['sound_events']) + + def __validate_raw_transcription( + self, stats: EpisodeStats, trans_files: Dict[str, Path], + ) -> None: + # Try to find any available raw format + raw_path = next((trans_files[k] for k in ('main', 'segmented', 'simple') if trans_files[k].exists()), None) + + if not raw_path: + self._add_warning(stats, 'Missing raw transcription file (.json, _segmented.json, or _simple.json)') + return + + if self._validate_json_if_exists(stats, raw_path, "Invalid transcription JSON"): + self.__extract_transcription_metrics(stats, raw_path) + + def __extract_transcription_metrics(self, stats: EpisodeStats, raw_path: Path) -> None: + data = self._load_json_safely(raw_path) + if not data: + self._add_error(stats, f'Error reading transcription: {raw_path}') + return + + text = self.__get_full_text(data) + stats.transcription_chars = len(text) + stats.transcription_words = len(text.split()) + stats.transcription_duration = self.__determine_duration(data) + + @staticmethod + def __get_full_text(data: Dict[str, Any]) -> str: + text = data.get('text', '') + if not text: + segments: List[Dict[str, Any]] = data.get('segments', []) + text = ' '.join(s.get('text', '') for s in segments) + return text + + @staticmethod + def __determine_duration(data: Dict[str, Any]) -> float: + words: List[Dict[str, Any]] = data.get('words', []) + if words: + return words[-1].get('end', 0.0) + + segments: List[Dict[str, Any]] = data.get('segments', []) + if segments and segments[-1].get('end'): + return segments[-1].get('end', 0.0) + return 0.0 + + def __validate_clean_transcription(self, stats: EpisodeStats, file_path: Path) -> None: + self._validate_json_with_warning( + stats, file_path, + missing_msg=f'Missing clean transcription: {file_path.name}', + invalid_msg_prefix='Invalid clean transcription JSON', + ) + + def __validate_clean_txt(self, stats: EpisodeStats, file_path: Path) -> None: + if not file_path.exists(): + self._add_warning(stats, f'Missing clean transcription txt: {file_path.name}') + + def __validate_sound_events(self, stats: EpisodeStats, file_path: Path) -> None: + self._validate_json_with_warning( + stats, file_path, + missing_msg=f'Missing sound events: {file_path.name}', + invalid_msg_prefix='Invalid sound events JSON', + ) + + @staticmethod + def __resolve_file_map(stats: EpisodeStats) -> Dict[str, Path]: + path_svc = PathService(stats.series_name) + raw_ep_dir = path_svc.get_episode_dir( + stats.episode_info, + f'{settings.output_subdirs.transcriptions}/{settings.output_subdirs.transcription_subdirs.raw}', + ) + season_raw_dir = raw_ep_dir.parent + ep_code = stats.episode_info.episode_code() + + return { + 'main': raw_ep_dir / f'{ep_code}.json', + 'segmented': raw_ep_dir / f'{ep_code}_segmented.json', + 'simple': raw_ep_dir / f'{ep_code}_simple.json', + 'clean': season_raw_dir / settings.output_subdirs.transcription_subdirs.clean / f'{ep_code}_clean_transcription.json', + 'clean_txt': season_raw_dir / settings.output_subdirs.transcription_subdirs.clean / f'{ep_code}_clean_transcription.txt', + 'sound_events': season_raw_dir / settings.output_subdirs.transcription_subdirs.sound_events / f'{ep_code}_sound_events.json', + } diff --git a/preprocessor/services/validation/validators/validation_helpers.py b/preprocessor/services/validation/validators/validation_helpers.py new file mode 100644 index 000000000..2ede1d4b4 --- /dev/null +++ b/preprocessor/services/validation/validators/validation_helpers.py @@ -0,0 +1,137 @@ +from __future__ import annotations + +from pathlib import Path +from typing import ( + List, + Optional, + Tuple, +) + +from preprocessor.services.io.path_service import PathService +from preprocessor.services.validation.episode_stats import EpisodeStats +from preprocessor.services.validation.file_validators import FileValidator + + +class JsonDirectoryValidationHelper: + @staticmethod + def validate_json_directory( + stats: EpisodeStats, + subdir: str, + count_attr: Optional[str], + context_name: str, + exclude_pattern: Optional[str] = None, + check_anomalies: bool = True, + ) -> None: + dir_path = PathService(stats.series_name).get_episode_dir(stats.episode_info, subdir) + + if not dir_path.exists(): + stats.warnings.append(f'Missing {subdir} directory') + return + + count, sizes, errors = JsonDirectoryValidationHelper.__analyze_json_files(dir_path, exclude_pattern) + + if count == 0: + stats.warnings.append(f'No JSON files in {subdir}/') + return + + if count_attr: + setattr(stats, count_attr, count) + + stats.errors.extend(errors) + + if check_anomalies: + JsonDirectoryValidationHelper.__perform_size_anomaly_check(stats, sizes, context_name) + + @staticmethod + def __analyze_json_files( + directory: Path, + exclude_pattern: Optional[str], + ) -> Tuple[int, List[int], List[str]]: + json_files = [ + f for f in directory.glob('*.json') + if not exclude_pattern or exclude_pattern not in str(f) + ] + + if not json_files: + return 0, [], [] + + sizes: List[int] = [] + errors: List[str] = [] + + for json_file in json_files: + result = FileValidator.validate_json_file(json_file) + if not result.is_valid: + errors.append(f'Invalid JSON {json_file.name}: {result.error_message}') + else: + sizes.append(json_file.stat().st_size) + + return len(json_files), sizes, errors + + @staticmethod + def __perform_size_anomaly_check( + stats: EpisodeStats, + sizes: List[int], + folder_name: str, + threshold: float = 0.2, + ) -> None: + if len(sizes) < 2: + return + + avg_size = sum(sizes) / len(sizes) + if avg_size == 0: + return + + for i, size in enumerate(sizes): + deviation = abs(size - avg_size) / avg_size + if deviation > threshold: + stats.warnings.append( + f'{folder_name} file #{i + 1} size deviation: {deviation * 100:.1f}% from average', + ) + + +class VisualizationValidationHelper: + @staticmethod + def validate_visualizations( + stats: EpisodeStats, + subdir: str, + count_attr: str, + context_name: str, + ) -> None: + viz_dir = PathService(stats.series_name).get_episode_dir(stats.episode_info, subdir) + total, invalid, errors = VisualizationValidationHelper.__scan_images(viz_dir) + + if total == 0 and viz_dir.exists(): + stats.warnings.append(f'No visualization images in {subdir}/') + return + + if total > 0: + setattr(stats, count_attr, total) + stats.errors.extend(errors) + if invalid > 0: + stats.warnings.append(f'{invalid} invalid {context_name} images found') + + @staticmethod + def __scan_images( + directory: Path, + extensions: Tuple[str, ...] = ('*.jpg', '*.png'), + ) -> Tuple[int, int, List[str]]: + if not directory.exists(): + return 0, 0, [] + + image_files: List[Path] = [] + for ext in extensions: + image_files.extend(directory.glob(ext)) + + if not image_files: + return 0, 0, [] + + invalid_count = 0 + errors: List[str] = [] + + for img_file in image_files: + result = FileValidator.validate_image_file(img_file) + if not result.is_valid: + invalid_count += 1 + errors.append(f'Invalid image {img_file.name}: {result.error_message}') + + return len(image_files), invalid_count, errors diff --git a/preprocessor/services/validation/validators/video_validator.py b/preprocessor/services/validation/validators/video_validator.py new file mode 100644 index 000000000..7b691b723 --- /dev/null +++ b/preprocessor/services/validation/validators/video_validator.py @@ -0,0 +1,43 @@ +from __future__ import annotations + +from pathlib import Path + +from preprocessor.config.constants import DEFAULT_VIDEO_EXTENSION +from preprocessor.config.output_paths import get_base_output_dir +from preprocessor.config.settings_instance import settings +from preprocessor.services.validation.episode_stats import EpisodeStats +from preprocessor.services.validation.file_validators import FileValidator +from preprocessor.services.validation.validators.base_validator import BaseValidator + + +class VideoValidator(BaseValidator): + def validate(self, stats: EpisodeStats) -> None: + video_path = self.__resolve_video_file_path(stats) + + if not video_path.exists(): + self._add_warning(stats, f'Missing video file: {video_path}') + return + + result = FileValidator.validate_video_file(video_path) + if not result.is_valid: + self._add_error(stats, f'Invalid video: {result.error_message}') + return + + self.__populate_video_metrics(stats, result.metadata) + + @staticmethod + def __resolve_video_file_path(stats: EpisodeStats) -> Path: + filename = f'{stats.series_name.lower()}_{stats.episode_info.episode_code()}{DEFAULT_VIDEO_EXTENSION}' + season_dir = ( + get_base_output_dir(stats.series_name) / + settings.output_subdirs.video / + stats.episode_info.season_code() + ) + return season_dir / filename + + @staticmethod + def __populate_video_metrics(stats: EpisodeStats, metadata: dict) -> None: + stats.video_size_mb = metadata['size_mb'] + stats.video_duration = metadata['duration'] + stats.video_codec = metadata['codec'] + stats.video_resolution = (metadata['width'], metadata['height']) diff --git a/preprocessor/services/video/__init__.py b/preprocessor/services/video/__init__.py new file mode 100644 index 000000000..6cfcddeef --- /dev/null +++ b/preprocessor/services/video/__init__.py @@ -0,0 +1,3 @@ +from preprocessor.services.video.discovery import VideoDiscovery + +__all__ = ['VideoDiscovery'] diff --git a/preprocessor/services/video/discovery.py b/preprocessor/services/video/discovery.py new file mode 100644 index 000000000..45f3dd9f3 --- /dev/null +++ b/preprocessor/services/video/discovery.py @@ -0,0 +1,23 @@ +from pathlib import Path +from typing import ( + List, + Optional, +) + + +class VideoDiscovery: + DEFAULT_EXTENSIONS: List[str] = ["*.mp4", "*.mkv", "*.avi"] + + @staticmethod + def discover( + source_path: Path, + extensions: Optional[List[str]] = None, + ) -> List[Path]: + if extensions is None: + extensions = VideoDiscovery.DEFAULT_EXTENSIONS + + videos: List[Path] = [] + for ext in extensions: + videos.extend(source_path.glob(f"**/{ext}")) + + return sorted(videos) diff --git a/preprocessor/services/video/emotion_utils.py b/preprocessor/services/video/emotion_utils.py new file mode 100644 index 000000000..03a970e0e --- /dev/null +++ b/preprocessor/services/video/emotion_utils.py @@ -0,0 +1,209 @@ +import os +from pathlib import Path +import shutil +import time +from typing import ( + Dict, + List, + Optional, + Tuple, +) +import urllib.error + +import hsemotion_onnx.facial_emotions as _hsemotion_facial_emotions +from hsemotion_onnx.facial_emotions import HSEmotionRecognizer +import numpy as np +import onnxruntime as ort + +from preprocessor.config.settings_instance import settings +from preprocessor.services.core.logging import ErrorHandlingLogger + +EMOTION_LABELS: List[str] = ['anger', 'contempt', 'disgust', 'fear', 'happiness', 'neutral', 'sadness', 'surprise'] + +_ORIGINAL_GET_MODEL_PATH = _hsemotion_facial_emotions.get_model_path + + +def _volume_aware_get_model_path(model_name: str) -> str: + model_home = os.environ.get('EMOTION_MODEL_HOME', '') + if model_home: + volume_path = Path(model_home) / f'{model_name}.onnx' + if volume_path.exists(): + return str(volume_path) + return _ORIGINAL_GET_MODEL_PATH(model_name) + + +_hsemotion_facial_emotions.get_model_path = _volume_aware_get_model_path + + +class EmotionDetector: + @staticmethod + def detect( + face_image: np.ndarray, + model: HSEmotionRecognizer, + ) -> Tuple[str, float, Dict[str, float]]: + try: + emotion, scores = model.predict_emotions(face_image, logits=False) + return EmotionDetector.__process_emotion_result(emotion, scores) + except Exception as e: + raise RuntimeError(f'Emotion detection failed: {e}') from e + + @staticmethod + def init_model(logger: Optional[ErrorHandlingLogger] = None) -> HSEmotionRecognizer: + model_name = settings.emotion_detection.model_name + if logger: + logger.info(f'Loading HSEmotion model: {model_name}...') + + try: + fer = EmotionDetector.__load_with_retry(model_name, logger) + EmotionDetector.__persist_model_to_volume(model_name, logger) + EmotionDetector.__patch_gpu_session(fer, model_name, logger) + if logger: + logger.info(f'HSEmotion model loaded: {model_name}') + return fer + except Exception as e: + raise RuntimeError(f'Failed to load HSEmotion model {model_name}: {e}') from e + + @staticmethod + def __load_with_retry( + model_name: str, + logger: Optional[ErrorHandlingLogger], + max_retries: int = 5, + initial_delay: float = 15.0, + ) -> HSEmotionRecognizer: + delay = initial_delay + for attempt in range(max_retries): + try: + return HSEmotionRecognizer(model_name=model_name) + except urllib.error.HTTPError as e: + if e.code != 429 or attempt >= max_retries - 1: + raise + if logger: + logger.warning( + f'Rate limited downloading HSEmotion model ' + f'(attempt {attempt + 1}/{max_retries}), retrying in {delay:.0f}s...', + ) + time.sleep(delay) + delay *= 2 + raise RuntimeError(f'Failed to download HSEmotion model after {max_retries} attempts') + + @staticmethod + def __get_volume_model_path(model_name: str) -> Optional[Path]: + model_home = os.environ.get('EMOTION_MODEL_HOME', '') + if not model_home: + return None + return Path(model_home) / f'{model_name}.onnx' + + @staticmethod + def __persist_model_to_volume( + model_name: str, logger: Optional[ErrorHandlingLogger], + ) -> None: + volume_path = EmotionDetector.__get_volume_model_path(model_name) + if not volume_path or volume_path.exists(): + return + package_path = Path(_hsemotion_facial_emotions.__file__).parent / 'models' / f'{model_name}.onnx' + if package_path.exists(): + volume_path.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(package_path, volume_path) + if logger: + logger.info(f'Persisted HSEmotion model to volume: {volume_path}') + + @staticmethod + def __patch_gpu_session( + fer: HSEmotionRecognizer, + model_name: str, + logger: Optional[ErrorHandlingLogger], + ) -> None: + available_providers = ort.get_available_providers() + if 'CUDAExecutionProvider' not in available_providers: + if logger: + logger.warning( + 'CUDAExecutionProvider not available — HSEmotion running on CPU. ' + 'Install onnxruntime-gpu to enable GPU acceleration.', + ) + return + + model_path = _hsemotion_facial_emotions.get_model_path(model_name) + fer.ort_session = ort.InferenceSession( + model_path, + providers=['CUDAExecutionProvider', 'CPUExecutionProvider'], + ) + if logger: + logger.info('HSEmotion session patched to use GPU (CUDAExecutionProvider)') + + @staticmethod + def detect_batch( + face_images: List[np.ndarray], + model: HSEmotionRecognizer, + batch_size: int = 32, + logger: Optional[ErrorHandlingLogger] = None, + ) -> List[Optional[Tuple[str, float, Dict[str, float]]]]: + results: List[Optional[Tuple[str, float, Dict[str, float]]]] = [] + total = len(face_images) + + for batch_start in range(0, total, batch_size): + batch_end = min(batch_start + batch_size, total) + batch = face_images[batch_start:batch_end] + progress_pct = int(batch_end / total * 100) + + if logger: + logger.info( + f'Processing emotion batch {batch_start}-{batch_end}/{total} ' + f'({progress_pct}%)', + ) + + try: + batch_results = model.predict_multi_emotions(batch, logits=False) + for emotion, scores in batch_results: + results.append(EmotionDetector.__process_emotion_result(emotion, scores)) + except Exception: + for face_img in batch: + try: + emotion, scores = model.predict_emotions(face_img, logits=False) + results.append(EmotionDetector.__process_emotion_result(emotion, scores)) + except Exception: + results.append(None) + + return results + + @staticmethod + def crop_face(frame: np.ndarray, bbox: Dict[str, int]) -> Optional[np.ndarray]: + try: + x1, y1, x2, y2 = (bbox['x1'], bbox['y1'], bbox['x2'], bbox['y2']) + height, width = frame.shape[:2] + + x1, y1, x2, y2 = EmotionDetector.__clip_bbox(x1, y1, x2, y2, width, height) + if x2 <= x1 or y2 <= y1: + return None + + face_crop = frame[y1:y2, x1:x2] + return face_crop if face_crop.size > 0 else None + except Exception: + return None + + @staticmethod + def __clip_bbox( + x1: int, + y1: int, + x2: int, + y2: int, + width: int, + height: int, + ) -> Tuple[int, int, int, int]: + x1 = max(0, x1) + y1 = max(0, y1) + x2 = min(width, x2) + y2 = min(height, y2) + return x1, y1, x2, y2 + + @staticmethod + def __process_emotion_result( + emotion: str, + scores: np.ndarray, + ) -> Tuple[str, float, Dict[str, float]]: + emotion_scores = { + EMOTION_LABELS[i]: float(scores[i]) + for i in range(len(EMOTION_LABELS)) + } + confidence = float(max(scores)) + dominant_emotion = emotion.lower() + return dominant_emotion, confidence, emotion_scores diff --git a/preprocessor/services/video/frame_utils.py b/preprocessor/services/video/frame_utils.py new file mode 100644 index 000000000..e4f7e0645 --- /dev/null +++ b/preprocessor/services/video/frame_utils.py @@ -0,0 +1,45 @@ +from concurrent.futures import ThreadPoolExecutor +from pathlib import Path +from typing import ( + Any, + Dict, + List, +) + +from PIL import Image + + +class FrameLoader: + @staticmethod + def load_from_requests( + frames_dir: Path, + frame_requests: List[Dict[str, Any]], + convert_rgb: bool = False, + num_workers: int = 4, + ) -> List[Image.Image]: + with ThreadPoolExecutor(max_workers=num_workers) as executor: + images = list( + executor.map( + lambda req: FrameLoader.__load_single(frames_dir, req, convert_rgb), + frame_requests, + ), + ) + return images + + @staticmethod + def __load_single( + frames_dir: Path, request: Dict[str, Any], convert_rgb: bool, + ) -> Image.Image: + if 'frame_path' in request: + frame_path = frames_dir / request['frame_path'] + else: + frame_num = request['frame_number'] + frame_path = frames_dir / f'frame_{frame_num:06d}.jpg' + + if frame_path.exists(): + img = Image.open(frame_path) + if convert_rgb and img.mode != 'RGB': + img = img.convert('RGB') + return img + + return Image.new('RGB', (1, 1)) diff --git a/preprocessor/services/video/image_hasher.py b/preprocessor/services/video/image_hasher.py new file mode 100644 index 000000000..b35789b0f --- /dev/null +++ b/preprocessor/services/video/image_hasher.py @@ -0,0 +1,71 @@ +from typing import ( + List, + Optional, +) + +from PIL import Image +import torch +from torch import nn +import torch.nn.functional as F +from torchvision import ( + models, + transforms, +) +from torchvision.models import ResNet18_Weights + + +class PerceptualHasher: + + def __init__(self, device: str = 'cuda', hash_size: int = 8) -> None: + self.__device = device + self.__hash_size = hash_size + base_model = models.resnet18(weights=ResNet18_Weights.DEFAULT) + self.model: Optional[nn.Module] = nn.Sequential(*list(base_model.children())[:-1]) + self.model.eval() + if device == 'cuda' and torch.cuda.is_available(): + self.model = self.model.cuda() + self.__transform = transforms.Compose([ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), + ]) + + def cleanup(self) -> None: + if self.model is not None: + del self.model + self.model = None + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + def compute_phash_batch(self, images: List[Image.Image]) -> List[str]: + if self.model is None: + raise RuntimeError('Model not initialized or already cleaned up') + + hashes: List[str] = [] + batch_tensors: List[torch.Tensor] = [] + + for img in images: + tensor = self.__transform(img) + batch_tensors.append(tensor) + + if batch_tensors: + batch = torch.stack(batch_tensors) + if self.__device == 'cuda' and torch.cuda.is_available(): + batch = batch.cuda() + + with torch.no_grad(): + features = self.model(batch) + features = F.adaptive_avg_pool2d(features, (1, 1)) + features = features.view(features.size(0), -1) + + for feature_vec in features: + hash_bits = (feature_vec > feature_vec.median()).int() + n_bits = self.__hash_size * self.__hash_size + bits_str = ''.join(str(b.item()) for b in hash_bits[:n_bits]) + hashes.append(format(int(bits_str, 2), f'0{n_bits // 4}x')) + + return hashes + + +__all__ = ['PerceptualHasher'] diff --git a/preprocessor/services/video/strategies/__init__.py b/preprocessor/services/video/strategies/__init__.py new file mode 100644 index 000000000..91c924807 --- /dev/null +++ b/preprocessor/services/video/strategies/__init__.py @@ -0,0 +1,4 @@ +from preprocessor.services.video.strategies.base_strategy import BaseKeyframeStrategy +from preprocessor.services.video.strategies.scene_changes_strategy import SceneChangesStrategy + +__all__ = ['BaseKeyframeStrategy', 'SceneChangesStrategy'] diff --git a/preprocessor/embeddings/strategies/base_strategy.py b/preprocessor/services/video/strategies/base_strategy.py similarity index 62% rename from preprocessor/embeddings/strategies/base_strategy.py rename to preprocessor/services/video/strategies/base_strategy.py index 12c271714..f04e03a2d 100644 --- a/preprocessor/embeddings/strategies/base_strategy.py +++ b/preprocessor/services/video/strategies/base_strategy.py @@ -9,12 +9,12 @@ List, ) +from preprocessor.config.types import FrameRequest + class BaseKeyframeStrategy(ABC): @abstractmethod def extract_frame_requests( - self, - video_path: Path, - data: Dict[str, Any], - ) -> List[Dict[str, Any]]: + self, video_path: Path, data: Dict[str, Any], + ) -> List[FrameRequest]: pass diff --git a/preprocessor/services/video/strategies/scene_changes_strategy.py b/preprocessor/services/video/strategies/scene_changes_strategy.py new file mode 100644 index 000000000..9ed873a31 --- /dev/null +++ b/preprocessor/services/video/strategies/scene_changes_strategy.py @@ -0,0 +1,97 @@ +from pathlib import Path +from typing import ( + Any, + Dict, + List, + Optional, +) + +from preprocessor.config.enums import FrameType +from preprocessor.config.types import FrameRequest +from preprocessor.services.ui.console import console +from preprocessor.services.video.strategies.base_strategy import BaseKeyframeStrategy + + +class SceneChangesStrategy(BaseKeyframeStrategy): + def __init__(self, frames_per_scene: int, scene_change_offset_seconds: float = 0.5) -> None: + self.__frames_per_scene = frames_per_scene + self.__offset = scene_change_offset_seconds + + def extract_frame_requests( + self, video_path: Path, data: Dict[str, Any], + ) -> List[FrameRequest]: + scenes = self.__extract_scenes(data) + if not scenes: + console.print('[yellow]No scene timestamps found[/yellow]') + return [] + + return self.__process_all_scenes(scenes) + + def __process_all_scenes( + self, scenes: List[Dict[str, Any]], + ) -> List[FrameRequest]: + frame_requests: List[FrameRequest] = [] + for i, scene in enumerate(scenes): + frame_requests.extend(self.__process_single_scene(scene, i)) + return frame_requests + + def __process_single_scene( + self, scene: Dict[str, Any], scene_index: int, + ) -> List[FrameRequest]: + start_seconds = scene.get('start', {}).get('seconds', 0.0) + self.__offset + end_seconds = scene.get('end', {}).get('seconds', start_seconds) + duration = end_seconds - start_seconds + + if duration <= 0.1: + return [ + self.__create_request(start_seconds, FrameType.SCENE_SINGLE, scene_index), + ] + + return self.__generate_multi_frame_requests( + start_seconds, duration, scene_index, + ) + + def __generate_multi_frame_requests( + self, start_seconds: float, duration: float, scene_index: int, + ) -> List[FrameRequest]: + requests: List[FrameRequest] = [] + for frame_idx in range(self.__frames_per_scene): + timestamp = self.__calculate_timestamp( + start_seconds, duration, frame_idx, + ) + frame_type = self.__determine_frame_type(frame_idx) + requests.append( + self.__create_request(timestamp, frame_type, scene_index), + ) + return requests + + def __calculate_timestamp( + self, start_seconds: float, duration: float, frame_idx: int, + ) -> float: + position = frame_idx / (self.__frames_per_scene - 1) if self.__frames_per_scene > 1 else 0.0 + return start_seconds + position * duration + + def __determine_frame_type(self, frame_idx: int) -> str: + if frame_idx == 0: + return FrameType.SCENE_START + if frame_idx == self.__frames_per_scene - 1: + return FrameType.SCENE_END + return FrameType.scene_mid(frame_idx) + + @staticmethod + def __extract_scenes(data: Dict[str, Any]) -> List[Dict[str, Any]]: + scene_timestamps = data.get('scene_timestamps', {}) + return scene_timestamps.get('scenes', []) + + @staticmethod + def __create_request( + timestamp: float, type_name: str, scene_num: Optional[int] = None, + ) -> FrameRequest: + req: FrameRequest = { + 'frame_number': 0, + 'timestamp': float(timestamp), + 'type': type_name, + } + if scene_num is not None: + req['scene_number'] = scene_num + return req diff --git a/preprocessor/services/video/strategies/strategy_factory.py b/preprocessor/services/video/strategies/strategy_factory.py new file mode 100644 index 000000000..87ac24b68 --- /dev/null +++ b/preprocessor/services/video/strategies/strategy_factory.py @@ -0,0 +1,19 @@ +from preprocessor.config.enums import KeyframeStrategy +from preprocessor.services.video.strategies.base_strategy import BaseKeyframeStrategy +from preprocessor.services.video.strategies.scene_changes_strategy import SceneChangesStrategy + + +class KeyframeStrategyFactory: + @staticmethod + def create( + strategy_type: KeyframeStrategy, + frames_per_scene: int = 1, + scene_change_offset_seconds: float = 0.5, + ) -> BaseKeyframeStrategy: + if strategy_type == KeyframeStrategy.SCENE_CHANGES: + return SceneChangesStrategy( + frames_per_scene=frames_per_scene, + scene_change_offset_seconds=scene_change_offset_seconds, + ) + + raise ValueError(f"Unknown strategy type: {strategy_type}") diff --git a/preprocessor/indexing/__init__.py b/preprocessor/steps/__init__.py similarity index 100% rename from preprocessor/indexing/__init__.py rename to preprocessor/steps/__init__.py diff --git a/preprocessor/providers/__init__.py b/preprocessor/steps/analysis/__init__.py similarity index 100% rename from preprocessor/providers/__init__.py rename to preprocessor/steps/analysis/__init__.py diff --git a/preprocessor/steps/analysis/resolution_analysis_step.py b/preprocessor/steps/analysis/resolution_analysis_step.py new file mode 100644 index 000000000..5b50113c2 --- /dev/null +++ b/preprocessor/steps/analysis/resolution_analysis_step.py @@ -0,0 +1,428 @@ +from collections import Counter +from datetime import datetime +from pathlib import Path +from typing import ( + Any, + Dict, + List, + Optional, +) + +from preprocessor.config.step_configs import ResolutionAnalysisConfig +from preprocessor.core.artifacts import ResolutionAnalysisResult +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext +from preprocessor.core.models import AnalysisData +from preprocessor.services.io.files import FileOperations +from preprocessor.services.io.path_service import PathService +from preprocessor.services.media.ffmpeg import FFmpegWrapper + + +class ResolutionAnalysisStep(PipelineStep[None, ResolutionAnalysisResult, ResolutionAnalysisConfig]): + @property + def is_global(self) -> bool: + return True + + @property + def uses_caching(self) -> bool: + return False + + def _process( + self, input_data: None, context: ExecutionContext, + ) -> ResolutionAnalysisResult: + self.__log_analysis_header(context) + + video_paths = self.__find_video_files(context) + if not video_paths: + return self.__handle_missing_videos(context) + + video_info = self.__scan_resolutions(video_paths, context) + if not video_info: + return self.__handle_failed_analysis(video_paths, context) + + upscaling_pct = self.__analyze_and_report(video_info, context) + self.__save_results_to_json(video_info, upscaling_pct, context) + + return ResolutionAnalysisResult( + total_files=len(video_info), upscaling_percentage=upscaling_pct, + ) + + def __scan_resolutions( + self, video_paths: List[Path], context: ExecutionContext, + ) -> List[Dict[str, Any]]: + results = self._execute_with_threadpool( + video_paths, + context, + self.config.max_parallel_episodes, + self.__scan_single_video, + ) + return [r for r in results if r is not None] + + def __analyze_and_report( + self, video_info: List[Dict[str, Any]], context: ExecutionContext, + ) -> float: + resolution_counts = Counter((v['width'], v['height']) for v in video_info) + total_episodes = len(video_info) + + target_width = self.config.resolution.width + target_height = self.config.resolution.height + target_pixels = target_width * target_height + + upscaling_count = sum( + 1 for v in video_info + if (v['width'] * v['height']) < target_pixels + ) + upscaling_pct = ( + (upscaling_count / total_episodes) * 100 if total_episodes > 0 else 0 + ) + + needs_deinterlace_count = sum(1 for v in video_info if v['needs_deinterlace']) + progressive_count = sum(1 for v in video_info if not v['needs_deinterlace']) + metadata_mismatch_count = sum( + 1 for v in video_info if v['metadata_match'] != 'match' + ) + + self.__log_resolution_distribution( + context, + resolution_counts, + total_episodes, + target_width, + target_height, + ) + self.__log_upscaling_warnings(context, upscaling_pct) + self.__log_interlacing_analysis( + context, + progressive_count, + needs_deinterlace_count, + total_episodes, + ) + self.__log_metadata_warnings(context, metadata_mismatch_count) + + context.logger.info('=' * 80) + return upscaling_pct + + def __save_results_to_json( + self, + video_info: List[Dict[str, Any]], + upscaling_pct: float, + context: ExecutionContext, + ) -> None: + output_file = self.__resolve_output_file(context) + + resolution_counts = Counter((v['width'], v['height']) for v in video_info) + total_episodes = len(video_info) + + target_width = self.config.resolution.width + target_height = self.config.resolution.height + target_pixels = target_width * target_height + + upscaling_count = sum( + 1 for v in video_info + if (v['width'] * v['height']) < target_pixels + ) + needs_deinterlace_count = sum(1 for v in video_info if v['needs_deinterlace']) + progressive_count = sum(1 for v in video_info if not v['needs_deinterlace']) + metadata_mismatch_count = sum( + 1 for v in video_info if v['metadata_match'] != 'match' + ) + + analysis_data = AnalysisData( + video_info=video_info, + resolution_counts=resolution_counts, + total_episodes=total_episodes, + target_width=target_width, + target_height=target_height, + target_pixels=target_pixels, + upscaling_count=upscaling_count, + upscaling_pct=upscaling_pct, + progressive_count=progressive_count, + needs_deinterlace_count=needs_deinterlace_count, + metadata_mismatch_count=metadata_mismatch_count, + ) + + result = self.__build_analysis_payload(context, analysis_data) + FileOperations.atomic_write_json(output_file, result, indent=2) + context.logger.info(f'Resolution analysis saved to: {output_file}') + + def __build_analysis_payload( + self, + context: ExecutionContext, + data: AnalysisData, + ) -> Dict[str, Any]: + source_resolutions = [ + { + 'width': width, + 'height': height, + 'count': count, + 'percentage': round((count / data.total_episodes) * 100, 1), + 'label': self.__get_resolution_label(width, height), + } + for (width, height), count in data.resolution_counts.most_common() + ] + + files_details = [ + { + 'filename': v['filename'], + 'width': v['width'], + 'height': v['height'], + 'label': self.__get_resolution_label(v['width'], v['height']), + 'needs_upscaling': (v['width'] * v['height']) < data.target_pixels, + 'field_order': v['field_order'], + 'needs_deinterlace': v['needs_deinterlace'], + 'metadata_match': v['metadata_match'], + 'idet_stats': v['idet_stats'], + } + for v in sorted(data.video_info, key=lambda x: x['filename']) + ] + + return { + 'analysis_date': datetime.now().isoformat(), + 'series_name': context.series_name, + 'target_resolution': { + 'width': data.target_width, + 'height': data.target_height, + 'label': self.__get_resolution_label( + data.target_width, data.target_height, + ), + }, + 'source_resolutions': source_resolutions, + 'total_files': data.total_episodes, + 'upscaling_required': { + 'count': data.upscaling_count, + 'percentage': round(data.upscaling_pct, 1), + }, + 'interlacing_analysis': { + 'progressive': { + 'count': data.progressive_count, + 'percentage': round( + (data.progressive_count / data.total_episodes) * 100, 1, + ), + }, + 'interlaced': { + 'count': data.needs_deinterlace_count, + 'percentage': round( + (data.needs_deinterlace_count / data.total_episodes) * 100, 1, + ), + }, + 'metadata_mismatches': { + 'count': data.metadata_mismatch_count, + 'percentage': round( + (data.metadata_mismatch_count / data.total_episodes) * 100, 1, + ), + }, + }, + 'files': files_details, + } + + @staticmethod + def __handle_missing_videos( + context: ExecutionContext, + ) -> ResolutionAnalysisResult: + context.logger.warning('No video files found - skipping resolution analysis') + return ResolutionAnalysisResult(total_files=0, upscaling_percentage=0.0) + + @staticmethod + def __handle_failed_analysis( + video_paths: List[Path], context: ExecutionContext, + ) -> ResolutionAnalysisResult: + context.logger.warning('Failed to analyze videos - skipping') + return ResolutionAnalysisResult( + total_files=len(video_paths), upscaling_percentage=0.0, + ) + + def __log_resolution_distribution( + self, + context: ExecutionContext, + resolution_counts: Counter, + total_episodes: int, + target_width: int, + target_height: int, + ) -> None: + context.logger.info('') + context.logger.info('Source Resolution Distribution:') + context.logger.info('-' * 60) + + for (width, height), count in resolution_counts.most_common(): + pct = (count / total_episodes) * 100 + label = self.__get_resolution_label(width, height) + context.logger.info( + f' {width}x{height} ({label}): {count} episodes ({pct:.1f}%)', + ) + + context.logger.info('') + context.logger.info( + f'Target Resolution: {target_width}x{target_height} ' + f'({self.__get_resolution_label(target_width, target_height)})', + ) + + @staticmethod + def __log_analysis_header(context: ExecutionContext) -> None: + context.logger.info('=' * 80) + context.logger.info('RESOLUTION ANALYSIS - Checking source video resolutions') + context.logger.info('=' * 80) + + @staticmethod + def __log_upscaling_warnings( + context: ExecutionContext, upscaling_pct: float, + ) -> None: + if upscaling_pct > 50: + context.logger.warning('') + context.logger.warning('⚠' * 30) + context.logger.warning( + f'⚠ WARNING: {upscaling_pct:.1f}% of episodes will require UPSCALING!', + ) + context.logger.warning( + '⚠ Upscaling degrades quality. Consider using analyze-resolution CLI ' + 'to find optimal target resolution.', + ) + context.logger.warning('⚠' * 30) + elif upscaling_pct > 0: + context.logger.info( + f'Note: {upscaling_pct:.1f}% of episodes will be upscaled ' + '(enhanced quality params will be used)', + ) + + @staticmethod + def __log_interlacing_analysis( + context: ExecutionContext, + progressive_count: int, + needs_deinterlace_count: int, + total_episodes: int, + ) -> None: + context.logger.info('') + context.logger.info('Interlacing Analysis (based on idet, not metadata):') + context.logger.info('-' * 60) + context.logger.info( + f' Progressive: {progressive_count} episodes ' + f'({(progressive_count / total_episodes) * 100:.1f}%)', + ) + context.logger.info( + f' Interlaced (needs deinterlace): {needs_deinterlace_count} episodes ' + f'({(needs_deinterlace_count / total_episodes) * 100:.1f}%)', + ) + + @staticmethod + def __log_metadata_warnings(context: ExecutionContext, mismatch_count: int) -> None: + if mismatch_count > 0: + context.logger.warning('') + context.logger.warning( + f'⚠ WARNING: {mismatch_count} episodes have INCORRECT field_order metadata!', + ) + context.logger.warning( + '⚠ Using idet analysis instead of metadata for deinterlacing decisions.', + ) + + @staticmethod + def __find_video_files(context: ExecutionContext) -> List[Path]: + input_base = PathService.get_input_base() + series_path = input_base / context.series_name + + if not series_path.exists(): + return [] + + video_extensions = {'.mp4', '.mkv', '.avi', '.mov', '.m4v'} + video_files = [ + p + for p in series_path.rglob('*') + if p.is_file() and p.suffix.lower() in video_extensions + ] + + return sorted(video_files) + + @staticmethod + def __scan_single_video( + video_path: Path, context: ExecutionContext, + ) -> Optional[Dict[str, Any]]: + try: + probe_data = FFmpegWrapper.probe_video(video_path) + width, height = FFmpegWrapper.get_resolution(probe_data) + sar_num, sar_denom = FFmpegWrapper.get_sample_aspect_ratio(probe_data) + field_order = FFmpegWrapper.get_field_order(probe_data) + + effective_width = int(width * sar_num / sar_denom) + + context.logger.info( + f'Analyzing interlacing for {video_path.name} ' + f'(field_order={field_order}, analyzing full video)...', + ) + has_interlacing, idet_stats = FFmpegWrapper.detect_interlacing( + video_path, analysis_time=None, + ) + + metadata_vs_reality = ResolutionAnalysisStep.__validate_field_order( + field_order, has_interlacing, idet_stats, + ) + + if metadata_vs_reality != 'match': + context.logger.warning( + f'⚠ {video_path.name}: field_order={field_order} ' + f'but idet says {metadata_vs_reality}!', + ) + + return { + 'filename': video_path.name, + 'width': effective_width, + 'height': height, + 'field_order': field_order, + 'needs_deinterlace': has_interlacing, + 'idet_stats': idet_stats, + 'metadata_match': metadata_vs_reality, + } + + except Exception as e: + context.logger.warning(f'Failed to probe {video_path.name}: {e}') + return None + + @staticmethod + def __validate_field_order( + field_order: str, has_interlacing: bool, idet_stats: Optional[Dict[str, int]], + ) -> str: + if not idet_stats: + return 'unknown' + + metadata_says_progressive = field_order in {'progressive', 'unknown'} + idet_says_progressive = not has_interlacing + + if metadata_says_progressive and idet_says_progressive: + return 'match' + if not metadata_says_progressive and not idet_says_progressive: + return 'match' + if metadata_says_progressive and not idet_says_progressive: + return 'interlaced (metadata wrong)' + return 'progressive (metadata wrong)' + + @staticmethod + def __get_resolution_label(width: int, height: int) -> str: + resolution_labels = { + (7680, 4320): '8K', + (3840, 2160): '4K', + (2560, 1440): '1440p', + (1920, 1080): '1080p', + (1280, 720): '720p', + (854, 480): '480p', + (640, 360): '360p', + (426, 240): '240p', + (256, 144): '144p', + } + + if (width, height) in resolution_labels: + return resolution_labels[(width, height)] + + if height >= 2000: + return '4K+' + if height >= 1400: + return '2K' + if height >= 1000: + return 'Full HD' + if height >= 700: + return 'HD' + if height >= 450: + return 'SD' + return 'Low' + + @staticmethod + def __resolve_output_file(context: ExecutionContext) -> Path: + output_base = PathService.get_output_base() + output_dir = output_base / context.series_name + output_dir.mkdir(parents=True, exist_ok=True) + return output_dir / 'resolution_analysis.json' diff --git a/preprocessor/steps/audio/__init__.py b/preprocessor/steps/audio/__init__.py new file mode 100644 index 000000000..2cbc94cc2 --- /dev/null +++ b/preprocessor/steps/audio/__init__.py @@ -0,0 +1,3 @@ +from preprocessor.steps.audio.separation_step import SoundSeparationStep + +__all__ = ['SoundSeparationStep'] diff --git a/preprocessor/steps/audio/separation_step.py b/preprocessor/steps/audio/separation_step.py new file mode 100644 index 000000000..540ea231f --- /dev/null +++ b/preprocessor/steps/audio/separation_step.py @@ -0,0 +1,350 @@ +import json +from pathlib import Path +import re +from typing import ( + Any, + Dict, + List, + Tuple, +) + +from preprocessor.config.constants import ( + FILE_EXTENSIONS, + FILE_SUFFIXES, +) +from preprocessor.config.step_configs import SoundSeparationConfig +from preprocessor.config.types import ( + WordKeys, + WordTypeValues, +) +from preprocessor.core.artifacts import TranscriptionData +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext +from preprocessor.core.temp_files import StepTempFile +from preprocessor.services.io.files import FileOperations +from preprocessor.services.transcription.sound_classification import ( + classify_segment, + is_sound_event, +) + + +class SoundSeparationStep( + PipelineStep[TranscriptionData, TranscriptionData, SoundSeparationConfig], +): + @property + def supports_batch_processing(self) -> bool: + return True + + def execute_batch( + self, input_data: List[TranscriptionData], context: ExecutionContext, + ) -> List[TranscriptionData]: + return self._execute_with_threadpool( + input_data, context, self.config.max_parallel_episodes, self.execute, + ) + + def _process( + self, input_data: TranscriptionData, context: ExecutionContext, + ) -> TranscriptionData: + output_paths = self.__resolve_output_paths(input_data) + + transcription_data = self.__load_transcription_payload(input_data) + dialogue_segments, sound_segments = self.__separate_dialogue_from_sounds( + transcription_data['segments'], + ) + + self.__save_separated_data( + output_paths, + transcription_data['episode_info'], + dialogue_segments, + sound_segments, + ) + self.__generate_additional_formats( + output_paths, + dialogue_segments, + sound_segments, + ) + + return self.__construct_result_artifact(output_paths, input_data) + + def _get_cache_path( + self, input_data: TranscriptionData, context: ExecutionContext, + ) -> Path: + output_paths = self.__resolve_output_paths(input_data) + return output_paths['clean_json'] + + def _load_from_cache( + self, + cache_path: Path, + input_data: TranscriptionData, + context: ExecutionContext, + ) -> TranscriptionData: + output_paths = self.__resolve_output_paths(input_data) + return self.__construct_result_artifact(output_paths, input_data) + + def __separate_dialogue_from_sounds( + self, + segments: List[Dict[str, Any]], + ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: + dialogue_segments = [] + sound_segments = [] + + for segment in segments: + classification = classify_segment(segment) + if classification == 'dialogue': + cleaned = self.__clean_segment_text(segment) + dialogue_segments.append(cleaned) + elif classification == 'sound_event': + cleaned = self.__clean_segment_text(segment) + cleaned['sound_type'] = 'sound' + sound_segments.append(cleaned) + elif classification == 'mixed': + dialogue_parts, sound_parts = self.__split_mixed_segment(segment) + dialogue_segments.extend(dialogue_parts) + sound_segments.extend(sound_parts) + + dialogue_segments = self.__renumber_segments(dialogue_segments) + sound_segments = self.__renumber_segments(sound_segments) + + return dialogue_segments, sound_segments + + def __split_mixed_segment( + self, + segment: Dict[str, Any], + ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: + words = segment.get(WordKeys.WORDS, []) + dialogue_parts = [] + sound_parts = [] + current_type = None + current_words = [] + current_start = segment.get(WordKeys.START, 0.0) + + for word in words: + word_type = 'sound' if is_sound_event(word) else 'dialogue' + if word.get(WordKeys.TYPE) == WordTypeValues.SPACING: + if current_words: + current_words.append(word) + continue + + if word_type != current_type: + if current_words and current_type: + self.__finalize_sequence( + current_type, + current_words, + current_start, + dialogue_parts, + sound_parts, + ) + current_type = word_type + current_words = [word] + current_start = word.get(WordKeys.START) + else: + current_words.append(word) + + if current_words and current_type: + self.__finalize_sequence( + current_type, + current_words, + current_start, + dialogue_parts, + sound_parts, + ) + + return dialogue_parts, sound_parts + + @staticmethod + def __finalize_sequence( + seq_type: str, + words: List[Dict[str, Any]], + start: float, + dialogue_parts: List[Dict[str, Any]], + sound_parts: List[Dict[str, Any]], + ) -> None: + non_spacing = [ + w for w in words if w.get(WordKeys.TYPE) != WordTypeValues.SPACING + ] + if not non_spacing: + return + + text = ''.join((w.get(WordKeys.TEXT, '') for w in words)) + # Use the end time of the last word, or start if not available + end = words[-1].get(WordKeys.END, start) + + new_segment = { + 'id': 0, + 'text': text, + WordKeys.START: start, + WordKeys.END: end, + WordKeys.WORDS: words, + } + + if seq_type == 'sound': + new_segment['sound_type'] = 'sound' + sound_parts.append(new_segment) + else: + dialogue_parts.append(new_segment) + + def __generate_additional_formats( + self, + output_paths: Dict[str, Path], + dialogue_segments: List[Dict[str, Any]], + sound_segments: List[Dict[str, Any]], + ) -> None: + self.__generate_txt_file( + output_paths['clean_json'], output_paths['clean_txt'], + ) + self.__generate_txt_file( + output_paths['sound_json'], output_paths['sound_txt'], + ) + self.__generate_srt_file(dialogue_segments, output_paths['clean_srt']) + self.__generate_srt_file(sound_segments, output_paths['sound_srt']) + + @staticmethod + def __resolve_output_paths(input_data: TranscriptionData) -> Dict[str, Path]: + base_name = input_data.path.stem.replace(FILE_SUFFIXES['segmented'], '') + episode_dir = input_data.path.parent.parent + clean_dir = episode_dir / 'clean' + sound_dir = episode_dir / 'sound_events' + + clean_dir.mkdir(parents=True, exist_ok=True) + sound_dir.mkdir(parents=True, exist_ok=True) + + return { + 'clean_json': clean_dir + / f"{base_name}{FILE_SUFFIXES['clean']}{FILE_EXTENSIONS['json']}", + 'sound_json': sound_dir + / f"{base_name}{FILE_SUFFIXES['sound_events']}{FILE_EXTENSIONS['json']}", + 'clean_segmented': clean_dir + / f"{base_name}{FILE_SUFFIXES['segmented']}_clean{FILE_EXTENSIONS['json']}", + 'sound_segmented': sound_dir + / f"{base_name}{FILE_SUFFIXES['segmented']}_sound_events{FILE_EXTENSIONS['json']}", + 'clean_txt': clean_dir + / f"{base_name}{FILE_SUFFIXES['clean']}{FILE_EXTENSIONS['txt']}", + 'sound_txt': sound_dir + / f"{base_name}{FILE_SUFFIXES['sound_events']}{FILE_EXTENSIONS['txt']}", + 'clean_srt': clean_dir + / f"{base_name}{FILE_SUFFIXES['clean']}{FILE_EXTENSIONS['srt']}", + 'sound_srt': sound_dir + / f"{base_name}{FILE_SUFFIXES['sound_events']}{FILE_EXTENSIONS['srt']}", + } + + @staticmethod + def __load_transcription_payload( + input_data: TranscriptionData, + ) -> Dict[str, Any]: + with open(input_data.path, 'r', encoding='utf-8') as f: + data = json.load(f) + return { + 'episode_info': data.get('episode_info', {}), + 'segments': data.get('segments', []), + } + + @staticmethod + def __save_separated_data( + output_paths: Dict[str, Path], + episode_info_dict: Dict[str, Any], + dialogue_segments: List[Dict[str, Any]], + sound_segments: List[Dict[str, Any]], + ) -> None: + clean_data = { + 'episode_info': episode_info_dict, + 'segments': dialogue_segments, + } + sound_data = {'episode_info': episode_info_dict, 'segments': sound_segments} + + FileOperations.atomic_write_json(output_paths['clean_json'], clean_data) + FileOperations.atomic_write_json(output_paths['sound_json'], sound_data) + FileOperations.atomic_write_json( + output_paths['clean_segmented'], clean_data, + ) + FileOperations.atomic_write_json( + output_paths['sound_segmented'], sound_data, + ) + + @staticmethod + def __construct_result_artifact( + output_paths: Dict[str, Path], + input_data: TranscriptionData, + ) -> TranscriptionData: + return TranscriptionData( + path=output_paths['clean_json'], + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + language=input_data.language, + model=input_data.model, + format='json', + ) + + @staticmethod + def __clean_segment_text(segment: Dict[str, Any]) -> Dict[str, Any]: + cleaned = segment.copy() + text = cleaned.get('text', '') + text = re.sub(r'\s+', ' ', text) + cleaned['text'] = text.strip() + words = cleaned.get(WordKeys.WORDS, []) + + if words: + non_spacing = [ + w for w in words if w.get(WordKeys.TYPE) != WordTypeValues.SPACING + ] + if non_spacing: + cleaned[WordKeys.START] = min( + (w.get(WordKeys.START, 0) for w in non_spacing), + ) + cleaned[WordKeys.END] = max( + (w.get(WordKeys.END, 0) for w in non_spacing), + ) + + return cleaned + + @staticmethod + def __format_srt_time(seconds: float) -> str: + hours = int(seconds // 3600) + minutes = int(seconds % 3600 // 60) + secs = int(seconds % 60) + millis = int(seconds % 1 * 1000) + return f'{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}' + + @staticmethod + def __generate_srt_file( + segments: List[Dict[str, Any]], srt_path: Path, + ) -> None: + with StepTempFile(srt_path) as temp_path: + with open(temp_path, 'w', encoding='utf-8') as f: + for idx, seg in enumerate(segments, 1): + start = seg.get('start', 0) + end = seg.get('end', 0) + text = seg.get('text', '').strip() + + start_time = SoundSeparationStep.__format_srt_time(start) + end_time = SoundSeparationStep.__format_srt_time(end) + + f.write(f'{idx}\n') + f.write(f'{start_time} --> {end_time}\n') + f.write(f'{text}\n\n') + + @staticmethod + def __generate_txt_file(json_path: Path, txt_path: Path) -> None: + with open(json_path, 'r', encoding='utf-8') as f: + data = json.load(f) + + segments = data.get('segments', []) + text_lines = [] + + for seg in segments: + text = seg.get('text', '').strip() + text = re.sub(r'\([^)]*\)', '', text) + text = re.sub(r'\s+', ' ', text).strip() + if text: + text_lines.append(text) + + with StepTempFile(txt_path) as temp_path: + with open(temp_path, 'w', encoding='utf-8') as f: + f.write(' '.join(text_lines)) + + @staticmethod + def __renumber_segments( + segments: List[Dict[str, Any]], + ) -> List[Dict[str, Any]]: + for i, seg in enumerate(segments): + seg['id'] = i + return segments diff --git a/preprocessor/steps/packaging/__init__.py b/preprocessor/steps/packaging/__init__.py new file mode 100644 index 000000000..46c3d5231 --- /dev/null +++ b/preprocessor/steps/packaging/__init__.py @@ -0,0 +1,3 @@ +from preprocessor.steps.packaging.archives_step import ArchiveGenerationStep + +__all__ = ['ArchiveGenerationStep'] diff --git a/preprocessor/steps/packaging/archives_step.py b/preprocessor/steps/packaging/archives_step.py new file mode 100644 index 000000000..45c0cddaf --- /dev/null +++ b/preprocessor/steps/packaging/archives_step.py @@ -0,0 +1,132 @@ +from pathlib import Path +from typing import ( + Dict, + List, +) +import zipfile + +from preprocessor.config.constants import ELASTIC_DOC_TYPES +from preprocessor.config.step_configs import ArchiveConfig +from preprocessor.core.artifacts import ( + ArchiveArtifact, + ProcessedEpisode, +) +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext +from preprocessor.core.output_descriptors import FileOutput +from preprocessor.services.episodes.types import EpisodeInfo + + +class ArchiveGenerationStep( + PipelineStep[ProcessedEpisode, ArchiveArtifact, ArchiveConfig], +): + @property + def supports_batch_processing(self) -> bool: + return True + + def execute_batch( + self, input_data: List[ProcessedEpisode], context: ExecutionContext, + ) -> List[ArchiveArtifact]: + return self._execute_with_threadpool( + input_data, context, self.config.max_parallel_episodes, self.execute, + ) + + def _process( + self, input_data: ProcessedEpisode, context: ExecutionContext, + ) -> ArchiveArtifact: + episode_info = input_data.episode_info + output_path = self._get_cache_path(input_data, context) + + episode_files = self.__collect_episode_files(context, episode_info) + + expected = len(ELASTIC_DOC_TYPES) + found = len(episode_files) + + if found == 0: + context.logger.warning(f"No elastic documents found for {input_data.episode_id}") + return self.__build_artifact(input_data, output_path) + + if found < expected and not self.config.allow_partial: + missing = [folder for folder, _ in ELASTIC_DOC_TYPES if folder not in episode_files] + context.logger.warning( + f"Skipping {input_data.episode_id}: incomplete documents " + f"({found}/{expected}), missing: {missing}. Set allow_partial=True to archive anyway.", + ) + return self.__build_artifact(input_data, output_path) + + self.__create_archive(output_path, episode_files, context) + + return self.__build_artifact(input_data, output_path) + + def get_output_descriptors(self) -> List[FileOutput]: + return [ + FileOutput( + pattern="{season}/{episode}.zip", + subdir="archives", + min_size_bytes=1024 * 100, + ), + ] + + def _get_cache_path( + self, input_data: ProcessedEpisode, context: ExecutionContext, + ) -> Path: + return self._get_standard_cache_path(input_data, context) + + def _load_from_cache( + self, + cache_path: Path, + input_data: ProcessedEpisode, + context: ExecutionContext, + ) -> ArchiveArtifact: + return self.__build_artifact(input_data, cache_path) + + @staticmethod + def __collect_episode_files( + context: ExecutionContext, episode_info: EpisodeInfo, + ) -> Dict[str, Path]: + elastic_dir = context.base_output_dir / "elastic_documents" + season = episode_info.season_code() + episode = episode_info.episode_code() + + collected: Dict[str, Path] = {} + for folder, suffix in ELASTIC_DOC_TYPES: + file_path = elastic_dir / folder / season / f"{episode}_{suffix}.jsonl" + if file_path.exists(): + collected[folder] = file_path + return collected + + @staticmethod + def __create_archive( + archive_path: Path, + files: Dict[str, Path], + context: ExecutionContext, + ) -> None: + archive_path.parent.mkdir(parents=True, exist_ok=True) + temp_path = archive_path.with_suffix(archive_path.suffix + ".tmp") + + try: + with zipfile.ZipFile(temp_path, "w", zipfile.ZIP_DEFLATED) as zipf: + for file_path in files.values(): + zipf.write(file_path, arcname=file_path.name) + + temp_path.replace(archive_path) + + size_mb = archive_path.stat().st_size / (1024 * 1024) + context.logger.info( + f"Created archive: {archive_path.name} ({len(files)} files, {size_mb:.2f} MB)", + ) + + except Exception as e: + if temp_path.exists(): + temp_path.unlink() + raise RuntimeError(f"Failed to create archive {archive_path}: {e}") from e + + @staticmethod + def __build_artifact( + input_data: ProcessedEpisode, output_path: Path, + ) -> ArchiveArtifact: + return ArchiveArtifact( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=output_path, + ) diff --git a/preprocessor/steps/scraping/__init__.py b/preprocessor/steps/scraping/__init__.py new file mode 100644 index 000000000..77a521f2d --- /dev/null +++ b/preprocessor/steps/scraping/__init__.py @@ -0,0 +1,6 @@ +from preprocessor.services.scraping.base_scraper_step import BaseScraperStep +from preprocessor.steps.scraping.character_scraper_step import CharacterScraperStep +from preprocessor.steps.scraping.episode_scraper_step import EpisodeScraperStep +from preprocessor.steps.scraping.reference_processor_step import CharacterReferenceStep + +__all__ = ['BaseScraperStep', 'CharacterReferenceStep', 'CharacterScraperStep', 'EpisodeScraperStep'] diff --git a/preprocessor/steps/scraping/character_scraper_step.py b/preprocessor/steps/scraping/character_scraper_step.py new file mode 100644 index 000000000..274e1ba7e --- /dev/null +++ b/preprocessor/steps/scraping/character_scraper_step.py @@ -0,0 +1,13 @@ +from typing import Type + +from preprocessor.config.step_configs import CharacterScraperConfig +from preprocessor.services.scraping.base_scraper_step import BaseScraperStep +from preprocessor.services.scraping.character_scraper import CharacterScraper + + +class CharacterScraperStep(BaseScraperStep[CharacterScraperConfig]): + def _get_scraper_class(self) -> Type[CharacterScraper]: + return CharacterScraper + + def _get_metadata_type_name(self) -> str: + return "Characters" diff --git a/preprocessor/steps/scraping/episode_scraper_step.py b/preprocessor/steps/scraping/episode_scraper_step.py new file mode 100644 index 000000000..65c491d90 --- /dev/null +++ b/preprocessor/steps/scraping/episode_scraper_step.py @@ -0,0 +1,13 @@ +from typing import Type + +from preprocessor.config.step_configs import EpisodeScraperConfig +from preprocessor.services.scraping.base_scraper_step import BaseScraperStep +from preprocessor.services.scraping.episode_scraper import EpisodeScraper + + +class EpisodeScraperStep(BaseScraperStep[EpisodeScraperConfig]): + def _get_scraper_class(self) -> Type[EpisodeScraper]: + return EpisodeScraper + + def _get_metadata_type_name(self) -> str: + return "Episodes" diff --git a/preprocessor/steps/scraping/reference_processor_step.py b/preprocessor/steps/scraping/reference_processor_step.py new file mode 100644 index 000000000..16a783b01 --- /dev/null +++ b/preprocessor/steps/scraping/reference_processor_step.py @@ -0,0 +1,80 @@ +from pathlib import Path +from typing import Tuple + +from preprocessor.config.output_paths import get_base_output_dir +from preprocessor.config.step_configs import CharacterReferenceConfig +from preprocessor.core.artifacts import SourceVideo +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext +from preprocessor.services.characters.reference_downloader import CharacterReferenceDownloader + + +class CharacterReferenceStep( + PipelineStep[SourceVideo, SourceVideo, CharacterReferenceConfig], +): + @property + def is_global(self) -> bool: + return True + + @property + def uses_caching(self) -> bool: + return False + + @property + def uses_global_completion(self) -> bool: + return False + + def _process( + self, input_data: SourceVideo, context: ExecutionContext, + ) -> SourceVideo: + if self.config.images_per_character == 0: + context.logger.info("images_per_character=0, skipping character reference download") + return input_data + characters_path, output_dir = self.__resolve_paths(context) + self.__validate_characters_file(characters_path) + self.__download_character_references(characters_path, output_dir, context) + return input_data + + @staticmethod + def __resolve_paths(context: ExecutionContext) -> Tuple[Path, Path]: + base_dir = get_base_output_dir(context.series_name) + characters_path = base_dir / f"{context.series_name}_characters.json" + output_dir = base_dir / "character_faces" + return characters_path, output_dir + + def __download_character_references( + self, + characters_path: Path, + output_dir: Path, + context: ExecutionContext, + ) -> None: + context.logger.info(f"Downloading character references from {characters_path}") + + downloader = CharacterReferenceDownloader( + { + "characters_json": characters_path, + "output_dir": output_dir, + "search_engine": self.config.search_engine, + "images_per_character": self.config.images_per_character, + "series_name": context.series_name, + "search_query_template": self.config.search_query_template, + "force_rerun": context.force_rerun, + }, + ) + + exit_code = downloader.work() + + if exit_code != 0: + raise RuntimeError( + f"Character reference downloader failed with exit code {exit_code}", + ) + + context.logger.info(f"Character references saved to: {output_dir}") + + @staticmethod + def __validate_characters_file(characters_path: Path) -> None: + if not characters_path.exists(): + raise FileNotFoundError( + f"Characters file not found: {characters_path}. " + f"Run scrape_characters first.", + ) diff --git a/preprocessor/scraping/__init__.py b/preprocessor/steps/search/__init__.py similarity index 100% rename from preprocessor/scraping/__init__.py rename to preprocessor/steps/search/__init__.py diff --git a/preprocessor/steps/search/document_generation_step.py b/preprocessor/steps/search/document_generation_step.py new file mode 100644 index 000000000..489b7e5c0 --- /dev/null +++ b/preprocessor/steps/search/document_generation_step.py @@ -0,0 +1,488 @@ +import json +from pathlib import Path +from typing import ( + Any, + Dict, + List, + Optional, +) + +from preprocessor.config.constants import ELASTIC_DOC_TYPES +from preprocessor.config.step_configs import DocumentGenerationConfig +from preprocessor.core.artifacts import ( + ElasticDocuments, + EmbeddingCollection, +) +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext +from preprocessor.core.output_descriptors import FileOutput +from preprocessor.core.temp_files import StepTempFile +from preprocessor.services.episodes.types import EpisodeInfo +from preprocessor.services.io.files import FileOperations + + +class DocumentGeneratorStep( + PipelineStep[EmbeddingCollection, ElasticDocuments, DocumentGenerationConfig], +): + @property + def supports_batch_processing(self) -> bool: + return True + + def execute_batch( + self, input_data: List[EmbeddingCollection], context: ExecutionContext, + ) -> List[ElasticDocuments]: + return self._execute_with_threadpool( + input_data, context, self.config.max_parallel_episodes, self.execute, + ) + + def _process( + self, input_data: EmbeddingCollection, context: ExecutionContext, + ) -> ElasticDocuments: + episode_info = input_data.episode_info + episode_id = input_data.episode_id + episode_metadata = self.__build_episode_metadata(episode_info, context) + video_path = self.__build_video_path(episode_info, context) + + scene_data = self.__load_optional(context, "scene_detections", episode_info) + char_data = self.__load_optional(context, "detections/characters", episode_info) + emotion_data = self.__load_optional(context, "detections/emotions", episode_info) + object_data = self.__load_optional(context, "detections/objects", episode_info) + + char_by_frame = self.__index_characters_by_frame(char_data, emotion_data) + objects_by_frame = self.__index_objects_by_frame(object_data) + + total_docs = sum([ + self.__write_text_segments(context, episode_info, episode_id, episode_metadata, video_path, scene_data), + self.__write_sound_events(context, episode_info, episode_id, episode_metadata, video_path, scene_data), + self.__write_text_embeddings(context, episode_info, episode_id, episode_metadata, video_path), + self.__write_video_frames(context, episode_info, episode_id, episode_metadata, video_path, scene_data, char_by_frame, objects_by_frame), + self.__write_episode_name(context, episode_info, episode_id, episode_metadata, video_path), + self.__write_text_statistics(context, episode_info, episode_id, episode_metadata, video_path), + self.__write_full_episode_embedding(context, episode_info, episode_id, episode_metadata, video_path), + self.__write_sound_event_embeddings(context, episode_info, episode_id, episode_metadata, video_path), + ]) + + context.logger.info(f"Generated {total_docs} documents for {episode_id}") + + return ElasticDocuments( + episode_id=episode_id, + episode_info=episode_info, + path=self._get_cache_path(input_data, context), + document_count=total_docs, + ) + + def get_output_descriptors(self) -> List[FileOutput]: + return [ + FileOutput( + pattern=f"{{season}}/{{episode}}_{suffix}.jsonl", + subdir=f"elastic_documents/{folder}", + min_size_bytes=0, + ) + for folder, suffix in ELASTIC_DOC_TYPES + ] + + def _get_cache_path( + self, input_data: EmbeddingCollection, context: ExecutionContext, + ) -> Path: + return self._resolve_output_path( + 0, context, self.__path_vars(input_data.episode_info), + ) + + def _load_from_cache( + self, cache_path: Path, input_data: EmbeddingCollection, context: ExecutionContext, + ) -> ElasticDocuments: + return ElasticDocuments( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=cache_path, + document_count=0, + ) + + @staticmethod + def __path_vars(episode_info: EpisodeInfo) -> Dict[str, str]: + return { + "season": episode_info.season_code(), + "episode": episode_info.episode_code(), + } + + @staticmethod + def __input_path( + context: ExecutionContext, subdir: str, episode_info: EpisodeInfo, + ) -> Path: + return ( + context.base_output_dir + / subdir + / episode_info.season_code() + / f"{episode_info.episode_code()}.json" + ) + + def __output_path( + self, context: ExecutionContext, episode_info: EpisodeInfo, descriptor_index: int, + ) -> Path: + return self._resolve_output_path( + descriptor_index, context, self.__path_vars(episode_info), + ) + + def __load_optional( + self, context: ExecutionContext, subdir: str, episode_info: EpisodeInfo, + ) -> Optional[Dict[str, Any]]: + path = self.__input_path(context, subdir, episode_info) + return FileOperations.load_json(path) if path.exists() else None + + @staticmethod + def __build_episode_metadata( + episode_info: EpisodeInfo, context: ExecutionContext, + ) -> Dict[str, Any]: + return { + "season": episode_info.season, + "episode_number": episode_info.relative_episode, + "title": episode_info.title, + "premiere_date": episode_info.premiere_date, + "series_name": context.series_name, + "viewership": episode_info.viewership, + } + + @staticmethod + def __build_video_path(episode_info: EpisodeInfo, context: ExecutionContext) -> str: + filename = f"{context.series_name}_{episode_info.episode_code()}.mp4" + return f"bot/{context.series_name.upper()}-WIDEO/{episode_info.season_code()}/{filename}" + + @staticmethod + def __find_scene( + timestamp: float, scene_data: Optional[Dict[str, Any]], + ) -> Optional[Dict[str, Any]]: + if not scene_data: + return None + for scene in scene_data.get("scenes", []): + start = scene["start"]["seconds"] + end = scene["end"]["seconds"] + if start is None or end is None: + continue + if start <= timestamp < end: + return { + "scene_number": scene["scene_number"], + "scene_start_time": start, + "scene_end_time": end, + "scene_start_frame": scene["start"]["frame"], + "scene_end_frame": scene["end"]["frame"], + } + return None + + @staticmethod + def __index_characters_by_frame( + char_data: Optional[Dict[str, Any]], + emotion_data: Optional[Dict[str, Any]], + ) -> Dict[str, List[Dict[str, Any]]]: + if not char_data: + return {} + + emotion_by_frame: Dict[str, Dict[str, Dict[str, Any]]] = {} + if emotion_data: + for det in emotion_data.get("detections", []): + frame = det["frame"] + emotion_by_frame[frame] = { + face["name"]: face.get("emotion") + for face in det.get("faces", []) + if face.get("emotion") + } + + result: Dict[str, List[Dict[str, Any]]] = {} + for det in char_data.get("detections", []): + frame = det["frame"] + faces = [] + for face in det.get("faces", []): + name = face["name"] + entry: Dict[str, Any] = {"name": name, "confidence": face.get("confidence")} + emotion = emotion_by_frame.get(frame, {}).get(name) + if emotion: + entry["emotion"] = { + "label": emotion["label"], + "confidence": emotion["confidence"], + } + faces.append(entry) + if faces: + result[frame] = faces + return result + + @staticmethod + def __index_objects_by_frame( + object_data: Optional[Dict[str, Any]], + ) -> Dict[str, List[Dict[str, Any]]]: + if not object_data: + return {} + result: Dict[str, List[Dict[str, Any]]] = {} + for det in object_data.get("detections", []): + frame = det["frame"] + counts: Dict[str, int] = {} + for obj in det.get("objects", []): + cls = obj["class_name"] + counts[cls] = counts.get(cls, 0) + 1 + if counts: + result[frame] = [{"class": k, "count": v} for k, v in counts.items()] + return result + + @staticmethod + def __write_ndjson(output_path: Path, docs: List[Dict[str, Any]]) -> int: + with StepTempFile(output_path) as tmp: + with open(tmp, "w", encoding="utf-8") as f: + for doc in docs: + f.write(json.dumps(doc, ensure_ascii=False) + "\n") + return len(docs) + + def __write_text_segments( + self, + context: ExecutionContext, + episode_info: EpisodeInfo, + episode_id: str, + episode_metadata: Dict[str, Any], + video_path: str, + scene_data: Optional[Dict[str, Any]], + ) -> int: + clean_data = self.__load_optional(context, "transcriptions/clean", episode_info) + + docs = [] + for i, seg in enumerate((clean_data or {}).get("segments", [])): + text = seg.get("text", "").strip() + if not text: + continue + words = seg.get("words", []) + start = (words[0].get("start") or seg.get("start", 0.0)) if words else seg.get("start", 0.0) + end = (words[-1].get("end") or seg.get("end", 0.0)) if words else seg.get("end", 0.0) + speaker = (words[0].get("speaker_id") or seg.get("speaker", "unknown")) if words else seg.get("speaker", "unknown") + doc: Dict[str, Any] = { + "episode_id": episode_id, + "episode_metadata": episode_metadata, + "segment_id": i, + "text": text, + "start_time": start, + "end_time": end, + "speaker": speaker, + "video_path": video_path, + } + scene_info = self.__find_scene(start, scene_data) + if scene_info: + doc["scene_info"] = scene_info + docs.append(doc) + + return self.__write_ndjson(self.__output_path(context, episode_info, 0), docs) + + def __write_sound_events( + self, + context: ExecutionContext, + episode_info: EpisodeInfo, + episode_id: str, + episode_metadata: Dict[str, Any], + video_path: str, + scene_data: Optional[Dict[str, Any]], + ) -> int: + sound_data = self.__load_optional(context, "transcriptions/sound_events", episode_info) + + docs = [] + for i, seg in enumerate((sound_data or {}).get("segments", [])): + if "text" not in seg: + continue + words = seg.get("words", []) + start = (words[0].get("start") or seg.get("start", 0.0)) if words else seg.get("start", 0.0) + end = (words[-1].get("end") or seg.get("end", 0.0)) if words else seg.get("end", 0.0) + doc: Dict[str, Any] = { + "episode_id": episode_id, + "episode_metadata": episode_metadata, + "segment_id": i, + "text": seg.get("text", ""), + "sound_type": seg.get("sound_type", "sound"), + "start_time": start, + "end_time": end, + "video_path": video_path, + } + scene_info = self.__find_scene(start, scene_data) + if scene_info: + doc["scene_info"] = scene_info + docs.append(doc) + + return self.__write_ndjson(self.__output_path(context, episode_info, 1), docs) + + def __write_text_embeddings( + self, + context: ExecutionContext, + episode_info: EpisodeInfo, + episode_id: str, + episode_metadata: Dict[str, Any], + video_path: str, + ) -> int: + emb_data = self.__load_optional(context, "embeddings/text", episode_info) + + docs = [] + for i, emb in enumerate((emb_data or {}).get("text_embeddings", [])): + embedding = emb.get("embedding", []) + if not embedding: + continue + segment_range = emb.get("segment_range", []) + docs.append({ + "episode_id": episode_id, + "episode_metadata": episode_metadata, + "embedding_id": i, + "segment_range": segment_range, + "text": emb.get("text", ""), + "text_embedding": embedding, + "video_path": video_path, + }) + + return self.__write_ndjson(self.__output_path(context, episode_info, 2), docs) + + def __write_video_frames( + self, + context: ExecutionContext, + episode_info: EpisodeInfo, + episode_id: str, + episode_metadata: Dict[str, Any], + video_path: str, + scene_data: Optional[Dict[str, Any]], + char_by_frame: Dict[str, List[Dict[str, Any]]], + objects_by_frame: Dict[str, List[Dict[str, Any]]], + ) -> int: + emb_data = self.__load_optional(context, "embeddings/vision", episode_info) + + docs = [] + for emb in (emb_data or {}).get("video_embeddings", []): + embedding = emb.get("embedding") + timestamp = emb.get("timestamp") + if embedding is None or timestamp is None: + continue + + frame_path = emb.get("frame_path", "") + frame_name = Path(frame_path).name if frame_path else "" + + doc: Dict[str, Any] = { + "episode_id": episode_id, + "episode_metadata": episode_metadata, + "frame_number": emb.get("frame_number"), + "timestamp": timestamp, + "frame_type": emb.get("type", "unknown"), + "video_path": video_path, + "video_embedding": embedding, + } + + if frame_name and frame_name in char_by_frame: + doc["character_appearances"] = char_by_frame[frame_name] + if frame_name and frame_name in objects_by_frame: + doc["detected_objects"] = objects_by_frame[frame_name] + + perceptual_hash = emb.get("perceptual_hash") + if perceptual_hash: + doc["perceptual_hash"] = perceptual_hash + try: + doc["perceptual_hash_int"] = int(perceptual_hash, 16) + except (ValueError, TypeError): + pass + + if "scene_number" in emb: + doc["scene_number"] = emb["scene_number"] + + scene_info = self.__find_scene(timestamp, scene_data) + if scene_info: + doc["scene_info"] = scene_info + + docs.append(doc) + + return self.__write_ndjson(self.__output_path(context, episode_info, 3), docs) + + def __write_episode_name( + self, + context: ExecutionContext, + episode_info: EpisodeInfo, + episode_id: str, + episode_metadata: Dict[str, Any], + video_path: str, + ) -> int: + emb_data = self.__load_optional(context, "embeddings/episode_names", episode_info) + + docs = [] + if emb_data and emb_data.get("title_embedding"): + docs = [{ + "episode_id": episode_id, + "episode_metadata": episode_metadata, + "title": emb_data.get("title", ""), + "title_embedding": emb_data.get("title_embedding", []), + "video_path": video_path, + }] + return self.__write_ndjson(self.__output_path(context, episode_info, 4), docs) + + def __write_text_statistics( + self, + context: ExecutionContext, + episode_info: EpisodeInfo, + episode_id: str, + episode_metadata: Dict[str, Any], + video_path: str, + ) -> int: + stats_data = self.__load_optional(context, "text_analysis", episode_info) + + docs = [] + if stats_data and stats_data.get("basic_statistics"): + docs = [{ + "episode_id": episode_id, + "episode_metadata": episode_metadata, + "video_path": video_path, + "language": stats_data.get("metadata", {}).get("language", "pl"), + "analyzed_at": stats_data.get("metadata", {}).get("analyzed_at"), + "basic_statistics": stats_data.get("basic_statistics", {}), + "advanced_statistics": stats_data.get("advanced_statistics", {}), + "word_frequency": stats_data.get("word_frequency", [])[:20], + "bigrams": stats_data.get("bigrams", [])[:10], + "trigrams": stats_data.get("trigrams", [])[:10], + }] + return self.__write_ndjson(self.__output_path(context, episode_info, 5), docs) + + def __write_full_episode_embedding( + self, + context: ExecutionContext, + episode_info: EpisodeInfo, + episode_id: str, + episode_metadata: Dict[str, Any], + video_path: str, + ) -> int: + emb_data = self.__load_optional(context, "embeddings/full_episode", episode_info) + + docs = [] + full_emb = (emb_data or {}).get("full_episode_embedding", {}) + if full_emb and "embedding" in full_emb: + docs = [{ + "episode_id": episode_id, + "episode_metadata": episode_metadata, + "full_transcript": full_emb.get("text", ""), + "transcript_length": full_emb.get("transcript_length", 0), + "full_episode_embedding": full_emb.get("embedding", []), + "video_path": video_path, + }] + return self.__write_ndjson(self.__output_path(context, episode_info, 6), docs) + + def __write_sound_event_embeddings( + self, + context: ExecutionContext, + episode_info: EpisodeInfo, + episode_id: str, + episode_metadata: Dict[str, Any], + video_path: str, + ) -> int: + emb_data = self.__load_optional(context, "embeddings/sound_events", episode_info) + + docs = [] + for i, emb in enumerate((emb_data or {}).get("sound_event_embeddings", [])): + embedding = emb.get("embedding", []) + if not embedding: + continue + segment_range = emb.get("segment_range", []) + docs.append({ + "episode_id": episode_id, + "episode_metadata": episode_metadata, + "embedding_id": i, + "segment_range": segment_range, + "text": emb.get("text", ""), + "sound_types": emb.get("sound_types", []), + "start_time": emb.get("start_time", 0.0), + "end_time": emb.get("end_time", 0.0), + "sound_event_embedding": embedding, + "video_path": video_path, + }) + + return self.__write_ndjson(self.__output_path(context, episode_info, 7), docs) diff --git a/preprocessor/steps/search/indexing_step.py b/preprocessor/steps/search/indexing_step.py new file mode 100644 index 000000000..02cc7a60d --- /dev/null +++ b/preprocessor/steps/search/indexing_step.py @@ -0,0 +1,210 @@ +import asyncio +import json +from pathlib import Path +from typing import ( + Any, + Dict, + List, + Optional, +) + +from preprocessor.config.step_configs import ElasticsearchConfig +from preprocessor.core.artifacts import ( + ElasticDocuments, + IndexingResult, +) +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext +from preprocessor.services.search.elasticsearch import ElasticsearchWrapper + + +class ElasticsearchIndexerStep( + PipelineStep[List[ElasticDocuments], IndexingResult, ElasticsearchConfig], +): + def __init__(self, config: ElasticsearchConfig) -> None: + super().__init__(config) + self.__es: Optional[ElasticsearchWrapper] = None + + @property + def is_global(self) -> bool: + return True + + @property + def supports_batch_processing(self) -> bool: + return True + + @property + def uses_caching(self) -> bool: + return False + + def setup_resources(self, context: ExecutionContext) -> None: + if self.__es is None: + context.logger.info( + f'Initializing Elasticsearch client: {self.config.host}', + ) + self.__es = ElasticsearchWrapper( + host=self.config.host, + index_name=self.config.index_name, + ) + + def teardown_resources(self, context: ExecutionContext) -> None: + if self.__es: + asyncio.run(self.__es.close()) + self.__es = None + context.logger.info('Elasticsearch client closed') + + def cleanup(self) -> None: + if self.__es: + asyncio.run(self.__es.close()) + self.__es = None + + def execute_batch( + self, + input_data: List[List[ElasticDocuments]], + context: ExecutionContext, + ) -> List[IndexingResult]: + context.logger.info( + f"Batch indexing {len(input_data)} document collections", + ) + results = [] + for docs in input_data: + # Reusing _process logic via direct async call wrapper if needed, + # or calling execute which routes to _process + result = self.execute(docs, context) + results.append(result) + return results + + def _process( + self, input_data: List[ElasticDocuments], context: ExecutionContext, + ) -> IndexingResult: + return asyncio.run(self.__process_async(input_data, context)) + + async def __process_async( + self, + input_data: List[ElasticDocuments], + context: ExecutionContext, + ) -> IndexingResult: + if not input_data: + return self.__construct_empty_result(context) + + docs_by_type = self.__group_documents_by_type(input_data) + total_indexed = await self.__index_grouped_documents( + docs_by_type, context, + ) + + return self.__construct_indexing_result(total_indexed) + + async def __index_grouped_documents( + self, + docs_by_type: Dict[str, List[Path]], + context: ExecutionContext, + ) -> int: + total_indexed: int = 0 + for doc_type, paths in docs_by_type.items(): + try: + indexed_count = await self.__process_document_type( + doc_type, paths, context, + ) + total_indexed += indexed_count + except Exception as e: + context.logger.error( + f'Elasticsearch indexing failed for {doc_type}: {e}', + ) + raise + return total_indexed + + async def __process_document_type( + self, + doc_type: str, + paths: List[Path], + context: ExecutionContext, + ) -> int: + index_name: str = f'{self.config.index_name}_{doc_type}' + context.logger.info(f'Indexing {len(paths)} files into {index_name}') + + await self.__prepare_elasticsearch_client(index_name) + await self.__setup_index(doc_type) + + documents = self.__load_documents_from_paths(paths) + return await self.__execute_bulk_indexing( + documents, index_name, context, + ) + + async def __prepare_elasticsearch_client(self, index_name: str) -> None: + if self.__es is None or self.__es.index_name != index_name: + if self.__es is not None: + await self.__es.close() + self.__es = ElasticsearchWrapper( + index_name=index_name, + host=self.config.host, + dry_run=self.config.dry_run, + ) + + async def __setup_index(self, doc_type: str) -> None: + if not self.config.append: + await self.__es.delete_index() + + mapping: Optional[Dict[str, Any]] = self.__get_mapping_for_type( + doc_type, + ) + if mapping: + await self.__es.create_index(mapping) + + async def __execute_bulk_indexing( + self, + documents: List[Dict[str, Any]], + index_name: str, + context: ExecutionContext, + ) -> int: + if not documents: + return 0 + + if not self.config.dry_run: + await self.__es.bulk_index(documents) + return len(documents) + + context.logger.info( + f'Dry-run: would index {len(documents)} docs to {index_name}', + ) + return 0 + + def __construct_indexing_result(self, document_count: int) -> IndexingResult: + return IndexingResult( + index_name=self.config.index_name, + document_count=document_count, + success=True, + ) + + def __construct_empty_result( + self, context: ExecutionContext, + ) -> IndexingResult: + context.logger.warning('No documents to index.') + return self.__construct_indexing_result(0) + + @staticmethod + def __group_documents_by_type( + input_data: List[ElasticDocuments], + ) -> Dict[str, List[Path]]: + docs_by_type: Dict[str, List[Path]] = {} + for doc_artifact in input_data: + doc_type: str = doc_artifact.path.parent.name + if doc_type not in docs_by_type: + docs_by_type[doc_type] = [] + docs_by_type[doc_type].append(doc_artifact.path) + return docs_by_type + + @staticmethod + def __load_documents_from_paths(paths: List[Path]) -> List[Dict[str, Any]]: + documents: List[Dict[str, Any]] = [] + for path in paths: + with open(path, 'r', encoding='utf-8') as f: + for line in f: + if line.strip(): + documents.append(json.loads(line)) + return documents + + @staticmethod + def __get_mapping_for_type( + _doc_type: str, + ) -> Optional[Dict[str, Any]]: + return None diff --git a/preprocessor/steps/text/__init__.py b/preprocessor/steps/text/__init__.py new file mode 100644 index 000000000..ba7c25956 --- /dev/null +++ b/preprocessor/steps/text/__init__.py @@ -0,0 +1,15 @@ +from preprocessor.services.text.import_step import TranscriptionImportStep +from preprocessor.steps.text.analysis_step import TextAnalysisStep +from preprocessor.steps.text.embeddings_step import TextEmbeddingStep +from preprocessor.steps.text.sound_events_step import SoundEventsStep +from preprocessor.steps.text.text_cleaning_step import TextCleaningStep +from preprocessor.steps.text.transcription_step import TranscriptionStep + +__all__ = [ + 'SoundEventsStep', + 'TextAnalysisStep', + 'TextCleaningStep', + 'TextEmbeddingStep', + 'TranscriptionImportStep', + 'TranscriptionStep', +] diff --git a/preprocessor/steps/text/analysis_step.py b/preprocessor/steps/text/analysis_step.py new file mode 100644 index 000000000..f3c17054f --- /dev/null +++ b/preprocessor/steps/text/analysis_step.py @@ -0,0 +1,104 @@ +from datetime import datetime +from pathlib import Path +from typing import ( + Any, + Dict, + List, +) + +from preprocessor.config.step_configs import TextAnalysisConfig +from preprocessor.core.artifacts import ( + TextAnalysisResults, + TranscriptionData, +) +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext +from preprocessor.core.output_descriptors import ( + JsonFileOutput, + OutputDescriptor, +) +from preprocessor.services.io.files import FileOperations +from preprocessor.services.text.text_statistics import TextStatistics + + +class TextAnalysisStep(PipelineStep[TranscriptionData, TextAnalysisResults, TextAnalysisConfig]): + @property + def supports_batch_processing(self) -> bool: + return True + + def execute_batch( + self, input_data: List[TranscriptionData], context: ExecutionContext, + ) -> List[TextAnalysisResults]: + return self._execute_with_threadpool( + input_data, context, self.config.max_parallel_episodes, self.execute, + ) + + def _process( + self, input_data: TranscriptionData, context: ExecutionContext, + ) -> TextAnalysisResults: + output_path = self._get_cache_path(input_data, context) + + text = self.__extract_transcription_text(input_data) + stats = self.__analyze_text_statistics(text) + result_data = self.__build_result_payload(stats, input_data) + + FileOperations.atomic_write_json(output_path, result_data) + + return self.__construct_analysis_results(input_data, output_path, result_data) + + def get_output_descriptors(self) -> List[OutputDescriptor]: + return [ + JsonFileOutput( + pattern="{season}/{episode}.json", + subdir="text_analysis", + min_size_bytes=50, + ), + ] + + def _get_cache_path( + self, input_data: TranscriptionData, context: ExecutionContext, + ) -> Path: + return self._get_standard_cache_path(input_data, context) + + def _load_from_cache( + self, cache_path: Path, input_data: TranscriptionData, context: ExecutionContext, + ) -> TextAnalysisResults: + stats_data = FileOperations.load_json(cache_path) + return self.__construct_analysis_results(input_data, cache_path, stats_data) + + def __analyze_text_statistics(self, text: str) -> TextStatistics: + return TextStatistics.from_text(text, language=self.config.language) + + def __build_result_payload( + self, + stats: TextStatistics, + input_data: TranscriptionData, + ) -> Dict[str, Any]: + return { + 'metadata': { + 'episode_id': input_data.episode_id, + 'language': self.config.language, + 'source_file': input_data.path.name, + 'analyzed_at': datetime.now().isoformat(), + }, + **stats.to_dict(), + } + + @staticmethod + def __extract_transcription_text(input_data: TranscriptionData) -> str: + data = FileOperations.load_json(input_data.path) + segments = data.get('segments', []) + return ' '.join(seg.get('text', '').strip() for seg in segments if seg.get('text')) + + @staticmethod + def __construct_analysis_results( + input_data: TranscriptionData, + output_path: Path, + result_data: Dict[str, Any], + ) -> TextAnalysisResults: + return TextAnalysisResults( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=output_path, + statistics=result_data, + ) diff --git a/preprocessor/steps/text/embeddings_step.py b/preprocessor/steps/text/embeddings_step.py new file mode 100644 index 000000000..3baf62903 --- /dev/null +++ b/preprocessor/steps/text/embeddings_step.py @@ -0,0 +1,277 @@ +# pylint: disable=duplicate-code +from pathlib import Path +import re +from typing import ( + Any, + Dict, + List, + Optional, + Tuple, +) + +from preprocessor.config.step_configs import TextEmbeddingConfig +from preprocessor.core.artifacts import ( + EmbeddingCollection, + TranscriptionData, +) +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext +from preprocessor.core.output_descriptors import FileOutput +from preprocessor.services.io.files import FileOperations +from preprocessor.services.io.metadata import MetadataBuilder +from preprocessor.services.search.embedding_model import EmbeddingModelWrapper + + +class TextEmbeddingStep(PipelineStep[TranscriptionData, EmbeddingCollection, TextEmbeddingConfig]): + def __init__(self, config: TextEmbeddingConfig) -> None: + super().__init__(config) + self.__model: Optional[EmbeddingModelWrapper] = None + + @property + def supports_batch_processing(self) -> bool: + return True + + def setup_resources(self, context: ExecutionContext) -> None: + if self.__model is None: + context.logger.info(f'Loading VLLM embedding model: {self.config.model_name}') + self.__model = EmbeddingModelWrapper( + self.config.model_name, + self.config.device, + self.config.batch_size, + ) + + def teardown_resources(self, context: ExecutionContext) -> None: + if self.__model: + self.__model = None + context.logger.info('VLLM embedding model unloaded') + + def cleanup(self) -> None: + if self.__model: + self.__model = None + + def execute_batch( + self, input_data: List[TranscriptionData], context: ExecutionContext, + ) -> List[EmbeddingCollection]: + return self._execute_sequential(input_data, context, self.execute) + + def _process( + self, input_data: TranscriptionData, context: ExecutionContext, + ) -> EmbeddingCollection: + output_path = self._get_cache_path(input_data, context) + + segments = self.__extract_valid_segments(input_data, context) + if not segments: + return self.__construct_embedding_collection( + input_data, output_path, 0, + ) + + self.__prepare_embedding_model() + context.logger.info(f'Generating text embeddings for {input_data.episode_id}') + + results = self.__process_text_embeddings(segments) + self.__save_embedding_results(results, output_path, input_data) + + return self.__construct_embedding_collection( + input_data, output_path, len(results), + ) + + def get_output_descriptors(self) -> List[FileOutput]: + return [ + FileOutput( + pattern="{season}/{episode}.json", + subdir="embeddings/text", + min_size_bytes=1024, + ), + ] + + def _get_cache_path( + self, input_data: TranscriptionData, context: ExecutionContext, + ) -> Path: + return self._resolve_output_path( + 0, + context, + self.__create_path_variables(input_data), + ) + + def _load_from_cache( + self, cache_path: Path, input_data: TranscriptionData, context: ExecutionContext, + ) -> EmbeddingCollection: + emb_data: Dict[str, Any] = FileOperations.load_json(cache_path) + return self.__construct_embedding_collection( + input_data, + cache_path, + len(emb_data.get('text_embeddings', [])), + ) + + def __prepare_embedding_model(self) -> None: + if self.__model is None: + self.__model = EmbeddingModelWrapper( + self.config.model_name, + self.config.device, + self.config.batch_size, + ) + + def __process_text_embeddings( + self, segments: List[Dict[str, Any]], + ) -> List[Dict[str, Any]]: + full_text: str = ' '.join([seg.get('text', '') for seg in segments]) + sentences: List[str] = self.__split_into_sentences(full_text) + text_chunks, chunk_metadata = self.__create_text_chunks(sentences, segments) + return self.__batch_encode_chunks(text_chunks, chunk_metadata) + + def __create_text_chunks( + self, + sentences: List[str], + segments: List[Dict[str, Any]], + ) -> Tuple[List[str], List[Dict[str, Any]]]: + text_chunks: List[str] = [] + chunk_metadata: List[Dict[str, Any]] = [] + step: int = ( + self.config.text_sentences_per_chunk - self.config.text_chunk_overlap + ) + + for i in range(0, len(sentences), step): + chunk_sentences: List[str] = sentences[ + i : i + self.config.text_sentences_per_chunk + ] + if not chunk_sentences: + continue + + chunk_text: str = ' '.join(chunk_sentences).strip() + if not chunk_text: + continue + + char_start: int = sum((len(s) + 1 for s in sentences[:i])) + char_end: int = char_start + len(chunk_text) + start_seg_id: int = self.__find_segment_at_position(segments, char_start) + end_seg_id: int = self.__find_segment_at_position(segments, char_end) + + text_chunks.append(chunk_text) + chunk_metadata.append({ + 'segment_range': [start_seg_id, end_seg_id], + 'text': chunk_text, + }) + + return text_chunks, chunk_metadata + + def __batch_encode_chunks( + self, + text_chunks: List[str], + chunk_metadata: List[Dict[str, Any]], + ) -> List[Dict[str, Any]]: + results: List[Dict[str, Any]] = [] + + if not self.__model: + raise RuntimeError("Embedding model not initialized") + + for i in range(0, len(text_chunks), self.config.batch_size): + batch_texts: List[str] = text_chunks[i : i + self.config.batch_size] + batch_meta: List[Dict[str, Any]] = chunk_metadata[ + i : i + self.config.batch_size + ] + batch_embeddings: List[List[float]] = self.__model.encode_text(batch_texts) + + for meta, embedding in zip(batch_meta, batch_embeddings): + results.append({**meta, 'embedding': embedding}) + + return results + + def __save_embedding_results( + self, + results: List[Dict[str, Any]], + output_path: Path, + input_data: TranscriptionData, + ) -> None: + output_data: Dict[str, Any] = MetadataBuilder.create_processing_metadata( + episode_info=input_data.episode_info, + processing_params=self.config.model_dump(), + statistics={ + 'total_embeddings': len(results), + 'embedding_dimension': len(results[0]['embedding']) if results else 0, + }, + results_key='text_embeddings', + results_data=results, + ) + FileOperations.atomic_write_json(output_path, output_data) + + def __construct_embedding_collection( + self, + input_data: TranscriptionData, + output_path: Path, + embedding_count: int, + ) -> EmbeddingCollection: + return MetadataBuilder.create_embedding_collection( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=output_path, + model_name=self.config.model_name, + embedding_count=embedding_count, + embedding_type='text', + ) + + @staticmethod + def __create_path_variables(input_data: TranscriptionData) -> Dict[str, str]: + return { + "season": f"S{input_data.episode_info.season:02d}", + "episode": input_data.episode_info.episode_code(), + } + + @staticmethod + def __extract_valid_segments( + input_data: TranscriptionData, + context: ExecutionContext, + ) -> List[Dict[str, Any]]: + transcription: Dict[str, Any] = TextEmbeddingStep.__load_clean_transcription( + input_data, + ) + segments: List[Dict[str, Any]] = transcription.get('segments', []) + if not segments: + context.logger.warning( + f'No text segments for embedding in {input_data.episode_id}', + ) + return segments + + @staticmethod + def __load_clean_transcription(input_data: TranscriptionData) -> Dict[str, Any]: + raw_path: Path = input_data.path + clean_path: Path = ( + raw_path.parent.parent + / 'clean' + / raw_path.name.replace('.json', '_clean_transcription.json') + ) + if clean_path.exists(): + return FileOperations.load_json(clean_path) + return FileOperations.load_json(raw_path) + + @staticmethod + def __find_segment_at_position( + segments: List[Dict[str, Any]], char_pos: int, + ) -> int: + cumulative_length: int = 0 + for idx, seg in enumerate(segments): + seg_length: int = len(seg.get('text', '')) + 1 + if cumulative_length <= char_pos < cumulative_length + seg_length: + return idx + cumulative_length += seg_length + return len(segments) - 1 if segments else 0 + + @staticmethod + def __split_into_sentences(text: str) -> List[str]: + normalized_text: str = re.sub(r'\.{2,}', '.', text) + normalized_text = re.sub(r'!{2,}', '!', normalized_text) + normalized_text = re.sub(r'\?{2,}', '?', normalized_text) + sentences: List[str] = re.split(r'([.!?]+(?:\s+|$))', normalized_text) + raw: List[str] = [] + for i in range(0, len(sentences) - 1, 2): + s: str = (sentences[i] + sentences[i + 1]).strip() + if s: + raw.append(s) + if len(sentences) % 2 == 1 and sentences[-1].strip(): + raw.append(sentences[-1].strip()) + result: List[str] = [] + for sentence in raw: + if len(sentence) < 30 and result: + result[-1] = result[-1] + ' ' + sentence + else: + result.append(sentence) + return result diff --git a/preprocessor/steps/text/episode_name_embedding_step.py b/preprocessor/steps/text/episode_name_embedding_step.py new file mode 100644 index 000000000..4e700fdf0 --- /dev/null +++ b/preprocessor/steps/text/episode_name_embedding_step.py @@ -0,0 +1,165 @@ +# pylint: disable=duplicate-code +from datetime import datetime +from pathlib import Path +from typing import ( + Any, + Dict, + List, + Optional, +) + +from preprocessor.config.step_configs import EpisodeNameEmbeddingConfig +from preprocessor.core.artifacts import ( + EmbeddingCollection, + TranscriptionData, +) +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext +from preprocessor.core.output_descriptors import FileOutput +from preprocessor.services.io.files import FileOperations +from preprocessor.services.io.metadata import MetadataBuilder +from preprocessor.services.search.embedding_model import EmbeddingModelWrapper + + +class EpisodeNameEmbeddingStep( + PipelineStep[TranscriptionData, EmbeddingCollection, EpisodeNameEmbeddingConfig], +): + def __init__(self, config: EpisodeNameEmbeddingConfig) -> None: + super().__init__(config) + self.__model: Optional[EmbeddingModelWrapper] = None + + @property + def supports_batch_processing(self) -> bool: + return True + + def setup_resources(self, context: ExecutionContext) -> None: + if self.__model is None: + context.logger.info(f'Loading embedding model: {self.config.model_name}') + self.__model = EmbeddingModelWrapper( + self.config.model_name, + self.config.device, + 1, + ) + + def teardown_resources(self, context: ExecutionContext) -> None: + if self.__model: + self.__model = None + context.logger.info('Embedding model unloaded') + + def cleanup(self) -> None: + if self.__model: + self.__model = None + + def execute_batch( + self, + input_data: List[TranscriptionData], + context: ExecutionContext, + ) -> List[EmbeddingCollection]: + return self._execute_sequential(input_data, context, self.execute) + + def _process( + self, + input_data: TranscriptionData, + context: ExecutionContext, + ) -> EmbeddingCollection: + output_path = self._get_cache_path(input_data, context) + + title = input_data.episode_info.title + if not title: + context.logger.warning( + f'No title for episode name embedding in {input_data.episode_id}', + ) + return self.__build_collection(input_data, output_path, 0) + + self.__ensure_model() + context.logger.info(f'Generating episode name embedding for {input_data.episode_id}') + + embedding: List[float] = self.__model.encode_text(title) # type: ignore[assignment,union-attr] + self.__save_result(embedding, title, output_path, input_data) + + return self.__build_collection(input_data, output_path, 1) + + def get_output_descriptors(self) -> List[FileOutput]: + return [ + FileOutput( + pattern="{season}/{episode}.json", + subdir="embeddings/episode_names", + min_size_bytes=1024, + ), + ] + + def _get_cache_path( + self, + input_data: TranscriptionData, + context: ExecutionContext, + ) -> Path: + return self._resolve_output_path( + 0, + context, + self.__create_path_vars(input_data), + ) + + def _load_from_cache( + self, + cache_path: Path, + input_data: TranscriptionData, + context: ExecutionContext, + ) -> EmbeddingCollection: + data: Dict[str, Any] = FileOperations.load_json(cache_path) + count = 1 if data.get('title_embedding') else 0 + return self.__build_collection(input_data, cache_path, count) + + def __ensure_model(self) -> None: + if self.__model is None: + self.__model = EmbeddingModelWrapper( + self.config.model_name, + self.config.device, + 1, + ) + + def __save_result( + self, + embedding: List[float], + title: str, + output_path: Path, + input_data: TranscriptionData, + ) -> None: + episode_info = input_data.episode_info + output_data: Dict[str, Any] = { + 'generated_at': datetime.now().isoformat(), + 'processing_parameters': self.config.model_dump(), + 'episode_id': input_data.episode_id, + 'title': title, + 'title_embedding': embedding, + 'episode_metadata': { + 'season': episode_info.season, + 'episode_number': episode_info.relative_episode, + 'title': title, + 'premiere_date': episode_info.premiere_date, + 'series_name': episode_info.series_name, + 'viewership': episode_info.viewership, + }, + } + FileOperations.atomic_write_json(output_path, output_data) + + def __build_collection( + self, + input_data: TranscriptionData, + output_path: Path, + embedding_count: int, + ) -> EmbeddingCollection: + return MetadataBuilder.create_embedding_collection( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=output_path, + model_name=self.config.model_name, + embedding_count=embedding_count, + embedding_type='episode_name', + ) + + @staticmethod + def __create_path_vars(input_data: TranscriptionData) -> Dict[str, str]: + return { + "season": f"S{input_data.episode_info.season:02d}", + "episode": input_data.episode_info.episode_code(), + } diff --git a/preprocessor/steps/text/full_episode_embedding_step.py b/preprocessor/steps/text/full_episode_embedding_step.py new file mode 100644 index 000000000..8ec29d60a --- /dev/null +++ b/preprocessor/steps/text/full_episode_embedding_step.py @@ -0,0 +1,224 @@ +# pylint: disable=duplicate-code +from datetime import datetime +from pathlib import Path +from typing import ( + Any, + Dict, + List, + Optional, + Tuple, +) + +import numpy as np + +from preprocessor.config.step_configs import FullEpisodeEmbeddingConfig +from preprocessor.core.artifacts import ( + EmbeddingCollection, + TranscriptionData, +) +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext +from preprocessor.core.output_descriptors import FileOutput +from preprocessor.services.io.files import FileOperations +from preprocessor.services.io.metadata import MetadataBuilder +from preprocessor.services.search.embedding_model import EmbeddingModelWrapper + + +class FullEpisodeEmbeddingStep( + PipelineStep[TranscriptionData, EmbeddingCollection, FullEpisodeEmbeddingConfig], +): + def __init__(self, config: FullEpisodeEmbeddingConfig) -> None: + super().__init__(config) + self.__model: Optional[EmbeddingModelWrapper] = None + + @property + def supports_batch_processing(self) -> bool: + return True + + def setup_resources(self, context: ExecutionContext) -> None: + if self.__model is None: + context.logger.info(f'Loading embedding model: {self.config.model_name}') + self.__model = EmbeddingModelWrapper( + self.config.model_name, + self.config.device, + 1, + ) + + def teardown_resources(self, context: ExecutionContext) -> None: + if self.__model: + self.__model = None + context.logger.info('Embedding model unloaded') + + def cleanup(self) -> None: + if self.__model: + self.__model = None + + def execute_batch( + self, + input_data: List[TranscriptionData], + context: ExecutionContext, + ) -> List[EmbeddingCollection]: + return self._execute_sequential(input_data, context, self.execute) + + def _process( + self, + input_data: TranscriptionData, + context: ExecutionContext, + ) -> EmbeddingCollection: + output_path = self._get_cache_path(input_data, context) + + full_text = self.__build_full_text(input_data, context) + if not full_text: + return self.__build_collection(input_data, output_path, 0) + + self.__ensure_model() + context.logger.info(f'Generating full episode embedding for {input_data.episode_id}') + + embedding = self.__embed_full_text(full_text) + self.__save_result(embedding, full_text, output_path, input_data) + + return self.__build_collection(input_data, output_path, 1) + + def get_output_descriptors(self) -> List[FileOutput]: + return [ + FileOutput( + pattern="{season}/{episode}.json", + subdir="embeddings/full_episode", + min_size_bytes=1024, + ), + ] + + def _get_cache_path( + self, + input_data: TranscriptionData, + context: ExecutionContext, + ) -> Path: + return self._resolve_output_path( + 0, + context, + self.__create_path_vars(input_data), + ) + + def _load_from_cache( + self, + cache_path: Path, + input_data: TranscriptionData, + context: ExecutionContext, + ) -> EmbeddingCollection: + data: Dict[str, Any] = FileOperations.load_json(cache_path) + count = 1 if data.get('full_episode_embedding') else 0 + return self.__build_collection(input_data, cache_path, count) + + def __ensure_model(self) -> None: + if self.__model is None: + self.__model = EmbeddingModelWrapper( + self.config.model_name, + self.config.device, + 1, + ) + + def __embed_full_text(self, full_text: str) -> List[float]: + if len(full_text) <= self.config.max_chars_per_chunk: + embedding: List[float] = self.__model.encode_text(full_text) # type: ignore[assignment,union-attr] + return embedding + return self.__sliding_window_embed(full_text) + + def __sliding_window_embed(self, full_text: str) -> List[float]: + chunks, weights = self.__build_chunks_and_weights(full_text) + if not self.__model: + raise RuntimeError("Embedding model not initialized") + + embeddings: List[List[float]] = self.__model.encode_text(chunks) # type: ignore[assignment] + total_weight = sum(weights) + normalized_weights = [w / total_weight for w in weights] + + dim = len(embeddings[0]) + avg: np.ndarray = np.zeros(dim, dtype=np.float64) + for emb, w in zip(embeddings, normalized_weights): + avg += np.array(emb, dtype=np.float64) * w + + norm = float(np.linalg.norm(avg)) + if norm > 0: + avg /= norm + + return avg.tolist() + + def __build_chunks_and_weights( + self, + full_text: str, + ) -> Tuple[List[str], List[float]]: + chunks: List[str] = [] + weights: List[float] = [] + step = self.config.max_chars_per_chunk - self.config.overlap_chars + pos = 0 + + while pos < len(full_text): + chunk = full_text[pos : pos + self.config.max_chars_per_chunk] + if len(chunk) >= self.config.min_chunk_length: + chunks.append(chunk) + weights.append(len(chunk) / self.config.max_chars_per_chunk) + pos += step + + return chunks, weights + + def __save_result( + self, + embedding: List[float], + full_text: str, + output_path: Path, + input_data: TranscriptionData, + ) -> None: + output_data: Dict[str, Any] = { + 'generated_at': datetime.now().isoformat(), + 'episode_info': { + 'season': input_data.episode_info.season, + 'episode_number': input_data.episode_info.relative_episode, + }, + 'processing_parameters': self.config.model_dump(), + 'statistics': { + 'transcript_length': len(full_text), + 'embedding_dimension': len(embedding), + }, + 'full_episode_embedding': { + 'text': full_text, + 'embedding': embedding, + 'transcript_length': len(full_text), + }, + } + FileOperations.atomic_write_json(output_path, output_data) + + def __build_collection( + self, + input_data: TranscriptionData, + output_path: Path, + embedding_count: int, + ) -> EmbeddingCollection: + return MetadataBuilder.create_embedding_collection( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=output_path, + model_name=self.config.model_name, + embedding_count=embedding_count, + embedding_type='full_episode', + ) + + @staticmethod + def __create_path_vars(input_data: TranscriptionData) -> Dict[str, str]: + return { + "season": f"S{input_data.episode_info.season:02d}", + "episode": input_data.episode_info.episode_code(), + } + + @staticmethod + def __build_full_text( + input_data: TranscriptionData, + context: ExecutionContext, + ) -> str: + data: Dict[str, Any] = FileOperations.load_json(input_data.path) + segments: List[Dict[str, Any]] = data.get('segments', []) + if not segments: + context.logger.warning( + f'No text segments for full episode embedding in {input_data.episode_id}', + ) + return '' + return ' '.join(s.get('text', '') for s in segments).strip() diff --git a/preprocessor/steps/text/segment_filter_step.py b/preprocessor/steps/text/segment_filter_step.py new file mode 100644 index 000000000..1f3c5c669 --- /dev/null +++ b/preprocessor/steps/text/segment_filter_step.py @@ -0,0 +1,148 @@ +from abc import abstractmethod +from pathlib import Path +import re +from typing import ( + Any, + Dict, + Generic, + List, + Optional, + Tuple, + TypeVar, +) + +from preprocessor.config.step_configs import SegmentFilterConfig +from preprocessor.config.types import ( + WordKeys, + WordTypeValues, +) +from preprocessor.core.artifacts import TranscriptionData +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext +from preprocessor.core.output_descriptors import JsonFileOutput +from preprocessor.services.io.files import FileOperations +from preprocessor.services.transcription.sound_classification import ( + classify_segment, + is_sound_event, +) + +_ConfigT = TypeVar('_ConfigT', bound=SegmentFilterConfig) + +_SOUND_EVENT_PATTERN = re.compile(r'^\s*\(.*\)\s*$') + + +class SegmentFilterStep( + PipelineStep[TranscriptionData, TranscriptionData, _ConfigT], + Generic[_ConfigT], +): + @property + @abstractmethod + def _output_format(self) -> str: + pass + + @property + @abstractmethod + def _output_subdir(self) -> str: + pass + + @abstractmethod + def _process_segment(self, segment: Dict[str, Any]) -> List[Dict[str, Any]]: + pass + + @property + def supports_batch_processing(self) -> bool: + return True + + def execute_batch( + self, input_data: List[TranscriptionData], context: ExecutionContext, + ) -> List[TranscriptionData]: + return self._execute_with_threadpool( + input_data, context, self.config.max_parallel_episodes, self.execute, + ) + + def _process( + self, input_data: TranscriptionData, context: ExecutionContext, + ) -> TranscriptionData: + output_path = self._get_cache_path(input_data, context) + data = FileOperations.load_json(input_data.path) + filtered = self.__apply_filter(data) + FileOperations.atomic_write_json(output_path, filtered) + return self.__build_artifact(input_data, output_path) + + def get_output_descriptors(self) -> List[JsonFileOutput]: + return [ + JsonFileOutput( + pattern="{season}/{episode}.json", + subdir=self._output_subdir, + min_size_bytes=10, + ), + ] + + def _get_cache_path( + self, input_data: TranscriptionData, context: ExecutionContext, + ) -> Path: + return self._get_standard_cache_path(input_data, context) + + def _load_from_cache( + self, + cache_path: Path, + input_data: TranscriptionData, + context: ExecutionContext, + ) -> TranscriptionData: + return self.__build_artifact(input_data, cache_path) + + @staticmethod + def _classify(segment: Dict[str, Any]) -> str: + words = segment.get(WordKeys.WORDS, []) + if not words: + text = segment.get('text', '').strip() + return 'sound_event' if _SOUND_EVENT_PATTERN.match(text) else 'dialogue' + return classify_segment(segment) + + @staticmethod + def _split_mixed( + segment: Dict[str, Any], + ) -> Tuple[Optional[Dict[str, Any]], Optional[Dict[str, Any]]]: + words = segment.get(WordKeys.WORDS, []) + + dialogue_words = [ + w for w in words + if not is_sound_event(w) and w.get(WordKeys.TYPE) not in (WordTypeValues.SPACING, '') + ] + sound_words = [w for w in words if is_sound_event(w)] + + dialogue_part = SegmentFilterStep.__make_sub_segment(segment, dialogue_words) if dialogue_words else None + sound_part = SegmentFilterStep.__make_sub_segment(segment, sound_words) if sound_words else None + + return dialogue_part, sound_part + + @staticmethod + def __make_sub_segment( + segment: Dict[str, Any], + words: List[Dict[str, Any]], + ) -> Dict[str, Any]: + text = ' '.join(w.get(WordKeys.TEXT, w.get(WordKeys.WORD, '')) for w in words).strip() + return { + **segment, + 'start': words[0].get(WordKeys.START, segment.get('start')), + 'end': words[-1].get(WordKeys.END, segment.get('end')), + 'text': text, + WordKeys.WORDS: words, + } + + def __apply_filter(self, data: Dict[str, Any]) -> Dict[str, Any]: + segments: List[Dict[str, Any]] = data.get('segments', []) + result: List[Dict[str, Any]] = [] + for seg in segments: + result.extend(self._process_segment(seg)) + return {**data, 'segments': result} + + def __build_artifact(self, input_data: TranscriptionData, path: Path) -> TranscriptionData: + return TranscriptionData( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=path, + language=input_data.language, + model=input_data.model, + format=self._output_format, + ) diff --git a/preprocessor/steps/text/sound_event_embedding_step.py b/preprocessor/steps/text/sound_event_embedding_step.py new file mode 100644 index 000000000..72db7746d --- /dev/null +++ b/preprocessor/steps/text/sound_event_embedding_step.py @@ -0,0 +1,224 @@ +# pylint: disable=duplicate-code +from pathlib import Path +import re +from typing import ( + Any, + Dict, + List, + Optional, + Set, +) + +from preprocessor.config.step_configs import SoundEventEmbeddingConfig +from preprocessor.core.artifacts import ( + EmbeddingCollection, + TranscriptionData, +) +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext +from preprocessor.core.output_descriptors import FileOutput +from preprocessor.services.io.files import FileOperations +from preprocessor.services.io.metadata import MetadataBuilder +from preprocessor.services.search.embedding_model import EmbeddingModelWrapper + +_SOUND_TYPE_PATTERN = re.compile(r'\(([^)]+)\)') + + +class SoundEventEmbeddingStep( + PipelineStep[TranscriptionData, EmbeddingCollection, SoundEventEmbeddingConfig], +): + def __init__(self, config: SoundEventEmbeddingConfig) -> None: + super().__init__(config) + self.__model: Optional[EmbeddingModelWrapper] = None + + @property + def supports_batch_processing(self) -> bool: + return True + + def setup_resources(self, context: ExecutionContext) -> None: + if self.__model is None: + context.logger.info(f'Loading embedding model: {self.config.model_name}') + self.__model = EmbeddingModelWrapper( + self.config.model_name, + self.config.device, + self.config.batch_size, + ) + + def teardown_resources(self, context: ExecutionContext) -> None: + if self.__model: + self.__model = None + context.logger.info('Embedding model unloaded') + + def cleanup(self) -> None: + if self.__model: + self.__model = None + + def execute_batch( + self, + input_data: List[TranscriptionData], + context: ExecutionContext, + ) -> List[EmbeddingCollection]: + return self._execute_sequential(input_data, context, self.execute) + + def _process( + self, + input_data: TranscriptionData, + context: ExecutionContext, + ) -> EmbeddingCollection: + output_path = self._get_cache_path(input_data, context) + + segments = self.__load_segments(input_data, context) + if not segments: + self.__save_results([], output_path, input_data) + return self.__build_collection(input_data, output_path, 0) + + self.__ensure_model() + context.logger.info(f'Generating sound event embeddings for {input_data.episode_id}') + + results = self.__process_chunks(segments) + self.__save_results(results, output_path, input_data) + + return self.__build_collection(input_data, output_path, len(results)) + + def get_output_descriptors(self) -> List[FileOutput]: + return [ + FileOutput( + pattern="{season}/{episode}.json", + subdir="embeddings/sound_events", + min_size_bytes=1024, + ), + ] + + def _get_cache_path( + self, + input_data: TranscriptionData, + context: ExecutionContext, + ) -> Path: + return self._resolve_output_path( + 0, + context, + self.__create_path_vars(input_data), + ) + + def _load_from_cache( + self, + cache_path: Path, + input_data: TranscriptionData, + context: ExecutionContext, + ) -> EmbeddingCollection: + data: Dict[str, Any] = FileOperations.load_json(cache_path) + return self.__build_collection( + input_data, + cache_path, + len(data.get('sound_event_embeddings', [])), + ) + + def __ensure_model(self) -> None: + if self.__model is None: + self.__model = EmbeddingModelWrapper( + self.config.model_name, + self.config.device, + self.config.batch_size, + ) + + def __process_chunks( + self, + segments: List[Dict[str, Any]], + ) -> List[Dict[str, Any]]: + chunks = self.__group_segments(segments) + if not self.__model: + raise RuntimeError("Embedding model not initialized") + + results: List[Dict[str, Any]] = [] + for i in range(0, len(chunks), self.config.batch_size): + batch_chunks = chunks[i : i + self.config.batch_size] + batch_texts = [c['text'] for c in batch_chunks] + batch_embeddings: List[List[float]] = self.__model.encode_text(batch_texts) + for chunk, embedding in zip(batch_chunks, batch_embeddings): + results.append({**chunk, 'embedding': embedding}) + + return results + + def __group_segments( + self, + segments: List[Dict[str, Any]], + ) -> List[Dict[str, Any]]: + chunks: List[Dict[str, Any]] = [] + step = self.config.segments_per_embedding + + for i in range(0, len(segments), step): + chunk_segs = segments[i : i + step] + if not chunk_segs: + continue + + text = ' '.join(s.get('text', '') for s in chunk_segs).strip() + if not text: + continue + + sound_types: Set[str] = set() + for seg in chunk_segs: + for match in _SOUND_TYPE_PATTERN.finditer(seg.get('text', '')): + sound_types.add(match.group(1).strip().lower()) + + chunks.append({ + 'segment_range': [i, i + len(chunk_segs) - 1], + 'text': text, + 'sound_types': sorted(sound_types), + 'start_time': chunk_segs[0].get('start', 0.0), + 'end_time': chunk_segs[-1].get('end', 0.0), + }) + + return chunks + + def __save_results( + self, + results: List[Dict[str, Any]], + output_path: Path, + input_data: TranscriptionData, + ) -> None: + output_data: Dict[str, Any] = MetadataBuilder.create_processing_metadata( + episode_info=input_data.episode_info, + processing_params=self.config.model_dump(), + statistics={ + 'total_embeddings': len(results), + 'embedding_dimension': len(results[0]['embedding']) if results else 0, + }, + results_key='sound_event_embeddings', + results_data=results, + ) + FileOperations.atomic_write_json(output_path, output_data) + + def __build_collection( + self, + input_data: TranscriptionData, + output_path: Path, + embedding_count: int, + ) -> EmbeddingCollection: + return MetadataBuilder.create_embedding_collection( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=output_path, + model_name=self.config.model_name, + embedding_count=embedding_count, + embedding_type='sound_events', + ) + + @staticmethod + def __create_path_vars(input_data: TranscriptionData) -> Dict[str, str]: + return { + "season": f"S{input_data.episode_info.season:02d}", + "episode": input_data.episode_info.episode_code(), + } + + @staticmethod + def __load_segments( + input_data: TranscriptionData, + context: ExecutionContext, + ) -> List[Dict[str, Any]]: + data: Dict[str, Any] = FileOperations.load_json(input_data.path) + segments: List[Dict[str, Any]] = data.get('segments', []) + if not segments: + context.logger.warning( + f'No sound event segments for embedding in {input_data.episode_id}', + ) + return segments diff --git a/preprocessor/steps/text/sound_events_step.py b/preprocessor/steps/text/sound_events_step.py new file mode 100644 index 000000000..712894554 --- /dev/null +++ b/preprocessor/steps/text/sound_events_step.py @@ -0,0 +1,27 @@ +from typing import ( + Any, + Dict, + List, +) + +from preprocessor.config.step_configs import SoundEventsConfig +from preprocessor.steps.text.segment_filter_step import SegmentFilterStep + + +class SoundEventsStep(SegmentFilterStep[SoundEventsConfig]): + @property + def _output_format(self) -> str: + return 'sound_events' + + @property + def _output_subdir(self) -> str: + return 'transcriptions/sound_events' + + def _process_segment(self, segment: Dict[str, Any]) -> List[Dict[str, Any]]: + kind = self._classify(segment) + if kind == 'sound_event': + return [segment] + if kind == 'dialogue': + return [] + _, sound_part = self._split_mixed(segment) + return [sound_part] if sound_part else [] diff --git a/preprocessor/steps/text/text_cleaning_step.py b/preprocessor/steps/text/text_cleaning_step.py new file mode 100644 index 000000000..fc14f5a08 --- /dev/null +++ b/preprocessor/steps/text/text_cleaning_step.py @@ -0,0 +1,27 @@ +from typing import ( + Any, + Dict, + List, +) + +from preprocessor.config.step_configs import TextCleaningConfig +from preprocessor.steps.text.segment_filter_step import SegmentFilterStep + + +class TextCleaningStep(SegmentFilterStep[TextCleaningConfig]): + @property + def _output_format(self) -> str: + return 'clean' + + @property + def _output_subdir(self) -> str: + return 'transcriptions/clean' + + def _process_segment(self, segment: Dict[str, Any]) -> List[Dict[str, Any]]: + kind = self._classify(segment) + if kind == 'dialogue': + return [segment] + if kind == 'sound_event': + return [] + dialogue_part, _ = self._split_mixed(segment) + return [dialogue_part] if dialogue_part else [] diff --git a/preprocessor/steps/text/transcription_step.py b/preprocessor/steps/text/transcription_step.py new file mode 100644 index 000000000..95519d7c6 --- /dev/null +++ b/preprocessor/steps/text/transcription_step.py @@ -0,0 +1,165 @@ +import json +from pathlib import Path +from typing import ( + Any, + Dict, + List, + Optional, +) + +from preprocessor.config.step_configs import TranscriptionConfig +from preprocessor.core.artifacts import ( + TranscodedVideo, + TranscriptionData, +) +from preprocessor.core.base_transcription_step import BaseTranscriptionStep +from preprocessor.core.context import ExecutionContext +from preprocessor.services.episodes.episode_manager import EpisodeManager +from preprocessor.services.io.files import FileOperations +from preprocessor.services.transcription.engines.base_engine import TranscriptionEngine +from preprocessor.services.transcription.engines.elevenlabs_engine import ElevenLabsEngine +from preprocessor.services.transcription.engines.whisper_engine import WhisperEngine +from preprocessor.services.transcription.generators.json_generator import JsonGenerator +from preprocessor.services.transcription.generators.srt_generator import SrtGenerator +from preprocessor.services.transcription.generators.txt_generator import TxtGenerator + + +class TranscriptionStep( + BaseTranscriptionStep[TranscodedVideo, TranscriptionConfig], +): + def __init__(self, config: TranscriptionConfig) -> None: + super().__init__(config) + self.__engine: Optional[TranscriptionEngine] = None + + @property + def supports_batch_processing(self) -> bool: + return True + + def setup_resources(self, context: ExecutionContext) -> None: + if self.__engine is None: + self.__engine = self.__create_engine(context) + + def teardown_resources(self, context: ExecutionContext) -> None: + if self.__engine: + self.__engine.cleanup() + self.__engine = None + context.logger.info('Transcription engine unloaded') + + def execute_batch( + self, input_data: List[TranscodedVideo], context: ExecutionContext, + ) -> List[TranscriptionData]: + return self._execute_with_threadpool( + input_data, context, self.config.max_parallel_episodes, self.execute, + ) + + def _process( + self, input_data: TranscodedVideo, context: ExecutionContext, + ) -> TranscriptionData: + output_path = self._get_cache_path(input_data, context) + + if self.__engine is None: + self.__engine = self.__create_engine(context) + + result = self.__transcribe_and_save(input_data, output_path, context) + self.__save_additional_formats(output_path, result) + + return self.__construct_result_artifact(output_path, input_data, result) + + def _load_from_cache( + self, + cache_path: Path, + input_data: TranscodedVideo, + context: ExecutionContext, + ) -> TranscriptionData: + self.__ensure_additional_formats(cache_path) + return TranscriptionData( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=cache_path, + language=self.config.language, + model=self.config.model, + format='json', + ) + + def __create_engine(self, context: ExecutionContext) -> TranscriptionEngine: + if self.config.mode in {'11labs', 'elevenlabs'}: + context.logger.info('Creating ElevenLabs transcription engine') + return ElevenLabsEngine(logger=context.logger) + + context.logger.info(f'Loading Whisper model: {self.config.model}') + return WhisperEngine( + model_name=self.config.model, + language=self.config.language, + device=self.config.device, + beam_size=self.config.beam_size, + temperature=self.config.temperature, + max_chunk_duration_seconds=self.config.max_chunk_duration_seconds, + ) + + def __transcribe_and_save( + self, + input_data: TranscodedVideo, + output_path: Path, + context: ExecutionContext, + ) -> Dict[str, Any]: + try: + if self.__engine is None: + raise RuntimeError('Transcription engine not initialized') + + result: Dict[str, Any] = self.__engine.transcribe(input_data.path) + result['episode_info'] = EpisodeManager.get_metadata( + input_data.episode_info, + ) + FileOperations.atomic_write_json(output_path, result) + return result + except Exception as e: + context.logger.error( + f'Transcription failed for {input_data.episode_id}: {e}', + ) + if output_path.exists(): + output_path.unlink() + raise + + @staticmethod + def __save_additional_formats(output_path: Path, data: Dict[str, Any]) -> None: + stem = output_path.stem + parent = output_path.parent + + simple = JsonGenerator.convert_to_simple_format(data) + (parent / f'{stem}_simple.json').write_text( + json.dumps(simple, indent=2, ensure_ascii=False), encoding='utf-8', + ) + (parent / f'{stem}.srt').write_text( + SrtGenerator.convert_to_srt_format(data), encoding='utf-8', + ) + (parent / f'{stem}.txt').write_text( + TxtGenerator.convert_to_txt_format(data), encoding='utf-8', + ) + + @staticmethod + def __ensure_additional_formats(cache_path: Path) -> None: + stem = cache_path.stem + parent = cache_path.parent + missing = any( + not (parent / name).exists() + for name in (f'{stem}_simple.json', f'{stem}.srt', f'{stem}.txt') + ) + if not missing: + return + data = FileOperations.load_json(cache_path) + TranscriptionStep.__save_additional_formats(cache_path, data) + + def __construct_result_artifact( + self, + output_path: Path, + input_data: TranscodedVideo, + result: Dict[str, Any], + ) -> TranscriptionData: + return TranscriptionData( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=output_path, + language=result.get('language', self.config.language), + model=self.config.model, + format='json', + ) diff --git a/preprocessor/steps/validation/__init__.py b/preprocessor/steps/validation/__init__.py new file mode 100644 index 000000000..0c9efa03a --- /dev/null +++ b/preprocessor/steps/validation/__init__.py @@ -0,0 +1,3 @@ +from preprocessor.steps.validation.validator_step import ValidationStep + +__all__ = ['ValidationStep'] diff --git a/preprocessor/steps/validation/validator_step.py b/preprocessor/steps/validation/validator_step.py new file mode 100644 index 000000000..aecd7355d --- /dev/null +++ b/preprocessor/steps/validation/validator_step.py @@ -0,0 +1,67 @@ +from typing import List + +from preprocessor.config.step_configs import ValidationConfig +from preprocessor.core.artifacts import ( + ElasticDocuments, + ValidationResult, +) +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext +from preprocessor.services.validation.validator import Validator + + +class ValidationStep(PipelineStep[ElasticDocuments, ValidationResult, ValidationConfig]): + @property + def supports_batch_processing(self) -> bool: + return True + + @property + def uses_caching(self) -> bool: + return False + + def execute_batch( + self, input_data: List[ElasticDocuments], context: ExecutionContext, + ) -> List[ValidationResult]: + return self._execute_with_threadpool( + input_data, context, self.config.max_parallel_episodes, self.execute, + ) + + def _process( + self, + input_data: ElasticDocuments, + context: ExecutionContext, + ) -> ValidationResult: + season = input_data.episode_info.season_code() + context.logger.info(f"Starting validation for season {season}") + + validator = self.__create_validator(season, context) + self.__run_validation(validator) + + context.logger.info("Validation completed successfully") + + return self.__construct_validation_result(season, validator) + + def __create_validator(self, season: str, context: ExecutionContext) -> Validator: + return Validator( + season=season, + series_name=context.series_name, + anomaly_threshold=self.config.anomaly_threshold, + base_output_dir=context.base_output_dir, + episodes_info_json=self.config.episodes_info_json, + ) + + @staticmethod + def __run_validation(validator: Validator) -> None: + exit_code = validator.validate() + if exit_code != 0: + raise RuntimeError(f"Validation failed with exit code {exit_code}") + + @staticmethod + def __construct_validation_result( + season: str, + validator: Validator, + ) -> ValidationResult: + return ValidationResult( + season=season, + validation_report_dir=validator.validation_reports_dir, + ) diff --git a/preprocessor/search/__init__.py b/preprocessor/steps/video/__init__.py similarity index 100% rename from preprocessor/search/__init__.py rename to preprocessor/steps/video/__init__.py diff --git a/preprocessor/steps/video/frame_export_step.py b/preprocessor/steps/video/frame_export_step.py new file mode 100644 index 000000000..ca0034b16 --- /dev/null +++ b/preprocessor/steps/video/frame_export_step.py @@ -0,0 +1,414 @@ +import bisect +from concurrent.futures import ThreadPoolExecutor +from datetime import datetime +import json +from pathlib import Path +import shutil +from typing import ( + Any, + Dict, + List, +) + +from PIL import Image + +from preprocessor.config.step_configs import FrameExportConfig +from preprocessor.config.types import FrameRequest +from preprocessor.core.artifacts import ( + FrameCollection, + SceneCollection, +) +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext +from preprocessor.core.output_descriptors import ( + DirectoryOutput, + create_frames_output, +) +from preprocessor.core.temp_files import StepTempFile +from preprocessor.services.io.files import FileOperations +from preprocessor.services.media.ffmpeg import FFmpegWrapper +from preprocessor.services.video.strategies.strategy_factory import KeyframeStrategyFactory + + +class FrameExporterStep(PipelineStep[SceneCollection, FrameCollection, FrameExportConfig]): + def __init__(self, config: FrameExportConfig) -> None: + super().__init__(config) + self.__strategy = KeyframeStrategyFactory.create( + self.config.keyframe_strategy, + self.config.frames_per_scene, + self.config.scene_change_offset_seconds, + ) + + @property + def supports_batch_processing(self) -> bool: + return True + + def execute_batch( + self, input_data: List[SceneCollection], context: ExecutionContext, + ) -> List[FrameCollection]: + return self._execute_with_threadpool( + input_data, context, self.config.max_parallel_episodes, self.execute, + ) + + def _process( + self, input_data: SceneCollection, context: ExecutionContext, + ) -> FrameCollection: + metadata_file = self._get_cache_path(input_data, context) + episode_dir = metadata_file.parent + + self.__prepare_episode_directory(episode_dir, context) + frame_requests = self.__extract_frame_requests(input_data) + + if not frame_requests: + return self.__construct_empty_result( + episode_dir, metadata_file, input_data, context, + ) + + context.logger.info( + f'Extracting {len(frame_requests)} keyframes from {input_data.video_path.name}', + ) + + self.__process_frame_extraction( + input_data.video_path, + frame_requests, + episode_dir, + input_data, + metadata_file, + context, + ) + + return FrameCollection( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + directory=episode_dir, + frame_count=len(frame_requests), + metadata_path=metadata_file, + ) + + def get_output_descriptors(self) -> List[DirectoryOutput]: + return [create_frames_output()] + + def _get_cache_path( + self, input_data: SceneCollection, context: ExecutionContext, + ) -> Path: + episode_dir = self._get_standard_cache_path(input_data, context) + metadata_filename = ( + f'{context.series_name}_' + f'{input_data.episode_info.episode_code()}_frame_metadata.json' + ) + return episode_dir / metadata_filename + + def _load_from_cache( + self, cache_path: Path, input_data: SceneCollection, context: ExecutionContext, + ) -> FrameCollection: + episode_dir = cache_path.parent + with open(cache_path, 'r', encoding='utf-8') as f: + metadata = json.load(f) + return FrameCollection( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + directory=episode_dir, + frame_count=metadata['statistics']['total_frames'], + metadata_path=cache_path, + ) + + def __extract_frame_requests( + self, input_data: SceneCollection, + ) -> List[FrameRequest]: + video_path = input_data.video_path + if not video_path.exists(): + raise FileNotFoundError(f'Video file not found for frame export: {video_path}') + data = { + 'scene_timestamps': {'scenes': input_data.scenes}, + } + return self.__strategy.extract_frame_requests(video_path, data) + + def __process_frame_extraction( + self, + video_path: Path, + frame_requests: List[FrameRequest], + episode_dir: Path, + input_data: SceneCollection, + metadata_file: Path, + context: ExecutionContext, + ) -> None: + try: + fps = self.__extract_frames( + video_path, + frame_requests, + episode_dir, + input_data.episode_info, + context, + ) + self.__write_metadata( + frame_requests, + input_data.episode_info, + video_path, + context, + metadata_file, + fps, + ) + except (Exception, KeyboardInterrupt) as e: + error_type = "interrupted" if isinstance(e, KeyboardInterrupt) else "failed" + context.logger.error(f'Frame extraction {error_type} for {video_path}: {e}') + shutil.rmtree(episode_dir, ignore_errors=True) + raise + + def __extract_frames( + self, + video_file: Path, + frame_requests: List[FrameRequest], + episode_dir: Path, + episode_info, + context: ExecutionContext, + ) -> float: + video_metadata = self.__fetch_video_metadata(video_file) + dar = self.__calculate_display_aspect_ratio(video_metadata) + fps = self.__get_fps(video_metadata) + + keyframes = self.__get_all_keyframes(video_file) + context.logger.info(f'Found {len(keyframes)} I-frames in {video_file.name}') + + unique_requests = self.__snap_and_deduplicate(frame_requests, keyframes, fps, context) + + with ThreadPoolExecutor(max_workers=self.config.max_parallel_frames) as executor: + futures = [ + executor.submit( + self.__extract_resize_save_frame, + video_file, req['timestamp'], req['frame_number'], + episode_dir, episode_info, dar, context.series_name, + ) + for req in unique_requests + ] + for future in futures: + future.result() + + return fps + + def __snap_and_deduplicate( + self, + frame_requests: List[FrameRequest], + keyframes: List[float], + fps: float, + context: ExecutionContext, + ) -> List[FrameRequest]: + for req in frame_requests: + target = req['timestamp'] + snapped = self.__snap_to_keyframe(keyframes, target) + if abs(snapped - target) > 0.1: + context.logger.debug( + f'Snapped {target:.3f}s -> {snapped:.3f}s (delta: {snapped - target:.3f}s)', + ) + req['timestamp'] = snapped + req['original_timestamp'] = target + req['snapped_to_keyframe'] = True + req['frame_number'] = round(snapped * fps) + + seen: set[int] = set() + unique: List[FrameRequest] = [] + for req in frame_requests: + if req['frame_number'] not in seen: + seen.add(req['frame_number']) + unique.append(req) + return unique + + def __extract_resize_save_frame( + self, + video_file: Path, + timestamp: float, + frame_number: int, + episode_dir: Path, + episode_info, + dar: float, + series_name: str, + ) -> None: + image = FFmpegWrapper.extract_frame_at_timestamp(video_file, timestamp) + self.__resize_and_save_frame(image, frame_number, episode_dir, episode_info, dar, series_name) + + def __resize_and_save_frame( + self, + image: Image.Image, + frame_number: int, + episode_dir: Path, + episode_info, + dar: float, + series_name: str, + ) -> None: + resized = self.__resize_frame(image, dar) + base_filename = f'{series_name}_{episode_info.episode_code()}' + filename = f'{base_filename}_frame_{frame_number:06d}.jpg' + final_path = episode_dir / filename + + with StepTempFile(final_path) as temp_path: + resized.save(temp_path, format='JPEG', quality=90) + + def __resize_frame( + self, frame: Image.Image, display_aspect_ratio: float, + ) -> Image.Image: + target_width = self.config.resolution.width + target_height = self.config.resolution.height + target_aspect = target_width / target_height + + if abs(display_aspect_ratio - target_aspect) < 0.01: + return frame.resize( + (target_width, target_height), Image.Resampling.LANCZOS, + ) + + if display_aspect_ratio > target_aspect: + new_width = target_width + new_height = int(target_width / display_aspect_ratio) + resized = frame.resize((new_width, new_height), Image.Resampling.LANCZOS) + result = Image.new('RGB', (target_width, target_height), (0, 0, 0)) + y_offset = (target_height - new_height) // 2 + result.paste(resized, (0, y_offset)) + return result + + new_height = target_height + new_width = int(target_height * display_aspect_ratio) + resized = frame.resize((new_width, new_height), Image.Resampling.LANCZOS) + result = Image.new('RGB', (target_width, target_height), (0, 0, 0)) + x_offset = (target_width - new_width) // 2 + result.paste(resized, (x_offset, 0)) + return result + + def __write_metadata( + self, + frame_requests: List[FrameRequest], + episode_info, + source_video: Path, + context: ExecutionContext, + metadata_file: Path, + fps: float, + ) -> None: + frame_types_count: Dict[str, int] = {} + frames_with_paths: List[Dict[str, Any]] = [] + base_filename = f'{context.series_name}_{episode_info.episode_code()}' + + for frame in frame_requests: + frame_type = frame.get('type', 'unknown') + frame_types_count[frame_type] = frame_types_count.get(frame_type, 0) + 1 + + frame_with_path = frame.copy() + frame_with_path['frame_path'] = f'{base_filename}_frame_{frame["frame_number"]:06d}.jpg' + frames_with_paths.append(frame_with_path) + + scene_numbers = { + f.get('scene_number', -1) + for f in frame_requests + if f.get('scene_number', -1) != -1 + } + + metadata = { + 'generated_at': datetime.now().isoformat(), + 'episode_info': { + 'season': episode_info.season, + 'episode_number': episode_info.relative_episode, + 'absolute_episode': episode_info.absolute_episode, + }, + 'source_video': str(source_video), + 'processing_parameters': { + 'frame_width': self.config.resolution.width, + 'frame_height': self.config.resolution.height, + 'fps': fps, + 'keyframe_strategy': self.config.keyframe_strategy.value, + 'frames_per_scene': self.config.frames_per_scene, + }, + 'statistics': { + 'total_frames': len(frame_requests), + 'frame_types': frame_types_count, + 'total_scenes': len(scene_numbers), + 'timestamp_range': { + 'start': min( + (f.get('timestamp', 0) for f in frame_requests), default=0, + ), + 'end': max( + (f.get('timestamp', 0) for f in frame_requests), default=0, + ), + }, + }, + 'frames': frames_with_paths, + } + FileOperations.atomic_write_json(metadata_file, metadata, indent=2) + + @staticmethod + def __prepare_episode_directory( + episode_dir: Path, context: ExecutionContext, + ) -> None: + if episode_dir.exists(): + context.logger.info( + f'Cleaning incomplete frames from previous run: {episode_dir}', + ) + shutil.rmtree(episode_dir, ignore_errors=True) + episode_dir.mkdir(parents=True, exist_ok=True) + + @staticmethod + def __construct_empty_result( + episode_dir: Path, + metadata_file: Path, + input_data: SceneCollection, + context: ExecutionContext, + ) -> FrameCollection: + context.logger.warning(f'No frames to extract for {input_data.episode_id}') + return FrameCollection( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + directory=episode_dir, + frame_count=0, + metadata_path=metadata_file, + ) + + @staticmethod + def __get_fps(stream: Dict[str, Any]) -> float: + r_frame_rate: str = stream.get('r_frame_rate', '25/1') + parts = r_frame_rate.split('/') + num, denom = int(parts[0]), int(parts[1]) if len(parts) > 1 else 1 + return num / denom if denom != 0 else 25.0 + + @staticmethod + def __fetch_video_metadata(video_path: Path) -> Dict[str, Any]: + probe_data = FFmpegWrapper.probe_video(video_path) + streams: List[Dict[str, Any]] = probe_data.get('streams', []) + + video_streams = [s for s in streams if s.get('codec_type') == 'video'] + if not video_streams: + raise ValueError(f'No video streams found in {video_path}') + return video_streams[0] + + @staticmethod + def __calculate_display_aspect_ratio(metadata: Dict[str, Any]) -> float: + width = metadata.get('width', 0) + height = metadata.get('height', 0) + + if width == 0 or height == 0: + raise ValueError('Invalid video dimensions') + + sar_str = metadata.get('sample_aspect_ratio', '1:1') + if sar_str == 'N/A' or not sar_str: + sar_str = '1:1' + + try: + sar_num, sar_denom = [int(x) for x in sar_str.split(':')] + sar = sar_num / sar_denom if sar_denom != 0 else 1.0 + except (ValueError, ZeroDivisionError): + sar = 1.0 + + return width / height * sar + + @staticmethod + def __get_all_keyframes(video_file: Path) -> List[float]: + return sorted(FFmpegWrapper.get_keyframe_timestamps(video_file)) + + @staticmethod + def __snap_to_keyframe( + keyframes: List[float], + target_timestamp: float, + ) -> float: + if not keyframes: + return target_timestamp + + idx = bisect.bisect_left(keyframes, target_timestamp) + + if idx < len(keyframes): + return keyframes[idx] + + return keyframes[-1] diff --git a/preprocessor/steps/video/scene_detection_step.py b/preprocessor/steps/video/scene_detection_step.py new file mode 100644 index 000000000..bd83a5cbe --- /dev/null +++ b/preprocessor/steps/video/scene_detection_step.py @@ -0,0 +1,146 @@ +from pathlib import Path +from typing import ( + Any, + Dict, + List, +) + +from preprocessor.config.step_configs import SceneDetectionConfig +from preprocessor.core.artifacts import ( + SceneCollection, + TranscodedVideo, +) +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext +from preprocessor.core.output_descriptors import ( + JsonFileOutput, + OutputDescriptor, +) +from preprocessor.services.io.files import FileOperations +from preprocessor.services.media.scene_detection import TransNetWrapper + + +class SceneDetectorStep(PipelineStep[TranscodedVideo, SceneCollection, SceneDetectionConfig]): + def __init__(self, config: SceneDetectionConfig) -> None: + super().__init__(config) + self.__transnet = TransNetWrapper() + self.__model_loaded = False + + @property + def supports_batch_processing(self) -> bool: + return True + + def setup_resources(self, context: ExecutionContext) -> None: + if not self.__model_loaded: + context.logger.info('Loading TransNetV2 model...') + self.__transnet.load_model() + self.__model_loaded = True + + def teardown_resources(self, context: ExecutionContext) -> None: + if self.__model_loaded: + self.__transnet.cleanup() + self.__model_loaded = False + context.logger.info('TransNetV2 model unloaded') + + def cleanup(self) -> None: + if self.__model_loaded: + self.__transnet.cleanup() + self.__model_loaded = False + + def execute_batch( + self, input_data: List[TranscodedVideo], context: ExecutionContext, + ) -> List[SceneCollection]: + return self._execute_with_threadpool( + input_data, context, self.config.max_parallel_episodes, self.execute, + ) + + def _process( + self, input_data: TranscodedVideo, context: ExecutionContext, + ) -> SceneCollection: + output_path = self._get_cache_path(input_data, context) + + self.__prepare_detection_environment(context) + scenes = self.__detect_scenes(input_data.path) + + # Retrieve video info needed for the output payload + video_info = self.__transnet.get_video_info(input_data.path) + self.__save_detection_results(scenes, video_info, output_path) + + return self.__construct_scene_collection(output_path, input_data, scenes) + + def get_output_descriptors(self) -> List[OutputDescriptor]: + return [ + JsonFileOutput( + pattern="{season}/{episode}.json", + subdir="scene_detections", + min_size_bytes=10, + ), + ] + + def _get_cache_path( + self, input_data: TranscodedVideo, context: ExecutionContext, + ) -> Path: + return self._get_standard_cache_path(input_data, context) + + def _load_from_cache( + self, cache_path: Path, input_data: TranscodedVideo, context: ExecutionContext, + ) -> SceneCollection: + scenes_data: Dict[str, Any] = FileOperations.load_json(cache_path) + return self.__construct_scene_collection( + cache_path, input_data, scenes_data.get('scenes', []), + ) + + def __prepare_detection_environment(self, context: ExecutionContext) -> None: + if not self.__model_loaded: + context.logger.info('Loading TransNetV2 model...') + self.__transnet.load_model() + self.__model_loaded = True + + def __detect_scenes(self, video_path: Path) -> List[Dict[str, Any]]: + return self.__transnet.detect_scenes( + video_path, + threshold=self.config.threshold, + min_scene_len=self.config.min_scene_len, + ) + + def __save_detection_results( + self, + scenes: List[Dict[str, Any]], + video_info: Dict[str, Any], + output_path: Path, + ) -> None: + output_data = self.__build_results_payload(scenes, video_info) + FileOperations.atomic_write_json(output_path, output_data) + + def __build_results_payload( + self, + scenes: List[Dict[str, Any]], + video_info: Dict[str, Any], + ) -> Dict[str, Any]: + return { + 'total_scenes': len(scenes), + 'video_info': video_info, + 'detection_settings': { + 'threshold': self.config.threshold, + 'min_scene_len': self.config.min_scene_len, + 'method': 'transnetv2', + }, + 'scenes': scenes, + } + + def __construct_scene_collection( + self, + output_path: Path, + input_data: TranscodedVideo, + scenes: List[Dict[str, Any]], + ) -> SceneCollection: + return SceneCollection( + path=output_path, + video_path=input_data.path, + source_video_path=getattr(input_data, 'source_video_path', input_data.path), + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + scenes=scenes, + threshold=self.config.threshold, + min_scene_len=self.config.min_scene_len, + ) diff --git a/preprocessor/steps/video/transcoding_step.py b/preprocessor/steps/video/transcoding_step.py new file mode 100644 index 000000000..f1b70a3a0 --- /dev/null +++ b/preprocessor/steps/video/transcoding_step.py @@ -0,0 +1,283 @@ +from dataclasses import replace +from pathlib import Path +from typing import ( + Any, + Dict, + List, +) + +from preprocessor.config.step_configs import TranscodeConfig +from preprocessor.core.artifacts import ( + SourceVideo, + TranscodedVideo, +) +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext +from preprocessor.core.output_descriptors import FileOutput +from preprocessor.core.temp_files import StepTempFile +from preprocessor.services.media.ffmpeg import FFmpegWrapper +from preprocessor.services.media.transcode_params import TranscodeParams + + +class VideoTranscoderStep(PipelineStep[SourceVideo, TranscodedVideo, TranscodeConfig]): + __CODEC_EFFICIENCY: Dict[str, float] = { + 'h264': 1.0, 'avc': 1.0, + 'hevc': 2.0, 'h265': 2.0, + 'vp9': 2.85, 'av1': 4.0, + } + __TARGET_FRAMERATE: float = 25.0 + __command_logged: bool = False + + @property + def supports_batch_processing(self) -> bool: + return True + + def execute_batch( + self, input_data: List[SourceVideo], context: ExecutionContext, + ) -> List[TranscodedVideo]: + total = len(input_data) + parallel = min(self.config.max_parallel_episodes, total) + context.logger.info( + f'Transcoding {total} videos (processing {parallel} in parallel)', + ) + return self._execute_with_threadpool( + input_data, context, self.config.max_parallel_episodes, self.execute, + ) + + def _process(self, input_data: SourceVideo, context: ExecutionContext) -> TranscodedVideo: + output_path = self._get_cache_path(input_data, context) + + probe_data = FFmpegWrapper.probe_video(input_data.path) + params = self.__create_transcode_params(input_data, output_path, probe_data, context) + + self.__log_transcode_details(context, input_data, params, probe_data) + self.__execute_ffmpeg_process(context, params, input_data.episode_id) + + return self.__construct_result_artifact(output_path, input_data) + + def get_output_descriptors(self) -> List[FileOutput]: + return [ + FileOutput( + pattern="{season}/{series_name}_{episode}.mp4", + subdir="transcoded_videos", + min_size_bytes=1024 * 1024, + ), + ] + + def _get_cache_path(self, input_data: SourceVideo, context: ExecutionContext) -> Path: + return self._resolve_output_path( + 0, + context, + { + 'season': input_data.episode_info.season_code(), + 'episode': input_data.episode_info.episode_code(), + 'series_name': context.series_name, + }, + ) + + def _load_from_cache( + self, cache_path: Path, input_data: SourceVideo, context: ExecutionContext, + ) -> TranscodedVideo: + return self.__construct_result_artifact(cache_path, input_data) + + def __create_transcode_params( + self, + input_data: SourceVideo, + output_path: Path, + probe_data: Dict[str, Any], + context: ExecutionContext, + ) -> TranscodeParams: + target_fps = self.__TARGET_FRAMERATE + bitrates = self.__compute_all_bitrate_settings(probe_data, context) + is_upscaling = self.__is_upscaling(probe_data) + + return TranscodeParams( + input_path=input_data.path, + output_path=output_path, + codec=self.config.codec, + preset=self.config.preset, + resolution=f'{self.config.resolution.width}:{self.config.resolution.height}', + video_bitrate=f'{bitrates["video"]}M', + minrate=f'{bitrates["min"]}M', + maxrate=f'{bitrates["max"]}M', + bufsize=f'{bitrates["buf"]}M', + audio_bitrate=f'{self.__compute_audio_bitrate(probe_data, context)}k', + gop_size=int(target_fps * self.config.keyframe_interval_seconds), + target_fps=target_fps, + deinterlace=self.__resolve_deinterlacing_strategy(input_data, context, probe_data), + is_upscaling=is_upscaling, + log_command=self.__should_log_command(), + ) + + def __is_upscaling(self, probe_data: Dict[str, Any]) -> bool: + w, h = FFmpegWrapper.get_resolution(probe_data) + sar_num, sar_denom = FFmpegWrapper.get_sample_aspect_ratio(probe_data) + eff_w = int(w * sar_num / sar_denom) + src_px = eff_w * h + target_px = self.config.resolution.width * self.config.resolution.height + return src_px < target_px + + def __is_same_resolution(self, probe_data: Dict[str, Any]) -> bool: + w, h = FFmpegWrapper.get_resolution(probe_data) + sar_num, sar_denom = FFmpegWrapper.get_sample_aspect_ratio(probe_data) + eff_w = int(w * sar_num / sar_denom) + return eff_w == self.config.resolution.width and h == self.config.resolution.height + + def __compute_all_bitrate_settings( + self, probe_data: Dict[str, Any], context: ExecutionContext, + ) -> Dict[str, float]: + src_bitrate = FFmpegWrapper.get_video_bitrate(probe_data) + min_bitrate = self.config.min_bitrate_mbps + max_bitrate = self.config.video_bitrate_mbps + + if not src_bitrate: + return self.__build_fallback_bitrates(max_bitrate) + + normalized_bitrate = self.__get_normalized_bitrate(src_bitrate, probe_data, context) + + if normalized_bitrate < min_bitrate: + final_bitrate = min_bitrate + adjustment = f"boosted to minimum ({min_bitrate} Mbps)" + elif normalized_bitrate > max_bitrate: + final_bitrate = max_bitrate + adjustment = f"capped to maximum ({max_bitrate} Mbps)" + else: + final_bitrate = normalized_bitrate * self.config.bitrate_boost_ratio + boost_percent = (self.config.bitrate_boost_ratio - 1.0) * 100 + adjustment = f"boosted by {boost_percent:.0f}%" + + context.logger.info( + f'Bitrate: {src_bitrate:.2f} → {normalized_bitrate:.2f} → {final_bitrate:.2f} Mbps ' + f'({adjustment})', + ) + + return self.__scale_bitrate_limits(final_bitrate / max_bitrate) + + def __get_normalized_bitrate( + self, src_v: float, probe: Dict[str, Any], context: ExecutionContext, + ) -> float: + src_codec = self.__normalize_codec_name(FFmpegWrapper.get_video_codec(probe)) + tgt_codec = self.__normalize_codec_name(self.config.codec) + mult = self.__get_codec_efficiency_multiplier(src_codec, tgt_codec) + + if mult != 1.0: + norm = src_v * mult + context.logger.info( + f'Codec: {src_codec.upper()}->{tgt_codec.upper()} ({mult:.2f}x) | ' + f'{src_v:.2f}->{norm:.2f} Mbps', + ) + return norm + return src_v + + def __scale_bitrate_limits(self, scale: float) -> Dict[str, float]: + return { + "video": round(self.config.video_bitrate_mbps * scale, 2), + "min": round(self.config.calculate_minrate_mbps() * scale, 2), + "max": round(self.config.calculate_maxrate_mbps() * scale, 2), + "buf": round(self.config.calculate_bufsize_mbps() * scale, 2), + } + + def __build_fallback_bitrates(self, target_max: float) -> Dict[str, float]: + return { + "video": target_max, + "min": self.config.calculate_minrate_mbps(), + "max": self.config.calculate_maxrate_mbps(), + "buf": self.config.calculate_bufsize_mbps(), + } + + def __resolve_deinterlacing_strategy( + self, input_data: SourceVideo, context: ExecutionContext, probe: Dict[str, Any], + ) -> bool: + if self.config.force_deinterlace: + context.logger.info('Deinterlacing: FORCED') + return True + has_int, stats = FFmpegWrapper.detect_interlacing(input_data.path) + if not stats: + return False + + field_order = FFmpegWrapper.get_field_order(probe) + ratio_pct = stats['ratio'] * 100 + + if has_int: + context.logger.info( + f"Interlacing detected ({ratio_pct:.1f}%) | {field_order} → APPLYING deinterlace filter", + ) + else: + context.logger.info(f"Interlacing: No ({ratio_pct:.1f}%) | {field_order}") + + return has_int + + def __compute_audio_bitrate(self, probe: Dict[str, Any], context: ExecutionContext) -> int: + src_a = FFmpegWrapper.get_audio_bitrate(probe) + tgt_a = self.config.audio_bitrate_kbps + if src_a and src_a < tgt_a: + adj = min(int(src_a * 1.05), tgt_a) + context.logger.info(f'Audio boost: {src_a} -> {adj} kbps') + return adj + return tgt_a + + def __execute_ffmpeg_process( + self, context: ExecutionContext, params: TranscodeParams, ep_id: str, + ) -> None: + with StepTempFile(params.output_path) as temp_path: + temp_params = replace(params, output_path=temp_path) + context.mark_step_started(self.name, ep_id, [str(temp_path)]) + + command_log = FFmpegWrapper.transcode(temp_params) + if command_log: + context.logger.info('=' * 20 + ' FFmpeg ' + '=' * 20) + context.logger.info(command_log) + + def __construct_result_artifact(self, path: Path, input_data: SourceVideo) -> TranscodedVideo: + return TranscodedVideo( + path=path, + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + resolution=f'{self.config.resolution.width}x{self.config.resolution.height}', + codec=self.config.codec, + source_video_path=input_data.path, + ) + + @staticmethod + def __should_log_command() -> bool: + if not VideoTranscoderStep.__command_logged: + VideoTranscoderStep.__command_logged = True + return True + return False + + @staticmethod + def __normalize_codec_name(codec: str) -> str: + name = codec.lower() + mapping = { + 'h264': ('h264', 'avc'), + 'hevc': ('h265', 'hevc'), + 'vp9': ('vp9',), + 'av1': ('av1',), + } + for norm, patterns in mapping.items(): + if any(p in name for p in patterns): + return norm + return 'h264' + + @staticmethod + def __get_codec_efficiency_multiplier(src: str, tgt: str) -> float: + eff = VideoTranscoderStep.__CODEC_EFFICIENCY + return eff.get(src, 1.0) / eff.get(tgt, 1.0) + + def __log_transcode_details( + self, + ctx: ExecutionContext, + input_data: SourceVideo, + params: TranscodeParams, + probe: Dict[str, Any], + ) -> None: + w, h = FFmpegWrapper.get_resolution(probe) + if self.__is_same_resolution(probe): + scale_label = "SAME" + elif params.is_upscaling: + scale_label = "UP" + else: + scale_label = "DOWN" + ctx.logger.info( + f'{input_data.episode_id}: {w}x{h} -> {params.resolution} [{scale_label}]', + ) diff --git a/preprocessor/steps/vision/__init__.py b/preprocessor/steps/vision/__init__.py new file mode 100644 index 000000000..5cdd3a929 --- /dev/null +++ b/preprocessor/steps/vision/__init__.py @@ -0,0 +1,7 @@ +from preprocessor.steps.vision.character_detection_step import CharacterDetectorStep +from preprocessor.steps.vision.embeddings_step import VideoEmbeddingStep +from preprocessor.steps.vision.emotion_detection_step import EmotionDetectionStep +from preprocessor.steps.vision.image_hashing_step import ImageHashStep +from preprocessor.steps.vision.object_detection_step import ObjectDetectionStep + +__all__ = ['CharacterDetectorStep', 'EmotionDetectionStep', 'ImageHashStep', 'ObjectDetectionStep', 'VideoEmbeddingStep'] diff --git a/preprocessor/steps/vision/character_detection_step.py b/preprocessor/steps/vision/character_detection_step.py new file mode 100644 index 000000000..91ad1d15a --- /dev/null +++ b/preprocessor/steps/vision/character_detection_step.py @@ -0,0 +1,206 @@ +# pylint: disable=duplicate-code +from pathlib import Path +from typing import ( + Any, + Dict, + List, +) + +import numpy as np + +from preprocessor.config.output_paths import get_base_output_dir +from preprocessor.config.step_configs import CharacterDetectionConfig +from preprocessor.core.artifacts import ( + DetectionResults, + FrameCollection, +) +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext +from preprocessor.core.output_descriptors import ( + JsonFileOutput, + OutputDescriptor, +) +from preprocessor.services.characters import FaceDetector +from preprocessor.services.io.files import FileOperations + + +class CharacterDetectorStep(PipelineStep[FrameCollection, DetectionResults, CharacterDetectionConfig]): + def __init__(self, config: CharacterDetectionConfig) -> None: + super().__init__(config) + self.__face_app = None + self.__character_vectors: Dict[str, np.ndarray] = {} + + @property + def supports_batch_processing(self) -> bool: + return True + + def setup_resources(self, context: ExecutionContext) -> None: + if self.__face_app is None: + context.logger.info('Loading Face Detection model...') + self.__face_app = FaceDetector.init() + self.__load_character_references(context) + + def teardown_resources(self, context: ExecutionContext) -> None: + if self.__face_app: + context.logger.info('Face Detection model unloaded') + self.__face_app = None + self.__character_vectors = {} + + def cleanup(self) -> None: + self.__face_app = None + self.__character_vectors = {} + + def execute_batch( + self, input_data: List[FrameCollection], context: ExecutionContext, + ) -> List[DetectionResults]: + return self._execute_with_threadpool( + input_data, context, self.config.max_parallel_episodes, self.execute, + ) + + def _process( + self, input_data: FrameCollection, context: ExecutionContext, + ) -> DetectionResults: + output_path = self._get_cache_path(input_data, context) + self.__prepare_detection_environment(context) + + frame_files = self.__extract_frame_files(input_data) + if not frame_files: + return self.__construct_empty_result(output_path, input_data, context) + + results = self.__process_character_detection(frame_files) + self.__save_detection_results( + results, output_path, input_data, context, frame_files, + ) + + return DetectionResults( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=output_path, + detection_type='character', + detection_count=len(results), + ) + + def get_output_descriptors(self) -> List[OutputDescriptor]: + """Define output file descriptors for character detection step.""" + return [ + JsonFileOutput( + subdir="detections/characters", + pattern="{season}/{episode}.json", + min_size_bytes=10, + ), + ] + + def _get_cache_path( + self, input_data: FrameCollection, context: ExecutionContext, + ) -> Path: + return self._resolve_output_path( + 0, + context, + self.__create_path_variables(input_data), + ) + + def _load_from_cache( + self, cache_path: Path, input_data: FrameCollection, context: ExecutionContext, + ) -> DetectionResults: + detection_data: Dict[str, Any] = FileOperations.load_json(cache_path) + return DetectionResults( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=cache_path, + detection_type='character', + detection_count=len(detection_data.get('detections', [])), + ) + + def __prepare_detection_environment(self, context: ExecutionContext) -> None: + if self.__face_app is None: + context.logger.info('Initializing face detection model...') + self.__face_app = FaceDetector.init() + self.__load_character_references(context) + + def __load_character_references(self, context: ExecutionContext) -> None: + base_dir = get_base_output_dir(context.series_name) + characters_dir: Path = base_dir / 'character_references_processed' + if not characters_dir.exists(): + characters_dir = base_dir / 'character_faces' + + if characters_dir.exists(): + context.logger.info(f'Loading character references from {characters_dir}') + self.__character_vectors = FaceDetector.load_character_references( + characters_dir, self.__face_app, + ) + else: + context.logger.warning(f'Characters directory not found: {characters_dir}') + + def __process_character_detection( + self, frame_files: List[Path], + ) -> List[Dict[str, Any]]: + results: List[Dict[str, Any]] = [] + for frame_path in frame_files: + detections: List[Dict[str, Any]] = FaceDetector.detect_characters_in_frame( + frame_path, + self.__face_app, + self.__character_vectors, + self.config.threshold, + ) + if detections: + results.append({'frame': frame_path.name, 'faces': detections}) + return results + + def __save_detection_results( + self, + results: List[Dict[str, Any]], + output_path: Path, + input_data: FrameCollection, + context: ExecutionContext, + frame_files: List[Path], + ) -> None: + output_data: Dict[str, Any] = { + 'episode_id': input_data.episode_id, + 'series_name': context.series_name, + 'detection_settings': self.config.model_dump(), + 'statistics': { + 'total_frames_processed': len(frame_files), + 'frames_with_detections': len(results), + 'character_counts': self.__count_characters(results), + }, + 'detections': results, + } + FileOperations.atomic_write_json(output_path, output_data) + + @staticmethod + def __create_path_variables(input_data: FrameCollection) -> Dict[str, str]: + return { + 'season': f'S{input_data.episode_info.season:02d}', + 'episode': input_data.episode_info.episode_code(), + } + + @staticmethod + def __extract_frame_files(input_data: FrameCollection) -> List[Path]: + return sorted([ + f for f in input_data.directory.glob('*.jpg') + if f.is_file() and 'frame_' in f.name + ]) + + @staticmethod + def __construct_empty_result( + output_path: Path, + input_data: FrameCollection, + context: ExecutionContext, + ) -> DetectionResults: + context.logger.warning(f'No frame files found in {input_data.directory}') + return DetectionResults( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=output_path, + detection_type='character', + detection_count=0, + ) + + @staticmethod + def __count_characters(results: List[Dict[str, Any]]) -> Dict[str, int]: + counts: Dict[str, int] = {} + for res in results: + for face in res.get('faces', []): + name: str = face.get('name', 'unknown') + counts[name] = counts.get(name, 0) + 1 + return counts diff --git a/preprocessor/steps/vision/character_reference_processor_step.py b/preprocessor/steps/vision/character_reference_processor_step.py new file mode 100644 index 000000000..6de6a65f8 --- /dev/null +++ b/preprocessor/steps/vision/character_reference_processor_step.py @@ -0,0 +1,220 @@ +from datetime import datetime +import json +from pathlib import Path +from typing import ( + List, + Tuple, +) + +from insightface.app import FaceAnalysis +import numpy as np + +from preprocessor.config.output_paths import get_base_output_dir +from preprocessor.config.settings_instance import settings +from preprocessor.config.step_configs import CharacterReferenceProcessorConfig +from preprocessor.core.artifacts import SourceVideo +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext +from preprocessor.core.output_descriptors import ( + DirectoryOutput, + OutputDescriptor, +) +from preprocessor.services.characters import ( + FaceClusterer, + FaceDetector, +) +from preprocessor.services.characters.cluster_folder_manager import ClusterFolderManager +from preprocessor.services.scraping.reference_processor import CharacterReferenceProcessor + + +class CharacterReferenceProcessorStep( + PipelineStep[SourceVideo, SourceVideo, CharacterReferenceProcessorConfig], +): + @property + def is_global(self) -> bool: + return True + + def get_output_descriptors(self) -> List[OutputDescriptor]: + return [ + DirectoryOutput( + pattern="character_references_processed", + subdir="", + expected_file_pattern="**/face_vector.npy", + min_files=1, + min_size_per_file_bytes=100, + ), + ] + + def _get_cache_path( + self, input_data: SourceVideo, context: ExecutionContext, + ) -> Path: + _, output_dir = self.__resolve_paths(context) + return output_dir + + def _load_from_cache( + self, cache_path: Path, input_data: SourceVideo, context: ExecutionContext, + ) -> SourceVideo: + context.logger.info(f"Character reference vectors already exist in: {cache_path}") + return input_data + + def _process( + self, input_data: SourceVideo, context: ExecutionContext, + ) -> SourceVideo: + if self.config.reference_source == "clusters": + return self.__process_from_clusters(input_data, context) + return self.__process_from_web(input_data, context) + + @staticmethod + def __resolve_paths(context: ExecutionContext) -> Tuple[Path, Path]: + base_dir = get_base_output_dir(context.series_name) + return base_dir / 'character_faces', base_dir / 'character_references_processed' + + def __process_from_web( + self, + input_data: SourceVideo, + context: ExecutionContext, + ) -> SourceVideo: + characters_dir, output_dir = self.__resolve_paths(context) + self.__validate_web_input_directory(characters_dir) + self.__run_reference_processor(characters_dir, output_dir, context) + return input_data + + def __process_from_clusters( + self, + input_data: SourceVideo, + context: ExecutionContext, + ) -> SourceVideo: + cluster_dir = context.base_output_dir / 'character_clusters' + _, output_dir = self.__resolve_paths(context) + + character_names = self.__load_character_names(context) + is_complete, missing = ClusterFolderManager.is_complete(cluster_dir, character_names) + + if not is_complete: + context.logger.warning( + f"Cluster labeling incomplete. Missing characters: {missing}", + ) + raise RuntimeError( + f"Not all characters have labeled cluster folders. Missing: {missing}", + ) + + labeled_folders = ClusterFolderManager.get_labeled_folders(cluster_dir) + context.logger.info( + f"Processing {len(labeled_folders)} labeled cluster folders into face vectors...", + ) + + face_app = None + try: + face_app = FaceDetector.init() + for char_name, folder in labeled_folders.items(): + self.__process_cluster_character( + char_name, folder, output_dir, face_app, context, + ) + finally: + if face_app is not None: + FaceClusterer.cleanup_gpu_memory() + + context.logger.info(f"Cluster-based face vectors saved to: {output_dir}") + return input_data + + def __process_cluster_character( + self, + char_name: str, + cluster_folder: Path, + output_dir: Path, + face_app: FaceAnalysis, + context: ExecutionContext, + ) -> None: + vector = ClusterFolderManager.extract_face_vector( + cluster_folder, face_app, context.logger, + ) + if vector is None: + context.logger.warning(f"Could not extract face vector for '{char_name}', skipping") + return + + char_out = output_dir / char_name + char_out.mkdir(parents=True, exist_ok=True) + np.save(char_out / 'face_vector.npy', vector) + self.__save_cluster_metadata(char_out, char_name, cluster_folder, vector) + context.logger.info(f"Saved face vector for '{char_name}'") + + def __run_reference_processor( + self, + characters_dir: Path, + output_dir: Path, + context: ExecutionContext, + ) -> None: + context.logger.info(f"Processing character reference images from {characters_dir}") + + processor = CharacterReferenceProcessor({ + 'characters_dir': characters_dir, + 'output_dir': output_dir, + 'similarity_threshold': self.config.similarity_threshold, + 'interactive': False, + }) + + exit_code = processor.work() + if exit_code != 0: + raise RuntimeError( + f"Character reference processor failed with exit code {exit_code}", + ) + + context.logger.info(f"Character reference vectors saved to: {output_dir}") + + @staticmethod + def __load_character_names(context: ExecutionContext) -> List[str]: + characters_json = context.base_output_dir / f'{context.series_name}_characters.json' + if not characters_json.exists(): + raise FileNotFoundError( + f"Characters JSON not found: {characters_json}. " + f"Run characters_metadata step first.", + ) + with open(characters_json, 'r', encoding='utf-8') as f: + data = json.load(f) + return [c['name'] for c in data.get('characters', []) if c.get('name')] + + @staticmethod + def __save_cluster_metadata( + char_out: Path, + char_name: str, + cluster_folder: Path, + vector: np.ndarray, + ) -> None: + metadata = { + 'character_name': char_name, + 'source': 'clusters', + 'cluster_folder': str(cluster_folder), + 'processed_at': datetime.now().isoformat(), + 'face_vector_dim': int(vector.shape[0]), + 'processing_params': { + 'face_model': settings.face_recognition.model_name, + }, + } + with open(char_out / 'metadata.json', 'w', encoding='utf-8') as f: + json.dump(metadata, f, ensure_ascii=False, indent=2) + + @staticmethod + def __validate_web_input_directory(characters_dir: Path) -> None: + if not characters_dir.exists(): + raise FileNotFoundError( + f"Character faces directory not found: {characters_dir}. " + f"Run character_reference step first.", + ) + + def _check_cache_validity( + self, + output_path: Path, + context: ExecutionContext, + episode_id: str, + cache_description: str, + ) -> bool: + if output_path.exists() and not context.force_rerun: + vectors = list(output_path.rglob('face_vector.npy')) + if vectors: + if not context.is_step_completed(self.name, episode_id): + context.mark_step_completed(self.name, episode_id) + context.logger.info( + f'Skipping {episode_id} ({cache_description}, {len(vectors)} vectors found)', + ) + return True + return False diff --git a/preprocessor/steps/vision/embeddings_step.py b/preprocessor/steps/vision/embeddings_step.py new file mode 100644 index 000000000..9c06aa120 --- /dev/null +++ b/preprocessor/steps/vision/embeddings_step.py @@ -0,0 +1,219 @@ +# pylint: disable=duplicate-code +from pathlib import Path +from typing import ( + Any, + Dict, + List, + Optional, +) + +import numpy as np + +from preprocessor.config.step_configs import VideoEmbeddingConfig +from preprocessor.core.artifacts import ( + EmbeddingCollection, + FrameCollection, +) +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext +from preprocessor.core.output_descriptors import FileOutput +from preprocessor.services.io.files import FileOperations +from preprocessor.services.io.metadata import MetadataBuilder +from preprocessor.services.search.embedding_model import EmbeddingModelWrapper + + +class VideoEmbeddingStep(PipelineStep[FrameCollection, EmbeddingCollection, VideoEmbeddingConfig]): + def __init__(self, config: VideoEmbeddingConfig) -> None: + super().__init__(config) + self.__model: Optional[EmbeddingModelWrapper] = None + + @property + def supports_batch_processing(self) -> bool: + return True + + def setup_resources(self, context: ExecutionContext) -> None: + if self.__model is None: + context.logger.info(f'Loading VLLM embedding model: {self.config.model_name}') + self.__model = EmbeddingModelWrapper(self.config.model_name, self.config.device) + self.__model.load_model() + + def teardown_resources(self, context: ExecutionContext) -> None: + if self.__model: + self.__model.cleanup() + self.__model = None + context.logger.info('VLLM embedding model unloaded') + + def cleanup(self) -> None: + if self.__model: + self.__model.cleanup() + self.__model = None + + def execute_batch( + self, input_data: List[FrameCollection], context: ExecutionContext, + ) -> List[EmbeddingCollection]: + return self._execute_sequential(input_data, context, self.execute) + + def _process( + self, input_data: FrameCollection, context: ExecutionContext, + ) -> EmbeddingCollection: + output_path = self._get_cache_path(input_data, context) + + frame_requests = self.__extract_frame_requests(input_data, context) + if not frame_requests: + return self.__construct_embedding_collection( + input_data, output_path, 0, self.config.model_name, + ) + + self.__prepare_embedding_model(context) + context.logger.info( + f'Generating video embeddings for {len(frame_requests)} frames in {input_data.episode_id}', + ) + + image_hashes = self.__fetch_image_hashes(input_data, context) + results = self.__generate_embeddings(frame_requests, input_data, image_hashes) + self.__save_embedding_results(results, output_path, input_data, image_hashes) + + return self.__construct_embedding_collection( + input_data, output_path, len(results), self.config.model_name, + ) + + def get_output_descriptors(self) -> List[FileOutput]: + return [ + FileOutput( + pattern="{season}/{episode}.json", + subdir="embeddings/vision", + min_size_bytes=1024, + ), + ] + + def _get_cache_path( + self, input_data: FrameCollection, context: ExecutionContext, + ) -> Path: + return self._resolve_output_path( + 0, + context, + self.__create_path_variables(input_data), + ) + + def _load_from_cache( + self, cache_path: Path, input_data: FrameCollection, context: ExecutionContext, + ) -> EmbeddingCollection: + emb_data: Dict[str, Any] = FileOperations.load_json(cache_path) + return self.__construct_embedding_collection( + input_data, + cache_path, + len(emb_data.get('video_embeddings', [])), + self.config.model_name, + ) + + def __prepare_embedding_model(self, context: ExecutionContext) -> None: + if self.__model is None: + context.logger.info('Initializing embedding model...') + self.__model = EmbeddingModelWrapper(self.config.model_name, self.config.device) + self.__model.load_model() + + def __generate_embeddings( + self, + frame_requests: List[Dict[str, Any]], + input_data: FrameCollection, + image_hashes: Dict[int, str], + ) -> List[Dict[str, Any]]: + results: List[Dict[str, Any]] = [] + batch_size: int = self.config.batch_size + + if not self.__model: + raise RuntimeError("Embedding model not initialized") + + for i in range(0, len(frame_requests), batch_size): + batch: List[Dict[str, Any]] = frame_requests[i : i + batch_size] + image_paths: List[str] = [ + str(input_data.directory / f['frame_path']) for f in batch + ] + batch_embeddings: List[np.ndarray] = self.__model.encode_images(image_paths) + + for request, embedding in zip(batch, batch_embeddings): + res: Dict[str, Any] = {**request, 'embedding': embedding.tolist()} + frame_num: int = request.get('frame_number', -1) + if frame_num in image_hashes: + res['perceptual_hash'] = image_hashes[frame_num] + results.append(res) + + return results + + @staticmethod + def __create_path_variables(input_data: FrameCollection) -> Dict[str, str]: + return { + "season": f"S{input_data.episode_info.season:02d}", + "episode": input_data.episode_info.episode_code(), + } + + @staticmethod + def __extract_frame_requests( + input_data: FrameCollection, + context: ExecutionContext, + ) -> List[Dict[str, Any]]: + frame_metadata: Dict[str, Any] = FileOperations.load_json( + input_data.metadata_path, + ) + frame_requests: List[Dict[str, Any]] = frame_metadata.get('frames', []) + if not frame_requests: + context.logger.warning(f'No frames for embedding in {input_data.episode_id}') + return frame_requests + + @staticmethod + def __fetch_image_hashes( + input_data: FrameCollection, context: ExecutionContext, + ) -> Dict[int, str]: + season = f'S{input_data.episode_info.season:02d}' + episode = input_data.episode_info.episode_code() + hash_path: Path = context.base_output_dir / 'hashes' / season / f'{episode}.json' + + if not hash_path.exists(): + return {} + + try: + data: Dict[str, Any] = FileOperations.load_json(hash_path) + return { + h['frame_number']: h['perceptual_hash'] + for h in data.get('hashes', []) + } + except Exception as e: + context.logger.warning(f'Could not load image hashes from {hash_path}: {e}') + return {} + + @staticmethod + def __save_embedding_results( + results: List[Dict[str, Any]], + output_path: Path, + input_data: FrameCollection, + image_hashes: Dict[int, str], + ) -> None: + statistics = { + 'total_embeddings': len(results), + 'embedding_dimension': len(results[0]['embedding']) if results else 0, + 'frames_with_hash': len(image_hashes), + } + output_data: Dict[str, Any] = MetadataBuilder.create_processing_metadata( + episode_info=input_data.episode_info, + processing_params={}, + statistics=statistics, + results_key='video_embeddings', + results_data=results, + ) + FileOperations.atomic_write_json(output_path, output_data) + + @staticmethod + def __construct_embedding_collection( + input_data: FrameCollection, + output_path: Path, + embedding_count: int, + model_name: str, + ) -> EmbeddingCollection: + return MetadataBuilder.create_embedding_collection( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=output_path, + model_name=model_name, + embedding_count=embedding_count, + embedding_type='video', + ) diff --git a/preprocessor/steps/vision/emotion_detection_step.py b/preprocessor/steps/vision/emotion_detection_step.py new file mode 100644 index 000000000..bb7ad69f5 --- /dev/null +++ b/preprocessor/steps/vision/emotion_detection_step.py @@ -0,0 +1,228 @@ +# pylint: disable=duplicate-code +from pathlib import Path +from typing import ( + Any, + Dict, + List, + Optional, + Tuple, +) + +import cv2 +from hsemotion_onnx.facial_emotions import HSEmotionRecognizer +import numpy as np + +from preprocessor.config.step_configs import EmotionDetectionConfig +from preprocessor.core.artifacts import ( + EmotionData, + FrameCollection, +) +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext +from preprocessor.core.output_descriptors import ( + JsonFileOutput, + OutputDescriptor, +) +from preprocessor.services.io.files import FileOperations +from preprocessor.services.video.emotion_utils import EmotionDetector + + +class EmotionDetectionStep(PipelineStep[FrameCollection, EmotionData, EmotionDetectionConfig]): + def __init__(self, config: EmotionDetectionConfig) -> None: + super().__init__(config) + self.__model: Optional[HSEmotionRecognizer] = None + + @property + def supports_batch_processing(self) -> bool: + return True + + def setup_resources(self, context: ExecutionContext) -> None: + if self.__model is None: + context.logger.info('Loading HSEmotion model...') + self.__model = EmotionDetector.init_model(context.logger) + + def teardown_resources(self, context: ExecutionContext) -> None: + if self.__model: + context.logger.info('HSEmotion model unloaded') + self.__model = None + + def cleanup(self) -> None: + self.__model = None + + def execute_batch( + self, input_data: List[FrameCollection], context: ExecutionContext, + ) -> List[EmotionData]: + return self._execute_with_threadpool( + input_data, context, self.config.max_parallel_episodes, self.execute, + ) + + def _process( + self, input_data: FrameCollection, context: ExecutionContext, + ) -> EmotionData: + input_path = self.__resolve_input_path(input_data, context) + output_path = self._get_cache_path(input_data, context) + + if not input_path.exists(): + context.logger.warning( + f'No character detections found for emotion analysis: {input_path}', + ) + return self.__construct_emotion_data(input_data, output_path) + + self.__prepare_emotion_model(context) + + detections_data = FileOperations.load_json(input_path) + self.__process_and_update_emotions(detections_data, input_data, context) + FileOperations.atomic_write_json(output_path, detections_data) + + return self.__construct_emotion_data(input_data, output_path) + + def get_output_descriptors(self) -> List[OutputDescriptor]: + return [ + JsonFileOutput( + subdir="detections/emotions", + pattern="{season}/{episode}.json", + min_size_bytes=10, + ), + ] + + def _get_cache_path( + self, input_data: FrameCollection, context: ExecutionContext, + ) -> Path: + return self._resolve_output_path( + 0, + context, + self.__create_path_variables(input_data), + ) + + def _load_from_cache( + self, cache_path: Path, input_data: FrameCollection, context: ExecutionContext, + ) -> EmotionData: + return self.__construct_emotion_data(input_data, cache_path) + + def __prepare_emotion_model(self, context: ExecutionContext) -> None: + if self.__model is None: + self.__model = EmotionDetector.init_model(context.logger) + + def __process_and_update_emotions( + self, + detections_data: Dict[str, Any], + input_data: FrameCollection, + context: ExecutionContext, + ) -> None: + detections: List[Dict[str, Any]] = detections_data.get('detections', []) + + face_crops, face_metadata = self.__collect_face_crops( + detections, input_data.directory, context, + ) + + if not face_crops: + context.logger.warning('No valid face crops found for emotion detection') + return + + context.logger.info(f'Processing {len(face_crops)} faces with HSEmotion model') + emotion_results = EmotionDetector.detect_batch( + face_crops, self.__model, batch_size=32, logger=context.logger, + ) + + self.__apply_emotion_results(detections, emotion_results, face_metadata, context) + + @staticmethod + def __create_path_variables(input_data: FrameCollection) -> Dict[str, str]: + return { + 'season': f'S{input_data.episode_info.season:02d}', + 'episode': input_data.episode_info.episode_code(), + } + + @staticmethod + def __resolve_input_path( + input_data: FrameCollection, context: ExecutionContext, + ) -> Path: + season_code = f'S{input_data.episode_info.season:02d}' + episode_code = input_data.episode_info.episode_code() + return ( + context.base_output_dir + / 'detections' + / 'characters' + / season_code + / f'{episode_code}.json' + ) + + @staticmethod + def __construct_emotion_data( + input_data: FrameCollection, detections_path: Path, + ) -> EmotionData: + return EmotionData( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=detections_path, + ) + + @staticmethod + def __collect_face_crops( + detections: List[Dict[str, Any]], + frames_dir: Path, + context: ExecutionContext, + ) -> Tuple[List[np.ndarray], List[Dict[str, int]]]: + face_crops: List[np.ndarray] = [] + face_metadata: List[Dict[str, int]] = [] + + total_faces = sum(len(d.get('faces', [])) for d in detections) + context.logger.info(f'Collecting {total_faces} faces for batch emotion analysis') + + for detection_idx, detection in enumerate(detections): + frame_file = detection.get('frame') + if not frame_file: + continue + + frame_path = frames_dir / frame_file + if not frame_path.exists(): + continue + + frame = cv2.imread(str(frame_path)) + if frame is None: + continue + + faces = detection.get('faces', []) + for face_idx, face in enumerate(faces): + bbox = face.get('bbox') + if not bbox: + continue + + face_crop = EmotionDetector.crop_face(frame, bbox) + if face_crop is None: + continue + + face_crops.append(face_crop) + face_metadata.append({ + 'detection_idx': detection_idx, + 'face_idx': face_idx, + }) + + return face_crops, face_metadata + + @staticmethod + def __apply_emotion_results( + detections: List[Dict[str, Any]], + emotion_results: List[Optional[Tuple[str, float, Dict[str, float]]]], + face_metadata: List[Dict[str, int]], + context: ExecutionContext, + ) -> None: + processed = 0 + for result, metadata in zip(emotion_results, face_metadata): + if result is None: + continue + + dominant_emotion, confidence, emotion_scores = result + detection_idx = metadata['detection_idx'] + face_idx = metadata['face_idx'] + + face = detections[detection_idx]['faces'][face_idx] + face['emotion'] = { + 'label': dominant_emotion, + 'confidence': confidence, + 'scores': emotion_scores, + } + processed += 1 + + total = len(face_metadata) + context.logger.info(f'Emotion analysis complete: {processed}/{total} faces processed') diff --git a/preprocessor/steps/vision/image_hashing_step.py b/preprocessor/steps/vision/image_hashing_step.py new file mode 100644 index 000000000..0d29e4882 --- /dev/null +++ b/preprocessor/steps/vision/image_hashing_step.py @@ -0,0 +1,200 @@ +import gc +from pathlib import Path +from typing import ( + Any, + Dict, + List, + Optional, + Tuple, +) + +import torch + +from preprocessor.config.step_configs import ImageHashConfig +from preprocessor.core.artifacts import ( + FrameCollection, + ImageHashCollection, +) +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext +from preprocessor.core.output_descriptors import ( + JsonFileOutput, + OutputDescriptor, +) +from preprocessor.services.io.files import FileOperations +from preprocessor.services.video.frame_utils import FrameLoader +from preprocessor.services.video.image_hasher import PerceptualHasher + + +class ImageHashStep(PipelineStep[FrameCollection, ImageHashCollection, ImageHashConfig]): + def __init__(self, config: ImageHashConfig) -> None: + super().__init__(config) + self.__hasher: Optional[PerceptualHasher] = None + + @property + def supports_batch_processing(self) -> bool: + return True + + def execute_batch( + self, input_data: List[FrameCollection], context: ExecutionContext, + ) -> List[ImageHashCollection]: + return self._execute_with_threadpool( + input_data, context, self.config.max_parallel_episodes, self.execute, + ) + + def cleanup(self) -> None: + self.__hasher = None + self.__cleanup_memory() + + def _process( + self, input_data: FrameCollection, context: ExecutionContext, + ) -> ImageHashCollection: + output_path = self._get_cache_path(input_data, context) + + frame_metadata, frame_requests = self.__load_frame_metadata(input_data, context) + if not frame_requests: + return self.__construct_empty_result(output_path, input_data) + + self.__prepare_hasher(context) + + context.logger.info( + f'Computing hashes for {len(frame_requests)} frames in {input_data.episode_id}', + ) + + hash_results = self.__compute_hashes(frame_requests, input_data) + self.__save_hash_results( + hash_results, output_path, input_data, context, frame_metadata, self.config.device, + ) + + self.__cleanup_memory() + + return ImageHashCollection( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=output_path, + hash_count=len(hash_results), + ) + + def get_output_descriptors(self) -> List[OutputDescriptor]: + return [ + JsonFileOutput( + subdir="hashes", + pattern="{season}/{episode}.json", + min_size_bytes=50, + ), + ] + + def _get_cache_path( + self, input_data: FrameCollection, context: ExecutionContext, + ) -> Path: + return self._resolve_output_path( + 0, + context, + self.__create_path_variables(input_data), + ) + + def _load_from_cache( + self, cache_path: Path, input_data: FrameCollection, context: ExecutionContext, + ) -> ImageHashCollection: + hash_data: Dict[str, Any] = FileOperations.load_json(cache_path) + return ImageHashCollection( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=cache_path, + hash_count=len(hash_data.get('hashes', [])), + ) + + def __prepare_hasher(self, context: ExecutionContext) -> None: + if self.__hasher is None: + context.logger.info(f'Loading image hasher on {self.config.device}...') + self.__hasher = PerceptualHasher(device=self.config.device) + + @staticmethod + def __parse_frame_number(request: Dict[str, Any]) -> int: + return int(request['frame_number']) + + def __compute_hashes( + self, + frame_requests: List[Dict[str, Any]], + input_data: FrameCollection, + ) -> List[Dict[str, Any]]: + hash_results: List[Dict[str, Any]] = [] + batch_size: int = self.config.batch_size + + for i in range(0, len(frame_requests), batch_size): + batch: List[Dict[str, Any]] = frame_requests[i : i + batch_size] + pil_images = FrameLoader.load_from_requests(input_data.directory, batch) + phashes: List[str] = self.__hasher.compute_phash_batch(pil_images) + + for request, phash in zip(batch, phashes): + result: Dict[str, Any] = request.copy() + result['frame_number'] = self.__parse_frame_number(request) + result['perceptual_hash'] = phash + hash_results.append(result) + + del pil_images + if i % (batch_size * 5) == 0: + self.__cleanup_memory() + + return hash_results + + @staticmethod + def __create_path_variables(input_data: FrameCollection) -> Dict[str, str]: + return { + 'season': f'S{input_data.episode_info.season:02d}', + 'episode': input_data.episode_info.episode_code(), + } + + @staticmethod + def __load_frame_metadata( + input_data: FrameCollection, + context: ExecutionContext, + ) -> Tuple[Dict[str, Any], List[Dict[str, Any]]]: + frame_metadata: Dict[str, Any] = FileOperations.load_json( + input_data.metadata_path, + ) + frame_requests: List[Dict[str, Any]] = frame_metadata.get('frames', []) + + if not frame_requests: + context.logger.warning(f'No frames to hash for {input_data.episode_id}') + + return frame_metadata, frame_requests + + @staticmethod + def __construct_empty_result( + output_path: Path, + input_data: FrameCollection, + ) -> ImageHashCollection: + return ImageHashCollection( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=output_path, + hash_count=0, + ) + + @staticmethod + def __save_hash_results( + hash_results: List[Dict[str, Any]], + output_path: Path, + input_data: FrameCollection, + context: ExecutionContext, + frame_metadata: Dict[str, Any], + device: str, + ) -> None: + output_data: Dict[str, Any] = { + 'episode_id': input_data.episode_id, + 'series_name': context.series_name, + 'generated_at': frame_metadata.get('generated_at'), + 'hash_settings': { + 'device': device, + 'batch_size': len(hash_results) // 10 if hash_results else 1, + }, + 'hashes': hash_results, + } + FileOperations.atomic_write_json(output_path, output_data) + + @staticmethod + def __cleanup_memory() -> None: + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() diff --git a/preprocessor/steps/vision/object_detection_step.py b/preprocessor/steps/vision/object_detection_step.py new file mode 100644 index 000000000..c7d653b96 --- /dev/null +++ b/preprocessor/steps/vision/object_detection_step.py @@ -0,0 +1,235 @@ +# pylint: disable=duplicate-code +import gc +from pathlib import Path +from typing import ( + Any, + Dict, + List, + Optional, +) + +from PIL import Image +import torch +from transformers import ( + AutoImageProcessor, + DFineForObjectDetection, +) + +from preprocessor.config.step_configs import ObjectDetectionConfig +from preprocessor.core.artifacts import ( + FrameCollection, + ObjectDetectionData, +) +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext +from preprocessor.core.output_descriptors import ( + JsonFileOutput, + OutputDescriptor, +) +from preprocessor.services.io.files import FileOperations + + +class ObjectDetectionStep( + PipelineStep[FrameCollection, ObjectDetectionData, ObjectDetectionConfig], +): + def __init__(self, config: ObjectDetectionConfig) -> None: + super().__init__(config) + self.__model: Optional[DFineForObjectDetection] = None + self.__image_processor: Optional[AutoImageProcessor] = None + + @property + def supports_batch_processing(self) -> bool: + return True + + def setup_resources(self, context: ExecutionContext) -> None: + if self.__model is None: + self.__load_model(context) + + def teardown_resources(self, context: ExecutionContext) -> None: + if self.__model: + self.__unload_model(context) + + def cleanup(self) -> None: + self.__model = None + self.__image_processor = None + + def execute_batch( + self, input_data: List[FrameCollection], context: ExecutionContext, + ) -> List[ObjectDetectionData]: + return self._execute_with_threadpool( + input_data, context, self.config.max_parallel_episodes, self.execute, + ) + + def _process( + self, input_data: FrameCollection, context: ExecutionContext, + ) -> ObjectDetectionData: + output_path = self._get_cache_path(input_data, context) + self.__ensure_model_loaded(context) + + frame_files = self.__extract_frame_files(input_data) + if not frame_files: + context.logger.warning(f'No frame files found in {input_data.directory}') + return ObjectDetectionData( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=output_path, + ) + + detections = self.__process_batches(frame_files) + self.__save_results(detections, output_path, input_data, context, frame_files) + + return ObjectDetectionData( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=output_path, + ) + + def get_output_descriptors(self) -> List[OutputDescriptor]: + return [ + JsonFileOutput( + subdir="detections/objects", + pattern="{season}/{episode}.json", + min_size_bytes=10, + ), + ] + + def _get_cache_path( + self, input_data: FrameCollection, context: ExecutionContext, + ) -> Path: + return self._resolve_output_path( + 0, + context, + self.__create_path_variables(input_data), + ) + + def _load_from_cache( + self, cache_path: Path, input_data: FrameCollection, context: ExecutionContext, + ) -> ObjectDetectionData: + return ObjectDetectionData( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=cache_path, + ) + + def __ensure_model_loaded(self, context: ExecutionContext) -> None: + if self.__model is None: + self.__load_model(context) + + def __load_model(self, context: ExecutionContext) -> None: + if not torch.cuda.is_available(): + raise RuntimeError('CUDA is not available. Object detection requires GPU.') + + context.logger.info(f'Loading D-FINE model: {self.config.model_name}') + self.__image_processor = AutoImageProcessor.from_pretrained(self.config.model_name) + self.__model = DFineForObjectDetection.from_pretrained(self.config.model_name) + self.__model.to('cuda') + context.logger.info('D-FINE model loaded on GPU') + + def __unload_model(self, context: ExecutionContext) -> None: + context.logger.info('Object Detection model unloaded') + self.__model = None + self.__image_processor = None + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + def __process_batches(self, frame_files: List[Path]) -> List[Dict[str, Any]]: + detections: List[Dict[str, Any]] = [] + + for batch_start in range(0, len(frame_files), self.config.batch_size): + batch_paths = frame_files[batch_start:batch_start + self.config.batch_size] + batch_detections = self.__process_single_batch(batch_paths) + detections.extend(batch_detections) + + return detections + + def __process_single_batch(self, batch_paths: List[Path]) -> List[Dict[str, Any]]: + batch_images = [Image.open(fp) for fp in batch_paths] + target_sizes = [(img.height, img.width) for img in batch_images] + + inputs = self.__image_processor(images=batch_images, return_tensors='pt') + inputs = {k: v.to('cuda') for k, v in inputs.items()} + + with torch.no_grad(): + outputs = self.__model(**inputs) + + results = self.__image_processor.post_process_object_detection( + outputs, + target_sizes=target_sizes, + threshold=self.config.conf_threshold, + ) + + batch_detections = [] + for frame_path, result in zip(batch_paths, results): + frame_entry = self.__build_frame_entry(frame_path, result) + if frame_entry['objects']: + batch_detections.append(frame_entry) + + for img in batch_images: + img.close() + + return batch_detections + + def __build_frame_entry( + self, frame_path: Path, result: Dict[str, Any], + ) -> Dict[str, Any]: + objects: List[Dict[str, Any]] = [] + for score, label_id, box in zip(result['scores'], result['labels'], result['boxes']): + box_coords = [float(v) for v in box.tolist()] + objects.append({ + 'class_id': label_id.item(), + 'class_name': self.__model.config.id2label[label_id.item()], + 'confidence': score.item(), + 'bbox': { + 'x1': box_coords[0], + 'y1': box_coords[1], + 'x2': box_coords[2], + 'y2': box_coords[3], + }, + }) + return {'frame': frame_path.name, 'objects': objects} + + def __save_results( + self, + detections: List[Dict[str, Any]], + output_path: Path, + input_data: FrameCollection, + context: ExecutionContext, + frame_files: List[Path], + ) -> None: + output_path.parent.mkdir(parents=True, exist_ok=True) + output_data: Dict[str, Any] = { + 'episode_id': input_data.episode_id, + 'series_name': context.series_name, + 'detection_settings': self.config.model_dump(), + 'statistics': { + 'total_frames_processed': len(frame_files), + 'frames_with_detections': len(detections), + 'object_counts': self.__count_objects(detections), + }, + 'detections': detections, + } + FileOperations.atomic_write_json(output_path, output_data) + + @staticmethod + def __create_path_variables(input_data: FrameCollection) -> Dict[str, str]: + return { + 'season': f'S{input_data.episode_info.season:02d}', + 'episode': input_data.episode_info.episode_code(), + } + + @staticmethod + def __extract_frame_files(input_data: FrameCollection) -> List[Path]: + return sorted([ + f for f in input_data.directory.glob('*.jpg') + if f.is_file() and 'frame_' in f.name + ]) + + @staticmethod + def __count_objects(detections: List[Dict[str, Any]]) -> Dict[str, int]: + counts: Dict[str, int] = {} + for frame in detections: + for obj in frame.get('objects', []): + name: str = obj.get('class_name', 'unknown') + counts[name] = counts.get(name, 0) + 1 + return counts diff --git a/preprocessor/steps/vision/series_face_clustering_step.py b/preprocessor/steps/vision/series_face_clustering_step.py new file mode 100644 index 000000000..974521f82 --- /dev/null +++ b/preprocessor/steps/vision/series_face_clustering_step.py @@ -0,0 +1,139 @@ +import json +from pathlib import Path +from typing import ( + Any, + Dict, + List, +) + +from preprocessor.config.settings_instance import settings +from preprocessor.config.step_configs import SeriesFaceClusteringConfig +from preprocessor.core.artifacts import SourceVideo +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext +from preprocessor.core.output_descriptors import ( + JsonFileOutput, + OutputDescriptor, +) +from preprocessor.services.characters import ( + FaceClusterer, + FaceDetector, +) +from preprocessor.services.characters.cluster_folder_manager import ClusterFolderManager +from preprocessor.services.io.files import FileOperations + + +class SeriesFaceClusteringStep(PipelineStep[SourceVideo, SourceVideo, SeriesFaceClusteringConfig]): + @property + def is_global(self) -> bool: + return True + + def get_output_descriptors(self) -> List[OutputDescriptor]: + return [ + JsonFileOutput( + pattern='_cluster_index.json', + subdir='character_clusters', + min_size_bytes=10, + ), + ] + + def _get_cache_path(self, input_data: SourceVideo, context: ExecutionContext) -> Path: + return context.base_output_dir / 'character_clusters' / '_cluster_index.json' + + def _load_from_cache( + self, cache_path: Path, input_data: SourceVideo, context: ExecutionContext, + ) -> SourceVideo: + context.logger.info(f"Series character clusters already exist: {cache_path.parent}") + return input_data + + def _process(self, input_data: SourceVideo, context: ExecutionContext) -> SourceVideo: + frames_root = context.base_output_dir / 'frames' + output_dir = context.base_output_dir / 'character_clusters' + + frame_files = self.__collect_frame_files(frames_root) + if not frame_files: + context.logger.warning(f"No frames found in {frames_root}") + return input_data + + context.logger.info( + f"Extracting face embeddings from {len(frame_files)} frames across the series...", + ) + + clustering = settings.face_clustering + face_app = None + try: + face_app = FaceDetector.init(det_thresh=clustering.min_det_score) + face_data = FaceClusterer.extract_face_embeddings( + frame_files, face_app, self.config.prefetch_workers, + min_det_score=clustering.min_det_score, + min_face_px=clustering.min_face_px, + ) + + if not face_data: + context.logger.warning("No faces detected across the series") + return input_data + + context.logger.info(f"Clustering {len(face_data)} face embeddings series-wide...") + + labels = FaceClusterer.cluster_embeddings( + face_data, clustering.min_cluster_size, clustering.min_samples, + ) + + cluster_count = ClusterFolderManager.create_cluster_folders( + face_data=face_data, + labels=labels, + output_dir=output_dir, + logger=context.logger, + ) + + self.__write_cluster_index(output_dir, context.series_name, cluster_count, face_data, frame_files) + self.__create_character_label_folders(output_dir, context) + + context.logger.info( + f"Series clustering complete: {cluster_count} clusters → {output_dir}", + ) + finally: + if face_app is not None: + FaceClusterer.cleanup_gpu_memory() + + return input_data + + @staticmethod + def __write_cluster_index( + output_dir: Path, + series_name: str, + cluster_count: int, + face_data: List[Dict[str, Any]], + frame_files: List[Path], + ) -> None: + index_data = { + 'series_name': series_name, + 'cluster_count': cluster_count, + 'total_faces': len(face_data), + 'total_frames': len(frame_files), + } + FileOperations.atomic_write_json(output_dir / '_cluster_index.json', index_data) + + @staticmethod + def __create_character_label_folders(output_dir: Path, context: ExecutionContext) -> None: + characters_json = context.base_output_dir / f'{context.series_name}_characters.json' + if not characters_json.exists(): + return + with open(characters_json, 'r', encoding='utf-8') as f: + data = json.load(f) + names = [c['name'] for c in data.get('characters', []) if c.get('name')] + for name in names: + folder = output_dir / name + if not folder.exists(): + folder.mkdir(parents=True) + if names: + context.logger.info(f"Created {len(names)} empty character label folders") + + @staticmethod + def __collect_frame_files(frames_root: Path) -> List[Path]: + if not frames_root.exists(): + return [] + return sorted([ + f for f in frames_root.rglob('*.jpg') + if f.is_file() and 'frame_' in f.name + ]) diff --git a/preprocessor/text_analysis/__init__.py b/preprocessor/text_analysis/__init__.py deleted file mode 100644 index 937e69245..000000000 --- a/preprocessor/text_analysis/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from preprocessor.text_analysis.text_analyzer import TextAnalyzer -from preprocessor.text_analysis.text_statistics import TextStatistics - -__all__ = ["TextAnalyzer", "TextStatistics"] diff --git a/preprocessor/text_analysis/text_analyzer.py b/preprocessor/text_analysis/text_analyzer.py deleted file mode 100644 index 0f6711ccf..000000000 --- a/preprocessor/text_analysis/text_analyzer.py +++ /dev/null @@ -1,135 +0,0 @@ -from datetime import datetime -import logging -from typing import ( - Any, - Dict, - List, -) - -from preprocessor.config.config import ( - BASE_OUTPUT_DIR, - settings, -) -from preprocessor.core.base_processor import ( - BaseProcessor, - OutputSpec, - ProcessingItem, -) -from preprocessor.core.episode_manager import EpisodeManager -from preprocessor.text_analysis.text_statistics import TextStatistics -from preprocessor.utils.file_utils import atomic_write_json - - -class TextAnalyzer(BaseProcessor): - def __init__(self, args: Dict[str, Any]): - super().__init__( - args=args, - class_name=self.__class__.__name__, - error_exit_code=40, - loglevel=logging.INFO, - ) - self.transcriptions_base = BASE_OUTPUT_DIR / settings.output_subdirs.transcriptions - self.language = args.get("language", "pl") - self.episode_manager = EpisodeManager( - args.get("episodes_info_json"), - args.get("series_name", "ranczo"), - ) - - def _validate_args(self, args: Dict[str, Any]) -> None: - if "series_name" not in args: - raise ValueError("series_name is required") - - def _get_processing_items(self) -> List[ProcessingItem]: - items = [] - - if not self.transcriptions_base.exists(): - self.logger.error(f"Transcriptions directory not found: {self.transcriptions_base}") - return items - - for season_dir in sorted(self.transcriptions_base.glob("S*")): - if not season_dir.is_dir(): - continue - - for episode_dir in sorted(season_dir.glob("E*")): - if not episode_dir.is_dir(): - continue - - clean_subdir = episode_dir / settings.output_subdirs.transcription_subdirs.clean - clean_txt_files = list(clean_subdir.glob("*_clean_transcription.txt")) - if not clean_txt_files: - continue - txt_file = clean_txt_files[0] - - episode_info = self.episode_manager.parse_filename(txt_file) - if not episode_info: - self.logger.error(f"Cannot parse episode info from {txt_file.name}") - continue - - episode_id = EpisodeManager.get_episode_id_for_state(episode_info) - - items.append( - ProcessingItem( - episode_id=episode_id, - input_path=txt_file, - metadata={ - "episode_info": episode_info, - "episode_dir": episode_dir, - }, - ), - ) - - return items - - def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: - episode_dir = item.metadata["episode_dir"] - episode_info = item.metadata["episode_info"] - clean_dir = episode_dir / settings.output_subdirs.transcription_subdirs.clean - - output_filename = self.episode_manager.file_naming.build_filename( - episode_info, - extension="json", - suffix="text_stats", - ) - output_file = clean_dir / output_filename - - return [OutputSpec(path=output_file, required=True)] - - def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> None: - txt_file = item.input_path - episode_dir = item.metadata["episode_dir"] - episode_info = item.metadata["episode_info"] - clean_dir = episode_dir / settings.output_subdirs.transcription_subdirs.clean - - output_filename = self.episode_manager.file_naming.build_filename( - episode_info, - extension="json", - suffix="text_stats", - ) - output_file = clean_dir / output_filename - - try: - stats = TextStatistics.from_file(txt_file, language=self.language) - - result = { - "metadata": { - "episode_id": episode_info.episode_code(), - "language": self.language, - "source_file": txt_file.name, - "analyzed_at": datetime.now().isoformat(), - }, - **stats.to_dict(), - } - - atomic_write_json(output_file, result) - - self.logger.info( - f"Text analysis completed for {item.episode_id}: " - f"{stats.words} words, {stats.sentences} sentences", - ) - - except Exception as e: - self.logger.error(f"Failed to analyze {txt_file.name}: {e}") - raise - - def _get_progress_description(self) -> str: - return f"Analyzing transcription texts ({self.language})" diff --git a/preprocessor/text_analysis/text_statistics.py b/preprocessor/text_analysis/text_statistics.py deleted file mode 100644 index 8bf692bd0..000000000 --- a/preprocessor/text_analysis/text_statistics.py +++ /dev/null @@ -1,207 +0,0 @@ -from collections import Counter -from dataclasses import ( - dataclass, - field, -) -from pathlib import Path -import re -from typing import ( - Any, - Dict, - List, - Set, -) - - -@dataclass -class LanguageConfig: - vowels: Set[str] - consonants: Set[str] - punctuation: Set[str] - special_chars: Set[str] - - -POLISH_VOWELS = set("aąeęioóuyAĄEĘIOÓUY") -POLISH_CONSONANTS = set("bcćdfghjklłmnńprsśtwzźżBCĆDFGHJKLŁMNŃPRSŚTWZŹŻ") -ENGLISH_VOWELS = set("aeiouAEIOU") -ENGLISH_CONSONANTS = set("bcdfghjklmnpqrstvwxyzBCDFGHJKLMNPQRSTVWXYZ") - -PUNCTUATION = set(".,;:!?…-—–()[]{}\"'«»„""''") # noqa: RUF001, pylint: disable=implicit-str-concat -SPECIAL_CHARS = set("@#$%^&*+=<>|\\/_~`") - - -POLISH_CONFIG = LanguageConfig( - vowels=POLISH_VOWELS | ENGLISH_VOWELS, - consonants=POLISH_CONSONANTS | ENGLISH_CONSONANTS, - punctuation=PUNCTUATION, - special_chars=SPECIAL_CHARS, -) - -ENGLISH_CONFIG = LanguageConfig( - vowels=ENGLISH_VOWELS, - consonants=ENGLISH_CONSONANTS, - punctuation=PUNCTUATION, - special_chars=SPECIAL_CHARS, -) - - -@dataclass -class TextStatistics: # pylint: disable=too-many-instance-attributes - text: str - language: str = "pl" - - sentences: int = 0 - lines: int = 0 - paragraphs: int = 0 - empty_lines: int = 0 - words: int = 0 - letters: int = 0 - digits: int = 0 - symbols: int = 0 - punctuation_marks: int = 0 - special_characters: int = 0 - chars_without_spaces: int = 0 - spaces: int = 0 - total_chars: int = 0 - vowels: int = 0 - consonants: int = 0 - - unique_words: int = 0 - avg_word_length: float = 0.0 - avg_sentence_length: float = 0.0 - type_token_ratio: float = 0.0 - - letter_frequency: Dict[str, int] = field(default_factory=dict) - word_frequency: List[Dict[str, Any]] = field(default_factory=list) - bigrams: List[Dict[str, Any]] = field(default_factory=list) - trigrams: List[Dict[str, Any]] = field(default_factory=list) - - @classmethod - def from_file(cls, file_path: Path, language: str = "pl") -> "TextStatistics": - with open(file_path, "r", encoding="utf-8") as f: - text = f.read() - - stats = cls(text=text, language=language) - stats.calculate() - return stats - - @classmethod - def from_text(cls, text: str, language: str = "pl") -> "TextStatistics": - stats = cls(text=text, language=language) - stats.calculate() - return stats - - def calculate(self): - self.__calculate_basic_stats() - self.__calculate_character_stats() - self.__calculate_word_stats() - self.__calculate_advanced_stats() - - def __get_config(self) -> LanguageConfig: - return POLISH_CONFIG if self.language == "pl" else ENGLISH_CONFIG - - def __calculate_basic_stats(self): - lines = self.text.split("\n") - self.lines = len(lines) - self.empty_lines = sum(1 for line in lines if not line.strip()) - - paragraphs = self.text.split("\n\n") - self.paragraphs = len([p for p in paragraphs if p.strip()]) - - sentence_pattern = r'[.!?…]+(?:\s|$)' - self.sentences = len(re.findall(sentence_pattern, self.text)) - - self.total_chars = len(self.text) - self.spaces = self.text.count(" ") + self.text.count("\t") + self.text.count("\n") - self.chars_without_spaces = self.total_chars - self.spaces - - def __calculate_character_stats(self): - config = self.__get_config() - letter_counter = Counter() - - for char in self.text: - if char.isalpha(): - self.letters += 1 - letter_counter[char.lower()] += 1 - - if char in config.vowels: - self.vowels += 1 - elif char in config.consonants: - self.consonants += 1 - elif char.isdigit(): - self.digits += 1 - elif char in config.punctuation: - self.punctuation_marks += 1 - elif char in config.special_chars: - self.special_characters += 1 - elif not char.isspace(): - self.symbols += 1 - - self.letter_frequency = dict(sorted(letter_counter.items(), key=lambda x: x[1], reverse=True)) - - def __calculate_word_stats(self): - words = re.findall(r'\b\w+\b', self.text.lower()) - self.words = len(words) - - if self.words > 0: - word_counter = Counter(words) - self.unique_words = len(word_counter) - self.type_token_ratio = round(self.unique_words / self.words, 4) if self.words > 0 else 0.0 - - word_lengths = [len(w) for w in words] - self.avg_word_length = round(sum(word_lengths) / len(word_lengths), 2) if word_lengths else 0.0 - - self.word_frequency = [ - {"word": word, "count": count} - for word, count in word_counter.most_common(50) - ] - - def __calculate_advanced_stats(self): - if self.sentences > 0: - self.avg_sentence_length = round(self.words / self.sentences, 2) - - words = re.findall(r'\b\w+\b', self.text.lower()) - if len(words) >= 2: - bigram_counter = Counter(zip(words[:-1], words[1:])) - self.bigrams = [ - {"bigram": f"{w1} {w2}", "count": count} - for (w1, w2), count in bigram_counter.most_common(25) - ] - - if len(words) >= 3: - trigram_counter = Counter(zip(words[:-2], words[1:-1], words[2:])) - self.trigrams = [ - {"trigram": f"{w1} {w2} {w3}", "count": count} - for (w1, w2, w3), count in trigram_counter.most_common(25) - ] - - def to_dict(self) -> Dict[str, Any]: - return { - "basic_statistics": { - "sentences": self.sentences, - "lines": self.lines, - "paragraphs": self.paragraphs, - "empty_lines": self.empty_lines, - "words": self.words, - "letters": self.letters, - "digits": self.digits, - "symbols": self.symbols, - "punctuation_marks": self.punctuation_marks, - "special_characters": self.special_characters, - "chars_without_spaces": self.chars_without_spaces, - "spaces": self.spaces, - "total_chars": self.total_chars, - "vowels": self.vowels, - "consonants": self.consonants, - }, - "advanced_statistics": { - "unique_words": self.unique_words, - "avg_word_length": self.avg_word_length, - "avg_sentence_length": self.avg_sentence_length, - "type_token_ratio": self.type_token_ratio, - }, - "letter_frequency": self.letter_frequency, - "word_frequency": self.word_frequency, - "bigrams": self.bigrams, - "trigrams": self.trigrams, - } diff --git a/preprocessor/transcription/__init__.py b/preprocessor/transcription/__init__.py deleted file mode 100644 index 456e60c25..000000000 --- a/preprocessor/transcription/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from preprocessor.transcription.generators.json_generator import JsonGenerator -from preprocessor.transcription.processors.audio_normalizer import AudioNormalizer -from preprocessor.transcription.processors.episode_info_processor import EpisodeInfoProcessor -from preprocessor.transcription.processors.normalized_audio_processor import NormalizedAudioProcessor diff --git a/preprocessor/transcription/elevenlabs.py b/preprocessor/transcription/elevenlabs.py deleted file mode 100644 index 3d0520175..000000000 --- a/preprocessor/transcription/elevenlabs.py +++ /dev/null @@ -1,218 +0,0 @@ -import json -import logging -from pathlib import Path -import subprocess -import tempfile -from typing import ( - Any, - Dict, - List, - Optional, -) - -from preprocessor.core.base_processor import BaseProcessor -from preprocessor.core.episode_manager import EpisodeManager -from preprocessor.transcription.engines.elevenlabs_engine import ElevenLabsEngine -from preprocessor.transcription.generators.multi_format_generator import MultiFormatGenerator -from preprocessor.utils.console import ( - console, - create_progress, -) - - -class ElevenLabsTranscriber(BaseProcessor): - def _validate_args(self, args: Dict[str, Any]) -> None: - if "videos" not in args: - raise ValueError("videos is required") - if "output_dir" not in args: - raise ValueError("output_dir is required") - if "series_name" not in args: - raise ValueError("series_name is required") - - videos_path = Path(args["videos"]) - if not videos_path.is_dir(): - raise NotADirectoryError(f"Input videos is not a directory: '{videos_path}'") - - def __init__(self, args: Dict[str, Any]): - super().__init__( - args=args, - class_name=self.__class__.__name__, - error_exit_code=5, - loglevel=logging.DEBUG, - ) - - self.input_videos: Path = Path(self._args["videos"]) - self.output_dir: Path = Path(self._args["output_dir"]) - self.output_dir.mkdir(parents=True, exist_ok=True) - - self.episodes_info_json: Optional[Path] = self._args.get("episodes_info_json") - - self.model_id: str = self._args.get("model_id", "scribe_v1") - self.language_code: str = self._args.get("language_code", "pol") - self.diarize: bool = self._args.get("diarize", True) - - self.episode_manager = EpisodeManager(self.episodes_info_json, self.series_name) - - self.engine = ElevenLabsEngine( - model_id=self.model_id, - language_code=self.language_code, - diarize=self.diarize, - ) - - def _execute(self) -> None: - video_files: List[Path] = [] - for ext in self.SUPPORTED_VIDEO_EXTENSIONS: - video_files.extend(self.input_videos.rglob(f"*{ext}")) - video_files = sorted(video_files) - - if not video_files: - self.logger.warning("No video files found") - return - - console.print(f"[blue]Found {len(video_files)} videos to transcribe with 11labs[/blue]") - - try: - with create_progress() as progress: - task = progress.add_task("Transcribing with 11labs...", total=len(video_files)) - - for video_file in video_files: - episode_id = video_file.stem - - if self.state_manager and self.state_manager.is_step_completed("transcribe_11labs", episode_id): - console.print(f"[yellow]Skipping (already done): {episode_id}[/yellow]") - progress.advance(task) - continue - - audio_path = None - try: - if self.state_manager: - audio_path = self.__extract_audio(video_file) - self.state_manager.mark_step_started("transcribe_11labs", episode_id, [str(audio_path)]) - - audio_path = audio_path or self.__extract_audio(video_file) - transcription_data = self.engine.transcribe(audio_path) - - self.__save_transcription(transcription_data, video_file) - - if self.state_manager: - self.state_manager.mark_step_completed("transcribe_11labs", episode_id) - - except Exception as e: - self.logger.error(f"Failed to transcribe {video_file.name}: {e}") - - finally: - if audio_path and audio_path.exists(): - audio_path.unlink() - - progress.advance(task) - except KeyboardInterrupt: - console.print("\n[yellow]Transcription interrupted[/yellow]") - raise - - console.print("[blue]Generating multi-format outputs (SRT, TXT, etc.)...[/blue]") - if self.episodes_info_json: - jsons_source_dir = self.output_dir / "json" - multi_format_gen = MultiFormatGenerator( - jsons_dir=jsons_source_dir, - episodes_info_json=self.episodes_info_json, - output_base_path=self.output_dir, - logger=self.logger, - series_name=self.series_name, - ) - multi_format_gen.generate() - - @staticmethod - def __create_segments_from_words(words: List[Dict]) -> List[Dict]: - if not words: - return [] - - segments = [] - current_segment_words = [] - current_speaker = None - - for word in words: - speaker_id = word.get("speaker_id", "speaker_unknown") - - if current_speaker is None: - current_speaker = speaker_id - current_segment_words = [word] - elif speaker_id == current_speaker: - current_segment_words.append(word) - else: - segment_text = " ".join(w.get("text", "") for w in current_segment_words).strip() - segments.append({ - "text": segment_text, - "words": current_segment_words, - }) - current_speaker = speaker_id - current_segment_words = [word] - - if current_segment_words: - segment_text = " ".join(w.get("text", "") for w in current_segment_words).strip() - segments.append({ - "text": segment_text, - "words": current_segment_words, - }) - - return segments - - @staticmethod - def __extract_audio(video_file: Path) -> Path: - temp_dir = Path(tempfile.gettempdir()) - audio_path = temp_dir / f"{video_file.stem}_audio.mp3" - - command = [ - "ffmpeg", - "-v", "error", - "-hide_banner", - "-y", - "-i", str(video_file), - "-vn", - "-acodec", "libmp3lame", - "-ar", "16000", - "-ac", "1", - "-b:a", "64k", - str(audio_path), - ] - - subprocess.run(command, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE) - return audio_path - - def __save_transcription(self, data: Dict[str, Any], video_file: Path) -> None: - episode_info = self.episode_manager.parse_filename(video_file) - if not episode_info: - self.logger.error(f"Cannot parse episode info from {video_file.name}") - return - - api_segments = data.get("segments", []) - api_words = data.get("words", []) - - if api_segments: - segments = api_segments - words = [] - for segment in segments: - segment_words = segment.get("words", []) - for word in segment_words: - if "speaker_id" not in word and "speaker" in segment: - word["speaker_id"] = segment["speaker"] - words.extend(segment_words) - else: - words = api_words - segments = self.__create_segments_from_words(words) - - output_data = { - "text": data.get("text", ""), - "language_code": data.get("language_code", "pol"), - "segments": segments, - "words": words, - "episode_info": EpisodeManager.get_metadata(episode_info), - } - - json_dir = self.output_dir / "json" - output_file = self.episode_manager.build_output_path(episode_info, json_dir) - output_file.parent.mkdir(parents=True, exist_ok=True) - - with open(output_file, "w", encoding="utf-8") as f: - json.dump(output_data, f, indent=2, ensure_ascii=False) - - self.logger.info(f"Saved transcription: {output_file.name}") diff --git a/preprocessor/transcription/engines/__init__.py b/preprocessor/transcription/engines/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/preprocessor/transcription/engines/elevenlabs_engine.py b/preprocessor/transcription/engines/elevenlabs_engine.py deleted file mode 100644 index 495632a2e..000000000 --- a/preprocessor/transcription/engines/elevenlabs_engine.py +++ /dev/null @@ -1,155 +0,0 @@ -import json -import logging -from pathlib import Path -import time -from typing import ( - Any, - Dict, - Optional, -) - -from elevenlabs.client import ElevenLabs -from elevenlabs.core import ApiError - -from preprocessor.config.config import settings -from preprocessor.transcription.engines.base_engine import TranscriptionEngine -from preprocessor.utils.console import console - - -class ElevenLabsEngine(TranscriptionEngine): - def __init__( - self, - model_id: Optional[str] = None, - language_code: Optional[str] = None, - diarize: Optional[bool] = None, - polling_interval: Optional[int] = None, - ): - if not settings.elevenlabs.api_key: - raise ValueError( - "ElevenLabs API key not provided. Set ELEVEN_API_KEY environment variable.", - ) - - self.client = ElevenLabs(api_key=settings.elevenlabs.api_key) - self.model_id = model_id or settings.elevenlabs.model_id - self.language_code = language_code or settings.elevenlabs.language_code - self.diarize = diarize if diarize is not None else settings.elevenlabs.diarize - self.polling_interval = polling_interval or settings.elevenlabs.polling_interval - - self.additional_formats = [ - {"format": "srt"}, - { - "format": "segmented_json", - "include_speakers": True, - "include_timestamps": True, - "segment_on_silence_longer_than_s": 0.5, - "max_segment_duration_s": 10.0, - "max_segment_chars": 200, - }, - ] - - self.logger = logging.getLogger(self.__class__.__name__) - - def transcribe(self, audio_path: Path) -> Dict[str, Any]: - console.print(f"[cyan]Transcribing with 11labs: {audio_path.name}[/cyan]") - - if not audio_path.exists(): - raise FileNotFoundError(f"Audio file not found: {audio_path}") - - transcription_id = self.__submit_job(audio_path) - result = self.__poll_for_results(transcription_id) - - console.print(f"[green]Transcription completed: {audio_path.name}[/green]") - - return self.__convert_to_unified_format(result) - - def __submit_job(self, audio_path: Path) -> str: - try: - with open(audio_path, "rb") as audio_file: - audio_data = audio_file.read() - - submit_response = self.client.speech_to_text.convert( - file=audio_data, - model_id=self.model_id, - language_code=self.language_code, - tag_audio_events=True, - timestamps_granularity="character", - diarize=self.diarize, - use_multi_channel=False, - additional_formats=self.additional_formats, - webhook=True, - ) - - self.logger.info(f"Job submitted. ID: {submit_response.transcription_id}") - return submit_response.transcription_id - - except ApiError as e: - self.logger.error(f"API error during job submission: {e.body}") - raise - - def __poll_for_results(self, transcription_id: str): - self.logger.info(f"Polling for results (ID: {transcription_id})...") - - max_attempts = settings.elevenlabs.max_attempts - attempt = 0 - - while attempt < max_attempts: - try: - result = self.client.speech_to_text.transcripts.get( - transcription_id=transcription_id, - ) - - self.logger.info("Transcription complete!") - return result - - except ApiError as e: - if e.status_code == 404: - self.logger.info(" ...Processing. Waiting...") - time.sleep(self.polling_interval) - attempt += 1 - else: - self.logger.error(f"API error during polling: {e.body}") - raise - - raise TimeoutError(f"Transcription timeout after {max_attempts} attempts") - - @staticmethod - def __convert_to_unified_format(result) -> Dict[str, Any]: - unified_data = { - "text": result.text, - "language_code": result.language_code, - "segments": [], - } - - if result.additional_formats: - for fmt in result.additional_formats: - if fmt.requested_format == "segmented_json": - segmented_data = json.loads(fmt.content) - - for seg in segmented_data.get("segments", []): - words = seg.get("words", []) - if not words: - continue - - non_spacing_words = [w for w in words if w.get("type") != "spacing"] - - segment = { - "text": seg.get("text", "").strip(), - "words": words, - } - - if non_spacing_words: - first_word = non_spacing_words[0] - last_word = non_spacing_words[-1] - - segment["start"] = first_word.get("start") - segment["end"] = last_word.get("end") - segment["speaker"] = first_word.get("speaker_id") - - unified_data["segments"].append(segment) - - break - - return unified_data - - def get_name(self) -> str: - return "ElevenLabs" diff --git a/preprocessor/transcription/engines/whisper_engine.py b/preprocessor/transcription/engines/whisper_engine.py deleted file mode 100644 index 92586a4f8..000000000 --- a/preprocessor/transcription/engines/whisper_engine.py +++ /dev/null @@ -1,73 +0,0 @@ -import gc -import logging -from pathlib import Path -from typing import ( - Any, - Dict, -) - -from faster_whisper import WhisperModel -import torch - -from preprocessor.transcription.engines.base_engine import TranscriptionEngine -from preprocessor.transcription.whisper_utils import ( - build_transcription_result, - get_language_code, -) -from preprocessor.utils.console import console - - -class WhisperEngine(TranscriptionEngine): - def __init__( - self, - model: str = "large-v3-turbo", - language: str = "Polish", - device: str = "cuda", - ): - self.model_name = model - self.language = language - self.device = device - - self.logger = logging.getLogger(self.__class__.__name__) - - if device != "cuda": - raise ValueError(f"Only GPU (cuda) is supported, got device={device}") - - compute_type = "float16" - console.print(f"[cyan]Loading Whisper model: {model} on {device} with compute_type={compute_type}[/cyan]") - self.model = WhisperModel(model, device=device, compute_type=compute_type) - console.print("[green]✓ Whisper model loaded[/green]") - - def transcribe(self, audio_path: Path) -> Dict[str, Any]: - console.print(f"[cyan]Transcribing with Whisper: {audio_path.name}[/cyan]") - - if not audio_path.exists(): - raise FileNotFoundError(f"Audio file not found: {audio_path}") - - language_code = get_language_code(self.language) - - segments, info = self.model.transcribe( - str(audio_path), - language=language_code, - beam_size=10, - word_timestamps=True, - condition_on_previous_text=False, - ) - - result = build_transcription_result(segments, language=info.language) - - console.print(f"[green]✓ Transcription completed: {audio_path.name}[/green]") - - return result - - def get_name(self) -> str: - return f"Whisper-{self.model_name}" - - def cleanup(self) -> None: - console.print("[cyan]Unloading Whisper model and clearing GPU memory...[/cyan]") - if hasattr(self, 'model'): - del self.model - gc.collect() - if torch.cuda.is_available(): - torch.cuda.empty_cache() - console.print("[green]✓ Whisper model unloaded, GPU memory cleared[/green]") diff --git a/preprocessor/transcription/generator.py b/preprocessor/transcription/generator.py deleted file mode 100644 index d6e6767bc..000000000 --- a/preprocessor/transcription/generator.py +++ /dev/null @@ -1,240 +0,0 @@ -import logging -from pathlib import Path -import tempfile -from typing import ( - Any, - Dict, - List, -) - -from preprocessor.core.base_processor import ( - BaseProcessor, - OutputSpec, - ProcessingItem, -) -from preprocessor.core.episode_manager import EpisodeManager -from preprocessor.core.output_path_builder import OutputPathBuilder -from preprocessor.transcription.generators.multi_format_generator import MultiFormatGenerator -from preprocessor.transcription.processors.audio_normalizer import AudioNormalizer -from preprocessor.transcription.processors.normalized_audio_processor import NormalizedAudioProcessor -from preprocessor.transcription.processors.unicode_fixer import TranscriptionUnicodeFixer - - -class TranscriptionGenerator(BaseProcessor): - def __init__(self, args: Dict[str, Any]) -> None: - super().__init__( - args=args, - class_name=self.__class__.__name__, - error_exit_code=2, - loglevel=logging.DEBUG, - ) - - self.input_videos: Path = Path(self._args["videos"]) - self.series_name_lower: str = self._args.get("name", "unknown").lower() - self.episodes_info_json: Path = Path(self._args["episodes_info_json"]) - self.episode_manager = EpisodeManager(self.episodes_info_json, self.series_name_lower) - - self.temp_dir = None - self.audio_normalizer = None - self.audio_processor = None - self.multi_format_generator = None - self.unicode_fixer = None - self.final_output_dir = None - - def _validate_args(self, args: Dict[str, Any]) -> None: - if "videos" not in args: - raise ValueError("videos path is required") - if "episodes_info_json" not in args: - raise ValueError("episodes_info_json is required") - - videos_path = Path(args["videos"]) - if not videos_path.is_dir(): - raise NotADirectoryError(f"Input videos is not a directory: '{videos_path}'") - - def _get_processing_items(self) -> List[ProcessingItem]: - if self.__check_all_transcriptions_exist(): - return [] - - return [ - ProcessingItem( - episode_id="transcription_batch", - input_path=self.input_videos, - metadata={}, - ), - ] - - def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: - video_files = [] - for ext in self.SUPPORTED_VIDEO_EXTENSIONS: - video_files.extend(self.input_videos.rglob(f"*{ext}")) - outputs = [] - - for video_file in video_files: - episode_info = self.episode_manager.parse_filename(video_file) - if not episode_info: - continue - - filename = self.episode_manager.file_naming.build_filename(episode_info, extension="json") - expected_file = OutputPathBuilder.build_transcription_path( - episode_info, - filename, - subdir="raw", - ) - - segmented_filename = self.episode_manager.file_naming.build_filename( - episode_info, - extension="json", - suffix="_segmented", - ) - segmented_file = OutputPathBuilder.build_transcription_path( - episode_info, - segmented_filename, - subdir="raw", - ) - - if not expected_file.exists() and not segmented_file.exists(): - outputs.append(OutputSpec(path=expected_file, required=True)) - - return outputs - - def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> None: - ramdisk_path = self._args.get("ramdisk_path") - if ramdisk_path and Path(ramdisk_path).exists(): - self.temp_dir = tempfile.TemporaryDirectory(dir=str(ramdisk_path)) - else: - self.temp_dir = tempfile.TemporaryDirectory() # pylint: disable=consider-using-with - - try: - missing_video_files = self.__get_missing_video_files(missing_outputs) - self.__init_workers(self._args, missing_video_files) - - self.logger.info("Step 1/3: Normalizing audio from videos...") - self.audio_normalizer() - - self.logger.info("Step 2/3: Generating transcriptions with Whisper...") - self.audio_processor() - - self.logger.info("Cleaning up Whisper model...") - self.audio_processor.cleanup() - - self.logger.info("Step 3/4: Generating multi-format output...") - self.multi_format_generator() - - self.logger.info("Step 4/4: Fixing unicode escapes in transcriptions...") - self.unicode_fixer() - - except (RuntimeError, OSError, ValueError) as e: - self.logger.error(f"Error generating transcriptions: {e}") - finally: - if self.temp_dir: - self.temp_dir.cleanup() - - def __check_all_transcriptions_exist(self) -> bool: - if not self.episodes_info_json.exists(): - self.logger.debug(f"Episodes info JSON not found: {self.episodes_info_json}") - return False - - video_files = [] - for ext in self.SUPPORTED_VIDEO_EXTENSIONS: - video_files.extend(self.input_videos.rglob(f"*{ext}")) - if not video_files: - self.logger.debug("No video files found to check") - return False - - missing_files = [] - for video_file in video_files: - episode_info = self.episode_manager.parse_filename(video_file) - if not episode_info: - continue - - filename = self.episode_manager.file_naming.build_filename(episode_info, extension="json") - expected_file = OutputPathBuilder.build_transcription_path( - episode_info, - filename, - subdir="raw", - ) - - segmented_filename = self.episode_manager.file_naming.build_filename( - episode_info, - extension="json", - suffix="_segmented", - ) - segmented_file = OutputPathBuilder.build_transcription_path( - episode_info, - segmented_filename, - subdir="raw", - ) - - if not expected_file.exists() and not segmented_file.exists(): - missing_files.append(f"{video_file.name} -> {expected_file}") - - if missing_files: - self.logger.debug(f"Missing {len(missing_files)} transcription(s), first: {missing_files[0]}") - return False - - self.logger.info(f"All transcriptions already exist for {len(video_files)} video(s)") - return True - - def __get_missing_video_files(self, missing_outputs: List[OutputSpec]) -> List[Path]: - video_files = [] - for ext in self.SUPPORTED_VIDEO_EXTENSIONS: - video_files.extend(self.input_videos.rglob(f"*{ext}")) - - missing_video_files = [] - - for video_file in video_files: - episode_info = self.episode_manager.parse_filename(video_file) - if not episode_info: - continue - - filename = self.episode_manager.file_naming.build_filename(episode_info, extension="json") - expected_file = OutputPathBuilder.build_transcription_path( - episode_info, - filename, - subdir="raw", - ) - - if any(expected_file == output.path for output in missing_outputs): - missing_video_files.append(video_file) - - return missing_video_files - - def __init_workers(self, args: Dict[str, Any], video_files: List[Path]) -> None: - temp_dir_path: Path = Path(self.temp_dir.name) / "transcription_generator" - normalizer_output: Path = temp_dir_path / "normalizer" - processor_output: Path = temp_dir_path / "processor" - - self.final_output_dir: Path = Path(args["transcription_jsons"]) - - audio_files = [normalizer_output / video.with_suffix(".wav").name for video in video_files] - - self.audio_normalizer: AudioNormalizer = AudioNormalizer( - input_videos=self.input_videos, - output_dir=normalizer_output, - logger=self.logger, - video_files=video_files if video_files else None, - ) - - self.audio_processor: NormalizedAudioProcessor = NormalizedAudioProcessor( - input_audios=normalizer_output, - output_dir=processor_output, - logger=self.logger, - language=args["language"], - model=args["model"], - device=args["device"], - audio_files=audio_files if audio_files else None, - ) - - self.multi_format_generator: MultiFormatGenerator = MultiFormatGenerator( - jsons_dir=processor_output, - episodes_info_json=self.episodes_info_json, - output_base_path=self.final_output_dir, - logger=self.logger, - series_name=args["name"], - ) - - self.unicode_fixer: TranscriptionUnicodeFixer = TranscriptionUnicodeFixer({ - "transcription_jsons": self.final_output_dir, - "episodes_info_json": self.episodes_info_json, - "name": args["name"], - }) diff --git a/preprocessor/transcription/generators/__init__.py b/preprocessor/transcription/generators/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/preprocessor/transcription/generators/base_generator.py b/preprocessor/transcription/generators/base_generator.py deleted file mode 100644 index c1b825933..000000000 --- a/preprocessor/transcription/generators/base_generator.py +++ /dev/null @@ -1,45 +0,0 @@ -from abc import ( - ABC, - abstractmethod, -) -import json -from pathlib import Path -from typing import ( - Any, - Dict, -) - -from preprocessor.utils.error_handling_logger import ErrorHandlingLogger - - -class BaseTranscriptionGenerator(ABC): - def __init__( - self, - input_dir: Path, - output_dir: Path, - logger: ErrorHandlingLogger, - ): - self.input_dir = input_dir - self.output_dir = output_dir - self.logger = logger - - def generate(self) -> None: - self.output_dir.mkdir(parents=True, exist_ok=True) - - for json_file in self.input_dir.rglob("*.json"): - try: - with open(json_file, "r", encoding="utf-8") as f: - data = json.load(f) - - self._process_file(json_file, data) - - except Exception as e: - self.logger.error(f"Failed to generate output for {json_file}: {e}") - - @abstractmethod - def _process_file(self, json_file: Path, data: Dict[str, Any]) -> None: - pass - - @abstractmethod - def _get_output_filename(self, json_file: Path) -> str: - pass diff --git a/preprocessor/transcription/generators/full_json_generator.py b/preprocessor/transcription/generators/full_json_generator.py deleted file mode 100644 index 4dd881518..000000000 --- a/preprocessor/transcription/generators/full_json_generator.py +++ /dev/null @@ -1,38 +0,0 @@ -from pathlib import Path -from typing import ( - Any, - Dict, -) - -from preprocessor.transcription.generators.base_generator import BaseTranscriptionGenerator -from preprocessor.utils.transcription_utils import convert_words_list - - -class FullJsonGenerator(BaseTranscriptionGenerator): - def _process_file(self, json_file: Path, data: Dict[str, Any]) -> None: - pass - - def _get_output_filename(self, json_file: Path) -> str: - return json_file.name - - @staticmethod - def convert_to_full_format(data: Dict[str, Any]) -> Dict[str, Any]: - segments = data.get("segments", []) - - full_text = " ".join(seg.get("text", "").strip() for seg in segments) - - language_code = data.get("language", "pol") - if language_code in {"Polish", "polish"}: - language_code = "pol" - - words = [] - for seg in segments: - seg_words = seg.get("words", []) - words.extend(convert_words_list(seg_words)) - - return { - "language_code": language_code, - "language_probability": 1.0, - "text": full_text, - "words": words, - } diff --git a/preprocessor/transcription/generators/json_generator.py b/preprocessor/transcription/generators/json_generator.py deleted file mode 100644 index 77289644d..000000000 --- a/preprocessor/transcription/generators/json_generator.py +++ /dev/null @@ -1,80 +0,0 @@ -import json -from pathlib import Path -from typing import ( - Any, - Dict, - List, -) - -from preprocessor.core.constants import FILE_EXTENSIONS -from preprocessor.utils.error_handling_logger import ErrorHandlingLogger - - -class JsonGenerator: - DEFAULT_KEYS_TO_REMOVE: List[str] = [ - "tokens", "no_speech_prob", "compression_ratio", "avg_logprob", "temperature", - ] - - UNICODE_TO_POLISH_MAP: Dict[str, str] = { - '\\u0105': 'ą', '\\u0107': 'ć', '\\u0119': 'ę', '\\u0142': 'ł', - '\\u0144': 'ń', '\\u00F3': 'ó', '\\u015B': 'ś', '\\u017A': 'ź', - '\\u017C': 'ż', '\\u0104': 'Ą', '\\u0106': 'Ć', '\\u0118': 'Ę', - '\\u0141': 'Ł', '\\u0143': 'Ń', '\\u00D3': 'Ó', '\\u015A': 'Ś', - '\\u0179': 'Ź', '\\u017B': 'Ż', - } - - def __init__( - self, - jsons_dir: Path, - output_dir: Path, - logger: ErrorHandlingLogger, - extra_keys_to_remove: List[str], - ): - self.__jsons_dir: Path = jsons_dir - self.__output_dir: Path = output_dir - self.__logger: ErrorHandlingLogger = logger - self.__keys_to_remove: List[str] = self.DEFAULT_KEYS_TO_REMOVE + extra_keys_to_remove - - self.__output_dir.mkdir(parents=True, exist_ok=True) - - def __call__(self) -> None: - for item in self.__jsons_dir.rglob("*"): - if item.is_file() and item.suffix == FILE_EXTENSIONS["json"]: - output_path = self.__output_dir / item.name - self.__format_json(item, output_path) - - def __format_json(self, file_path: Path, output_path: Path) -> None: - try: - with file_path.open("r", encoding="utf-8") as file: - data = json.load(file) - - if "segments" in data: - data["segments"] = [self.__process_json_segment(segment) for segment in data["segments"]] - - with output_path.open("w", encoding="utf-8") as file: - json.dump({"segments": data["segments"]}, file, ensure_ascii=False, indent=4) - - self.__logger.info(f"Processed file: {file_path}") - - except Exception as e: - self.__logger.error(f"Error formatting JSON file {file_path}: {e}") - - def __process_json_segment(self, segment: Dict[str, Any]) -> Dict[str, Any]: - for key in self.__keys_to_remove: - segment.pop(key, None) - - segment["text"] = self.__replace_unicode_chars(segment.get("text", "")) - segment.update({ - "author": "", - "comment": "", - "tags": ["", ""], - "location": "", - "actors": ["", ""], - }) - return segment - - @staticmethod - def __replace_unicode_chars(text: str) -> str: - for unicode_char, char in JsonGenerator.UNICODE_TO_POLISH_MAP.items(): - text = text.replace(unicode_char, char) - return text diff --git a/preprocessor/transcription/generators/multi_format_generator.py b/preprocessor/transcription/generators/multi_format_generator.py deleted file mode 100644 index 88a9b76f2..000000000 --- a/preprocessor/transcription/generators/multi_format_generator.py +++ /dev/null @@ -1,159 +0,0 @@ -import json -from pathlib import Path -from typing import ( - Any, - Dict, -) - -from preprocessor.core.episode_manager import EpisodeManager -from preprocessor.core.output_path_builder import OutputPathBuilder -from preprocessor.transcription.generators.full_json_generator import FullJsonGenerator -from preprocessor.transcription.generators.segmented_json_generator import SegmentedJsonGenerator -from preprocessor.transcription.generators.simple_json_generator import SimpleJsonGenerator -from preprocessor.transcription.generators.srt_generator import SrtGenerator -from preprocessor.transcription.generators.txt_generator import TxtGenerator -from preprocessor.utils.error_handling_logger import ErrorHandlingLogger - - -class MultiFormatGenerator: - def __init__( - self, - jsons_dir: Path, - episodes_info_json: Path, - output_base_path: Path, - logger: ErrorHandlingLogger, - series_name: str = "", - ): - self.jsons_dir = jsons_dir - self.output_base_path = output_base_path - self.logger = logger - self.series_name = series_name.lower() if series_name else "unknown" - - self.episode_manager = EpisodeManager(episodes_info_json, self.series_name) - - def __call__(self) -> None: - self.generate() - - def generate(self) -> None: - for transcription_file in self.jsons_dir.rglob("*.json"): - self.__process_file(transcription_file) - - def __process_file(self, transcription_file: Path) -> None: - try: # pylint: disable=too-many-try-statements - with open(transcription_file, "r", encoding="utf-8") as f: - transcription = json.load(f) - - episode_info = self.episode_manager.parse_filename(transcription_file) - if not episode_info: - self.logger.error(f"Cannot extract episode info from {transcription_file.name}") - return - - filename = self.episode_manager.file_naming.build_filename(episode_info, extension="json") - main_output_file = OutputPathBuilder.build_transcription_path( - episode_info, - filename, - subdir="raw", - ) - - if main_output_file.exists(): - self.logger.info(f"Skipping (already exists): {episode_info.episode_code()}") - return - - episode_metadata = EpisodeManager.get_metadata(episode_info) - transcription_with_info = { - "episode_info": episode_metadata, - **transcription, - } - - self.__generate_full_json(transcription_with_info, episode_info) - self.__generate_segmented_json(transcription, episode_info) - self.__generate_simple_json(transcription, episode_info) - self.__generate_srt(transcription, episode_info) - self.__generate_txt(transcription, episode_info) - - except Exception as e: - self.logger.error(f"Error processing file {transcription_file}: {e}") - - def __generate_full_json(self, data: Dict[str, Any], episode_info) -> None: - filename = self.episode_manager.file_naming.build_filename(episode_info, extension="json") - output_file = OutputPathBuilder.build_transcription_path(episode_info, filename, subdir="raw") - output_file.parent.mkdir(parents=True, exist_ok=True) - - generator = FullJsonGenerator(Path("."), output_file.parent, self.logger) - full_json = generator.convert_to_full_format(data) - full_json["episode_info"] = data.get("episode_info", {}) - - with open(output_file, "w", encoding="utf-8") as f: - json.dump(full_json, f, indent=2, ensure_ascii=False) - - self.logger.info(f"Generated full JSON: {output_file}") - - def __generate_segmented_json(self, data: Dict[str, Any], episode_info) -> None: - filename = self.episode_manager.file_naming.build_filename( - episode_info, - extension="json", - suffix="segmented", - ) - output_file = OutputPathBuilder.build_transcription_path(episode_info, filename, subdir="raw") - output_file.parent.mkdir(parents=True, exist_ok=True) - - generator = SegmentedJsonGenerator(Path("."), output_file.parent, self.logger) - segmented_json = generator.convert_to_segmented_format(data) - - segmented_json["episode_info"] = { - "season": episode_info.season, - "episode_number": episode_info.relative_episode, - } - - with open(output_file, "w", encoding="utf-8") as f: - json.dump(segmented_json, f, indent=2, ensure_ascii=False) - - self.logger.info(f"Generated segmented JSON: {output_file}") - - def __generate_simple_json(self, data: Dict[str, Any], episode_info) -> None: - filename = self.episode_manager.file_naming.build_filename( - episode_info, - extension="json", - suffix="simple", - ) - output_file = OutputPathBuilder.build_transcription_path(episode_info, filename, subdir="raw") - output_file.parent.mkdir(parents=True, exist_ok=True) - - generator = SimpleJsonGenerator(Path("."), output_file.parent, self.logger) - simple_json = generator.convert_to_simple_format(data) - - simple_json["episode_info"] = { - "season": episode_info.season, - "episode_number": episode_info.relative_episode, - } - - with open(output_file, "w", encoding="utf-8") as f: - json.dump(simple_json, f, indent=2, ensure_ascii=False) - - self.logger.info(f"Generated simple JSON: {output_file}") - - def __generate_srt(self, data: Dict[str, Any], episode_info) -> None: - filename = self.episode_manager.file_naming.build_filename(episode_info, extension="srt") - output_file = OutputPathBuilder.build_transcription_path(episode_info, filename, subdir="raw") - output_file.parent.mkdir(parents=True, exist_ok=True) - - generator = SrtGenerator(Path("."), output_file.parent, self.logger) - srt_content = generator.convert_to_srt_format(data) - - with open(output_file, "w", encoding="utf-8") as f: - f.write(srt_content) - - self.logger.info(f"Generated SRT: {output_file}") - - def __generate_txt(self, data: Dict[str, Any], episode_info) -> None: - filename = self.episode_manager.file_naming.build_filename(episode_info, extension="txt") - output_file = OutputPathBuilder.build_transcription_path(episode_info, filename, subdir="raw") - output_file.parent.mkdir(parents=True, exist_ok=True) - - generator = TxtGenerator(Path("."), output_file.parent, self.logger) - txt_content = generator.convert_to_txt_format(data) - - with open(output_file, "w", encoding="utf-8") as f: - f.write(txt_content) - - self.logger.info(f"Generated TXT: {output_file}") diff --git a/preprocessor/transcription/generators/segmented_json_generator.py b/preprocessor/transcription/generators/segmented_json_generator.py deleted file mode 100644 index e5f920f08..000000000 --- a/preprocessor/transcription/generators/segmented_json_generator.py +++ /dev/null @@ -1,36 +0,0 @@ -from pathlib import Path -from typing import ( - Any, - Dict, -) - -from preprocessor.core.constants import ( - FILE_EXTENSIONS, - FILE_SUFFIXES, -) -from preprocessor.transcription.generators.base_generator import BaseTranscriptionGenerator -from preprocessor.utils.transcription_utils import convert_words_list - - -class SegmentedJsonGenerator(BaseTranscriptionGenerator): - def _process_file(self, json_file: Path, data: Dict[str, Any]) -> None: - pass - - def _get_output_filename(self, json_file: Path) -> str: - return json_file.name.replace(FILE_EXTENSIONS["json"], f"{FILE_SUFFIXES['segmented']}{FILE_EXTENSIONS['json']}") - - @staticmethod - def convert_to_segmented_format(data: Dict[str, Any]) -> Dict[str, Any]: - segments = data.get("segments", []) - result_segments = [] - - for seg in segments: - text = seg.get("text", "").strip() - seg_words = seg.get("words", []) - - result_segments.append({ - "text": text, - "words": convert_words_list(seg_words), - }) - - return {"segments": result_segments} diff --git a/preprocessor/transcription/generators/simple_json_generator.py b/preprocessor/transcription/generators/simple_json_generator.py deleted file mode 100644 index d0848cd73..000000000 --- a/preprocessor/transcription/generators/simple_json_generator.py +++ /dev/null @@ -1,39 +0,0 @@ -from pathlib import Path -from typing import ( - Any, - Dict, -) - -from preprocessor.core.constants import ( - FILE_EXTENSIONS, - FILE_SUFFIXES, -) -from preprocessor.transcription.generators.base_generator import BaseTranscriptionGenerator - - -class SimpleJsonGenerator(BaseTranscriptionGenerator): - def _process_file(self, json_file: Path, data: Dict[str, Any]) -> None: - pass - - def _get_output_filename(self, json_file: Path) -> str: - return json_file.name.replace(FILE_EXTENSIONS["json"], f"{FILE_SUFFIXES['simple']}{FILE_EXTENSIONS['json']}") - - @staticmethod - def convert_to_simple_format(data: Dict[str, Any]) -> Dict[str, Any]: - segments = data.get("segments", []) - result_segments = [] - - for seg in segments: - text = seg.get("text", "").strip() - seg_words = seg.get("words", []) - - speaker = "speaker_unknown" - if seg_words: - speaker = seg_words[0].get("speaker_id", "speaker_unknown") - - result_segments.append({ - "speaker": speaker, - "text": text, - }) - - return {"segments": result_segments} diff --git a/preprocessor/transcription/generators/srt_generator.py b/preprocessor/transcription/generators/srt_generator.py deleted file mode 100644 index 2c5661020..000000000 --- a/preprocessor/transcription/generators/srt_generator.py +++ /dev/null @@ -1,50 +0,0 @@ -from pathlib import Path -from typing import ( - Any, - Dict, -) - -from preprocessor.core.constants import FILE_EXTENSIONS -from preprocessor.transcription.generators.base_generator import BaseTranscriptionGenerator - - -class SrtGenerator(BaseTranscriptionGenerator): - def _process_file(self, json_file: Path, data: Dict[str, Any]) -> None: - pass - - def _get_output_filename(self, json_file: Path) -> str: - return json_file.name.replace(FILE_EXTENSIONS["json"], FILE_EXTENSIONS["srt"]) - - def convert_to_srt_format(self, data: Dict[str, Any]) -> str: - segments = data.get("segments", []) - srt_lines = [] - index = 1 - - for seg in segments: - start = seg.get("start", 0.0) - end = seg.get("end", 0.0) - text = seg.get("text", "").strip() - - if not text: - continue - - start_time = self.__format_timestamp(start) - end_time = self.__format_timestamp(end) - - srt_lines.append(f"{index}") - srt_lines.append(f"{start_time} --> {end_time}") - srt_lines.append(text) - srt_lines.append("") - - index += 1 - - return "\n".join(srt_lines) - - @staticmethod - def __format_timestamp(seconds: float) -> str: - hours = int(seconds // 3600) - minutes = int((seconds % 3600) // 60) - secs = int(seconds % 60) - millis = int((seconds % 1) * 1000) - - return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}" diff --git a/preprocessor/transcription/generators/txt_generator.py b/preprocessor/transcription/generators/txt_generator.py deleted file mode 100644 index b966db2d5..000000000 --- a/preprocessor/transcription/generators/txt_generator.py +++ /dev/null @@ -1,28 +0,0 @@ -from pathlib import Path -from typing import ( - Any, - Dict, -) - -from preprocessor.core.constants import FILE_EXTENSIONS -from preprocessor.transcription.generators.base_generator import BaseTranscriptionGenerator - - -class TxtGenerator(BaseTranscriptionGenerator): - def _process_file(self, json_file: Path, data: Dict[str, Any]) -> None: - pass - - def _get_output_filename(self, json_file: Path) -> str: - return json_file.name.replace(FILE_EXTENSIONS["json"], FILE_EXTENSIONS["txt"]) - - @staticmethod - def convert_to_txt_format(data: Dict[str, Any]) -> str: - segments = data.get("segments", []) - - text_parts = [] - for seg in segments: - text = seg.get("text", "").strip() - if text: - text_parts.append(text) - - return " ".join(text_parts) diff --git a/preprocessor/transcription/importer.py b/preprocessor/transcription/importer.py deleted file mode 100644 index 7c8731ab4..000000000 --- a/preprocessor/transcription/importer.py +++ /dev/null @@ -1,225 +0,0 @@ -import json -import logging -from pathlib import Path -import re -from typing import ( - Any, - Dict, - List, - Optional, - Tuple, -) - -from preprocessor.core.base_processor import BaseProcessor -from preprocessor.core.episode_manager import EpisodeManager -from preprocessor.utils.console import ( - console, - create_progress, -) - - -class TranscriptionImporter(BaseProcessor): - def _validate_args(self, args: Dict[str, Any]) -> None: - if "source_dir" not in args: - raise ValueError("source_dir is required") - if "output_dir" not in args: - raise ValueError("output_dir is required") - if "series_name" not in args: - raise ValueError("series_name is required") - - source_dir = Path(args["source_dir"]) - if not source_dir.exists(): - raise FileNotFoundError(f"Source directory not found: {source_dir}") - - def __init__(self, args: Dict[str, Any]) -> None: - super().__init__( - args=args, - class_name=self.__class__.__name__, - error_exit_code=4, - loglevel=logging.DEBUG, - ) - - self.source_dir: Path = Path(self._args["source_dir"]) - self.output_dir: Path = Path(self._args["output_dir"]) - self.episodes_info_json: Optional[Path] = self._args.get("episodes_info_json") - self.format_type: str = self._args.get("format_type", "11labs_segmented") - - self.output_dir.mkdir(parents=True, exist_ok=True) - - self.episode_manager = EpisodeManager(self.episodes_info_json, self.series_name) - - def _execute(self) -> None: - json_files = self.__find_transcription_files() - - if not json_files: - self.logger.warning(f"No transcription files found in {self.source_dir}") - return - - console.print(f"[blue]Found {len(json_files)} transcription files to import[/blue]") - - try: - with create_progress() as progress: - task = progress.add_task("Importing transcriptions...", total=len(json_files)) - - for json_file in json_files: - episode_id = self.__extract_episode_id(json_file) - - if self.state_manager and self.state_manager.is_step_completed("import", episode_id): - console.print(f"[yellow]Skipping (already imported): {episode_id}[/yellow]") - progress.advance(task) - continue - - if self.state_manager: - self.state_manager.mark_step_started("import", episode_id) - - try: - self.__import_single_file(json_file) - if self.state_manager: - self.state_manager.mark_step_completed("import", episode_id) - except Exception as e: - self.logger.error(f"Failed to import {json_file.name}: {e}") - - progress.advance(task) - except KeyboardInterrupt: - console.print("\n[yellow]Import interrupted[/yellow]") - raise - - def __find_transcription_files(self) -> List[Path]: - if self.format_type == "11labs_segmented": - pattern = "*_segmented.json" - elif self.format_type == "11labs": - pattern = "*.json" - else: - pattern = "*.json" - - files = sorted(self.source_dir.rglob(pattern)) - files = [f for f in files if not f.name.startswith('.')] - - return files - - @staticmethod - def __extract_episode_id(file_path: Path) -> str: - match = re.search(r'S(\d+)E(\d+)', file_path.name, re.IGNORECASE) - if match: - return f"S{match.group(1)}E{match.group(2)}" - - match = re.search(r'E(\d+)', file_path.stem, re.IGNORECASE) - if match: - return f"E{match.group(1)}" - - return file_path.stem - - def __import_single_file(self, json_file: Path) -> None: - with open(json_file, "r", encoding="utf-8") as f: - source_data = json.load(f) - - if self.format_type == "11labs_segmented": - converted_data = self.__convert_11labs_segmented(source_data, json_file) - elif self.format_type == "11labs": - converted_data = self.__convert_11labs_full(source_data, json_file) - else: - self.logger.error(f"Unknown format type: {self.format_type}") - return - - episode_info = self.episode_manager.parse_filename(json_file) - if not episode_info: - season_num, episode_num = self.__extract_season_episode_fallback(json_file) - episode_info = self.episode_manager.get_episode_by_season_and_relative(season_num, episode_num) - - if episode_info: - converted_data["episode_info"] = EpisodeManager.get_metadata(episode_info) - - output_file = self.episode_manager.build_output_path(episode_info, self.output_dir) - output_file.parent.mkdir(parents=True, exist_ok=True) - - with open(output_file, "w", encoding="utf-8") as f: - json.dump(converted_data, f, indent=2, ensure_ascii=False) - - self.logger.info(f"Imported: {json_file.name} -> {output_file.name}") - - @staticmethod - def __convert_11labs_segmented(data: Dict[str, Any], source_file: Path) -> Dict[str, Any]: - segments = [] - - for i, segment in enumerate(data.get("segments", [])): - converted_segment = { - "id": i, - "start": segment.get("start"), - "end": segment.get("end"), - "text": segment.get("text", ""), - "speaker": segment.get("speaker", "unknown"), - "words": segment.get("words", []), - } - segments.append(converted_segment) - - return { - "transcription": { - "format": "11labs_segmented", - "source_file": source_file.name, - "segments": segments, - }, - "segments": segments, - } - - @staticmethod - def __convert_11labs_full(data: Dict[str, Any], source_file: Path) -> Dict[str, Any]: - segments = [] - words = data.get("words", []) - - current_segment = { - "words": [], - "start": None, - "end": None, - "text": "", - "speaker": "unknown", - } - - for word in words: - if current_segment["start"] is None: - current_segment["start"] = word.get("start") - - current_segment["words"].append(word) - current_segment["end"] = word.get("end") - - if word.get("text", "").endswith((".", "!", "?")) or len(current_segment["words"]) >= 20: - current_segment["text"] = " ".join(w.get("text", "") for w in current_segment["words"]) - segments.append(dict(current_segment)) - current_segment = { - "words": [], - "start": None, - "end": None, - "text": "", - "speaker": word.get("speaker_id", "unknown"), - } - - if current_segment["words"]: - current_segment["text"] = " ".join(w.get("text", "") for w in current_segment["words"]) - segments.append(current_segment) - - for i, seg in enumerate(segments): - seg["id"] = i - - return { - "transcription": { - "format": "11labs", - "source_file": source_file.name, - "language_code": data.get("language_code", "pol"), - "language_probability": data.get("language_probability", 1.0), - }, - "segments": segments, - } - - @staticmethod - def __extract_season_episode_fallback(file_path: Path) -> Tuple[int, int]: - match = re.search(r'S(\d+)E(\d+)', file_path.name, re.IGNORECASE) - if match: - return int(match.group(1)), int(match.group(2)) - - parent_match = re.search(r'S(\d+)', file_path.parent.name, re.IGNORECASE) - if parent_match: - season = int(parent_match.group(1)) - episode_match = re.search(r'E(\d+)', file_path.name, re.IGNORECASE) - if episode_match: - return season, int(episode_match.group(1)) - - return 1, 1 diff --git a/preprocessor/transcription/processors/__init__.py b/preprocessor/transcription/processors/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/preprocessor/transcription/processors/audio_normalizer.py b/preprocessor/transcription/processors/audio_normalizer.py deleted file mode 100644 index e0b46cc32..000000000 --- a/preprocessor/transcription/processors/audio_normalizer.py +++ /dev/null @@ -1,99 +0,0 @@ -import json -from pathlib import Path -import subprocess -from typing import ( - List, - Optional, -) - -from preprocessor.core.base_processor import BaseProcessor -from preprocessor.utils.error_handling_logger import ErrorHandlingLogger - - -class AudioNormalizer: - SUPPORTED_VIDEO_EXTENSIONS = BaseProcessor.SUPPORTED_VIDEO_EXTENSIONS - - def __init__( - self, - input_videos: Path, - output_dir: Path, - logger: ErrorHandlingLogger, - video_files: Optional[List[Path]] = None, - ): - self.__input_videos: Path = input_videos - self.__output_dir: Path = output_dir - self.__logger: ErrorHandlingLogger = logger - self.__video_files: Optional[List[Path]] = video_files - - self.__output_dir.mkdir(parents=True, exist_ok=True) - - def __call__(self) -> None: - if self.__video_files is not None: - for video in self.__video_files: - self.__process_video(video) - else: - for video in self.__input_videos.rglob("*"): - if video.suffix.lower() in self.SUPPORTED_VIDEO_EXTENSIONS: - self.__process_video(video) - - def __process_video(self, video: Path) -> None: - try: - output_path = self.__output_dir / video.with_suffix(".wav").name - - if output_path.exists(): - return - - audio_idx = self.__get_best_audio_stream(video) - if audio_idx is None: - self.__logger.error(f"Cannot find audio stream for file: '{video}'") - return - - self.__normalize(video=video, audio_idx=audio_idx, output=output_path) - - except Exception as e: - self.__logger.error(f"Error processing video {video}: {e}") - - def __get_best_audio_stream(self, video: Path) -> Optional[int]: - cmd = [ - "ffprobe", "-v", "error", - "-select_streams", "a", - "-show_entries", "stream=index,bit_rate", - "-of", "json", - str(video), - ] - result = subprocess.run(cmd, capture_output=True, text=True, check=True) - streams = json.loads(result.stdout).get("streams", []) - - if not streams: - self.__logger.error(f"No audio streams found in file: {video}") - return None - - best_stream = max(streams, key=lambda s: int(s.get("bit_rate", 0))) - return best_stream["index"] - - def __normalize(self, video: Path, audio_idx: int, output: Path) -> None: - tmp_output = output.with_name(output.stem + "_temp.wav") - - extract_cmd = [ - "ffmpeg", "-y", - "-i", str(video), - "-map", f"0:{audio_idx}", - "-acodec", "pcm_s16le", - "-ar", "48000", - "-ac", "1", - str(output), - ] - subprocess.run(extract_cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) - self.__logger.info(f"Converted audio: {output}") - - normalize_cmd = [ - "ffmpeg", "-y", - "-i", str(output), - "-af", "dynaudnorm", - str(tmp_output), - ] - subprocess.run(normalize_cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) - self.__logger.info(f"Normalized audio: {tmp_output}") - - tmp_output.replace(output) - self.__logger.info(f"Replaced original file with normalized audio: {video} -> {output}") diff --git a/preprocessor/transcription/processors/episode_info_processor.py b/preprocessor/transcription/processors/episode_info_processor.py deleted file mode 100644 index a52f20d9b..000000000 --- a/preprocessor/transcription/processors/episode_info_processor.py +++ /dev/null @@ -1,84 +0,0 @@ -import json -from pathlib import Path -from typing import ( - Any, - Dict, - Tuple, -) - -from preprocessor.core.episode_manager import EpisodeManager -from preprocessor.utils.error_handling_logger import ErrorHandlingLogger - - -class EpisodeInfoProcessor: - def __init__( - self, - jsons_dir: Path, - episodes_info_json: Path, - output_path: Path, - logger: ErrorHandlingLogger, - series_name: str = "", - ): - self.__jsons_dir: Path = jsons_dir - self.__output_path: Path = output_path - self.__logger: ErrorHandlingLogger = logger - - if not series_name: - series_name = self.__output_path.parent.name.lower() - self.__logger.warning( - f"No series name provided. Using fallback from folder name: '{series_name}'", - ) - - self.__series_name: str = series_name.lower() - self.__output_path.mkdir(parents=True, exist_ok=True) - - self.__episode_manager = EpisodeManager(episodes_info_json, self.__series_name) - - def __call__(self) -> None: - for transcription_file in self.__jsons_dir.rglob("*.json"): - self.__process_file(transcription_file) - - def __process_file(self, transcription_file: Path) -> None: - try: - transcription = self.__load_transcription(transcription_file) - episode_info = self.__episode_manager.parse_filename(transcription_file) - if not episode_info: - self.__logger.error(f"Cannot extract episode info from {transcription_file.name}") - return - - _, new_json_name = self.__write_episode_json(transcription, episode_info) - self.__rename_original_file(transcription_file, new_json_name) - - except Exception as e: - self.__logger.error(f"Error processing file {transcription_file}: {e}") - - @staticmethod - def __load_transcription(path: Path) -> Dict[str, Any]: - with path.open("r", encoding="utf-8") as f: - return json.load(f) - - def __write_episode_json(self, transcription: Dict[str, Any], episode_info) -> Tuple[Path, str]: - new_json_name = self.__episode_manager.file_naming.build_filename(episode_info, extension="json") - output_path = self.__episode_manager.build_output_path(episode_info, self.__output_path) - output_path.parent.mkdir(parents=True, exist_ok=True) - - result = { - "episode_info": EpisodeManager.get_metadata(episode_info), - "segments": transcription.get("segments", []), - } - - with output_path.open("w", encoding="utf-8") as f: - json.dump(result, f, ensure_ascii=False, indent=4) - - self.__logger.info(f"Created episode info {output_path}.") - return output_path, new_json_name - - def __rename_original_file(self, original_path: Path, new_name: str) -> None: - new_src = original_path.parent / new_name - if original_path.name == new_name: - self.__logger.info(f"File {original_path} already has correct name.") - elif new_src.exists(): - self.__logger.error(f"Cannot rename {original_path} -> {new_src}, file already exists!") - else: - original_path.rename(new_src) - self.__logger.info(f"Renamed source transcription file: {original_path} -> {new_src}") diff --git a/preprocessor/transcription/processors/normalized_audio_processor.py b/preprocessor/transcription/processors/normalized_audio_processor.py deleted file mode 100644 index c304462ae..000000000 --- a/preprocessor/transcription/processors/normalized_audio_processor.py +++ /dev/null @@ -1,97 +0,0 @@ -import gc -import json -from pathlib import Path -from typing import ( - List, - Optional, - Tuple, -) - -from faster_whisper import WhisperModel -import torch - -from preprocessor.transcription.whisper_utils import ( - build_transcription_result, - get_language_code, -) -from preprocessor.utils.error_handling_logger import ErrorHandlingLogger - - -class NormalizedAudioProcessor: - SUPPORTED_AUDIO_EXTENSIONS: Tuple[str, str] = (".wav", ".mp3") - - def __init__( - self, - input_audios: Path, - output_dir: Path, - logger: ErrorHandlingLogger, - language: str, - model: str, - device: str, - audio_files: Optional[List[Path]] = None, - ): - self.__input_audios: Path = input_audios - self.__output_dir: Path = output_dir - self.__logger: ErrorHandlingLogger = logger - self.__audio_files: Optional[List[Path]] = audio_files - - self.__language: str = language - - self.__input_audios.mkdir(parents=True, exist_ok=True) - self.__output_dir.mkdir(parents=True, exist_ok=True) - - if device != "cuda": - raise ValueError(f"Only GPU (cuda) is supported, got device={device}") - - compute_type = "float16" - self.__logger.info(f"Loading Whisper model {model} on {device} with compute_type={compute_type}") - self.__whisper_model = WhisperModel(model, device=device, compute_type=compute_type) - - def __call__(self) -> None: - if self.__audio_files is not None: - for audio in self.__audio_files: - self.__process_normalized_audio(audio) - else: - for audio in self.__input_audios.rglob("*"): - if audio.suffix.lower() in self.SUPPORTED_AUDIO_EXTENSIONS: - self.__process_normalized_audio(audio) - - def __process_normalized_audio(self, normalized_audio: Path) -> None: - try: - output_file = self.__output_dir / normalized_audio.with_suffix(".json").name - - if output_file.exists(): - return - - language_code = get_language_code(self.__language) - - segments, info = self.__whisper_model.transcribe( - str(normalized_audio), - language=language_code, - beam_size=10, - word_timestamps=True, - condition_on_previous_text=False, - temperature=0.0, - compression_ratio_threshold=None, - ) - - result = build_transcription_result(segments, language=info.language) - - for segment_dict in result["segments"]: - segment_dict["temperature"] = 0.0 - - with open(output_file, "w", encoding="utf-8") as f: - json.dump(result, f, ensure_ascii=False, indent=2) - - self.__logger.info(f"Processed: {normalized_audio}") - except Exception as e: - self.__logger.error(f"Error processing file {normalized_audio}: {e}") - - def cleanup(self) -> None: - self.__logger.info("Unloading Whisper model and clearing GPU memory...") - if hasattr(self, '_NormalizedAudioProcessor__whisper_model'): - del self.__whisper_model - gc.collect() - if torch.cuda.is_available(): - torch.cuda.empty_cache() - self.__logger.info("Whisper model unloaded, GPU memory cleared") diff --git a/preprocessor/transcription/processors/sound_separator.py b/preprocessor/transcription/processors/sound_separator.py deleted file mode 100644 index 57d4e2043..000000000 --- a/preprocessor/transcription/processors/sound_separator.py +++ /dev/null @@ -1,386 +0,0 @@ -import json -from pathlib import Path -import re -from typing import ( - Any, - Dict, - List, - Tuple, -) - -from preprocessor.config.config import settings -from preprocessor.core.base_processor import ( - BaseProcessor, - OutputSpec, - ProcessingItem, -) -from preprocessor.core.constants import ( - FILE_EXTENSIONS, - FILE_SUFFIXES, -) -from preprocessor.core.episode_manager import EpisodeManager -from preprocessor.utils.constants import ( - WordKeys, - WordTypeValues, -) - - -class SoundEventSeparator(BaseProcessor): - - def __init__(self, args: Dict[str, Any]) -> None: - super().__init__( - args=args, - class_name=self.__class__.__name__, - error_exit_code=2, - loglevel=args.get("loglevel", 20), - ) - - self.transcription_dir = Path(self._args.get("transcription_dir", settings.transcription.output_dir)) - episodes_info_json = self._args.get("episodes_info_json") - self.episode_manager = EpisodeManager(episodes_info_json, self.series_name) - - def _validate_args(self, args: Dict[str, Any]) -> None: - pass - - def _get_processing_items(self) -> List[ProcessingItem]: - segmented_files = list(self.transcription_dir.rglob("**/raw/*_segmented.json")) - - items = [] - for trans_file in segmented_files: - episode_info = self.episode_manager.parse_filename(trans_file) - if not episode_info: - self.logger.warning(f"Cannot parse episode info from {trans_file.name}") - continue - - episode_id = EpisodeManager.get_episode_id_for_state(episode_info) - - items.append( - ProcessingItem( - episode_id=episode_id, - input_path=trans_file, - metadata={"episode_info": episode_info}, - ), - ) - - return items - - def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: - base_name = item.input_path.stem.replace(FILE_SUFFIXES["segmented"], "") - episode_dir = item.input_path.parent.parent - clean_dir = episode_dir / settings.output_subdirs.transcription_subdirs.clean - sound_dir = episode_dir / settings.output_subdirs.transcription_subdirs.sound_events - - clean_json = clean_dir / f"{base_name}{FILE_SUFFIXES['clean']}{FILE_EXTENSIONS['json']}" - sound_json = sound_dir / f"{base_name}{FILE_SUFFIXES['sound_events']}{FILE_EXTENSIONS['json']}" - clean_segmented_json = clean_dir / f"{base_name}{FILE_SUFFIXES['segmented']}_clean{FILE_EXTENSIONS['json']}" - sound_segmented_json = sound_dir / f"{base_name}{FILE_SUFFIXES['segmented']}_sound_events{FILE_EXTENSIONS['json']}" - clean_txt = clean_dir / f"{base_name}{FILE_SUFFIXES['clean']}{FILE_EXTENSIONS['txt']}" - sound_txt = sound_dir / f"{base_name}{FILE_SUFFIXES['sound_events']}{FILE_EXTENSIONS['txt']}" - clean_srt = clean_dir / f"{base_name}{FILE_SUFFIXES['clean']}{FILE_EXTENSIONS['srt']}" - sound_srt = sound_dir / f"{base_name}{FILE_SUFFIXES['sound_events']}{FILE_EXTENSIONS['srt']}" - - return [ - OutputSpec(path=clean_json, required=True), - OutputSpec(path=sound_json, required=True), - OutputSpec(path=clean_segmented_json, required=True), - OutputSpec(path=sound_segmented_json, required=True), - OutputSpec(path=clean_txt, required=True), - OutputSpec(path=sound_txt, required=True), - OutputSpec(path=clean_srt, required=True), - OutputSpec(path=sound_srt, required=True), - ] - - def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> None: # pylint: disable=too-many-locals - with open(item.input_path, "r", encoding="utf-8") as f: - data = json.load(f) - - episode_info = data.get("episode_info", {}) - segments = data.get("segments", []) - - dialogue_segments = [] - sound_event_segments = [] - - for segment in segments: - classification = self.__classify_segment(segment) - - if classification == "dialogue": - dialogue_segments.append(self.__clean_segment_text(segment)) - elif classification == "sound_event": - sound_event_segments.append(self.__enrich_sound_event(self.__clean_segment_text(segment))) - elif classification == "mixed": - dialogue_parts, sound_parts = self.__split_mixed_segment(segment) - dialogue_segments.extend(dialogue_parts) - sound_event_segments.extend([self.__enrich_sound_event(s) for s in sound_parts]) - - dialogue_segments = self.__renumber_segments(dialogue_segments) - sound_event_segments = self.__renumber_segments(sound_event_segments) - - base_name = item.input_path.stem.replace(FILE_SUFFIXES["segmented"], "") - episode_dir = item.input_path.parent.parent - clean_dir = episode_dir / settings.output_subdirs.transcription_subdirs.clean - sound_dir = episode_dir / settings.output_subdirs.transcription_subdirs.sound_events - - clean_dir.mkdir(parents=True, exist_ok=True) - sound_dir.mkdir(parents=True, exist_ok=True) - - clean_json = clean_dir / f"{base_name}_clean_transcription.json" - sound_json = sound_dir / f"{base_name}_sound_events.json" - clean_segmented_json = clean_dir / f"{base_name}_segmented_clean.json" - sound_segmented_json = sound_dir / f"{base_name}_segmented_sound_events.json" - clean_txt = clean_dir / f"{base_name}_clean_transcription.txt" - sound_txt = sound_dir / f"{base_name}_sound_events.txt" - clean_srt = clean_dir / f"{base_name}_clean_transcription.srt" - sound_srt = sound_dir / f"{base_name}_sound_events.srt" - - raw_txt = episode_dir / settings.output_subdirs.transcription_subdirs.raw / f"{base_name}.txt" - - dialogue_segments_simple = self.__convert_to_simple_format(dialogue_segments) - sound_event_segments_simple = self.__convert_to_simple_format(sound_event_segments) - - with open(clean_json, "w", encoding="utf-8") as f: - json.dump( - {"episode_info": episode_info, "segments": dialogue_segments_simple}, - f, - ensure_ascii=False, - indent=4, - ) - - with open(sound_json, "w", encoding="utf-8") as f: - json.dump( - {"episode_info": episode_info, "segments": sound_event_segments_simple}, - f, - ensure_ascii=False, - indent=4, - ) - - with open(clean_segmented_json, "w", encoding="utf-8") as f: - json.dump( - {"episode_info": episode_info, "segments": dialogue_segments}, - f, - ensure_ascii=False, - indent=4, - ) - - with open(sound_segmented_json, "w", encoding="utf-8") as f: - json.dump( - {"episode_info": episode_info, "segments": sound_event_segments}, - f, - ensure_ascii=False, - indent=4, - ) - - self.__generate_txt_files(raw_txt, clean_txt, sound_txt) - self.__generate_srt_files(dialogue_segments, sound_event_segments, clean_srt, sound_srt) - - self.logger.info( - f"Separated {item.episode_id}: " - f"{len(dialogue_segments)} dialogue, {len(sound_event_segments)} sound events", - ) - - def __classify_segment(self, segment: Dict[str, Any]) -> str: - words = segment.get("words", []) - if not words: - return "dialogue" - - has_sound = False - has_dialogue = False - - for word in words: - if self.__is_sound_event(word): - has_sound = True - elif word.get(WordKeys.TYPE) not in [WordTypeValues.SPACING, ""]: - has_dialogue = True - - if has_sound and has_dialogue: - return "mixed" - if has_sound: - return "sound_event" - return "dialogue" - - @staticmethod - def __is_sound_event(word: Dict[str, Any]) -> bool: - if word.get(WordKeys.TYPE) == WordTypeValues.AUDIO_EVENT: - return True - - text = word.get("text", "").strip() - if re.match(r'^\(.*\)$', text): - return True - - return False - - def __split_mixed_segment(self, segment: Dict[str, Any]) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: - words = segment.get("words", []) - dialogue_sequences = [] - sound_sequences = [] - - current_type = None - current_words = [] - - for word in words: - if word.get(WordKeys.TYPE) == WordTypeValues.SPACING: - if current_words: - current_words.append(word) - continue - - is_sound = self.__is_sound_event(word) - word_type = "sound" if is_sound else "dialogue" - - if word_type != current_type: - if current_words: - self.__finalize_sequence( - current_type, current_words, dialogue_sequences, sound_sequences, segment, - ) - current_type = word_type - current_words = [word] - else: - current_words.append(word) - - if current_words: - self.__finalize_sequence( - current_type, current_words, dialogue_sequences, sound_sequences, segment, - ) - - return dialogue_sequences, sound_sequences - - @staticmethod - def __finalize_sequence( - seq_type: str, - words: List[Dict], - dialogue_sequences: List[Dict], - sound_sequences: List[Dict], - original_segment: Dict[str, Any], - ) -> None: - if not words: - return - - non_spacing_words = [w for w in words if w.get(WordKeys.TYPE) != WordTypeValues.SPACING] - if not non_spacing_words: - return - - text = "".join([w.get("text", "") for w in words]) - text = re.sub(r'\s+', ' ', text).strip() - start_time = min((w.get("start") or 0) for w in words) - end_time = max((w.get("end") or 0) for w in words) - - new_segment = { - "text": text, - "start": start_time, - "end": end_time, - "words": words, - } - - for key in original_segment: - if key not in ["text", "start", "end", "words"]: - new_segment[key] = original_segment[key] - - if seq_type == "dialogue": - dialogue_sequences.append(new_segment) - else: - sound_sequences.append(new_segment) - - @staticmethod - def __clean_segment_text(segment: Dict[str, Any]) -> Dict[str, Any]: - cleaned = segment.copy() - if "text" in cleaned: - text = cleaned["text"] - text = re.sub(r'\s+', ' ', text).strip() - cleaned["text"] = text - - if cleaned.get("start") is None or cleaned.get("end") is None: - words = cleaned.get("words", []) - if words: - starts = [(w.get("start") or 0) for w in words if w.get("start") is not None] - ends = [(w.get("end") or 0) for w in words if w.get("end") is not None] - if starts: - cleaned["start"] = min(starts) - if ends: - cleaned["end"] = max(ends) - - return cleaned - - @staticmethod - def __enrich_sound_event(segment: Dict[str, Any]) -> Dict[str, Any]: - enriched = segment.copy() - enriched["sound_type"] = "sound" - return enriched - - @staticmethod - def __renumber_segments(segments: List[Dict]) -> List[Dict]: - for i, segment in enumerate(segments): - segment["id"] = i - return segments - - @staticmethod - def __convert_to_simple_format(segments: List[Dict]) -> List[Dict]: - simple_segments = [] - for seg in segments: - simple_seg = { - "id": seg.get("id"), - "text": seg.get("text", ""), - "start": seg.get("start") or 0.0, - "end": seg.get("end") or 0.0, - } - if "sound_type" in seg: - simple_seg["sound_type"] = seg["sound_type"] - simple_segments.append(simple_seg) - return simple_segments - - def __generate_txt_files(self, original_txt: Path, clean_txt: Path, sound_txt: Path) -> None: - if not original_txt.exists(): - self.logger.warning(f"Original TXT file not found: {original_txt}") - return - - with open(original_txt, "r", encoding="utf-8") as f: - original_content = f.read() - - clean_content = re.sub(r'\([^)]*\)', '', original_content) - clean_content = re.sub(r'\s+', ' ', clean_content).strip() - - sound_matches = re.findall(r'\([^)]*\)', original_content) - sound_content = ' '.join(sound_matches) - - with open(clean_txt, "w", encoding="utf-8") as f: - f.write(clean_content) - - with open(sound_txt, "w", encoding="utf-8") as f: - f.write(sound_content) - - @staticmethod - def __generate_srt_files( - dialogue_segments: List[Dict], - sound_segments: List[Dict], - clean_srt: Path, - sound_srt: Path, - ) -> None: - def format_timestamp(seconds: float) -> str: - hours = int(seconds // 3600) - minutes = int((seconds % 3600) // 60) - secs = int(seconds % 60) - millis = int((seconds % 1) * 1000) - return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}" - - def _write_srt(segments: List[Dict], output_path: Path) -> None: - with open(output_path, "w", encoding="utf-8") as f: - for idx, seg in enumerate(segments, start=1): - words = seg.get("words", []) - text = seg.get("text", "").strip() - - if not text or not words: - continue - - non_spacing_words = [w for w in words if w.get(WordKeys.TYPE) != WordTypeValues.SPACING] - if not non_spacing_words: - continue - - start_time = min((w.get("start") or 0.0) for w in non_spacing_words) - end_time = max((w.get("end") or 0.0) for w in non_spacing_words) - - f.write(f"{idx}\n") - f.write(f"{format_timestamp(start_time)} --> {format_timestamp(end_time)}\n") - f.write(f"{text}\n\n") - - _write_srt(dialogue_segments, clean_srt) - _write_srt(sound_segments, sound_srt) - - def _get_progress_description(self) -> str: - return "Separating sound events from dialogues" diff --git a/preprocessor/transcription/processors/unicode_fixer.py b/preprocessor/transcription/processors/unicode_fixer.py deleted file mode 100644 index 2d84a903c..000000000 --- a/preprocessor/transcription/processors/unicode_fixer.py +++ /dev/null @@ -1,59 +0,0 @@ -from pathlib import Path -from typing import ( - Any, - Dict, - List, -) - -from preprocessor.config.config import settings -from preprocessor.core.base_processor import ( - BaseProcessor, - OutputSpec, - ProcessingItem, -) -from preprocessor.core.episode_manager import EpisodeManager -from preprocessor.utils.transcription_utils import fix_transcription_file_unicode - - -class TranscriptionUnicodeFixer(BaseProcessor): - def __init__(self, args: Dict[str, Any]) -> None: - super().__init__( - args=args, - class_name=self.__class__.__name__, - error_exit_code=2, - loglevel=args.get("loglevel", 20), - ) - - self.transcription_jsons = Path(self._args.get("transcription_jsons", settings.transcription.output_dir)) - episodes_info_json = self._args.get("episodes_info_json") - self.episode_manager = EpisodeManager(episodes_info_json, self.series_name) - - def _validate_args(self, args: Dict[str, Any]) -> None: - pass - - def _get_processing_items(self) -> List[ProcessingItem]: - transcription_files = list(self.transcription_jsons.rglob("*.json")) - - return [ - ProcessingItem( - episode_id=f"unicode_fix_{i}", - input_path=trans_file, - metadata={"file": trans_file}, - ) - for i, trans_file in enumerate(transcription_files) - ] - - def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: - return [OutputSpec(path=item.input_path, required=True)] - - def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> None: - trans_file = item.metadata["file"] - - try: - was_fixed = fix_transcription_file_unicode(trans_file) - if was_fixed: - self.logger.info(f"Fixed unicode escapes in: {trans_file.name}") - else: - self.logger.debug(f"No unicode escapes found in: {trans_file.name}") - except Exception as e: - self.logger.error(f"Error fixing unicode in {trans_file.name}: {e}") diff --git a/preprocessor/transcription/whisper_utils.py b/preprocessor/transcription/whisper_utils.py deleted file mode 100644 index 8015068b3..000000000 --- a/preprocessor/transcription/whisper_utils.py +++ /dev/null @@ -1,58 +0,0 @@ -from typing import ( - Any, - Dict, -) - -LANGUAGE_MAP = { - "polish": "pl", - "english": "en", - "german": "de", - "french": "fr", - "spanish": "es", -} - - -def get_language_code(language: str) -> str: - return LANGUAGE_MAP.get(language.lower(), language.lower()) - - -def _process_whisper_segment(segment) -> Dict[str, Any]: - words = [] - if hasattr(segment, 'words') and segment.words: - for word in segment.words: - words.append({ - "word": word.word, - "start": word.start, - "end": word.end, - "probability": word.probability, - }) - - return { - "id": segment.id, - "seek": 0, - "start": segment.start, - "end": segment.end, - "text": segment.text, - "tokens": [], - "avg_logprob": segment.avg_logprob, - "compression_ratio": segment.compression_ratio, - "no_speech_prob": segment.no_speech_prob, - "words": words, - } - - -def build_transcription_result(segments, language: str = None) -> Dict[str, Any]: - result = { - "text": "", - "segments": [], - } - - if language: - result["language"] = language - - for segment in segments: - segment_dict = _process_whisper_segment(segment) - result["segments"].append(segment_dict) - result["text"] += segment.text - - return result diff --git a/preprocessor/utils/__init__.py b/preprocessor/utils/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/preprocessor/utils/batch_processing_utils.py b/preprocessor/utils/batch_processing_utils.py deleted file mode 100644 index 453885b4e..000000000 --- a/preprocessor/utils/batch_processing_utils.py +++ /dev/null @@ -1,220 +0,0 @@ -from concurrent.futures import ThreadPoolExecutor -import json -from pathlib import Path -import time -from typing import ( - Any, - Dict, - Iterator, - List, - Optional, - Tuple, -) - -from PIL import Image - -from preprocessor.embeddings.gpu_batch_processor import GPUBatchProcessor -from preprocessor.hashing.image_hasher import PerceptualHasher -from preprocessor.utils.console import console -from preprocessor.utils.frame_utils import load_frames_from_requests -from preprocessor.utils.time_utils import format_time_hms - - -def _prefetch_batches( - frames_dir: Path, - frame_requests: List[Dict[str, Any]], - batch_size: int, - convert_rgb: bool = False, - prefetch_count: int = 2, -) -> Iterator[Tuple[int, List[Dict[str, Any]], List[Image.Image]]]: - total_chunks = (len(frame_requests) + batch_size - 1) // batch_size - - with ThreadPoolExecutor(max_workers=prefetch_count) as executor: - futures = {} - - for chunk_idx in range(total_chunks): - chunk_start = chunk_idx * batch_size - chunk_end = min(chunk_start + batch_size, len(frame_requests)) - chunk_requests = frame_requests[chunk_start:chunk_end] - - future = executor.submit(load_frames_from_requests, frames_dir, chunk_requests, convert_rgb) - futures[chunk_idx] = (chunk_requests, future) - - if len(futures) >= prefetch_count or chunk_idx == total_chunks - 1: - next_idx = chunk_idx - len(futures) + 1 - chunk_reqs, future = futures.pop(next_idx) - pil_images = future.result() - yield next_idx, chunk_reqs, pil_images - - -def compute_hashes_in_batches( - frames_dir: Path, - frame_requests: List[Dict[str, Any]], - hasher: PerceptualHasher, - batch_size: int, -) -> List[Dict[str, Any]]: - total_chunks = (len(frame_requests) + batch_size - 1) // batch_size - results = [] - - console.print(f"[cyan]Computing hashes for {len(frame_requests)} frames in {total_chunks} batches[/cyan]") - - start_time = time.time() - - for chunk_idx in range(total_chunks): - chunk_start = chunk_idx * batch_size - chunk_end = min(chunk_start + batch_size, len(frame_requests)) - chunk_requests = frame_requests[chunk_start:chunk_end] - - pil_images = load_frames_from_requests(frames_dir, chunk_requests) - phashes = hasher.compute_phash_batch(pil_images) - - for request, phash in zip(chunk_requests, phashes): - result = request.copy() - result["perceptual_hash"] = phash - results.append(result) - - del pil_images - - _report_batch_progress( - chunk_idx + 1, - total_chunks, - chunk_idx + 1, - total_chunks, - start_time, - ) - - console.print(f"[green]✓ Computed {len(results)} hashes[/green]") - return results - - -def compute_embeddings_in_batches( # pylint: disable=too-many-locals - frames_dir: Path, - frame_requests: List[Dict[str, Any]], - gpu_processor: GPUBatchProcessor, - batch_size: int, - image_hashes: Dict[int, str], - checkpoint_file: Optional[Path] = None, - checkpoint_interval: int = 20, - prefetch_count: int = 2, -) -> List[Dict[str, Any]]: - total_chunks = (len(frame_requests) + batch_size - 1) // batch_size - embeddings = [] - start_chunk_idx = 0 - - if checkpoint_file and checkpoint_file.exists(): - console.print("[yellow]Found checkpoint file, resuming from last saved batch[/yellow]") - try: - with open(checkpoint_file, "r", encoding="utf-8") as f: - checkpoint_data = json.load(f) - embeddings = checkpoint_data.get("embeddings", []) - start_chunk_idx = checkpoint_data.get("last_batch_idx", 0) + 1 - console.print(f"[cyan]Resuming from batch {start_chunk_idx}/{total_chunks}[/cyan]") - except (json.JSONDecodeError, KeyError) as e: - console.print(f"[yellow]Failed to load checkpoint: {e}. Starting from beginning.[/yellow]") - start_chunk_idx = 0 - embeddings = [] - - console.print(f"[cyan]Computing embeddings for {len(frame_requests)} frames in {total_chunks} batches (with prefetch={prefetch_count})[/cyan]") - - actual_checkpoint_interval = min(checkpoint_interval, max(1, total_chunks // 2)) - if actual_checkpoint_interval != checkpoint_interval: - console.print(f"[dim cyan]Adjusted checkpoint interval: {actual_checkpoint_interval} (every ~50% of batches)[/dim cyan]") - - start_time = time.time() - processed_batches = 0 - batches_to_process = total_chunks - start_chunk_idx - - for chunk_idx, chunk_requests, pil_images in _prefetch_batches( - frames_dir, frame_requests, batch_size, convert_rgb=True, prefetch_count=prefetch_count, - ): - if chunk_idx < start_chunk_idx: - continue - - chunk_embeddings = gpu_processor.process_images_batch(pil_images, chunk_idx) - - for request, embedding in zip(chunk_requests, chunk_embeddings): - result = { - **request, - "embedding": embedding, - } - - frame_num = request.get("frame_number") - if frame_num is not None and frame_num in image_hashes: - result["perceptual_hash"] = image_hashes[frame_num] - - embeddings.append(result) - - del pil_images - del chunk_embeddings - - processed_batches += 1 - _report_batch_progress( - processed_batches, - batches_to_process, - chunk_idx + 1, - total_chunks, - start_time, - ) - - if checkpoint_file and (chunk_idx + 1) % actual_checkpoint_interval == 0: - _save_checkpoint(checkpoint_file, chunk_idx, embeddings) - - if checkpoint_file and checkpoint_file.exists(): - checkpoint_file.unlink() - console.print("[cyan]Checkpoint file removed[/cyan]") - - vram_stats = gpu_processor.get_vram_stats() - if vram_stats: - console.print( - f"[cyan]VRAM usage: max={vram_stats['max_vram_gb']}GB, " - f"avg={vram_stats['avg_vram_gb']}GB[/cyan]", - ) - suggested_batch = gpu_processor.suggest_optimal_batch_size(target_vram_gb=21.0) - if suggested_batch != batch_size: - console.print( - f"[yellow]Suggested batch_size for 21GB VRAM target: {suggested_batch} " - f"(current: {batch_size})[/yellow]", - ) - - console.print(f"[green]✓ Computed {len(embeddings)} embeddings[/green]") - return embeddings - - -def _report_batch_progress( - processed: int, - total_to_process: int, - current_batch: int, - total_batches: int, - start_time: float, -) -> None: - elapsed = time.time() - start_time - percent = (processed / total_to_process * 100) if total_to_process > 0 else 0 - - if 0 < processed < total_to_process: - rate = processed / elapsed if elapsed > 0 else 0 - remaining = total_to_process - processed - eta_seconds = remaining / rate if rate > 0 else 0 - eta = format_time_hms(eta_seconds) if eta_seconds > 0 else "0:00:00" - rate_str = f"{rate:.2f} batch/s" - elif processed >= total_to_process: - eta = "0:00:00" - rate_str = f"{processed / elapsed:.2f} batch/s" if elapsed > 0 else "N/A" - else: - eta = "-:--:--" - rate_str = "N/A" - - console.print( - f" [dim cyan]Batch {current_batch}/{total_batches} " - f"({percent:.1f}%) | {rate_str} | ETA: {eta}[/dim cyan]", - ) - - -def _save_checkpoint(checkpoint_file: Path, last_batch_idx: int, embeddings: List[Dict[str, Any]]) -> None: - checkpoint_file.parent.mkdir(parents=True, exist_ok=True) - checkpoint_data = { - "last_batch_idx": last_batch_idx, - "embeddings": embeddings, - } - with open(checkpoint_file, "w", encoding="utf-8") as f: - json.dump(checkpoint_data, f) - console.print(f"[dim cyan]Checkpoint saved at batch {last_batch_idx + 1}[/dim cyan]") diff --git a/preprocessor/utils/console.py b/preprocessor/utils/console.py deleted file mode 100644 index a23e4718e..000000000 --- a/preprocessor/utils/console.py +++ /dev/null @@ -1,95 +0,0 @@ -import os -import sys -import time - -from rich.console import Console - -from preprocessor.utils.time_utils import format_time_hms - -_console_instance = None - - -def _get_console() -> Console: - global _console_instance # pylint: disable=global-statement - if _console_instance is None: - in_docker = os.path.exists('/.dockerenv') or os.getenv('DOCKER_CONTAINER', 'false') == 'true' - - _console_instance = Console( - force_terminal=True, - file=sys.stderr, - color_system="standard" if in_docker else "auto", - ) - return _console_instance - - -class SimpleProgress: - def __init__(self): - self.tasks = {} - self.task_counter = 0 - self.console = console - - def add_task(self, description: str, total: int): - task_id = self.task_counter - self.task_counter += 1 - self.tasks[task_id] = { - 'description': description, - 'total': total, - 'completed': 0, - 'start_time': time.time(), - 'last_print': 0, - } - self.__print_progress(task_id) - return task_id - - def advance(self, task_id: int, advance: int = 1): - if task_id not in self.tasks: - return - - task = self.tasks[task_id] - task['completed'] += advance - - current_time = time.time() - if current_time - task['last_print'] >= 1.0 or task['completed'] >= task['total']: - self.__print_progress(task_id) - task['last_print'] = current_time - - def __print_progress(self, task_id: int): - task = self.tasks[task_id] - completed = task['completed'] - total = task['total'] - percent = (completed / total * 100) if total > 0 else 0 - - elapsed = time.time() - task['start_time'] - if 0 < completed < total: - eta_seconds = (elapsed / completed) * (total - completed) - eta = format_time_hms(eta_seconds) - elif completed >= total: - eta = "0:00:00" - else: - eta = "-:--:--" - - bar_width = 30 - filled = int(bar_width * completed / total) if total > 0 else 0 - progress_bar = "━" * filled + "╸" + "─" * (bar_width - filled - 1) if filled < bar_width else "━" * bar_width - - console.print( - f"[bold blue]{task['description']}[/bold blue] " - f"[cyan]{progress_bar}[/cyan] " - f"[green]{percent:3.0f}%[/green] " - f"[yellow]{completed}/{total}[/yellow] " - f"[dim]ETA: {eta}[/dim]", - highlight=False, - ) - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - pass - - -def create_progress() -> SimpleProgress: - return SimpleProgress() - - -console = _get_console() diff --git a/preprocessor/utils/constants.py b/preprocessor/utils/constants.py deleted file mode 100644 index e00986f2a..000000000 --- a/preprocessor/utils/constants.py +++ /dev/null @@ -1,200 +0,0 @@ -# pylint: disable=duplicate-code - -class SegmentKeys: - START_TIME = "start_time" - END_TIME = "end_time" - TEXT = "text" - VIDEO_PATH = "video_path" - SEGMENT_ID = "segment_id" - ID = "id" - START = "start" - END = "end" - - -class EpisodeMetadataKeys: - EPISODE_METADATA = "episode_metadata" - EPISODE_INFO = "episode_info" - SEASON = "season" - EPISODE_NUMBER = "episode_number" - SERIES_NAME = "series_name" - TITLE = "title" - PREMIERE_DATE = "premiere_date" - VIEWERSHIP = "viewership" - - -class ElasticsearchKeys: - SOURCE = "_source" - SCORE = "_score" - HITS = "hits" - TOTAL = "total" - AGGREGATIONS = "aggregations" - BUCKETS = "buckets" - KEY = "key" - - -class ElasticsearchAggregationKeys: - UNIQUE_EPISODES = "unique_episodes" - SEASONS = "seasons" - VALUE = "value" - - -class TranscriptionContextKeys: - TARGET = "target" - CONTEXT = "context" - OVERALL_START_TIME = "overall_start_time" - OVERALL_END_TIME = "overall_end_time" - - -class ElasticsearchQueryKeys: - QUERY = "query" - TERM = "term" - MATCH = "match" - BOOL = "bool" - MUST = "must" - FILTER = "filter" - RANGE = "range" - SIZE = "size" - SORT = "sort" - ORDER = "order" - ASC = "asc" - DESC = "desc" - FUZZINESS = "fuzziness" - AUTO = "AUTO" - TERMS = "terms" - FIELD = "field" - AGGS = "aggs" - CARDINALITY = "cardinality" - TOP_HITS = "top_hits" - INCLUDES = "includes" - LT = "lt" - GT = "gt" - SOURCE = "_source" - KEY = "_key" - - -class EpisodesDataKeys: - SEASONS = "seasons" - SEASON_NUMBER = "season_number" - EPISODES = "episodes" - - -class FfprobeKeys: - STREAMS = "streams" - FORMAT = "format" - - -class FfprobeStreamKeys: - R_FRAME_RATE = "r_frame_rate" - BIT_RATE = "bit_rate" - CODEC_NAME = "codec_name" - WIDTH = "width" - HEIGHT = "height" - DURATION = "duration" - - -class FfprobeFormatKeys: - DURATION = "duration" - SIZE = "size" - - -class DetectionKeys: - DETECTIONS = "detections" - CHARACTERS = "characters" - FRAME_NUMBER = "frame_number" - FRAME = "frame" - FRAME_NAME = "frame_name" - FRAME_FILE = "frame_file" - - -class CharacterDetectionKeys: - NAME = "name" - CONFIDENCE = "confidence" - EMOTION = "emotion" - BBOX = "bbox" - - -class EmotionKeys: - LABEL = "label" - CONFIDENCE = "confidence" - - -class ObjectDetectionKeys: - CLASS_NAME = "class_name" - CLASS_ID = "class_id" - CONFIDENCE = "confidence" - BBOX = "bbox" - - -class SceneKeys: - SCENES = "scenes" - START = "start" - END = "end" - SCENE_NUMBER = "scene_number" - SCENE_START_FRAME = "scene_start_frame" - SCENE_END_FRAME = "scene_end_frame" - SCENE_START_TIME = "scene_start_time" - SCENE_END_TIME = "scene_end_time" - - -class SceneTimeKeys: - SECONDS = "seconds" - FRAME = "frame" - - -class ElasticDocKeys: - SCENE_INFO = "scene_info" - CHARACTER_APPEARANCES = "character_appearances" - DETECTED_OBJECTS = "detected_objects" - PERCEPTUAL_HASH = "perceptual_hash" - PERCEPTUAL_HASH_INT = "perceptual_hash_int" - - -class EmbeddingKeys: - EPISODE_ID = "episode_id" - TITLE = "title" - TITLE_EMBEDDING = "title_embedding" - EPISODE_METADATA = "episode_metadata" - FRAME_NUMBER = "frame_number" - PERCEPTUAL_HASH = "perceptual_hash" - FRAME_PATH = "frame_path" - TIMESTAMP = "timestamp" - EMBEDDING = "embedding" - SCENE_NUMBER = "scene_number" - - -class ValidationMetadataKeys: - WIDTH = "width" - HEIGHT = "height" - FORMAT = "format" - SIZE_MB = "size_mb" - SIZE_BYTES = "size_bytes" - LINE_COUNT = "line_count" - CODEC = "codec" - DURATION = "duration" - - -class WordKeys: - TYPE = "type" - START = "start" - END = "end" - WORD = "word" - - -class WordTypeValues: - SPACING = "spacing" - AUDIO_EVENT = "audio_event" - - -class GoogleSearchKeys: - ENGINE = "engine" - Q = "q" - HL = "hl" - GL = "gl" - API_KEY = "api_key" - IMAGES_RESULTS = "images_results" - - -class ImageResultKeys: - ORIGINAL = "original" - THUMBNAIL = "thumbnail" - IMAGE = "image" diff --git a/preprocessor/utils/detection_io.py b/preprocessor/utils/detection_io.py deleted file mode 100644 index 6826786d9..000000000 --- a/preprocessor/utils/detection_io.py +++ /dev/null @@ -1,90 +0,0 @@ -from pathlib import Path -import re -from typing import ( - Any, - Dict, - List, - Optional, -) - -from preprocessor.characters.face_detection_utils import detect_characters_in_frame -from preprocessor.config.config import settings -from preprocessor.core.file_naming import FileNamingConventions -from preprocessor.core.output_path_builder import OutputPathBuilder -from preprocessor.utils.console import console -from preprocessor.utils.file_utils import atomic_write_json -from preprocessor.utils.metadata_utils import create_minimal_episode_info - - -def _parse_frame_number(frame_filename: str) -> Optional[int]: - match = re.search(r'frame_(\d+)', frame_filename) - if match: - return int(match.group(1)) - return None - - -def save_character_detections( - episode_info, - results: List[Dict[str, Any]], - fps: float = 25.0, -) -> None: - detections_data = { - "episode_info": create_minimal_episode_info(episode_info), - "video_metadata": { - "fps": fps, - }, - "detections": results, - } - - file_naming = FileNamingConventions(episode_info.series_name) - detections_filename = file_naming.build_filename( - episode_info, - extension="json", - suffix="character_detections", - ) - detections_output = OutputPathBuilder.build_output_path( - episode_info, - settings.output_subdirs.character_detections, - detections_filename, - ) - atomic_write_json(detections_output, detections_data, indent=2, ensure_ascii=False) - - frames_with_chars = sum(1 for r in results if r["characters"]) - console.print( - f"[green]✓ {episode_info.episode_code()}: {len(results)} frames, " - f"{frames_with_chars} with characters[/green]", - ) - - -def process_frames_for_detection( - frame_files: List[Path], - face_app, - character_vectors: Dict[str, Any], - threshold: float, - fps: float = 25.0, -) -> List[Dict[str, Any]]: - results = [] - for idx, frame_path in enumerate(frame_files): - detected_chars = detect_characters_in_frame( - frame_path, - face_app, - character_vectors, - threshold, - ) - - frame_number = _parse_frame_number(frame_path.name) - timestamp = frame_number / fps if frame_number is not None else None - - frame_result = { - "frame_number": frame_number, - "timestamp": timestamp, - "frame_file": frame_path.name, - "characters": detected_chars, - } - - results.append(frame_result) - - if (idx + 1) % 100 == 0: - console.print(f" Processed {idx + 1}/{len(frame_files)} frames") - - return results diff --git a/preprocessor/utils/emotion_utils.py b/preprocessor/utils/emotion_utils.py deleted file mode 100644 index 3e09bf26d..000000000 --- a/preprocessor/utils/emotion_utils.py +++ /dev/null @@ -1,127 +0,0 @@ -from typing import ( - Dict, - List, - Optional, - Tuple, -) - -from hsemotion_onnx.facial_emotions import HSEmotionRecognizer -import numpy as np - -from preprocessor.config.config import settings -from preprocessor.utils.console import console - -EMOTION_LABELS = [ - 'anger', - 'contempt', - 'disgust', - 'fear', - 'happiness', - 'neutral', - 'sadness', - 'surprise', -] - - -def init_emotion_model() -> HSEmotionRecognizer: - model_name = settings.emotion_detection.model_name - - console.print(f"[cyan]Loading HSEmotion model: {model_name}...[/cyan]") - - try: - fer = HSEmotionRecognizer(model_name=model_name) - console.print(f"[green]✓ HSEmotion model loaded: {model_name}[/green]") - return fer - except Exception as e: - raise RuntimeError(f"Failed to load HSEmotion model {model_name}: {e}") from e - - -def detect_emotion( - face_image: np.ndarray, - model: HSEmotionRecognizer, -) -> Tuple[str, float, Dict[str, float]]: - try: - emotion, scores = model.predict_emotions(face_image, logits=False) - - emotion_scores = { - EMOTION_LABELS[i]: float(scores[i]) - for i in range(len(EMOTION_LABELS)) - } - - confidence = float(max(scores)) - dominant_emotion = emotion.lower() - - return dominant_emotion, confidence, emotion_scores - - except Exception as e: - raise RuntimeError(f"Emotion detection failed: {e}") from e - - -def crop_face_from_frame(frame: np.ndarray, bbox: Dict[str, int]) -> Optional[np.ndarray]: - try: - x1, y1 = bbox['x1'], bbox['y1'] - x2, y2 = bbox['x2'], bbox['y2'] - - if x1 < 0 or y1 < 0 or x2 > frame.shape[1] or y2 > frame.shape[0]: - x1 = max(0, x1) - y1 = max(0, y1) - x2 = min(frame.shape[1], x2) - y2 = min(frame.shape[0], y2) - - if x2 <= x1 or y2 <= y1: - return None - - face_crop = frame[y1:y2, x1:x2] - - if face_crop.size == 0: - return None - - return face_crop - - except Exception: - return None - - -def detect_emotions_batch( - face_images: List[np.ndarray], - model: HSEmotionRecognizer, - batch_size: int = 32, -) -> List[Tuple[str, float, Dict[str, float]]]: - results = [] - total = len(face_images) - - for batch_start in range(0, total, batch_size): - batch_end = min(batch_start + batch_size, total) - batch = face_images[batch_start:batch_end] - - progress_pct = int((batch_end / total) * 100) - console.print(f"[cyan] Processing batch {batch_start}-{batch_end}/{total} ({progress_pct}%)[/cyan]") - - try: - batch_results = model.predict_multi_emotions(batch, logits=False) - - for emotion, scores in batch_results: - emotion_scores = { - EMOTION_LABELS[i]: float(scores[i]) - for i in range(len(EMOTION_LABELS)) - } - confidence = float(max(scores)) - dominant_emotion = emotion.lower() - - results.append((dominant_emotion, confidence, emotion_scores)) - - except Exception: - for face_img in batch: - try: - emotion, scores = model.predict_emotions(face_img, logits=False) - emotion_scores = { - EMOTION_LABELS[i]: float(scores[i]) - for i in range(len(EMOTION_LABELS)) - } - confidence = float(max(scores)) - dominant_emotion = emotion.lower() - results.append((dominant_emotion, confidence, emotion_scores)) - except Exception: - results.append(None) - - return results diff --git a/preprocessor/utils/error_handling_logger.py b/preprocessor/utils/error_handling_logger.py deleted file mode 100644 index 3f2cf1628..000000000 --- a/preprocessor/utils/error_handling_logger.py +++ /dev/null @@ -1,95 +0,0 @@ -import logging -from typing import List - -from rich.logging import RichHandler -from rich.panel import Panel - -from preprocessor.utils.console import console - - -class LoggerNotFinalizedException(Exception): - def __init__(self): - super().__init__("Logger destroyed without finalize() being called.") - - -class ErrorHandlingLogger: - def __init__(self, class_name: str, loglevel: int, error_exit_code: int) -> None: - self.__class_name: str = class_name - self.__error_exit_code: int = error_exit_code - self.__errors: List[str] = [] - self.__is_finalized: bool = False - - self.__setup_logger(loglevel) - - def __del__(self) -> None: - if not self.__is_finalized: - self.__logger.error( - f"ErrorHandlingLogger for '{self.__class_name}' destroyed without finalize().", - ) - if self.__errors: - self.__logger.error("Logged errors:") - for error in self.__errors: - self.__logger.error(f"- {error}") - raise LoggerNotFinalizedException - - def __setup_logger(self, level: int) -> None: - logging.basicConfig( - level=level, - format="%(message)s", - handlers=[ - RichHandler( - console=console, - rich_tracebacks=True, - show_time=True, - show_path=False, - ), - ], - force=True, - ) - self.__logger: logging.Logger = logging.getLogger(self.__class_name) - - def log(self, level: int, message: str) -> None: - if level == logging.ERROR: - self.__logger.error(message) - elif level == logging.INFO: - self.__logger.info(message) - elif level == logging.WARNING: - self.__logger.warning(message) - elif level == logging.DEBUG: - self.__logger.debug(message) - else: - raise RuntimeError(f"Logging level {level} is not supported.") - - def info(self, message: str) -> None: - self.__logger.info(message) - - def error(self, message: str) -> None: - self.__logger.error(message) - self.__errors.append(message) - - def warning(self, message: str) -> None: - self.__logger.warning(message) - - def debug(self, message: str) -> None: - self.__logger.debug(message) - - def finalize(self) -> int: - self.__is_finalized = True - if self.__errors: - console.print( - Panel( - f"[bold red]Processing for '{self.__class_name}' completed with {len(self.__errors)} error(s)[/bold red]", - title="Errors Occurred", - border_style="red", - ), - ) - return self.__error_exit_code - - console.print( - Panel( - f"[bold green]Processing for '{self.__class_name}' completed successfully[/bold green]", - title="Success", - border_style="green", - ), - ) - return 0 diff --git a/preprocessor/utils/file_utils.py b/preprocessor/utils/file_utils.py deleted file mode 100644 index b2747a91e..000000000 --- a/preprocessor/utils/file_utils.py +++ /dev/null @@ -1,18 +0,0 @@ -import json -from pathlib import Path -from typing import Any - - -def atomic_write_json(output_path: Path, data: Any, **kwargs) -> None: - kwargs.setdefault('ensure_ascii', False) - temp_path = output_path.with_suffix(output_path.suffix + '.tmp') - with open(temp_path, 'w', encoding='utf-8') as f: - json.dump(data, f, **kwargs) - temp_path.replace(output_path) - - -def atomic_write_text(output_path: Path, content: str) -> None: - temp_path = output_path.with_suffix(output_path.suffix + '.tmp') - with open(temp_path, 'w', encoding='utf-8') as f: - f.write(content) - temp_path.replace(output_path) diff --git a/preprocessor/utils/frame_utils.py b/preprocessor/utils/frame_utils.py deleted file mode 100644 index d1f2fe32f..000000000 --- a/preprocessor/utils/frame_utils.py +++ /dev/null @@ -1,40 +0,0 @@ -from concurrent.futures import ThreadPoolExecutor -from pathlib import Path -from typing import ( - Any, - Dict, - List, -) - -from PIL import Image - - -def _load_single_frame(frames_dir: Path, request: Dict[str, Any], convert_rgb: bool) -> Image.Image: - if "frame_path" in request: - frame_path = frames_dir / request["frame_path"] - else: - frame_num = request["frame_number"] - frame_path = frames_dir / f"frame_{frame_num:06d}.jpg" - - if frame_path.exists(): - img = Image.open(frame_path) - if convert_rgb and img.mode != 'RGB': - img = img.convert('RGB') - return img - return Image.new('RGB', (1, 1)) - - -def load_frames_from_requests( - frames_dir: Path, - frame_requests: List[Dict[str, Any]], - convert_rgb: bool = False, - num_workers: int = 4, -) -> List[Image.Image]: - with ThreadPoolExecutor(max_workers=num_workers) as executor: - images = list( - executor.map( - lambda req: _load_single_frame(frames_dir, req, convert_rgb), - frame_requests, - ), - ) - return images diff --git a/preprocessor/utils/image_hash_utils.py b/preprocessor/utils/image_hash_utils.py deleted file mode 100644 index e788b5d37..000000000 --- a/preprocessor/utils/image_hash_utils.py +++ /dev/null @@ -1,54 +0,0 @@ -import json -from typing import ( - Any, - Dict, -) - -from preprocessor.config.config import settings -from preprocessor.core.episode_manager import EpisodeInfo -from preprocessor.core.output_path_builder import OutputPathBuilder - - -def load_image_hashes_for_episode(episode_info_dict: Dict[str, Any], logger=None) -> Dict[int, str]: - season = episode_info_dict.get("season") - episode = episode_info_dict.get("episode_number") - if season is None or episode is None: - return {} - - episode_info = EpisodeInfo( - absolute_episode=0, - season=season, - relative_episode=episode, - title="", - ) - hashes_episode_dir = OutputPathBuilder.get_episode_dir(episode_info, settings.output_subdirs.image_hashes) - - hash_files = list(hashes_episode_dir.glob("*_image_hashes.json")) - if not hash_files: - if logger: - logger.debug(f"Image hashes not found in: {hashes_episode_dir}") - return {} - - hashes_file = hash_files[0] - - if not hashes_file.exists(): - if logger: - logger.debug(f"Image hashes not found: {hashes_file}") - return {} - - try: - with open(hashes_file, "r", encoding="utf-8") as f: - data = json.load(f) - - hash_map = {} - for item in data.get("image_hashes", []): - frame_num = item.get("frame_number") - phash = item.get("perceptual_hash") - if frame_num is not None and phash: - hash_map[frame_num] = phash - - return hash_map - except Exception as e: - if logger: - logger.error(f"Failed to load image hashes: {e}") - return {} diff --git a/preprocessor/utils/metadata_utils.py b/preprocessor/utils/metadata_utils.py deleted file mode 100644 index b1082a99a..000000000 --- a/preprocessor/utils/metadata_utils.py +++ /dev/null @@ -1,29 +0,0 @@ -from datetime import datetime -from typing import ( - Any, - Dict, - List, -) - - -def create_minimal_episode_info(episode_info) -> Dict[str, Any]: - return { - "season": episode_info.season, - "episode_number": episode_info.relative_episode, - } - - -def create_processing_metadata( - episode_info, - processing_params: Dict[str, Any], - statistics: Dict[str, Any], - results_key: str, - results_data: List[Any], -) -> Dict[str, Any]: - return { - "generated_at": datetime.now().isoformat(), - "episode_info": create_minimal_episode_info(episode_info), - "processing_parameters": processing_params, - "statistics": statistics, - results_key: results_data, - } diff --git a/preprocessor/utils/progress_tracker.py b/preprocessor/utils/progress_tracker.py deleted file mode 100644 index b5ff79d18..000000000 --- a/preprocessor/utils/progress_tracker.py +++ /dev/null @@ -1,78 +0,0 @@ -from contextlib import contextmanager -import time -from typing import Optional - -from preprocessor.utils.console import console -from preprocessor.utils.time_utils import ( - format_time_hms, - format_time_human, -) - - -class ProgressTracker: - def __init__(self): - self.current_operation: Optional[str] = None - self.start_time: Optional[float] = None - - @contextmanager - def track_operation(self, operation_name: str, total: int): - self.current_operation = operation_name - self.start_time = time.time() - console.print(f" [cyan]{operation_name} (total: {total})...[/cyan]") - - tracker = OperationTracker( - operation_name=operation_name, - total=total, - start_time=self.start_time, - ) - - try: - yield tracker - finally: - if tracker.completed > 0: - elapsed = time.time() - self.start_time - console.print( - f" [green]✓ {operation_name} completed: " - f"{tracker.completed}/{total} in {format_time_human(elapsed)}[/green]", - ) - - -class OperationTracker: - def __init__(self, operation_name: str, total: int, start_time: float): - self.operation_name = operation_name - self.total = total - self.completed = 0 - self.start_time = start_time - self.last_report = 0 - - def update(self, completed: int, interval: int = 10): - self.completed = completed - - should_report = ( - completed % interval == 0 or - completed == self.total or - completed == 1 - ) - - if should_report and completed != self.last_report: - self.__report_progress() - self.last_report = completed - - def __report_progress(self): - elapsed = time.time() - self.start_time - percent = (self.completed / self.total * 100) if self.total > 0 else 0 - - if 0 < self.completed < self.total: - rate = self.completed / elapsed if elapsed > 0 else 0 - remaining = self.total - self.completed - eta_seconds = remaining / rate if rate > 0 else 0 - eta = format_time_hms(eta_seconds) if eta_seconds > 0 else "0:00:00" - elif self.completed >= self.total: - eta = "0:00:00" - else: - eta = "-:--:--" - - console.print( - f" [dim]{self.operation_name}: {self.completed}/{self.total} " - f"({percent:.0f}%) ETA: {eta}[/dim]", - ) diff --git a/preprocessor/utils/resolution.py b/preprocessor/utils/resolution.py deleted file mode 100644 index df14f73b5..000000000 --- a/preprocessor/utils/resolution.py +++ /dev/null @@ -1,42 +0,0 @@ -from enum import Enum -from typing import ( - List, - Type, - TypeVar, -) - -# pylint: disable=duplicate-code - -T = TypeVar("T", bound="Resolution") - - -class Resolution(Enum): - R4320P = (7680, 4320) - R2160P = (3840, 2160) - R1440P = (2560, 1440) - R1080P = (1920, 1080) - R720P = (1280, 720) - R480P = (854, 480) - R360P = (640, 360) - R240P = (426, 240) - R144P = (256, 144) - - def __init__(self, width: int, height: int): - self.width = width - self.height = height - - def __str__(self): - return f"{self.height}p" - - @classmethod - def from_str(cls: Type[T], init: str) -> T: - init = init.strip() - if not init[0].isalpha(): - init = "R" + init.upper() - else: - init = init.upper() - return cls[init] - - @classmethod - def get_all_choices(cls) -> List[str]: - return [str(r) for r in cls] diff --git a/preprocessor/utils/time_utils.py b/preprocessor/utils/time_utils.py deleted file mode 100644 index a9ebf84e2..000000000 --- a/preprocessor/utils/time_utils.py +++ /dev/null @@ -1,17 +0,0 @@ -def format_time_hms(seconds: float) -> str: - hours = int(seconds // 3600) - minutes = int((seconds % 3600) // 60) - secs = int(seconds % 60) - return f"{hours}:{minutes:02d}:{secs:02d}" - - -def format_time_human(seconds: float) -> str: - if seconds < 60: - return f"{seconds:.1f}s" - minutes = int(seconds // 60) - secs = int(seconds % 60) - if minutes < 60: - return f"{minutes}m {secs}s" - hours = minutes // 60 - minutes = minutes % 60 - return f"{hours}h {minutes}m {secs}s" diff --git a/preprocessor/utils/transcription_utils.py b/preprocessor/utils/transcription_utils.py deleted file mode 100644 index a8c9c4b13..000000000 --- a/preprocessor/utils/transcription_utils.py +++ /dev/null @@ -1,57 +0,0 @@ -import codecs -import json -from pathlib import Path -import re -from typing import ( - Any, - Dict, - List, -) - - -def _convert_word_to_standard_format(word: Dict[str, Any]) -> Dict[str, Any]: - return { - "text": word.get("word", word.get("text", "")).strip(), - "start": word.get("start", 0.0), - "end": word.get("end", 0.0), - "type": "word", - "speaker_id": word.get("speaker_id", "speaker_unknown"), - "logprob": word.get("probability", 0.0), - } - - -def convert_words_list(seg_words: List[Dict[str, Any]]) -> List[Dict[str, Any]]: - return [_convert_word_to_standard_format(word) for word in seg_words] - - -def _fix_unicode_escapes(text: str) -> str: - def replace_unicode(match): - unicode_str = match.group(0) - try: - return codecs.decode(unicode_str, 'unicode_escape') - except Exception: - return unicode_str - - pattern = r'\\u[0-9a-fA-F]{4}' - return re.sub(pattern, replace_unicode, text) - - -def fix_transcription_file_unicode(file_path: Path) -> bool: - try: - with open(file_path, "r", encoding="utf-8") as f: - content = f.read() - - if '\\u' not in content: - return False - - fixed_content = _fix_unicode_escapes(content) - - if fixed_content != content: - data = json.loads(fixed_content) - with open(file_path, "w", encoding="utf-8") as f: - json.dump(data, f, indent=2, ensure_ascii=False) - return True - - return False - except Exception: - return False diff --git a/preprocessor/utils/video_utils.py b/preprocessor/utils/video_utils.py deleted file mode 100644 index 721fc8ceb..000000000 --- a/preprocessor/utils/video_utils.py +++ /dev/null @@ -1,29 +0,0 @@ -from typing import ( - Generator, - Tuple, -) - -import decord -import numpy as np - - -def iterate_frames_with_histogram( - video_path: str, - sample_interval: int = 5, -) -> Generator[Tuple[int, np.ndarray, np.ndarray], None, None]: - vr = decord.VideoReader(video_path, ctx=decord.cpu(0)) - total_frames = len(vr) - - for frame_num in range(0, total_frames, sample_interval): - try: - frame_tensor = vr[frame_num] - frame_np = frame_tensor.numpy() - - gray = np.dot(frame_np[..., :3], [0.299, 0.587, 0.114]).astype(np.uint8) - hist, _ = np.histogram(gray, bins=256, range=(0, 256)) - hist = hist / (hist.sum() + 1e-7) - - yield frame_num, frame_np, hist - - except (RuntimeError, ValueError, OSError): - break diff --git a/preprocessor/validation/__init__.py b/preprocessor/validation/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/preprocessor/validation/episode_stats.py b/preprocessor/validation/episode_stats.py deleted file mode 100644 index db2109cff..000000000 --- a/preprocessor/validation/episode_stats.py +++ /dev/null @@ -1,507 +0,0 @@ -from dataclasses import ( - dataclass, - field, -) -import json -from typing import ( - Any, - Dict, - List, - Optional, - Tuple, -) - -from preprocessor.config.config import settings -from preprocessor.core.constants import ( - OUTPUT_FILE_NAMES, - OUTPUT_FILE_PATTERNS, -) -from preprocessor.core.episode_manager import ( - EpisodeInfo, - EpisodeManager, -) -from preprocessor.core.output_path_builder import OutputPathBuilder -from preprocessor.validation.base_result import ValidationStatusMixin -from preprocessor.validation.file_validators import ( - validate_image_file, - validate_json_file, - validate_jsonl_file, - validate_video_file, -) - -ELASTIC_SUBDIRS = settings.output_subdirs.elastic_document_subdirs - - -@dataclass -class EpisodeStats(ValidationStatusMixin): # pylint: disable=too-many-instance-attributes - episode_info: EpisodeInfo - series_name: str - errors: List[str] = field(default_factory=list) - warnings: List[str] = field(default_factory=list) - - transcription_chars: Optional[int] = None - transcription_duration: Optional[float] = None - transcription_words: Optional[int] = None - - exported_frames_count: Optional[int] = None - exported_frames_total_size_mb: Optional[float] = None - exported_frames_avg_resolution: Optional[Tuple[int, int]] = None - - video_size_mb: Optional[float] = None - video_duration: Optional[float] = None - video_codec: Optional[str] = None - video_resolution: Optional[Tuple[int, int]] = None - - scenes_count: Optional[int] = None - scenes_avg_duration: Optional[float] = None - - image_hashes_count: Optional[int] = None - object_detections_count: Optional[int] = None - object_visualizations_count: Optional[int] = None - character_visualizations_count: Optional[int] = None - face_clusters_count: Optional[int] = None - face_clusters_total_faces: Optional[int] = None - - def collect_stats(self): - self.__validate_transcription() - self.__validate_exported_frames() - self.__validate_video() - self.__validate_scenes() - self.__validate_image_hashes() - self.__validate_character_visualizations() - self.__validate_face_clusters() - self.__validate_object_detections() - self.__validate_object_visualizations() - self.__validate_other_files() - - def __validate_transcription(self): - transcriptions_dir = EpisodeManager.get_episode_subdir(self.episode_info, settings.output_subdirs.transcriptions) - base_name = f"{self.series_name}_{self.episode_info.episode_code()}" - - raw_dir = transcriptions_dir / settings.output_subdirs.transcription_subdirs.raw - clean_dir = transcriptions_dir / settings.output_subdirs.transcription_subdirs.clean - sound_events_dir = transcriptions_dir / settings.output_subdirs.transcription_subdirs.sound_events - - transcription_files = { - "main": raw_dir / f"{base_name}.json", - "segmented": raw_dir / f"{base_name}_segmented.json", - "simple": raw_dir / f"{base_name}_simple.json", - "clean": clean_dir / f"{base_name}_clean_transcription.json", - "clean_txt": clean_dir / f"{base_name}_clean_transcription.txt", - "sound_events": sound_events_dir / f"{base_name}_sound_events.json", - } - - if not any(f.exists() for f in transcription_files.values()): - self.errors.append("No transcription files found in any format") - return - - self.__validate_raw_transcription(transcription_files) - self.__validate_clean_transcription(transcription_files["clean"]) - self.__validate_clean_txt(transcription_files["clean_txt"]) - self.__validate_sound_events(transcription_files["sound_events"]) - - def __validate_raw_transcription(self, transcription_files: Dict[str, Any]): - raw_transcription = None - for key in ("main", "segmented", "simple"): - if transcription_files[key].exists(): - raw_transcription = transcription_files[key] - break - - if not raw_transcription: - self.warnings.append("Missing raw transcription file (checked: .json, _segmented.json, _simple.json)") - return - - result = validate_json_file(raw_transcription) - if not result.is_valid: - self.errors.append(f"Invalid transcription JSON: {result.error_message}") - return - - self.__extract_transcription_stats(raw_transcription) - - def __extract_transcription_stats(self, raw_transcription): - try: - with open(raw_transcription, "r", encoding="utf-8") as f: - data = json.load(f) - - text = data.get("text", "") - if not text: - segments = data.get("segments", []) - if segments: - text = " ".join(seg.get("text", "") for seg in segments) - - self.transcription_chars = len(text) - self.transcription_words = len(text.split()) - - words = data.get("words", []) - if words: - self.transcription_duration = words[-1].get("end", 0.0) - else: - segments = data.get("segments", []) - if segments and segments[-1].get("end"): - self.transcription_duration = segments[-1].get("end", 0.0) - except Exception as e: - self.errors.append(f"Error reading transcription: {e}") - - def __validate_clean_transcription(self, clean_transcription_file): - if not clean_transcription_file.exists(): - self.warnings.append(f"Missing clean transcription file: {clean_transcription_file.name}") - return - - result = validate_json_file(clean_transcription_file) - if not result.is_valid: - self.warnings.append(f"Invalid clean transcription JSON: {result.error_message}") - - def __validate_clean_txt(self, clean_txt_file): - if not clean_txt_file.exists(): - self.warnings.append(f"Missing clean transcription txt: {clean_txt_file.name}") - - def __validate_sound_events(self, sound_events_file): - if not sound_events_file.exists(): - self.warnings.append(f"Missing sound events file: {sound_events_file.name}") - return - - result = validate_json_file(sound_events_file) - if not result.is_valid: - self.warnings.append(f"Invalid sound events JSON: {result.error_message}") - - def __validate_exported_frames(self): - frames_dir = EpisodeManager.get_episode_subdir(self.episode_info, settings.output_subdirs.frames) - if not frames_dir.exists(): - self.warnings.append(f"Missing {settings.output_subdirs.frames} directory: {frames_dir}") - return - - frame_files = sorted(frames_dir.glob(OUTPUT_FILE_PATTERNS["frame"])) - if not frame_files: - self.warnings.append(f"No frames found in {settings.output_subdirs.frames}/") - return - - self.exported_frames_count = len(frame_files) - - total_size = 0 - resolutions = [] - invalid_count = 0 - - for frame_file in frame_files: - result = validate_image_file(frame_file) - if result.is_valid: - total_size += result.metadata["size_mb"] - resolutions.append((result.metadata["width"], result.metadata["height"])) - else: - invalid_count += 1 - self.errors.append(f"Invalid frame {frame_file.name}: {result.error_message}") - - if invalid_count > 0: - self.warnings.append(f"{invalid_count} invalid frames found") - - self.exported_frames_total_size_mb = round(total_size, 2) - - if resolutions: - most_common_res = max(set(resolutions), key=resolutions.count) - self.exported_frames_avg_resolution = most_common_res - - def __validate_video(self): - video_file = OutputPathBuilder.build_video_path(self.episode_info, self.series_name) - if not video_file.exists(): - self.warnings.append(f"Missing video file: {video_file}") - return - - result = validate_video_file(video_file) - if not result.is_valid: - self.errors.append(f"Invalid video: {result.error_message}") - return - - self.video_size_mb = result.metadata["size_mb"] - self.video_duration = result.metadata["duration"] - self.video_codec = result.metadata["codec"] - self.video_resolution = (result.metadata["width"], result.metadata["height"]) - - def __validate_scenes(self): - scenes_dir = EpisodeManager.get_episode_subdir(self.episode_info, settings.output_subdirs.scenes) - scenes_file = scenes_dir / f"{self.series_name}_{self.episode_info.episode_code()}{OUTPUT_FILE_PATTERNS['scenes_suffix']}" - if not scenes_file.exists(): - self.errors.append(f"Missing scenes file: {scenes_file}") - return - - result = validate_json_file(scenes_file) - if not result.is_valid: - self.errors.append(f"Invalid scenes JSON: {result.error_message}") - return - - try: - with open(scenes_file, "r", encoding="utf-8") as f: - data = json.load(f) - - self.scenes_count = data.get("total_scenes", 0) - scenes = data.get("scenes", []) - if scenes: - durations = [scene.get("duration", 0) for scene in scenes] - self.scenes_avg_duration = round(sum(durations) / len(durations), 2) - except Exception as e: - self.errors.append(f"Error reading scenes: {e}") - - def __validate_image_hashes(self): - hashes_dir = EpisodeManager.get_episode_subdir(self.episode_info, settings.output_subdirs.image_hashes) - if not hashes_dir.exists(): - self.warnings.append(f"Missing {settings.output_subdirs.image_hashes} directory") - return - - json_files = list(hashes_dir.glob("*.json")) - if not json_files: - self.warnings.append(f"No JSON files in {settings.output_subdirs.image_hashes}/") - return - - self.image_hashes_count = len(json_files) - sizes = [] - - for json_file in json_files: - result = validate_json_file(json_file) - if not result.is_valid: - self.errors.append(f"Invalid image hash JSON {json_file.name}: {result.error_message}") - else: - sizes.append(json_file.stat().st_size) - - self.__check_size_anomalies(sizes, "image_hashes") - - def __validate_character_visualizations(self): - viz_dir = EpisodeManager.get_episode_subdir(self.episode_info, settings.output_subdirs.character_visualizations) - if not viz_dir.exists(): - return - - image_files = list(viz_dir.glob("*.jpg")) + list(viz_dir.glob("*.png")) - if not image_files: - self.warnings.append(f"No visualization images in {settings.output_subdirs.character_visualizations}/") - return - - self.character_visualizations_count = len(image_files) - invalid_count = 0 - - for img_file in image_files: - result = validate_image_file(img_file) - if not result.is_valid: - invalid_count += 1 - self.errors.append(f"Invalid character visualization {img_file.name}: {result.error_message}") - - if invalid_count > 0: - self.warnings.append(f"{invalid_count} invalid character visualization images found") - - def __validate_face_clusters(self): - clusters_dir = EpisodeManager.get_episode_subdir(self.episode_info, settings.output_subdirs.face_clusters) - if not clusters_dir.exists(): - return - - metadata_files = list(clusters_dir.glob("*_face_clusters.json")) - metadata_file = metadata_files[0] if metadata_files else None - - if not metadata_file or not metadata_file.exists(): - self.warnings.append("Missing face clustering metadata file") - return - - result = validate_json_file(metadata_file) - if not result.is_valid: - self.errors.append(f"Invalid face clustering metadata: {result.error_message}") - return - - try: - with open(metadata_file, "r", encoding="utf-8") as f: - data = json.load(f) - - clusters = data.get("clusters", {}) - - if isinstance(clusters, dict): - self.face_clusters_count = len(clusters) - total_faces = 0 - for _, cluster_info in clusters.items(): - total_faces += cluster_info.get("face_count", 0) - elif isinstance(clusters, list): - self.face_clusters_count = len(clusters) - total_faces = 0 - for cluster_info in clusters: - total_faces += cluster_info.get("face_count", 0) - else: - self.warnings.append("Unexpected clusters format in face clustering metadata") - return - - noise_info = data.get("noise", {}) - if noise_info: - total_faces += noise_info.get("face_count", 0) - - self.face_clusters_total_faces = total_faces - - except Exception as e: - self.errors.append(f"Error reading face clustering metadata: {e}") - - def __validate_object_detections(self): - detections_dir = EpisodeManager.get_episode_subdir(self.episode_info, settings.output_subdirs.object_detections) - if not detections_dir.exists(): - self.warnings.append(f"Missing {settings.output_subdirs.object_detections} directory") - return - - json_files = [f for f in detections_dir.glob("*.json") if "visualizations" not in str(f)] - if not json_files: - self.warnings.append(f"No JSON files in {settings.output_subdirs.object_detections}/") - return - - self.object_detections_count = len(json_files) - sizes = [] - - for json_file in json_files: - result = validate_json_file(json_file) - if not result.is_valid: - self.errors.append(f"Invalid object detection JSON {json_file.name}: {result.error_message}") - else: - sizes.append(json_file.stat().st_size) - - self.__check_size_anomalies(sizes, "object_detections") - - def __validate_object_visualizations(self): - viz_dir = EpisodeManager.get_episode_subdir(self.episode_info, settings.output_subdirs.object_visualizations) - if not viz_dir.exists(): - return - - image_files = list(viz_dir.glob("*.jpg")) + list(viz_dir.glob("*.png")) - if not image_files: - self.warnings.append(f"No visualization images in {settings.output_subdirs.object_visualizations}/") - return - - self.object_visualizations_count = len(image_files) - invalid_count = 0 - - for img_file in image_files: - result = validate_image_file(img_file) - if not result.is_valid: - invalid_count += 1 - self.errors.append(f"Invalid visualization {img_file.name}: {result.error_message}") - - if invalid_count > 0: - self.warnings.append(f"{invalid_count} invalid visualization images found") - - def __validate_embedding_dimensions(self, jsonl_file, subdir: str): - embedding_fields = { - ELASTIC_SUBDIRS.text_embeddings: "text_embedding", - ELASTIC_SUBDIRS.video_frames: "video_embedding", - ELASTIC_SUBDIRS.episode_names: "title_embedding", - ELASTIC_SUBDIRS.full_episode_embeddings: "full_episode_embedding", - ELASTIC_SUBDIRS.sound_event_embeddings: "sound_event_embedding", - } - - if subdir not in embedding_fields: - return - - embedding_field = embedding_fields[subdir] - expected_dim = settings.embedding_model.embedding_dim - - try: - with open(jsonl_file, "r", encoding="utf-8") as f: - for line_num, line in enumerate(f, 1): - if not line.strip(): - continue - doc = json.loads(line) - if embedding_field in doc: - embedding = doc[embedding_field] - if isinstance(embedding, list): - actual_dim = len(embedding) - if actual_dim != expected_dim: - self.errors.append( - f"{jsonl_file.name} line {line_num}: " - f"{embedding_field} has {actual_dim} dimensions, expected {expected_dim}", - ) - return - except Exception as e: - self.errors.append(f"Error validating embeddings in {jsonl_file.name}: {e}") - - def __check_size_anomalies(self, sizes: List[int], folder_name: str, threshold: float = 0.2): - if len(sizes) < 2: - return - - avg_size = sum(sizes) / len(sizes) - if avg_size == 0: - return - - for i, size in enumerate(sizes): - deviation = abs(size - avg_size) / avg_size - if deviation > threshold: - self.warnings.append( - f"{folder_name} file #{i+1} size deviation: {deviation*100:.1f}% from average", - ) - - def __validate_other_files(self): - char_detections_dir = EpisodeManager.get_episode_subdir(self.episode_info, settings.output_subdirs.character_detections) - detections_file = char_detections_dir / OUTPUT_FILE_NAMES["detections"] - if detections_file.exists(): - result = validate_json_file(detections_file) - if not result.is_valid: - self.errors.append(f"Invalid {OUTPUT_FILE_NAMES['detections']}: {result.error_message}") - - embeddings_dir = EpisodeManager.get_episode_subdir(self.episode_info, settings.output_subdirs.embeddings) - if embeddings_dir.exists(): - embeddings_file = embeddings_dir / OUTPUT_FILE_NAMES["embeddings_text"] - if embeddings_file.exists(): - result = validate_json_file(embeddings_file) - if not result.is_valid: - self.errors.append(f"Invalid {OUTPUT_FILE_NAMES['embeddings_text']}: {result.error_message}") - - elastic_subdirs = [ - ELASTIC_SUBDIRS.text_segments, - ELASTIC_SUBDIRS.text_embeddings, - ELASTIC_SUBDIRS.video_frames, - ELASTIC_SUBDIRS.episode_names, - ELASTIC_SUBDIRS.text_statistics, - ELASTIC_SUBDIRS.full_episode_embeddings, - ELASTIC_SUBDIRS.sound_events, - ELASTIC_SUBDIRS.sound_event_embeddings, - ] - found_elastic_docs = False - for subdir in elastic_subdirs: - elastic_docs_dir = EpisodeManager.get_episode_subdir( - self.episode_info, - f"{settings.output_subdirs.elastic_documents}/{subdir}", - ) - if elastic_docs_dir.exists(): - found_elastic_docs = True - for jsonl_file in elastic_docs_dir.glob("*.jsonl"): - result = validate_jsonl_file(jsonl_file) - if not result.is_valid: - self.errors.append(f"Invalid JSONL {jsonl_file.name}: {result.error_message}") - else: - self.__validate_embedding_dimensions(jsonl_file, subdir) - - if not found_elastic_docs: - self.warnings.append(f"Missing {settings.output_subdirs.elastic_documents} directory") - - transcriptions_dir = EpisodeManager.get_episode_subdir(self.episode_info, settings.output_subdirs.transcriptions) - if transcriptions_dir.exists(): - clean_dir = transcriptions_dir / settings.output_subdirs.transcription_subdirs.clean - text_stats_file = clean_dir / f"{self.series_name}_{self.episode_info.episode_code()}_text_stats.json" - if text_stats_file.exists(): - result = validate_json_file(text_stats_file) - if not result.is_valid: - self.errors.append(f"Invalid text_stats JSON: {result.error_message}") - else: - self.warnings.append(f"Missing text statistics file: {text_stats_file.name}") - - def to_dict(self) -> Dict[str, Any]: - return { - "status": self.status, - "errors": self.errors, - "warnings": self.warnings, - "stats": { - "transcription_chars": self.transcription_chars, - "transcription_duration": self.transcription_duration, - "transcription_words": self.transcription_words, - "exported_frames_count": self.exported_frames_count, - "exported_frames_total_size_mb": self.exported_frames_total_size_mb, - "exported_frames_avg_resolution": self.exported_frames_avg_resolution, - "video_size_mb": self.video_size_mb, - "video_duration": self.video_duration, - "video_codec": self.video_codec, - "video_resolution": self.video_resolution, - "scenes_count": self.scenes_count, - "scenes_avg_duration": self.scenes_avg_duration, - "image_hashes_count": self.image_hashes_count, - "character_visualizations_count": self.character_visualizations_count, - "face_clusters_count": self.face_clusters_count, - "face_clusters_total_faces": self.face_clusters_total_faces, - "object_detections_count": self.object_detections_count, - "object_visualizations_count": self.object_visualizations_count, - }, - } diff --git a/preprocessor/validation/file_validators.py b/preprocessor/validation/file_validators.py deleted file mode 100644 index bb3c7bff4..000000000 --- a/preprocessor/validation/file_validators.py +++ /dev/null @@ -1,178 +0,0 @@ -from dataclasses import dataclass -import json -from pathlib import Path -import subprocess -from typing import ( - Any, - Dict, - Optional, -) -import zipfile - -from PIL import Image - -from preprocessor.utils.constants import ( - FfprobeFormatKeys, - FfprobeKeys, - FfprobeStreamKeys, - ValidationMetadataKeys, -) - - -@dataclass -class ValidationResult: - is_valid: bool - error_message: Optional[str] = None - metadata: Optional[Dict[str, Any]] = None - - -def validate_json_file(path: Path) -> ValidationResult: - if not path.exists(): - return ValidationResult(is_valid=False, error_message=f"File does not exist: {path}") - - try: - with open(path, "r", encoding="utf-8") as f: - json.load(f) - return ValidationResult(is_valid=True, metadata={ValidationMetadataKeys.SIZE_BYTES: path.stat().st_size}) - except json.JSONDecodeError as e: - return ValidationResult(is_valid=False, error_message=f"Invalid JSON: {e}") - except Exception as e: - return ValidationResult(is_valid=False, error_message=f"Error reading file: {e}") - - -def validate_jsonl_file(path: Path) -> ValidationResult: - if not path.exists(): - return ValidationResult(is_valid=False, error_message=f"File does not exist: {path}") - - try: - line_count = 0 - with open(path, "r", encoding="utf-8") as f: - for line_num, line in enumerate(f, 1): - line = line.strip() - if not line: - continue - try: - json.loads(line) - line_count += 1 - except json.JSONDecodeError as e: - return ValidationResult( - is_valid=False, - error_message=f"Invalid JSON at line {line_num}: {e}", - ) - return ValidationResult( - is_valid=True, - metadata={ValidationMetadataKeys.SIZE_BYTES: path.stat().st_size, ValidationMetadataKeys.LINE_COUNT: line_count}, - ) - except Exception as e: - return ValidationResult(is_valid=False, error_message=f"Error reading file: {e}") - - -def validate_image_file(path: Path) -> ValidationResult: - if not path.exists(): - return ValidationResult(is_valid=False, error_message=f"File does not exist: {path}") - - try: - with Image.open(path) as img: - img.verify() - with Image.open(path) as img: - width, height = img.size - format_type = img.format - size_mb = path.stat().st_size / (1024 * 1024) - - return ValidationResult( - is_valid=True, - metadata={ - ValidationMetadataKeys.WIDTH: width, - ValidationMetadataKeys.HEIGHT: height, - ValidationMetadataKeys.FORMAT: format_type, - ValidationMetadataKeys.SIZE_MB: round(size_mb, 2), - }, - ) - except Exception as e: - return ValidationResult(is_valid=False, error_message=f"Invalid image: {e}") - - -def validate_video_file(path: Path) -> ValidationResult: - if not path.exists(): - return ValidationResult(is_valid=False, error_message=f"File does not exist: {path}") - - try: - result = subprocess.run( - [ - "ffprobe", - "-v", - "error", - "-select_streams", - "v:0", - "-show_entries", - "stream=codec_name,width,height,duration", - "-show_entries", - "format=duration,size", - "-of", - "json", - str(path), - ], - capture_output=True, - text=True, - check=True, - ) - - probe_data = json.loads(result.stdout) - stream = probe_data.get(FfprobeKeys.STREAMS, [{}])[0] - format_info = probe_data.get(FfprobeKeys.FORMAT, {}) - - duration = float(stream.get(FfprobeStreamKeys.DURATION, format_info.get(FfprobeFormatKeys.DURATION, 0))) - size_bytes = int(format_info.get(FfprobeFormatKeys.SIZE, 0)) - size_mb = size_bytes / (1024 * 1024) - - return ValidationResult( - is_valid=True, - metadata={ - ValidationMetadataKeys.CODEC: stream.get(FfprobeStreamKeys.CODEC_NAME), - ValidationMetadataKeys.WIDTH: stream.get(FfprobeStreamKeys.WIDTH), - ValidationMetadataKeys.HEIGHT: stream.get(FfprobeStreamKeys.HEIGHT), - ValidationMetadataKeys.DURATION: round(duration, 2), - ValidationMetadataKeys.SIZE_MB: round(size_mb, 2), - }, - ) - except subprocess.CalledProcessError as e: - return ValidationResult(is_valid=False, error_message=f"ffprobe error: {e.stderr}") - except Exception as e: - return ValidationResult(is_valid=False, error_message=f"Error validating video: {e}") - - -def validate_archive_file(path: Path) -> ValidationResult: - if not path.exists(): - return ValidationResult(is_valid=False, error_message=f"File does not exist: {path}") - - try: - with zipfile.ZipFile(path, "r") as zip_ref: - bad_file = zip_ref.testzip() - if bad_file: - return ValidationResult( - is_valid=False, - error_message=f"Corrupt file in archive: {bad_file}", - ) - - file_count = len(zip_ref.namelist()) - compressed_size = sum(info.compress_size for info in zip_ref.infolist()) - uncompressed_size = sum(info.file_size for info in zip_ref.infolist()) - - compression_ratio = 0 - if uncompressed_size > 0: - compression_ratio = (1 - compressed_size / uncompressed_size) * 100 - - return ValidationResult( - is_valid=True, - metadata={ - ValidationMetadataKeys.SIZE_MB: round(path.stat().st_size / (1024 * 1024), 2), - "file_count": file_count, - "compressed_size_mb": round(compressed_size / (1024 * 1024), 2), - "uncompressed_size_mb": round(uncompressed_size / (1024 * 1024), 2), - "compression_ratio": round(compression_ratio, 2), - }, - ) - except zipfile.BadZipFile as e: - return ValidationResult(is_valid=False, error_message=f"Invalid ZIP file: {e}") - except Exception as e: - return ValidationResult(is_valid=False, error_message=f"Error validating archive: {e}") diff --git a/preprocessor/validation/global_validator.py b/preprocessor/validation/global_validator.py deleted file mode 100644 index eef9cd9cd..000000000 --- a/preprocessor/validation/global_validator.py +++ /dev/null @@ -1,118 +0,0 @@ -from pathlib import Path -from typing import List - -from preprocessor.config.config import BASE_OUTPUT_DIR -from preprocessor.validation.base_result import BaseValidationResult -from preprocessor.validation.file_validators import ( - validate_image_file, - validate_json_file, -) - - -class GlobalValidationResult(BaseValidationResult): - pass - - -class GlobalValidator: - def __init__( - self, - series_name: str, - base_output_dir: Path = BASE_OUTPUT_DIR, - ): - self.series_name = series_name - self.base_output_dir = base_output_dir - self.result = GlobalValidationResult() - - def validate(self) -> GlobalValidationResult: - self.__validate_main_json_files() - self.__validate_characters_folder() - self.__validate_processing_metadata() - return self.result - - def __validate_main_json_files(self): - episodes_file = self.base_output_dir / f"{self.series_name}_episodes.json" - if episodes_file.exists(): - result = validate_json_file(episodes_file) - if not result.is_valid: - self.result.errors.append(f"Invalid {episodes_file.name}: {result.error_message}") - else: - self.result.stats["episodes_json_valid"] = True - else: - self.result.warnings.append(f"Missing {episodes_file.name}") - - characters_file = self.base_output_dir / f"{self.series_name}_characters.json" - if characters_file.exists(): - result = validate_json_file(characters_file) - if not result.is_valid: - self.result.errors.append(f"Invalid {characters_file.name}: {result.error_message}") - else: - self.result.stats["characters_json_valid"] = True - else: - self.result.warnings.append(f"Missing {characters_file.name}") - - def __validate_characters_folder(self): - characters_dir = self.base_output_dir / "characters" - if not characters_dir.exists(): - self.result.warnings.append("Missing characters/ directory") - return - - character_folders = [d for d in characters_dir.iterdir() if d.is_dir()] - if not character_folders: - self.result.warnings.append("No character folders in characters/") - return - - self.result.stats["character_folders_count"] = len(character_folders) - - total_images = 0 - invalid_images = 0 - characters_without_images: List[str] = [] - - for char_folder in character_folders: - image_files = ( - list(char_folder.glob("*.jpg")) + - list(char_folder.glob("*.jpeg")) + - list(char_folder.glob("*.png")) + - list(char_folder.glob("*.webp")) - ) - - if not image_files: - characters_without_images.append(char_folder.name) - continue - - total_images += len(image_files) - - for img_file in image_files: - result = validate_image_file(img_file) - if not result.is_valid: - invalid_images += 1 - self.result.errors.append( - f"Invalid character image {char_folder.name}/{img_file.name}: {result.error_message}", - ) - - self.result.stats["character_images_count"] = total_images - self.result.stats["invalid_character_images"] = invalid_images - - if characters_without_images: - self.result.warnings.append( - f"{len(characters_without_images)} characters without reference images", - ) - - def __validate_processing_metadata(self): - metadata_dir = self.base_output_dir / "processing_metadata" - if not metadata_dir.exists(): - self.result.warnings.append("Missing processing_metadata/ directory") - return - - json_files = list(metadata_dir.glob("*.json")) - if not json_files: - self.result.warnings.append("No JSON files in processing_metadata/") - return - - self.result.stats["processing_metadata_files"] = len(json_files) - - for json_file in json_files: - result = validate_json_file(json_file) - if not result.is_valid: - self.result.errors.append( - f"Invalid processing metadata {json_file.name}: {result.error_message}", - ) diff --git a/preprocessor/validation/report_generator.py b/preprocessor/validation/report_generator.py deleted file mode 100644 index d28d7421e..000000000 --- a/preprocessor/validation/report_generator.py +++ /dev/null @@ -1,41 +0,0 @@ -from datetime import datetime -import json -from pathlib import Path -from typing import ( - Any, - Dict, -) - -from preprocessor.validation.episode_stats import EpisodeStats -from preprocessor.validation.season_comparator import SeasonComparison - - -class ReportGenerator: - def __init__(self, season: str, anomaly_threshold: float): - self.season = season - self.anomaly_threshold = anomaly_threshold - self.timestamp = datetime.now().isoformat() - - def generate_report( - self, - episodes_stats: Dict[str, EpisodeStats], - season_comparison: SeasonComparison, - output_path: Path, - ): - report = { - "validation_timestamp": self.timestamp, - "season": self.season, - "anomaly_threshold": self.anomaly_threshold, - "episodes": {episode_id: stats.to_dict() for episode_id, stats in episodes_stats.items()}, - "season_comparison": season_comparison.to_dict(), - } - - self.__save_report(report, output_path) - return report - - @staticmethod - def __save_report(report: Dict[str, Any], output_path: Path): - output_path.parent.mkdir(parents=True, exist_ok=True) - - with open(output_path, "w", encoding="utf-8") as f: - json.dump(report, f, indent=2, ensure_ascii=False) diff --git a/preprocessor/validation/season_comparator.py b/preprocessor/validation/season_comparator.py deleted file mode 100644 index 4ac621d17..000000000 --- a/preprocessor/validation/season_comparator.py +++ /dev/null @@ -1,127 +0,0 @@ -from dataclasses import ( - dataclass, - field, -) -from typing import ( - Any, - Dict, - List, - Optional, -) - -from preprocessor.validation.episode_stats import EpisodeStats - - -@dataclass -class MetricComparison: - metric_name: str - min_value: Optional[float] - max_value: Optional[float] - avg_value: Optional[float] - difference_percent: Optional[float] - - -@dataclass -class Anomaly: - episode: str - metric: str - value: float - avg: float - deviation_percent: float - severity: str - - -@dataclass -class SeasonComparison: - season: str - anomaly_threshold: float - metrics: Dict[str, MetricComparison] = field(default_factory=dict) - anomalies: List[Anomaly] = field(default_factory=list) - - def compare_episodes(self, episodes_stats: Dict[str, EpisodeStats]): - metric_keys = [ - "transcription_duration", - "transcription_chars", - "transcription_words", - "exported_frames_count", - "exported_frames_total_size_mb", - "video_size_mb", - "video_duration", - "scenes_count", - ] - - for metric_key in metric_keys: - self.__compare_metric(metric_key, episodes_stats) - - def __compare_metric(self, metric_key: str, episodes_stats: Dict[str, EpisodeStats]): - values = [] - episode_values = {} - - for episode_id, stats in episodes_stats.items(): - value = getattr(stats, metric_key, None) - if value is not None: - values.append(value) - episode_values[episode_id] = value - - if not values: - return - - min_val = min(values) - max_val = max(values) - avg_val = sum(values) / len(values) - - if min_val > 0: - diff_percent = ((max_val - min_val) / min_val) * 100 - else: - diff_percent = 0.0 - - self.metrics[metric_key] = MetricComparison( - metric_name=metric_key, - min_value=round(min_val, 2), - max_value=round(max_val, 2), - avg_value=round(avg_val, 2), - difference_percent=round(diff_percent, 2), - ) - - for episode_id, value in episode_values.items(): - if avg_val > 0: - deviation_percent = abs((value - avg_val) / avg_val) * 100 - else: - deviation_percent = 0.0 - - if deviation_percent > self.anomaly_threshold: - severity = "ERROR" if deviation_percent > self.anomaly_threshold * 2 else "WARNING" - self.anomalies.append( - Anomaly( - episode=episode_id, - metric=metric_key, - value=round(value, 2), - avg=round(avg_val, 2), - deviation_percent=round(deviation_percent, 2), - severity=severity, - ), - ) - - def to_dict(self) -> Dict[str, Any]: - return { - "metrics": { - metric_name: { - "min": metric.min_value, - "max": metric.max_value, - "avg": metric.avg_value, - "difference_percent": metric.difference_percent, - } - for metric_name, metric in self.metrics.items() - }, - "anomalies": [ - { - "episode": anomaly.episode, - "metric": anomaly.metric, - "value": anomaly.value, - "avg": anomaly.avg, - "deviation_percent": anomaly.deviation_percent, - "severity": anomaly.severity, - } - for anomaly in self.anomalies - ], - } diff --git a/preprocessor/validation/validator.py b/preprocessor/validation/validator.py deleted file mode 100644 index ef160e6b4..000000000 --- a/preprocessor/validation/validator.py +++ /dev/null @@ -1,155 +0,0 @@ -from datetime import datetime -from pathlib import Path -from typing import ( - Dict, - Optional, -) - -from rich.console import Console -from rich.progress import track - -from preprocessor.config.config import ( - BASE_OUTPUT_DIR, - settings, -) -from preprocessor.core.episode_manager import EpisodeManager -from preprocessor.core.file_naming import FileNamingConventions -from preprocessor.utils.file_utils import atomic_write_json -from preprocessor.validation.episode_stats import EpisodeStats -from preprocessor.validation.report_generator import ReportGenerator -from preprocessor.validation.season_comparator import SeasonComparison - -console = Console() - - -class Validator: - def __init__( - self, - season: str, - series_name: str = "ranczo", - anomaly_threshold: float = 20.0, - base_output_dir: Path = BASE_OUTPUT_DIR, - episodes_info_json: Optional[Path] = None, - ): - self.season = season - self.series_name = series_name - self.anomaly_threshold = anomaly_threshold - self.base_output_dir = base_output_dir - self.episode_manager = EpisodeManager(episodes_info_json, series_name) - self.validation_reports_dir = base_output_dir / settings.output_subdirs.validation_reports - - def validate(self) -> int: - transcriptions_season_path = self.base_output_dir / "transcriptions" / self.season - if not transcriptions_season_path.exists(): - console.print(f"[red]Season directory not found: {transcriptions_season_path}[/red]") - return 1 - - console.print(f"[bold cyan]Validating season {self.season}...[/bold cyan]") - - episodes_stats = self.__collect_episodes_stats(transcriptions_season_path) - - if not episodes_stats: - console.print(f"[red]No episodes found in {transcriptions_season_path}[/red]") - return 1 - - self.validation_reports_dir.mkdir(parents=True, exist_ok=True) - - self.__generate_episode_reports(episodes_stats) - - season_comparison = SeasonComparison( - season=self.season, - anomaly_threshold=self.anomaly_threshold, - ) - season_comparison.compare_episodes(episodes_stats) - - report_generator = ReportGenerator( - season=self.season, - anomaly_threshold=self.anomaly_threshold, - ) - season_report_path = self.validation_reports_dir / f"{self.series_name}_{self.season}_season.json" - report_generator.generate_report(episodes_stats, season_comparison, season_report_path) - - self.__print_summary(episodes_stats, season_comparison) - - console.print(f"\n[green]Validation reports saved to: {self.validation_reports_dir}[/green]") - - return 0 - - def __collect_episodes_stats(self, transcriptions_season_path: Path) -> Dict[str, EpisodeStats]: - episode_dirs = sorted([d for d in transcriptions_season_path.iterdir() if d.is_dir() and d.name.startswith("E")]) - - episodes_stats = {} - for episode_dir in track(episode_dirs, description="Collecting episode stats"): - episode_num = int(episode_dir.name[1:]) - season_num = int(self.season[1:]) - - episode_info = self.episode_manager.get_episode_by_season_and_relative(season_num, episode_num) - if not episode_info: - console.print(f"[yellow]Skipping {episode_dir.name}: could not parse episode info[/yellow]") - continue - - episode_id = episode_info.episode_code() - stats = EpisodeStats( - episode_info=episode_info, - series_name=self.series_name, - ) - stats.collect_stats() - episodes_stats[episode_id] = stats - - return episodes_stats - - def __generate_episode_reports(self, episodes_stats: Dict[str, EpisodeStats]): - for stats in episodes_stats.values(): - episode_report = { - "validation_timestamp": datetime.now().isoformat(), - "episode_id": stats.episode_info.episode_code(), - "episode_title": stats.episode_info.title, - "status": stats.status, - "errors": stats.errors, - "warnings": stats.warnings, - "stats": stats.to_dict()["stats"], - } - - file_naming = FileNamingConventions(self.series_name) - report_filename = file_naming.build_filename(stats.episode_info, extension="json") - report_path = self.validation_reports_dir / report_filename - atomic_write_json(report_path, episode_report) - - def __print_summary(self, episodes_stats: Dict[str, EpisodeStats], season_comparison: SeasonComparison): - console.print(f"\n[bold]Validation Summary for {self.season}[/bold]") - console.print(f"Total episodes: {len(episodes_stats)}") - - pass_count = sum(1 for stats in episodes_stats.values() if stats.status == "PASS") - warning_count = sum(1 for stats in episodes_stats.values() if stats.status == "WARNING") - fail_count = sum(1 for stats in episodes_stats.values() if stats.status == "FAIL") - - console.print(f" [green]PASS:[/green] {pass_count}") - console.print(f" [yellow]WARNING:[/yellow] {warning_count}") - console.print(f" [red]FAIL:[/red] {fail_count}") - - if season_comparison.anomalies: - console.print(f"\n[bold yellow]Anomalies detected: {len(season_comparison.anomalies)}[/bold yellow]") - for anomaly in season_comparison.anomalies[:5]: - color = "red" if anomaly.severity == "ERROR" else "yellow" - console.print( - f" [{color}]{anomaly.episode}[/{color}]: " - f"{anomaly.metric} = {anomaly.value} " - f"(avg: {anomaly.avg}, deviation: {anomaly.deviation_percent:.1f}%)", - ) - if len(season_comparison.anomalies) > 5: - console.print(f" ... and {len(season_comparison.anomalies) - 5} more") - - for episode_id, stats in episodes_stats.items(): - if stats.errors: - console.print(f"\n[red]Errors in {episode_id}:[/red]") - for error in stats.errors[:3]: - console.print(f" - {error}") - if len(stats.errors) > 3: - console.print(f" ... and {len(stats.errors) - 3} more") - - if stats.warnings: - console.print(f"\n[yellow]Warnings in {episode_id}:[/yellow]") - for warning in stats.warnings[:3]: - console.print(f" - {warning}") - if len(stats.warnings) > 3: - console.print(f" ... and {len(stats.warnings) - 3} more") diff --git a/preprocessor/video/__init__.py b/preprocessor/video/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/preprocessor/video/base_video_processor.py b/preprocessor/video/base_video_processor.py deleted file mode 100644 index c706faaab..000000000 --- a/preprocessor/video/base_video_processor.py +++ /dev/null @@ -1,45 +0,0 @@ -from abc import ABC -import logging -from pathlib import Path -from typing import ( - Any, - Dict, - List, -) - -from preprocessor.core.base_processor import ( - BaseProcessor, - ProcessingItem, -) -from preprocessor.core.episode_manager import EpisodeManager - - -class BaseVideoProcessor(BaseProcessor, ABC): - def __init__( - self, - args: Dict[str, Any], - class_name: str, - error_exit_code: int, - input_videos_key: str = "videos", - subdirectory_filter: str = None, - ): - super().__init__( - args=args, - class_name=class_name, - error_exit_code=error_exit_code, - loglevel=logging.DEBUG, - ) - - self.input_videos: Path = Path(self._args[input_videos_key]) - self.subdirectory_filter: str = subdirectory_filter - episodes_json_path = self._args.get("episodes_info_json") - self.episode_manager = EpisodeManager(episodes_json_path, self.series_name) - - def _get_processing_items(self) -> List[ProcessingItem]: - return self._create_video_processing_items( - source_path=self.input_videos, - extensions=self.get_video_glob_patterns(), - episode_manager=self.episode_manager, - skip_unparseable=True, - subdirectory_filter=self.subdirectory_filter, - ) diff --git a/preprocessor/video/emotion_detection_subprocessor.py b/preprocessor/video/emotion_detection_subprocessor.py deleted file mode 100644 index 9ae007dc3..000000000 --- a/preprocessor/video/emotion_detection_subprocessor.py +++ /dev/null @@ -1,168 +0,0 @@ -import json -import logging -from pathlib import Path -from typing import ( - List, - Optional, -) - -import cv2 -from hsemotion_onnx.facial_emotions import HSEmotionRecognizer - -from preprocessor.config.config import settings -from preprocessor.core.base_processor import ( - OutputSpec, - ProcessingItem, -) -from preprocessor.core.episode_manager import EpisodeManager -from preprocessor.core.file_naming import FileNamingConventions -from preprocessor.utils.console import console -from preprocessor.utils.emotion_utils import ( - crop_face_from_frame, - detect_emotions_batch, - init_emotion_model, -) -from preprocessor.utils.error_handling_logger import ErrorHandlingLogger -from preprocessor.utils.file_utils import atomic_write_json -from preprocessor.video.frame_processor import FrameSubProcessor - - -class EmotionDetectionSubProcessor(FrameSubProcessor): - def __init__(self): - super().__init__("Emotion Detection") - self.model: Optional[HSEmotionRecognizer] = None - self.logger = ErrorHandlingLogger("EmotionDetectionSubProcessor", logging.DEBUG, 15) - - def initialize(self) -> None: - if self.model is None: - self.model = init_emotion_model() - - def cleanup(self) -> None: - self.model = None - - def finalize(self) -> None: - if hasattr(self, 'logger'): - self.logger.finalize() - - def get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: - episode_info = item.metadata["episode_info"] - episode_dir = EpisodeManager.get_episode_subdir(episode_info, settings.output_subdirs.character_detections) - marker_file = episode_dir / ".emotion_complete" - return [OutputSpec(path=marker_file, required=True)] - - def should_run(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> bool: - episode_info = item.metadata["episode_info"] - series_name = item.metadata["series_name"] - episode_dir = EpisodeManager.get_episode_subdir(episode_info, settings.output_subdirs.character_detections) - - file_naming = FileNamingConventions(series_name) - detections_filename = file_naming.build_filename( - episode_info, - extension="json", - suffix="character_detections", - ) - detections_file = episode_dir / detections_filename - - if not detections_file.exists(): - console.print( - f"[yellow]No character detections found for emotion analysis: {detections_file}[/yellow]", - ) - return False - - marker_file = episode_dir / ".emotion_complete" - return any(output.path == marker_file for output in missing_outputs) - - def process(self, item: ProcessingItem, ramdisk_frames_dir: Path) -> None: # pylint: disable=too-many-locals,too-many-statements - self.initialize() - - episode_info = item.metadata["episode_info"] - series_name = item.metadata["series_name"] - episode_dir = EpisodeManager.get_episode_subdir(episode_info, settings.output_subdirs.character_detections) - - file_naming = FileNamingConventions(series_name) - detections_filename = file_naming.build_filename( - episode_info, - extension="json", - suffix="character_detections", - ) - detections_file = episode_dir / detections_filename - - if not detections_file.exists(): - console.print(f"[yellow]No detections file: {detections_file}[/yellow]") - return - - with open(detections_file, "r", encoding="utf-8") as f: - detections_data = json.load(f) - - detections = detections_data.get("detections", []) - - total_characters = sum(len(d.get("characters", [])) for d in detections) - console.print(f"[cyan]Collecting {total_characters} faces for batch emotion analysis[/cyan]") - - face_crops = [] - face_metadata = [] - - for detection_idx, detection in enumerate(detections): - frame_file = detection.get("frame_file") - if not frame_file: - continue - - frame_path = ramdisk_frames_dir / frame_file - - if not frame_path.exists(): - continue - - frame = cv2.imread(str(frame_path)) - if frame is None: - continue - - characters = detection.get("characters", []) - - for char_idx, char in enumerate(characters): - bbox = char.get("bbox") - if not bbox: - continue - - face_crop = crop_face_from_frame(frame, bbox) - if face_crop is None: - continue - - face_crops.append(face_crop) - face_metadata.append({ - "detection_idx": detection_idx, - "char_idx": char_idx, - }) - - if not face_crops: - console.print("[yellow]No valid face crops found[/yellow]") - return - - console.print(f"[cyan]Processing {len(face_crops)} faces with HSEmotion model[/cyan]") - - emotion_results = detect_emotions_batch(face_crops, self.model) - - processed = 0 - for result, metadata in zip(emotion_results, face_metadata): - if result is None: - continue - - dominant_emotion, confidence, emotion_scores = result - detection_idx = metadata["detection_idx"] - char_idx = metadata["char_idx"] - - char = detections[detection_idx]["characters"][char_idx] - char["emotion"] = { - "label": dominant_emotion, - "confidence": confidence, - "scores": emotion_scores, - } - processed += 1 - - atomic_write_json(detections_file, detections_data, indent=2, ensure_ascii=False) - - marker_file = detections_file.parent / ".emotion_complete" - marker_file.write_text("completed", encoding="utf-8") - - console.print( - f"[green]✓ Emotion analysis complete: {processed}/{total_characters} characters processed[/green]", - ) diff --git a/preprocessor/video/face_clustering_subprocessor.py b/preprocessor/video/face_clustering_subprocessor.py deleted file mode 100644 index ecff9593a..000000000 --- a/preprocessor/video/face_clustering_subprocessor.py +++ /dev/null @@ -1,287 +0,0 @@ -from collections import defaultdict -import gc -import logging -from pathlib import Path -from typing import ( - Any, - Dict, - List, - Optional, -) - -from cuml.cluster import HDBSCAN as cuHDBSCAN -import cupy as cp -import cv2 -from insightface.app import FaceAnalysis -import numpy as np -import torch - -from preprocessor.characters.utils import init_face_detection -from preprocessor.config.config import settings -from preprocessor.core.base_processor import ( - OutputSpec, - ProcessingItem, -) -from preprocessor.core.episode_manager import EpisodeManager -from preprocessor.core.file_naming import FileNamingConventions -from preprocessor.utils.console import console -from preprocessor.utils.error_handling_logger import ErrorHandlingLogger -from preprocessor.utils.file_utils import atomic_write_json -from preprocessor.utils.metadata_utils import create_processing_metadata -from preprocessor.video.frame_processor import FrameSubProcessor - - -class FaceClusteringSubProcessor(FrameSubProcessor): - def __init__( - self, - min_cluster_size: int, - min_samples: int, - save_noise: bool, - save_full_frames: bool, - ): - super().__init__("Face Clustering") - self.min_cluster_size = min_cluster_size - self.min_samples = min_samples - self.save_noise = save_noise - self.save_full_frames = save_full_frames - self.face_app: Optional[FaceAnalysis] = None - self.logger = ErrorHandlingLogger("FaceClusteringSubProcessor", logging.DEBUG, 15) - - def initialize(self) -> None: - if self.face_app is None: - console.print("[cyan]Initializing face detection for clustering...[/cyan]") - self.face_app = init_face_detection() - console.print("[green]✓ Face detection initialized[/green]") - - def cleanup(self) -> None: - self.face_app = None - self.__cleanup_memory() - - def finalize(self) -> None: - if hasattr(self, 'logger'): - self.logger.finalize() - - def needs_ramdisk(self) -> bool: - return False - - def get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: - episode_info = item.metadata["episode_info"] - episode_dir = EpisodeManager.get_episode_subdir(episode_info, settings.output_subdirs.face_clusters) - series_name = item.metadata["series_name"] - file_naming = FileNamingConventions(series_name) - metadata_filename = file_naming.build_filename( - episode_info, - extension="json", - suffix="_face_clusters", - ) - metadata_output = episode_dir / metadata_filename - return [OutputSpec(path=metadata_output, required=True)] - - def should_run(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> bool: - expected = self.get_expected_outputs(item) - return any(str(exp.path) in str(miss.path) for exp in expected for miss in missing_outputs) - - def process(self, item: ProcessingItem, ramdisk_frames_dir: Path) -> None: - self.initialize() - - episode_info = item.metadata["episode_info"] - - frame_files = sorted([ - f for f in ramdisk_frames_dir.glob("*.jpg") - if f.is_file() and "frame_" in f.name - ]) - - if not frame_files: - console.print(f"[yellow]No frames found in {ramdisk_frames_dir}[/yellow]") - return - - console.print(f"[cyan]Extracting faces and vectors from {len(frame_files)} frames[/cyan]") - - face_data = self.__extract_faces_with_vectors(frame_files) - - if len(face_data) == 0: - console.print("[yellow]No faces detected, skipping clustering[/yellow]") - return - - console.print(f"[cyan]Clustering {len(face_data)} faces[/cyan]") - labels = self.__cluster_faces(face_data) - - console.print("[cyan]Saving clusters[/cyan]") - series_name = item.metadata["series_name"] - self.__save_clusters(episode_info, face_data, labels, frame_files, series_name) - - def __extract_faces_with_vectors(self, frame_files: List[Path]) -> List[Dict[str, Any]]: - face_data = [] - - for idx, frame_path in enumerate(frame_files): - if idx % 50 == 0: - console.print(f"[cyan]Processing frame {idx}/{len(frame_files)}[/cyan]") - - img = cv2.imread(str(frame_path)) - if img is None: - continue - - faces = self.face_app.get(img) - - for face_idx, face in enumerate(faces): - bbox = face.bbox.astype(int) - x1, y1, x2, y2 = bbox - - x1 = max(0, x1) - y1 = max(0, y1) - x2 = min(img.shape[1], x2) - y2 = min(img.shape[0], y2) - - face_img = img[y1:y2, x1:x2] - - if face_img.size == 0: - continue - - face_data.append({ - 'vector': face.normed_embedding, - 'frame_path': frame_path, - 'bbox': bbox, - 'face_img': face_img, - 'face_idx': face_idx, - }) - - console.print(f"[green]✓ Found {len(face_data)} faces in {len(frame_files)} frames[/green]") - return face_data - - def __cluster_faces(self, face_data: List[Dict[str, Any]]) -> np.ndarray: - vectors = np.array([fd['vector'] for fd in face_data]) - - console.print(f"[cyan]Clustering with GPU HDBSCAN (min_cluster_size={self.min_cluster_size}, min_samples={self.min_samples})[/cyan]") - vectors_gpu = cp.asarray(vectors) - - clusterer = cuHDBSCAN( - min_cluster_size=self.min_cluster_size, - min_samples=self.min_samples, - metric='euclidean', - cluster_selection_method='eom', - ) - labels = clusterer.fit_predict(vectors_gpu) - labels = cp.asnumpy(labels) - - n_clusters = len(set(labels)) - (1 if -1 in labels else 0) - n_noise = list(labels).count(-1) - - console.print(f"[green]✓ Found {n_clusters} clusters[/green]") - console.print(f"[green]✓ {n_noise} faces marked as noise[/green]") - - return labels - - def __save_clusters( # pylint: disable=too-many-locals - self, - episode_info, - face_data: List[Dict[str, Any]], - labels: np.ndarray, - all_frame_files: List[Path], - series_name: str, - ) -> None: - episode_dir = EpisodeManager.get_episode_subdir(episode_info, settings.output_subdirs.face_clusters) - episode_dir.mkdir(parents=True, exist_ok=True) - - clusters = defaultdict(list) - for face_info, label in zip(face_data, labels): - clusters[label].append(face_info) - - cluster_stats = [] - - for cluster_id, faces in sorted(clusters.items()): - if cluster_id == -1: - if not self.save_noise: - continue - cluster_dir = episode_dir / "noise" - else: - cluster_dir = episode_dir / f"cluster_{cluster_id}" - - faces_dir = cluster_dir / "faces" - faces_dir.mkdir(parents=True, exist_ok=True) - - if self.save_full_frames: - frames_dir = cluster_dir / "frames" - frames_dir.mkdir(parents=True, exist_ok=True) - - saved_frames = set() - cluster_frames = [] - - for face_info in faces: - frame_name = face_info['frame_path'].stem - face_idx = face_info['face_idx'] - face_output_path = faces_dir / f"{frame_name}_face{face_idx}.jpg" - - if face_info['face_img'].size > 0: - cv2.imwrite(str(face_output_path), face_info['face_img']) - - if self.save_full_frames and frame_name not in saved_frames: - frame_output_path = frames_dir / f"{frame_name}.jpg" - img = cv2.imread(str(face_info['frame_path'])) - if img is not None: - cv2.imwrite(str(frame_output_path), img) - saved_frames.add(frame_name) - cluster_frames.append(f"{frame_name}.jpg") - - cluster_label = "noise" if cluster_id == -1 else f"cluster_{cluster_id}" - console.print(f"[green]✓ Saved {len(faces)} faces to {cluster_label}[/green]") - - cluster_stats.append({ - "cluster_id": cluster_label, - "face_count": len(faces), - "frame_count": len(saved_frames), - "frames": sorted(cluster_frames), - "character_name": None, - }) - - self.__save_metadata(episode_info, face_data, labels, cluster_stats, all_frame_files, series_name) - - def __save_metadata( - self, - episode_info, - face_data: List[Dict[str, Any]], - labels: np.ndarray, - cluster_stats: List[Dict[str, Any]], - all_frame_files: List[Path], - series_name: str, - ) -> None: - episode_dir = EpisodeManager.get_episode_subdir(episode_info, settings.output_subdirs.face_clusters) - - n_clusters = len(set(labels)) - (1 if -1 in labels else 0) - n_noise = list(labels).count(-1) - frames_with_faces = len(set(fd['frame_path'] for fd in face_data)) - - metadata = create_processing_metadata( - episode_info=episode_info, - processing_params={ - "min_cluster_size": self.min_cluster_size, - "min_samples": self.min_samples, - "metric": "euclidean", - "algorithm": "hdbscan", - "model": settings.face_recognition.model_name, - }, - statistics={ - "total_faces_detected": len(face_data), - "total_clusters": n_clusters, - "noise_faces": n_noise, - "frames_processed": len(all_frame_files), - "frames_with_faces": frames_with_faces, - }, - results_key="clusters", - results_data=cluster_stats, - ) - file_naming = FileNamingConventions(series_name) - metadata_filename = file_naming.build_filename( - episode_info, - extension="json", - suffix="_face_clusters", - ) - metadata_output = episode_dir / metadata_filename - atomic_write_json(metadata_output, metadata, indent=2, ensure_ascii=False) - - console.print(f"[green]✓ Saved cluster metadata to: {metadata_output}[/green]") - - @staticmethod - def __cleanup_memory() -> None: - gc.collect() - if torch.cuda.is_available(): - torch.cuda.empty_cache() diff --git a/preprocessor/video/frame_exporter.py b/preprocessor/video/frame_exporter.py deleted file mode 100644 index f505e66d8..000000000 --- a/preprocessor/video/frame_exporter.py +++ /dev/null @@ -1,271 +0,0 @@ -from datetime import datetime -import json -from pathlib import Path -import shutil -import subprocess -from typing import ( - Any, - Dict, - List, - Optional, -) - -from PIL import Image -import decord - -from bot.types import FrameRequest -from preprocessor.config.config import settings -from preprocessor.core.base_processor import ( - OutputSpec, - ProcessingItem, -) -from preprocessor.core.enums import KeyframeStrategy -from preprocessor.core.episode_manager import EpisodeManager -from preprocessor.core.output_path_builder import OutputPathBuilder -from preprocessor.embeddings.strategies.strategy_factory import KeyframeStrategyFactory -from preprocessor.utils.console import console -from preprocessor.utils.file_utils import atomic_write_json -from preprocessor.video.base_video_processor import BaseVideoProcessor - - -class FrameExporter(BaseVideoProcessor): - def __init__(self, args: Dict[str, Any]) -> None: - super().__init__( - args=args, - class_name=self.__class__.__name__, - error_exit_code=10, - input_videos_key="videos", - ) - decord.bridge.set_bridge('native') - - self.output_frames: Path = Path(self._args.get("output_frames", settings.frame_export.output_dir)) - self.output_frames.mkdir(parents=True, exist_ok=True) - - self.scene_timestamps_dir: Path = Path(self._args.get("scene_timestamps_dir", settings.scene_detection.output_dir)) - - resolution = self._args.get("resolution", settings.frame_export.resolution) - self.resize_width: int = resolution.width - self.resize_height: int = resolution.height - - keyframe_strategy_str = self._args.get("keyframe_strategy", settings.keyframe_extraction.strategy) - self.keyframe_strategy = KeyframeStrategy(keyframe_strategy_str) - self.frames_per_scene: int = self._args.get("frames_per_scene", settings.keyframe_extraction.scene_changes.frames_per_scene) - - self.strategy = KeyframeStrategyFactory.create( - self.keyframe_strategy, - self.frames_per_scene, - ) - - def _validate_args(self, args: Dict[str, Any]) -> None: - if "videos" not in args: - raise ValueError("videos path is required") - - if "scene_timestamps_dir" in args: - scene_path = Path(args["scene_timestamps_dir"]) - if scene_path and not scene_path.exists(): - console.print(f"[yellow]Warning: Scene timestamps directory does not exist: {scene_path}[/yellow]") - - def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: - episode_info = item.metadata["episode_info"] - episode_dir = OutputPathBuilder.get_episode_dir(episode_info, settings.output_subdirs.frames) - - metadata_filename = self.episode_manager.file_naming.build_filename( - episode_info, - extension="json", - suffix="_frame_metadata", - ) - metadata_file = episode_dir / metadata_filename - return [OutputSpec(path=metadata_file, required=True)] - - def _get_temp_files(self, item: ProcessingItem) -> List[str]: - expected_outputs = self._get_expected_outputs(item) - if not expected_outputs: - return [] - temp_metadata = expected_outputs[0].path.with_suffix('.json.tmp') - return [str(temp_metadata)] - - def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> None: - episode_info = item.metadata["episode_info"] - episode_dir = self.__get_episode_dir(episode_info) - - if episode_dir.exists(): - metadata_filename = self.episode_manager.file_naming.build_filename( - episode_info, - extension="json", - suffix="_frame_metadata", - ) - metadata_file = episode_dir / metadata_filename - if not metadata_file.exists(): - console.print(f"[yellow]Cleaning incomplete frames from previous run: {episode_dir}[/yellow]") - shutil.rmtree(episode_dir, ignore_errors=True) - - episode_dir.mkdir(parents=True, exist_ok=True) - - data = self.__prepare_data(episode_info) - frame_requests = self.strategy.extract_frame_requests(item.input_path, data) - - if not frame_requests: - console.print(f"[yellow]No frames to extract for {item.input_path.name}[/yellow]") - return - - console.print(f"[cyan]Extracting {len(frame_requests)} keyframes from {item.input_path.name}[/cyan]") - - try: - self.__extract_frames(item.input_path, frame_requests, episode_dir, episode_info) - self.__write_metadata(episode_dir, frame_requests, episode_info, item.input_path) - console.print(f"[green]✓ Exported {len(frame_requests)} frames to {episode_dir}[/green]") - except Exception as e: - self.logger.error(f"Failed to extract frames from {item.input_path}: {e}") - console.print(f"[yellow]Cleaning incomplete frames due to error: {episode_dir}[/yellow]") - shutil.rmtree(episode_dir, ignore_errors=True) - raise - - def __get_episode_dir(self, episode_info) -> Path: - return self.episode_manager.get_episode_subdir(episode_info, settings.output_subdirs.frames) - - def __prepare_data(self, episode_info) -> Dict[str, Any]: - data = {} - scene_timestamps = self.__load_scene_timestamps(episode_info) - if scene_timestamps: - data["scene_timestamps"] = scene_timestamps - return data - - def __extract_frames(self, video_file: Path, frame_requests: List[FrameRequest], episode_dir: Path, episode_info) -> None: - metadata = self.__get_video_metadata(video_file) - self.current_video_dar = self.__calculate_display_aspect_ratio(metadata) - - vr = decord.VideoReader(str(video_file), ctx=decord.cpu(0)) - frame_numbers = [req["frame_number"] for req in frame_requests] - - with self.progress.track_operation(f"Keyframes ({len(frame_numbers)} frames)", len(frame_numbers)) as tracker: - for idx, frame_num in enumerate(frame_numbers, 1): - self.__extract_and_save_frame(vr, frame_num, episode_dir, episode_info) - tracker.update(idx, interval=50) - - del vr - - def __extract_and_save_frame(self, vr, frame_num: int, episode_dir: Path, episode_info) -> None: - frame_np = vr[frame_num].asnumpy() - frame_pil = Image.fromarray(frame_np) - - resized = self.__resize_frame(frame_pil, self.current_video_dar) - base_filename = self.episode_manager.file_naming.build_base_filename(episode_info) - filename = f"{base_filename}_frame_{frame_num:06d}.jpg" - resized.save(episode_dir / filename, quality=90) - - @staticmethod - def __get_video_metadata(video_path: Path) -> Dict[str, Any]: - cmd = [ - "ffprobe", "-v", "error", - "-select_streams", "v:0", - "-show_entries", "stream=width,height,sample_aspect_ratio,display_aspect_ratio", - "-of", "json", - str(video_path), - ] - result = subprocess.run(cmd, capture_output=True, text=True, check=True) - probe_data: Dict[str, Any] = json.loads(result.stdout) - streams: List[Dict[str, Any]] = probe_data.get("streams", []) - if not streams: - raise ValueError(f"No video streams found in {video_path}") - return streams[0] - - @staticmethod - def __calculate_display_aspect_ratio(metadata: Dict[str, Any]) -> float: - width = metadata.get("width", 0) - height = metadata.get("height", 0) - if width == 0 or height == 0: - raise ValueError("Invalid video dimensions") - - sar_str = metadata.get("sample_aspect_ratio", "1:1") - if sar_str == "N/A" or not sar_str: - sar_str = "1:1" - - try: - sar_num, sar_denom = [int(x) for x in sar_str.split(":")] - sar = sar_num / sar_denom if sar_denom != 0 else 1.0 - except (ValueError, ZeroDivisionError): - sar = 1.0 - - return (width / height) * sar - - def __resize_frame(self, frame: Image.Image, display_aspect_ratio: float) -> Image.Image: - target_aspect = self.resize_width / self.resize_height - - if abs(display_aspect_ratio - target_aspect) < 0.01: - return frame.resize((self.resize_width, self.resize_height), Image.Resampling.LANCZOS) - - if display_aspect_ratio > target_aspect: - new_height = self.resize_height - new_width = int(self.resize_height * display_aspect_ratio) - resized = frame.resize((new_width, new_height), Image.Resampling.LANCZOS) - - x_crop = (new_width - self.resize_width) // 2 - cropped = resized.crop((x_crop, 0, x_crop + self.resize_width, self.resize_height)) - return cropped - - new_width = self.resize_width - new_height = int(self.resize_width / display_aspect_ratio) - resized = frame.resize((new_width, new_height), Image.Resampling.LANCZOS) - - result = Image.new('RGB', (self.resize_width, self.resize_height), (0, 0, 0)) - y_offset = (self.resize_height - new_height) // 2 - result.paste(resized, (0, y_offset)) - return result - - @staticmethod - def __calculate_total_scenes(frame_requests: List[FrameRequest]) -> int: - scene_numbers = set(f.get("scene_number", -1) for f in frame_requests) - has_invalid = -1 in scene_numbers - return len(scene_numbers) - (1 if has_invalid else 0) - - def __write_metadata(self, episode_dir: Path, frame_requests: List[FrameRequest], episode_info, source_video: Path) -> None: - frame_types_count = {} - frames_with_paths = [] - - for frame in frame_requests: - frame_type = frame.get("type", "unknown") - frame_types_count[frame_type] = frame_types_count.get(frame_type, 0) + 1 - - frame_with_path = frame.copy() - frame_num = frame["frame_number"] - base_filename = self.episode_manager.file_naming.build_base_filename(episode_info) - frame_with_path["frame_path"] = f"{base_filename}_frame_{frame_num:06d}.jpg" - frames_with_paths.append(frame_with_path) - - metadata = { - "generated_at": datetime.now().isoformat(), - "episode_info": { - "season": episode_info.season, - "episode_number": episode_info.relative_episode, - "absolute_episode": episode_info.absolute_episode, - }, - "source_video": str(source_video), - "processing_parameters": { - "frame_width": self.resize_width, - "frame_height": self.resize_height, - "keyframe_strategy": self.keyframe_strategy.value, - "frames_per_scene": self.frames_per_scene, - }, - "statistics": { - "total_frames": len(frame_requests), - "frame_types": frame_types_count, - "total_scenes": self.__calculate_total_scenes(frame_requests), - "timestamp_range": { - "start": min((f.get("timestamp", 0) for f in frame_requests), default=0), - "end": max((f.get("timestamp", 0) for f in frame_requests), default=0), - }, - }, - "frames": frames_with_paths, - } - metadata_filename = self.episode_manager.file_naming.build_filename( - episode_info, - extension="json", - suffix="_frame_metadata", - ) - metadata_file = episode_dir / metadata_filename - atomic_write_json(metadata_file, metadata, indent=2, ensure_ascii=False) - - def __load_scene_timestamps(self, episode_info) -> Optional[Dict[str, Any]]: - if not self.scene_timestamps_dir or not self.scene_timestamps_dir.exists(): - return None - return EpisodeManager.load_scene_timestamps(episode_info, self.scene_timestamps_dir, self.logger) diff --git a/preprocessor/video/frame_processor.py b/preprocessor/video/frame_processor.py deleted file mode 100644 index 717b17766..000000000 --- a/preprocessor/video/frame_processor.py +++ /dev/null @@ -1,142 +0,0 @@ -import logging -from pathlib import Path -import shutil -from typing import ( - Any, - Dict, - List, -) - -from preprocessor.config.config import settings -from preprocessor.core.base_processor import ( - BaseProcessor, - OutputSpec, - ProcessingItem, -) -from preprocessor.core.episode_manager import EpisodeManager -from preprocessor.utils.console import console - - -class FrameProcessor(BaseProcessor): - def __init__(self, args: Dict[str, Any]): - super().__init__( - args=args, - class_name=self.__class__.__name__, - error_exit_code=15, - loglevel=logging.DEBUG, - ) - - self.frames_dir: Path = Path(self._args.get("frames_dir", settings.frame_export.output_dir)) - self.ramdisk_path: Path = Path(self._args.get("ramdisk_path", "/dev/shm")) - - episodes_info_json = self._args.get("episodes_info_json") - self.episode_manager = EpisodeManager(episodes_info_json, self.series_name) - - self.sub_processors: List['FrameSubProcessor'] = [] - - def _validate_args(self, args: Dict[str, Any]) -> None: - pass - - def add_sub_processor(self, processor: 'FrameSubProcessor') -> None: - self.sub_processors.append(processor) - - def _get_processing_items(self) -> List[ProcessingItem]: - return self._get_episode_processing_items_from_metadata( - "**/*_frame_metadata.json", - self.frames_dir, - self.episode_manager, - ) - - def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: - outputs = [] - for sub_processor in self.sub_processors: - outputs.extend(sub_processor.get_expected_outputs(item)) - return outputs - - def cleanup(self) -> None: - for sub_processor in self.sub_processors: - sub_processor.finalize() - console.print("[green]✓ All sub-processors finalized[/green]") - - def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> None: - metadata_file = item.input_path - episode_info = item.metadata["episode_info"] - - frames_episode_dir = metadata_file.parent - season = episode_info.season - episode = episode_info.relative_episode - - any_sub_processor_will_run = any( - sub_processor.should_run(item, missing_outputs) - for sub_processor in self.sub_processors - ) - - if not any_sub_processor_will_run: - for sub_processor in self.sub_processors: - console.print(f"[yellow]Skipping: {sub_processor.name} (output exists)[/yellow]") - return - - any_sub_processor_needs_ramdisk = any( - sub_processor.should_run(item, missing_outputs) and sub_processor.needs_ramdisk() - for sub_processor in self.sub_processors - ) - - if any_sub_processor_needs_ramdisk: - ramdisk_episode_dir = self.ramdisk_path / "frames" / f"S{season:02d}" / f"E{episode:02d}" - try: - self.__copy_frames_to_ramdisk(frames_episode_dir, ramdisk_episode_dir) - - for sub_processor in self.sub_processors: - if sub_processor.should_run(item, missing_outputs): - console.print(f"[cyan]Running: {sub_processor.name}[/cyan]") - sub_processor.process(item, ramdisk_episode_dir) - else: - console.print(f"[yellow]Skipping: {sub_processor.name} (output exists)[/yellow]") - - finally: - self.__cleanup_ramdisk(ramdisk_episode_dir) - else: - for sub_processor in self.sub_processors: - if sub_processor.should_run(item, missing_outputs): - console.print(f"[cyan]Running: {sub_processor.name}[/cyan]") - sub_processor.process(item, frames_episode_dir) - else: - console.print(f"[yellow]Skipping: {sub_processor.name} (output exists)[/yellow]") - - @staticmethod - def __copy_frames_to_ramdisk(source_dir: Path, dest_dir: Path) -> None: - dest_dir.mkdir(parents=True, exist_ok=True) - - frame_files = list(source_dir.glob("*frame_*.jpg")) - console.print(f"[cyan]Copying {len(frame_files)} frames to RAMdisk: {dest_dir}[/cyan]") - - for frame_file in frame_files: - shutil.copy2(frame_file, dest_dir / frame_file.name) - - console.print("[green]✓ Frames copied to RAMdisk[/green]") - - @staticmethod - def __cleanup_ramdisk(ramdisk_dir: Path) -> None: - if ramdisk_dir.exists(): - shutil.rmtree(ramdisk_dir) - console.print(f"[green]✓ RAMdisk cleaned: {ramdisk_dir}[/green]") - - -class FrameSubProcessor: - def __init__(self, name: str): - self.name = name - - def get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: - raise NotImplementedError - - def should_run(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> bool: - raise NotImplementedError - - def needs_ramdisk(self) -> bool: - return True - - def process(self, item: ProcessingItem, ramdisk_frames_dir: Path) -> None: - raise NotImplementedError - - def finalize(self) -> None: - pass diff --git a/preprocessor/video/frame_subprocessors.py b/preprocessor/video/frame_subprocessors.py deleted file mode 100644 index 85b4c9e6a..000000000 --- a/preprocessor/video/frame_subprocessors.py +++ /dev/null @@ -1,769 +0,0 @@ -import gc -import json -import logging -from pathlib import Path -from typing import ( - Any, - Dict, - List, - Optional, - Set, - Tuple, -) - -from insightface.app import FaceAnalysis -import numpy as np -import torch - -from preprocessor.characters.face_detection_utils import load_character_references -from preprocessor.characters.utils import init_face_detection -from preprocessor.config.config import settings -from preprocessor.core.base_processor import ( - OutputSpec, - ProcessingItem, -) -from preprocessor.core.episode_manager import EpisodeManager -from preprocessor.core.file_naming import FileNamingConventions -from preprocessor.embeddings.gpu_batch_processor import GPUBatchProcessor -from preprocessor.hashing.image_hasher import PerceptualHasher -from preprocessor.utils.batch_processing_utils import ( - compute_embeddings_in_batches, - compute_hashes_in_batches, -) -from preprocessor.utils.console import console -from preprocessor.utils.detection_io import ( - process_frames_for_detection, - save_character_detections, -) -from preprocessor.utils.error_handling_logger import ErrorHandlingLogger -from preprocessor.utils.file_utils import atomic_write_json -from preprocessor.utils.image_hash_utils import load_image_hashes_for_episode -from preprocessor.utils.metadata_utils import create_processing_metadata -from preprocessor.video.frame_processor import FrameSubProcessor - -# pylint: disable=duplicate-code - - - -class ImageHashSubProcessor(FrameSubProcessor): - def __init__(self, device: str, batch_size: int): - super().__init__("Image Hashing") - self.device = device - self.batch_size = batch_size - self.hasher: Optional[PerceptualHasher] = None - self.logger = ErrorHandlingLogger("ImageHashSubProcessor", logging.DEBUG, 15) - - def initialize(self) -> None: - if self.hasher is None: - self.hasher = PerceptualHasher(device=self.device, hash_size=8) - - def cleanup(self) -> None: - self.hasher = None - self.__cleanup_memory() - - def finalize(self) -> None: - if hasattr(self, 'logger'): - self.logger.finalize() - - def get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: - episode_info = item.metadata["episode_info"] - episode_dir = EpisodeManager.get_episode_subdir(episode_info, settings.output_subdirs.image_hashes) - series_name = item.metadata["series_name"] - file_naming = FileNamingConventions(series_name) - hash_filename = file_naming.build_filename( - episode_info, - extension="json", - suffix="image_hashes", - ) - hash_output = episode_dir / hash_filename - return [OutputSpec(path=hash_output, required=True)] - - def should_run(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> bool: - expected = self.get_expected_outputs(item) - return any(str(exp.path) in str(miss.path) for exp in expected for miss in missing_outputs) - - def process(self, item: ProcessingItem, ramdisk_frames_dir: Path) -> None: - self.initialize() - - metadata_file = item.input_path - episode_info = item.metadata["episode_info"] - - with open(metadata_file, "r", encoding="utf-8") as f: - metadata = json.load(f) - - frame_requests = metadata.get("frames", []) - if not frame_requests: - console.print(f"[yellow]No frames in metadata for {metadata_file}[/yellow]") - return - - hash_results = compute_hashes_in_batches(ramdisk_frames_dir, frame_requests, self.hasher, self.batch_size) - series_name = item.metadata["series_name"] - self.__save_hashes(episode_info, hash_results, series_name) - - def __save_hashes(self, episode_info, hash_results: List[Dict[str, Any]], series_name: str) -> None: - episode_dir = EpisodeManager.get_episode_subdir(episode_info, settings.output_subdirs.image_hashes) - episode_dir.mkdir(parents=True, exist_ok=True) - - hash_data = create_processing_metadata( - episode_info=episode_info, - processing_params={ - "device": self.device, - "batch_size": self.batch_size, - "hash_size": 8, - }, - statistics={ - "total_hashes": len(hash_results), - "unique_hashes": len(set(h.get("perceptual_hash") for h in hash_results if "perceptual_hash" in h)), - }, - results_key="image_hashes", - results_data=hash_results, - ) - - file_naming = FileNamingConventions(series_name) - hash_filename = file_naming.build_filename( - episode_info, - extension="json", - suffix="image_hashes", - ) - hash_output = episode_dir / hash_filename - atomic_write_json(hash_output, hash_data, indent=2, ensure_ascii=False) - - console.print(f"[green]✓ Saved hashes to: {hash_output}[/green]") - - @staticmethod - def __cleanup_memory() -> None: - gc.collect() - if torch.cuda.is_available(): - torch.cuda.empty_cache() - - -class VideoEmbeddingSubProcessor(FrameSubProcessor): - def __init__(self, device: str, batch_size: int, model_name: str, model_revision: str): - super().__init__("Video Embeddings") - self.device = device - self.batch_size = batch_size - self.model_name = model_name - self.model_revision = model_revision - self.model = None - self.gpu_processor: Optional[GPUBatchProcessor] = None - self.logger = ErrorHandlingLogger("VideoEmbeddingSubProcessor", logging.DEBUG, 15) - - def initialize(self) -> None: - if self.model is None: - from preprocessor.embeddings.qwen3_vl_embedding import Qwen3VLEmbedder # pylint: disable=import-outside-toplevel - console.print(f"[cyan]Loading embedding model: {self.model_name}[/cyan]") - self.model = Qwen3VLEmbedder( - model_name_or_path=self.model_name, - torch_dtype=torch.bfloat16, - ) - self.gpu_processor = GPUBatchProcessor( - self.model, - self.batch_size, - self.logger, - self.device, - progress_sub_batch_size=settings.embedding.progress_sub_batch_size, - ) - console.print("[green]✓ Qwen3-VL-Embedding model loaded[/green]") - - def cleanup(self) -> None: - self.model = None - self.gpu_processor = None - self.__cleanup_memory() - - def finalize(self) -> None: - if hasattr(self, 'logger'): - self.logger.finalize() - - def get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: - episode_info = item.metadata["episode_info"] - episode_dir = EpisodeManager.get_episode_subdir(episode_info, settings.output_subdirs.embeddings) - series_name = item.metadata["series_name"] - file_naming = FileNamingConventions(series_name) - video_filename = file_naming.build_filename( - episode_info, - extension="json", - suffix="embeddings_video", - ) - video_output = episode_dir / video_filename - return [OutputSpec(path=video_output, required=True)] - - def should_run(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> bool: - expected = self.get_expected_outputs(item) - return any(str(exp.path) in str(miss.path) for exp in expected for miss in missing_outputs) - - def process(self, item: ProcessingItem, ramdisk_frames_dir: Path) -> None: - self.initialize() - - metadata_file = item.input_path - episode_info = item.metadata["episode_info"] - - with open(metadata_file, "r", encoding="utf-8") as f: - metadata = json.load(f) - - frame_requests = metadata.get("frames", []) - if not frame_requests: - console.print(f"[yellow]No frames in metadata for {metadata_file}[/yellow]") - return - - episode_dir = EpisodeManager.get_episode_subdir(episode_info, settings.output_subdirs.embeddings) - checkpoint_file = episode_dir / "embeddings_video_checkpoint.json" - - image_hashes = load_image_hashes_for_episode( - {"season": episode_info.season, "episode_number": episode_info.relative_episode}, - self.logger, - ) - video_embeddings = compute_embeddings_in_batches( - ramdisk_frames_dir, - frame_requests, - self.gpu_processor, - self.batch_size, - image_hashes, - checkpoint_file=checkpoint_file, - checkpoint_interval=20, - prefetch_count=settings.embedding.prefetch_chunks, - ) - series_name = item.metadata["series_name"] - self.__save_embeddings(episode_info, video_embeddings, series_name) - - def __save_embeddings(self, episode_info, video_embeddings: List[Dict[str, Any]], series_name: str) -> None: - episode_dir = EpisodeManager.get_episode_subdir(episode_info, settings.output_subdirs.embeddings) - episode_dir.mkdir(parents=True, exist_ok=True) - - video_data = create_processing_metadata( - episode_info=episode_info, - processing_params={ - "model_name": self.model_name, - "model_revision": self.model_revision, - "batch_size": self.batch_size, - "device": self.device, - }, - statistics={ - "total_embeddings": len(video_embeddings), - "embedding_dimension": len(video_embeddings[0]["embedding"]) if video_embeddings else 0, - "frames_with_hash": sum(1 for e in video_embeddings if "perceptual_hash" in e), - }, - results_key="video_embeddings", - results_data=video_embeddings, - ) - file_naming = FileNamingConventions(series_name) - video_filename = file_naming.build_filename( - episode_info, - extension="json", - suffix="embeddings_video", - ) - video_output = episode_dir / video_filename - atomic_write_json(video_output, video_data, indent=2, ensure_ascii=False) - - console.print(f"[green]✓ Saved embeddings to: {video_output}[/green]") - - @staticmethod - def __cleanup_memory() -> None: - gc.collect() - if torch.cuda.is_available(): - torch.cuda.empty_cache() - - -class CharacterDetectionSubProcessor(FrameSubProcessor): - def __init__(self, characters_dir: Path, use_gpu: bool, threshold: float): - super().__init__("Character Detection") - self.characters_dir = characters_dir - self.use_gpu = use_gpu - self.threshold = threshold - self.face_app: Optional[FaceAnalysis] = None - self.character_vectors: Dict[str, np.ndarray] = {} - self.logger = ErrorHandlingLogger("CharacterDetectionSubProcessor", logging.DEBUG, 15) - - def initialize(self) -> None: - if self.face_app is None: - console.print("[cyan]Initializing face detection...[/cyan]") - self.face_app = init_face_detection() - self.character_vectors = load_character_references(self.characters_dir, self.face_app) - console.print("[green]✓ Face detection initialized[/green]") - - def cleanup(self) -> None: - self.face_app = None - self.character_vectors = {} - - def finalize(self) -> None: - if hasattr(self, 'logger'): - self.logger.finalize() - - def get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: - episode_info = item.metadata["episode_info"] - episode_dir = EpisodeManager.get_episode_subdir(episode_info, settings.output_subdirs.character_detections) - series_name = item.metadata["series_name"] - file_naming = FileNamingConventions(series_name) - detections_filename = file_naming.build_filename( - episode_info, - extension="json", - suffix="character_detections", - ) - detections_output = episode_dir / detections_filename - return [OutputSpec(path=detections_output, required=True)] - - def should_run(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> bool: - if not self.characters_dir.exists(): - console.print(f"[yellow]Characters directory not found: {self.characters_dir}, skipping[/yellow]") - return False - - expected = self.get_expected_outputs(item) - return any(str(exp.path) in str(miss.path) for exp in expected for miss in missing_outputs) - - def process(self, item: ProcessingItem, ramdisk_frames_dir: Path) -> None: - self.initialize() - - if not self.character_vectors: - console.print("[yellow]No character references loaded, skipping detection[/yellow]") - return - - episode_info = item.metadata["episode_info"] - - frame_files = sorted([ - f for f in ramdisk_frames_dir.glob("*.jpg") - if f.is_file() and "frame_" in f.name - ]) - - console.print(f"[cyan]Detecting characters in {len(frame_files)} frames[/cyan]") - - fps = 25.0 - - results = process_frames_for_detection( - frame_files, - self.face_app, - self.character_vectors, - self.threshold, - fps=fps, - ) - save_character_detections(episode_info, results, fps=fps) - - -class ObjectDetectionSubProcessor(FrameSubProcessor): - def __init__(self, model_name: str = "ustc-community/dfine-xlarge-obj2coco", conf_threshold: float = 0.25): - super().__init__("Object Detection") - self.model_name = model_name - self.conf_threshold = conf_threshold - self.model: Optional[Any] = None - self.image_processor: Optional[Any] = None - self.logger = ErrorHandlingLogger("ObjectDetectionSubProcessor", logging.DEBUG, 15) - - def initialize(self) -> None: - if self.model is None: - if not torch.cuda.is_available(): - raise RuntimeError("CUDA is not available. Object detection requires GPU.") - - from transformers import ( # pylint: disable=import-outside-toplevel - AutoImageProcessor, - DFineForObjectDetection, - ) - - console.print(f"[cyan]Loading D-FINE model: {self.model_name}[/cyan]") - self.image_processor = AutoImageProcessor.from_pretrained(self.model_name) - self.model = DFineForObjectDetection.from_pretrained(self.model_name) - self.model.to("cuda") - console.print("[green]✓ D-FINE model loaded on GPU[/green]") - - def cleanup(self) -> None: - self.model = None - self.image_processor = None - self.__cleanup_memory() - - def finalize(self) -> None: - if hasattr(self, 'logger'): - self.logger.finalize() - - def get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: - episode_info = item.metadata["episode_info"] - episode_dir = EpisodeManager.get_episode_subdir(episode_info, settings.output_subdirs.object_detections) - series_name = item.metadata["series_name"] - file_naming = FileNamingConventions(series_name) - detections_filename = file_naming.build_filename( - episode_info, - extension="json", - suffix="_object_detections", - ) - detections_output = episode_dir / detections_filename - return [OutputSpec(path=detections_output, required=True)] - - def should_run(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> bool: - expected = self.get_expected_outputs(item) - return any(str(exp.path) in str(miss.path) for exp in expected for miss in missing_outputs) - - def process(self, item: ProcessingItem, ramdisk_frames_dir: Path) -> None: # pylint: disable=too-many-locals - self.initialize() - - from PIL import Image # pylint: disable=import-outside-toplevel - - episode_info = item.metadata["episode_info"] - - frame_files = sorted([ - f for f in ramdisk_frames_dir.glob("*.jpg") - if f.is_file() and "frame_" in f.name - ]) - - if not frame_files: - console.print(f"[yellow]No frames found in {ramdisk_frames_dir}[/yellow]") - return - - console.print(f"[cyan]Detecting objects in {len(frame_files)} frames[/cyan]") - - detections_data = { - "episode_code": episode_info.episode_code(), - "model": self.model_name, - "confidence_threshold": self.conf_threshold, - "frames": [], - } - - batch_size = 8 - for batch_start in range(0, len(frame_files), batch_size): - batch_paths = frame_files[batch_start:batch_start + batch_size] - batch_images = [Image.open(fp) for fp in batch_paths] - target_sizes = [(img.height, img.width) for img in batch_images] - - inputs = self.image_processor(images=batch_images, return_tensors="pt") - inputs = {k: v.to("cuda") for k, v in inputs.items()} - - with torch.no_grad(): - outputs = self.model(**inputs) - - results = self.image_processor.post_process_object_detection( - outputs, - target_sizes=target_sizes, - threshold=self.conf_threshold, - ) - - for frame_path, result in zip(batch_paths, results): - frame_result = { - "frame_name": frame_path.name, - "detections": [], - } - - for score, label_id, box in zip(result["scores"], result["labels"], result["boxes"]): - score_value = score.item() - label = label_id.item() - box_coords = [float(i) for i in box.tolist()] - - detection = { - "class_id": label, - "class_name": self.model.config.id2label[label], - "confidence": score_value, - "bbox": { - "x1": box_coords[0], - "y1": box_coords[1], - "x2": box_coords[2], - "y2": box_coords[3], - }, - } - frame_result["detections"].append(detection) - - frame_result["detection_count"] = len(frame_result["detections"]) - detections_data["frames"].append(frame_result) - - for img in batch_images: - img.close() - - total_detections = sum(f['detection_count'] for f in detections_data['frames']) - frames_with_detections = len([f for f in detections_data['frames'] if f['detection_count'] > 0]) - - console.print(f"[green]✓ Total detections: {total_detections}[/green]") - console.print(f"[green]✓ Frames with detections: {frames_with_detections}/{len(frame_files)}[/green]") - - class_counts = {} - for frame in detections_data["frames"]: - for det in frame["detections"]: - class_name = det["class_name"] - class_counts[class_name] = class_counts.get(class_name, 0) + 1 - - if class_counts: - top_classes = sorted(class_counts.items(), key=lambda x: x[1], reverse=True)[:5] - console.print(f"[cyan]Top 5 classes: {', '.join(f'{cls}:{cnt}' for cls, cnt in top_classes)}[/cyan]") - - series_name = item.metadata["series_name"] - self.__save_detections(episode_info, detections_data, series_name) - - def __save_detections(self, episode_info, detections_data: Dict[str, Any], series_name: str) -> None: - episode_dir = EpisodeManager.get_episode_subdir(episode_info, settings.output_subdirs.object_detections) - episode_dir.mkdir(parents=True, exist_ok=True) - - output_data = create_processing_metadata( - episode_info=episode_info, - processing_params={ - "model": self.model_name, - "confidence_threshold": self.conf_threshold, - }, - statistics={ - "total_frames": len(detections_data["frames"]), - "total_detections": sum(f['detection_count'] for f in detections_data['frames']), - "frames_with_detections": len([f for f in detections_data['frames'] if f['detection_count'] > 0]), - }, - results_key="detections", - results_data=detections_data["frames"], - ) - file_naming = FileNamingConventions(series_name) - detections_filename = file_naming.build_filename( - episode_info, - extension="json", - suffix="_object_detections", - ) - detections_output = episode_dir / detections_filename - atomic_write_json(detections_output, output_data, indent=2, ensure_ascii=False) - - console.print(f"[green]✓ Saved object detections to: {detections_output}[/green]") - - @staticmethod - def __cleanup_memory() -> None: - gc.collect() - if torch.cuda.is_available(): - torch.cuda.empty_cache() - - -class ObjectDetectionVisualizationSubProcessor(FrameSubProcessor): - def __init__(self): - super().__init__("Object Detection Visualization") - self.logger = ErrorHandlingLogger("ObjectDetectionVisualizationSubProcessor", logging.DEBUG, 15) - - def initialize(self) -> None: - pass - - def cleanup(self) -> None: - pass - - def finalize(self) -> None: - if hasattr(self, 'logger'): - self.logger.finalize() - - def needs_ramdisk(self) -> bool: - return False - - def get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: - episode_info = item.metadata["episode_info"] - episode_dir = EpisodeManager.get_episode_subdir(episode_info, settings.output_subdirs.object_visualizations) - marker_file = episode_dir / ".visualization_complete" - return [OutputSpec(path=marker_file, required=True)] - - def should_run(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> bool: - episode_info = item.metadata["episode_info"] - detection_dir = EpisodeManager.get_episode_subdir(episode_info, settings.output_subdirs.object_detections) - detection_files = list(detection_dir.glob("*_object_detections.json")) - detection_file = detection_files[0] if detection_files else None - - if not detection_file or not detection_file.exists(): - console.print(f"[yellow]No object detections found for {episode_info.episode_code()}, skipping visualization[/yellow]") - return False - - expected = self.get_expected_outputs(item) - return any(str(exp.path) in str(miss.path) for exp in expected for miss in missing_outputs) - - def process(self, item: ProcessingItem, ramdisk_frames_dir: Path) -> None: - import cv2 # pylint: disable=import-outside-toplevel - - episode_info = item.metadata["episode_info"] - detection_dir = EpisodeManager.get_episode_subdir(episode_info, settings.output_subdirs.object_detections) - detection_files = list(detection_dir.glob("*_object_detections.json")) - detection_file = detection_files[0] if detection_files else None - - if not detection_file or not detection_file.exists(): - console.print(f"[yellow]No detections JSON found in {detection_dir}[/yellow]") - return - - if not ramdisk_frames_dir.exists(): - console.print(f"[yellow]No frames directory found: {ramdisk_frames_dir}[/yellow]") - return - - with open(detection_file, 'r', encoding='utf-8') as f: - detection_data = json.load(f) - - frames_with_detections = [f for f in detection_data.get("detections", []) if f['detection_count'] > 0] - if not frames_with_detections: - console.print(f"[yellow]No frames with detections for {episode_info.episode_code()}[/yellow]") - return - - output_dir = EpisodeManager.get_episode_subdir(episode_info, settings.output_subdirs.object_visualizations) - output_dir.mkdir(parents=True, exist_ok=True) - colors = self.__generate_colors() - conf_threshold = detection_data.get("processing_params", {}).get("confidence_threshold", 0.25) - - console.print(f"[cyan]Visualizing {len(frames_with_detections)} frames for {episode_info.episode_code()}[/cyan]") - - for frame_data in frames_with_detections: - output_path = output_dir / frame_data['frame_name'] - if output_path.exists(): - continue - - frame_path = ramdisk_frames_dir / frame_data['frame_name'] - if not frame_path.exists(): - continue - - img = cv2.imread(str(frame_path)) - if img is None: - continue - - self.__draw_detections_on_frame(img, frame_data['detections'], colors, conf_threshold) - cv2.imwrite(str(output_path), img) - - marker_file = output_dir / ".visualization_complete" - marker_file.write_text(f"completed: {len(frames_with_detections)} frames") - console.print(f"[green]✓ Visualized {len(frames_with_detections)} frames saved to: {output_dir}[/green]") - - @staticmethod - def __draw_detections_on_frame(img, detections: List[Dict[str, Any]], colors: Dict[int, Tuple[int, int, int]], conf_threshold: float) -> None: - import cv2 # pylint: disable=import-outside-toplevel - - for detection in detections: - if detection['confidence'] < conf_threshold: - continue - - class_id = detection['class_id'] - bbox = detection['bbox'] - x1, y1 = int(bbox['x1']), int(bbox['y1']) - x2, y2 = int(bbox['x2']), int(bbox['y2']) - color = colors.get(class_id, (0, 255, 0)) - - cv2.rectangle(img, (x1, y1), (x2, y2), color, 2) - - label = f"{detection['class_name']} {detection['confidence']:.2f}" - label_size, _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1) - label_y1 = max(y1 - 10, label_size[1]) - - cv2.rectangle(img, (x1, label_y1 - label_size[1] - 5), (x1 + label_size[0], label_y1), color, -1) - cv2.putText(img, label, (x1, label_y1 - 2), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1) - - @staticmethod - def __generate_colors(num_colors: int = 80) -> Dict[int, Tuple[int, int, int]]: - np.random.seed(42) - colors = {} - for i in range(num_colors): - colors[i] = tuple(int(x) for x in np.random.randint(50, 255, 3)) - return colors - - -class CharacterDetectionVisualizationSubProcessor(FrameSubProcessor): - def __init__(self): - super().__init__("Character Detection Visualization") - self.logger = ErrorHandlingLogger("CharacterDetectionVisualizationSubProcessor", logging.DEBUG, 15) - - def initialize(self) -> None: - pass - - def cleanup(self) -> None: - pass - - def finalize(self) -> None: - if hasattr(self, 'logger'): - self.logger.finalize() - - def needs_ramdisk(self) -> bool: - return False - - def get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: - episode_info = item.metadata["episode_info"] - episode_dir = EpisodeManager.get_episode_subdir(episode_info, settings.output_subdirs.character_visualizations) - marker_file = episode_dir / ".visualization_complete" - return [OutputSpec(path=marker_file, required=True)] - - def should_run(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> bool: - episode_info = item.metadata["episode_info"] - detection_dir = EpisodeManager.get_episode_subdir(episode_info, settings.output_subdirs.character_detections) - detection_files = list(detection_dir.glob("*_character_detections.json")) - detection_file = detection_files[0] if detection_files else None - - if not detection_file or not detection_file.exists(): - console.print(f"[yellow]No character detections found for {episode_info.episode_code()}, skipping visualization[/yellow]") - return False - - expected = self.get_expected_outputs(item) - return any(str(exp.path) in str(miss.path) for exp in expected for miss in missing_outputs) - - def process(self, item: ProcessingItem, ramdisk_frames_dir: Path) -> None: # pylint: disable=too-many-locals - import cv2 # pylint: disable=import-outside-toplevel - - episode_info = item.metadata["episode_info"] - detection_dir = EpisodeManager.get_episode_subdir(episode_info, settings.output_subdirs.character_detections) - detection_files = list(detection_dir.glob("*_character_detections.json")) - detection_file = detection_files[0] if detection_files else None - - if not detection_file or not detection_file.exists(): - console.print(f"[yellow]No detections JSON found in {detection_dir}[/yellow]") - return - - if not ramdisk_frames_dir.exists(): - console.print(f"[yellow]No frames directory found: {ramdisk_frames_dir}[/yellow]") - return - - with open(detection_file, 'r', encoding='utf-8') as f: - detection_data = json.load(f) - - frames_with_detections = [f for f in detection_data.get("detections", []) if f.get('characters')] - if not frames_with_detections: - console.print(f"[yellow]No frames with character detections for {episode_info.episode_code()}[/yellow]") - return - - output_dir = EpisodeManager.get_episode_subdir(episode_info, settings.output_subdirs.character_visualizations) - output_dir.mkdir(parents=True, exist_ok=True) - - all_character_names = set() - for frame_data in frames_with_detections: - for char in frame_data.get('characters', []): - all_character_names.add(char['name']) - colors = self.__generate_character_colors(all_character_names) - - console.print(f"[cyan]Visualizing {len(frames_with_detections)} frames with characters for {episode_info.episode_code()}[/cyan]") - - for frame_data in frames_with_detections: - frame_name = frame_data.get('frame_file') or frame_data.get('frame') - if not frame_name: - continue - - output_path = output_dir / frame_name - if output_path.exists(): - continue - - frame_path = ramdisk_frames_dir / frame_name - if not frame_path.exists(): - continue - - img = cv2.imread(str(frame_path)) - if img is None: - continue - - self.__draw_characters_on_frame(img, frame_data['characters'], colors) - cv2.imwrite(str(output_path), img) - - marker_file = output_dir / ".visualization_complete" - marker_file.write_text(f"completed: {len(frames_with_detections)} frames") - console.print(f"[green]✓ Visualized {len(frames_with_detections)} frames saved to: {output_dir}[/green]") - - @staticmethod - def __draw_characters_on_frame(img, characters: List[Dict[str, Any]], colors: Dict[str, Tuple[int, int, int]]) -> None: - import cv2 # pylint: disable=import-outside-toplevel - - for character in characters: - name = character['name'] - confidence = character['confidence'] - bbox = character['bbox'] - - x1, y1 = bbox['x1'], bbox['y1'] - x2, y2 = bbox['x2'], bbox['y2'] - color = colors.get(name, (0, 255, 0)) - - cv2.rectangle(img, (x1, y1), (x2, y2), color, 2) - - label = f"{name} {confidence:.2f}" - if "emotion" in character: - emotion_label = character["emotion"]["label"] - emotion_conf = character["emotion"]["confidence"] - label += f" | {emotion_label} {emotion_conf:.2f}" - - label_size, _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1) - label_y1 = max(y1 - 10, label_size[1]) - - cv2.rectangle(img, (x1, label_y1 - label_size[1] - 5), (x1 + label_size[0], label_y1), color, -1) - cv2.putText(img, label, (x1, label_y1 - 2), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1) - - @staticmethod - def __generate_character_colors(character_names: Set[str]) -> Dict[str, Tuple[int, int, int]]: - np.random.seed(42) - colors = {} - sorted_names = sorted(character_names) - for _, name in enumerate(sorted_names): - colors[name] = tuple(int(x) for x in np.random.randint(50, 255, 3)) - return colors diff --git a/preprocessor/video/scene_detector.py b/preprocessor/video/scene_detector.py deleted file mode 100644 index 0f06c7e4e..000000000 --- a/preprocessor/video/scene_detector.py +++ /dev/null @@ -1,205 +0,0 @@ -import gc -import logging -from pathlib import Path -from typing import ( - Any, - Dict, - List, - Optional, -) - -import decord -import numpy as np -import torch -from transnetv2_pytorch import TransNetV2 - -from bot.types import SceneDict -from preprocessor.config.config import settings -from preprocessor.core.base_processor import ( - BaseProcessor, - OutputSpec, - ProcessingItem, -) -from preprocessor.core.episode_manager import EpisodeManager -from preprocessor.core.output_path_builder import OutputPathBuilder -from preprocessor.utils.console import console -from preprocessor.utils.file_utils import atomic_write_json - - -class SceneDetector(BaseProcessor): - def __init__(self, args: Dict[str, Any]): - super().__init__( - args=args, - class_name=self.__class__.__name__, - error_exit_code=8, - loglevel=logging.DEBUG, - ) - - self.videos: Path = self._args["videos"] - self.output_dir: Path = self._args.get("output_dir", settings.scene_detection.output_dir) - self.threshold: float = self._args.get("threshold", settings.scene_detection.threshold) - self.min_scene_len: int = self._args.get("min_scene_len", settings.scene_detection.min_scene_len) - - episodes_info_json = self._args.get("episodes_info_json") - self.episode_manager = EpisodeManager(episodes_info_json, self.series_name) - - self.model = None - - def _validate_args(self, args: Dict[str, Any]) -> None: - if "videos" not in args: - raise ValueError("videos path is required") - if not torch.cuda.is_available(): - raise RuntimeError("CUDA is not available. TransNetV2 requires GPU.") - - def cleanup(self) -> None: - console.print("[cyan]Unloading TransNetV2 model and clearing GPU memory...[/cyan]") - if hasattr(self, 'model') and self.model is not None: - del self.model - self.model = None - gc.collect() - if torch.cuda.is_available(): - torch.cuda.empty_cache() - console.print("[green]✓ TransNetV2 model unloaded, GPU memory cleared[/green]") - - def _get_processing_items(self) -> List[ProcessingItem]: - return self._create_video_processing_items( - source_path=self.videos, - extensions=self.get_video_glob_patterns(), - episode_manager=self.episode_manager, - skip_unparseable=False, - ) - - def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: - episode_info = item.metadata.get("episode_info") - - if episode_info: - output_filename = self.episode_manager.file_naming.build_filename( - episode_info, - extension="json", - suffix="scenes", - ) - output_path = OutputPathBuilder.build_scene_path(episode_info, output_filename) - else: - output_filename = f"{item.input_path.stem}_scenes.json" - output_path = OutputPathBuilder.get_episode_dir(None, settings.output_subdirs.scenes) / output_filename - - return [OutputSpec(path=output_path, required=True)] - - def _get_processing_info(self) -> List[str]: - return ["[cyan]Scene detection using TransNetV2 on CUDA[/cyan]"] - - def _load_resources(self) -> bool: - if not torch.cuda.is_available(): - raise RuntimeError("CUDA is not available. TransNetV2 requires GPU.") - - console.print("[cyan]Loading TransNetV2 model on CUDA...[/cyan]") - self.model = TransNetV2().cuda() - console.print("[green]✓ TransNetV2 ready on CUDA[/green]") - return True - - def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> None: - video_file = item.input_path - output_file = missing_outputs[0].path - - console.print(f"[cyan]Processing: {video_file.name}[/cyan]") - - video_info = self.__get_video_info(video_file) - if not video_info: - self.logger.error(f"Failed to get video info for {video_file}") - return - - scene_list = self.__detect_scenes_transnetv2(video_file, video_info) - - if not scene_list: - console.print(f"[yellow]No scenes detected in {video_file.name}[/yellow]") - return - - result = { - "total_scenes": len(scene_list), - "video_info": video_info, - "detection_settings": { - "threshold": self.threshold, - "min_scene_len": self.min_scene_len, - "method": "transnetv2", - }, - "scenes": scene_list, - } - - output_file.parent.mkdir(parents=True, exist_ok=True) - - atomic_write_json(output_file, result, indent=2, ensure_ascii=False) - - console.print(f"[green]{video_file.name}: {len(scene_list)} scenes -> {output_file}[/green]") - - def __detect_scenes_transnetv2( - self, video_file: Path, video_info: Dict[str, Any], - ) -> List[SceneDict]: - try: # pylint: disable=too-many-try-statements - _, single_frame_predictions, _ = self.model.predict_video(str(video_file)) - - scene_changes = np.where(single_frame_predictions > self.threshold)[0] - - scenes = [] - fps = video_info["fps"] - prev_frame = 0 - - for frame_num in scene_changes: - if frame_num - prev_frame < self.min_scene_len: - continue - - scene = self.__create_scene_dict(len(scenes) + 1, prev_frame, frame_num, fps) - scenes.append(scene) - prev_frame = frame_num - - total_frames = video_info["total_frames"] - if total_frames - prev_frame > self.min_scene_len: - scene = self.__create_scene_dict(len(scenes) + 1, prev_frame, total_frames, fps) - scenes.append(scene) - - return scenes - - except (RuntimeError, ValueError, OSError) as e: - self.logger.error(f"TransNetV2 detection failed: {e}") - return [] - - def __get_video_info(self, video_file: Path) -> Optional[Dict[str, Any]]: - try: - vr = decord.VideoReader(str(video_file), ctx=decord.cpu(0)) - fps = vr.get_avg_fps() - total_frames = len(vr) - duration = total_frames / fps if fps > 0 else 0 - - return { - "fps": fps, - "duration": duration, - "total_frames": total_frames, - } - except (RuntimeError, ValueError, OSError) as e: - self.logger.error(f"Error reading video info: {e}") - return None - - def __create_scene_dict(self, scene_number: int, start_frame: int, end_frame: int, fps: float) -> SceneDict: - return { - "scene_number": scene_number, - "start": { - "frame": int(start_frame), - "seconds": float(start_frame / fps), - "timecode": self.__frame_to_timecode(start_frame, fps), - }, - "end": { - "frame": int(end_frame), - "seconds": float(end_frame / fps), - "timecode": self.__frame_to_timecode(end_frame, fps), - }, - "duration": float((end_frame - start_frame) / fps), - "frame_count": int(end_frame - start_frame), - } - - @staticmethod - def __frame_to_timecode(frame: int, fps: float) -> str: - seconds = frame / fps - hours = int(seconds // 3600) - minutes = int((seconds % 3600) // 60) - secs = int(seconds % 60) - frames = int((seconds % 1) * fps) - return f"{hours:02d}:{minutes:02d}:{secs:02d}:{frames:02d}" diff --git a/preprocessor/video/transcoder.py b/preprocessor/video/transcoder.py deleted file mode 100644 index ad44fc9d3..000000000 --- a/preprocessor/video/transcoder.py +++ /dev/null @@ -1,257 +0,0 @@ -import json -import os -from pathlib import Path -import subprocess -from typing import ( - Any, - Dict, - List, - Optional, -) - -from preprocessor.core.base_processor import ( - OutputSpec, - ProcessingItem, -) -from preprocessor.core.constants import DEFAULT_VIDEO_EXTENSION -from preprocessor.core.output_path_builder import OutputPathBuilder -from preprocessor.utils.constants import ( - FfprobeKeys, - FfprobeStreamKeys, -) -from preprocessor.utils.resolution import Resolution -from preprocessor.video.base_video_processor import BaseVideoProcessor - - -class VideoTranscoder(BaseVideoProcessor): - def __init__(self, args: Dict[str, Any]) -> None: - super().__init__( - args=args, - class_name=self.__class__.__name__, - error_exit_code=3, - input_videos_key="videos", - ) - - self.resolution: Resolution = self._args["resolution"] - self.codec: str = str(self._args["codec"]) - self.preset: str = "p7" - self.video_bitrate_mbps: Optional[float] = self._args.get("video_bitrate_mbps") - self.minrate_mbps: Optional[float] = self._args.get("minrate_mbps") - self.maxrate_mbps: Optional[float] = self._args.get("maxrate_mbps") - self.bufsize_mbps: Optional[float] = self._args.get("bufsize_mbps") - self.audio_bitrate_kbps: int = int(self._args.get("audio_bitrate_kbps", 128)) - self.gop_size: float = float(self._args["gop_size"]) - - def _validate_args(self, args: Dict[str, Any]) -> None: - if "videos" not in args: - raise ValueError("videos path is required") - if "resolution" not in args: - raise ValueError("resolution is required") - if "codec" not in args: - raise ValueError("codec is required") - if "gop_size" not in args: - raise ValueError("gop_size is required") - if "transcoded_videos" not in args: - raise ValueError("transcoded_videos is required") - if "video_bitrate_mbps" not in args or args["video_bitrate_mbps"] is None: - raise ValueError("video_bitrate_mbps is required for VBR mode") - if "minrate_mbps" not in args or args["minrate_mbps"] is None: - raise ValueError("minrate_mbps is required for VBR mode") - if "maxrate_mbps" not in args or args["maxrate_mbps"] is None: - raise ValueError("maxrate_mbps is required for VBR mode") - if "bufsize_mbps" not in args or args["bufsize_mbps"] is None: - raise ValueError("bufsize_mbps is required for VBR mode") - - videos_path = Path(args["videos"]) - if not videos_path.is_dir(): - raise NotADirectoryError(f"Input videos is not a directory: '{videos_path}'") - - def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: - episode_info = item.metadata["episode_info"] - output_path = OutputPathBuilder.build_video_path(episode_info, self.series_name, extension=DEFAULT_VIDEO_EXTENSION) - return [OutputSpec(path=output_path, required=True)] - - def _get_temp_files(self, item: ProcessingItem) -> List[str]: - expected_outputs = self._get_expected_outputs(item) - if not expected_outputs: - return [] - temp_path = expected_outputs[0].path.with_suffix('.mp4.tmp') - return [str(temp_path)] - - def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> None: - video_file = item.input_path - output_path = missing_outputs[0].path - temp_path = output_path.with_suffix('.mp4.tmp') - - try: - temp_path.parent.mkdir(parents=True, exist_ok=True) - self.__transcode_video(video_file, temp_path) - temp_path.replace(output_path) - self.logger.info(f"Processed: {video_file} -> {output_path}") - except subprocess.CalledProcessError as e: - self.logger.error(f"FFmpeg failed for {video_file}: {e}") - if temp_path.exists(): - temp_path.unlink() - raise - except Exception as e: - self.logger.error(f"Unexpected error during transcoding {video_file}: {e}") - if temp_path.exists(): - temp_path.unlink() - raise - - def __transcode_video(self, input_video: Path, output_video: Path) -> None: - input_fps = self.__get_framerate(input_video) - input_video_bitrate = self.__get_video_bitrate(input_video) - input_audio_bitrate = self.__get_audio_bitrate(input_video) - - target_fps = min(input_fps, 30.0) - if target_fps < input_fps: - self.logger.info( - f"Input FPS ({input_fps}) > 30. Limiting to {target_fps} FPS for compatibility and smaller file size.", - ) - - video_bitrate = self.video_bitrate_mbps - minrate = self.minrate_mbps - maxrate = self.maxrate_mbps - bufsize = self.bufsize_mbps - - if input_video_bitrate and input_video_bitrate < video_bitrate: - adjusted_bitrate = min(input_video_bitrate * 1.05, video_bitrate) - ratio = adjusted_bitrate / video_bitrate - video_bitrate = adjusted_bitrate - minrate = round(minrate * ratio, 2) - maxrate = round(maxrate * ratio, 2) - bufsize = round(bufsize * ratio, 2) - self.logger.info( - f"Input video bitrate ({input_video_bitrate} Mbps) < target ({self.video_bitrate_mbps} Mbps). " - f"Adjusted to {video_bitrate} Mbps to avoid quality loss.", - ) - - audio_bitrate = self.audio_bitrate_kbps - if input_audio_bitrate and input_audio_bitrate < audio_bitrate: - adjusted_audio_bitrate = min(int(input_audio_bitrate * 1.05), audio_bitrate) - audio_bitrate = adjusted_audio_bitrate - self.logger.info( - f"Input audio bitrate ({input_audio_bitrate} kbps) < target ({self.audio_bitrate_kbps} kbps). " - f"Adjusted to {audio_bitrate} kbps to avoid quality loss.", - ) - - vf_filter = ( - "scale='iw*sar:ih'," - f"scale={self.resolution.width}:{self.resolution.height}:force_original_aspect_ratio=decrease," - f"pad={self.resolution.width}:{self.resolution.height}:(ow-iw)/2:(oh-ih)/2:black," - "setsar=1" - ) - - command = [ - "ffmpeg", - "-v", "error", - "-stats", - "-hide_banner", - "-y", - "-i", str(input_video), - "-c:v", self.codec, - "-preset", self.preset, - "-profile:v", "main", - "-level", "4.1", - "-pix_fmt", "yuv420p", - ] - - if target_fps < input_fps: - command.extend(["-r", str(target_fps)]) - - command.extend([ - "-rc", "vbr_hq", - "-b:v", f"{video_bitrate}M", - "-minrate", f"{minrate}M", - "-maxrate", f"{maxrate}M", - "-bufsize", f"{bufsize}M", - "-bf", "2", - "-b_adapt", "1", - "-2pass", "1", - "-rc-lookahead", "32", - "-aq-strength", "15", - ]) - - command.extend([ - "-g", str(int(target_fps * self.gop_size)), - "-spatial-aq", "1", - "-temporal-aq", "1", - "-multipass", "fullres", - "-c:a", "aac", - "-b:a", f"{audio_bitrate}k", - "-ac", "2", - "-vf", vf_filter, - "-movflags", "+faststart", - "-f", "mp4", - str(output_video), - ]) - - self.logger.debug(f"Transcoding: {input_video.name} -> {output_video.name}") - self.logger.debug(f"FFmpeg command: {' '.join(command)}") - self.logger.debug(f"LD_LIBRARY_PATH: {os.environ.get('LD_LIBRARY_PATH', 'not set')[:200]}") - - try: - subprocess.run(command, check=True, capture_output=False, text=True) - except subprocess.CalledProcessError as e: - self.logger.error(f"FFmpeg failed with exit code: {e.returncode}") - raise - - @staticmethod - def __get_framerate(video: Path) -> float: - cmd = [ - "ffprobe", "-v", "error", - "-select_streams", "v:0", - "-show_entries", "stream=r_frame_rate", - "-of", "json", - str(video), - ] - result = subprocess.run(cmd, capture_output=True, text=True, check=True) - probe_data: Dict[str, Any] = json.loads(result.stdout) - streams: List[Dict[str, Any]] = probe_data.get(FfprobeKeys.STREAMS, []) - if not streams: - raise ValueError(f"No video streams found in {video}") - r_frame_rate: Optional[str] = streams[0].get(FfprobeStreamKeys.R_FRAME_RATE) - if not r_frame_rate: - raise ValueError(f"Frame rate not found in {video}") - num, denom = [int(x) for x in r_frame_rate.split("/")] - - return num / denom - - @staticmethod - def __get_video_bitrate(video: Path) -> Optional[float]: - cmd = [ - "ffprobe", "-v", "error", - "-select_streams", "v:0", - "-show_entries", "stream=bit_rate", - "-of", "json", - str(video), - ] - result = subprocess.run(cmd, capture_output=True, text=True, check=True) - probe_data: Dict[str, Any] = json.loads(result.stdout) - streams: List[Dict[str, Any]] = probe_data.get(FfprobeKeys.STREAMS, []) - if not streams: - return None - bit_rate = streams[0].get(FfprobeStreamKeys.BIT_RATE) - if not bit_rate: - return None - return round(int(bit_rate) / 1_000_000, 2) - - @staticmethod - def __get_audio_bitrate(video: Path) -> Optional[int]: - cmd = [ - "ffprobe", "-v", "error", - "-select_streams", "a:0", - "-show_entries", "stream=bit_rate", - "-of", "json", - str(video), - ] - result = subprocess.run(cmd, capture_output=True, text=True, check=True) - probe_data: Dict[str, Any] = json.loads(result.stdout) - streams: List[Dict[str, Any]] = probe_data.get(FfprobeKeys.STREAMS, []) - if not streams: - return None - bit_rate = streams[0].get(FfprobeStreamKeys.BIT_RATE) - if not bit_rate: - return None - return int(int(bit_rate) / 1000)